fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

zoekt: use b-tree for filenames (#530)

With this change we use the b-tree for filnames just like we do for content. Based on heap profiles from production instances, we anticipate a reduction of the heap by 15-20%. The change is behind the feature flag "ZOEKT_ENABLE_BTREE_NAME".

+75 -16
+11 -1
btree.go
··· 291 291 292 292 func (b btreeIndex) SizeBytes() (sz int) { 293 293 // btree 294 - sz += int(pointerSize) + b.bt.sizeBytes() 294 + if b.bt != nil { 295 + sz += int(pointerSize) + b.bt.sizeBytes() 296 + } 295 297 // ngramSec 296 298 sz += 8 297 299 // postingIndex ··· 308 310 // 3. Binary search the bucket (in MEM) 309 311 // 4. Return the simple section pointing to the posting list (in MEM) 310 312 func (b btreeIndex) Get(ng ngram) (ss simpleSection) { 313 + if b.bt == nil { 314 + return simpleSection{} 315 + } 316 + 311 317 // find bucket 312 318 bucketIndex, postingIndexOffset := b.bt.find(ng) 313 319 ··· 396 402 } 397 403 398 404 func (b btreeIndex) DumpMap() map[ngram]simpleSection { 405 + if b.bt == nil { 406 + return nil 407 + } 408 + 399 409 m := make(map[ngram]simpleSection, b.ngramSec.sz/ngramEncoding) 400 410 401 411 b.bt.visit(func(no node) {
+4 -1
hititer.go
··· 122 122 iters := make([]hitIterator, 0, len(variants)) 123 123 for _, v := range variants { 124 124 if fileName { 125 - blob := d.fileNameNgrams[v] 125 + blob, err := d.fileNameNgrams.GetBlob(v) 126 + if err != nil { 127 + return nil, err 128 + } 126 129 if len(blob) > 0 { 127 130 iters = append(iters, newCompressedPostingIterator(blob, v)) 128 131 }
+3 -3
indexdata.go
··· 56 56 57 57 fileNameContent []byte 58 58 fileNameIndex []uint32 59 - fileNameNgrams map[ngram][]byte 59 + fileNameNgrams fileNameNgrams 60 60 61 61 // fileEndSymbol[i] is the index of the first symbol for document i. 62 62 fileEndSymbol []uint32 ··· 317 317 if d.ngrams != nil { 318 318 sz += d.ngrams.SizeBytes() 319 319 } 320 - sz += 12 * len(d.fileNameNgrams) // these slices reference mmap-ed memory 320 + sz += d.fileNameNgrams.SizeBytes() 321 321 return sz 322 322 } 323 323 ··· 349 349 350 350 func (data *indexData) ngramFrequency(ng ngram, filename bool) uint32 { 351 351 if filename { 352 - return uint32(len(data.fileNameNgrams[ng])) 352 + return data.fileNameNgrams.Frequency(ng) 353 353 } 354 354 355 355 if data.ngrams == nil {
+37
ngramoffset.go
··· 364 364 SizeBytes() int 365 365 } 366 366 367 + // This is a temporary type to wrap two very different implementations of the 368 + // inverted index for the purpose of feature-flagging. We will remove this after 369 + // we enable the b-tree permanently. 370 + // 371 + // Alternatively we could have adapted/extended the interface "ngramIndex". 372 + // However, adapting the existing implementations and their tests to match the 373 + // access pattern of map[ngram][]byte seems more cumbersome than this makeshift 374 + // wrapper. In the end, both ngramIndex and this wrapper will be replaced by a 375 + // concrete type. 376 + type fileNameNgrams struct { 377 + m map[ngram][]byte 378 + bt btreeIndex 379 + } 380 + 381 + func (n fileNameNgrams) GetBlob(ng ngram) ([]byte, error) { 382 + if n.m != nil { 383 + return n.m[ng], nil 384 + } 385 + sec := n.bt.Get(ng) 386 + return n.bt.file.Read(sec.off, sec.sz) 387 + } 388 + 389 + func (n fileNameNgrams) Frequency(ng ngram) uint32 { 390 + if n.m != nil { 391 + return uint32(len(n.m[ng])) 392 + } 393 + return n.bt.Get(ng).sz 394 + } 395 + 396 + func (n fileNameNgrams) SizeBytes() int { 397 + if n.m != nil { 398 + // these slices reference mmap-ed memory 399 + return 12 * len(n.m) 400 + } 401 + return n.bt.SizeBytes() 402 + } 403 + 367 404 type binarySearchNgram struct { 368 405 // ngramText is the bytes at indexTOC.ngramText 369 406 //
+13 -10
read.go
··· 226 226 227 227 func (r *reader) readIndexData(toc *indexTOC) (*indexData, error) { 228 228 d := indexData{ 229 - file: r.r, 230 - fileNameNgrams: map[ngram][]byte{}, 231 - branchIDs: []map[string]uint{}, 232 - branchNames: []map[uint]string{}, 229 + file: r.r, 230 + branchIDs: []map[string]uint{}, 231 + branchNames: []map[uint]string{}, 233 232 } 234 233 235 234 repos, md, err := r.readMetadata(toc) ··· 290 289 } 291 290 292 291 if os.Getenv("ZOEKT_ENABLE_BTREE") != "" { 293 - bt, err := d.newBtreeIndex(toc) 292 + bt, err := d.newBtreeIndex(toc.ngramText, toc.postings) 294 293 if err != nil { 295 294 return nil, err 296 295 } ··· 321 320 322 321 d.fileNameIndex = toc.fileNames.relativeIndex() 323 322 324 - d.fileNameNgrams, err = d.readFileNameNgrams(toc) 323 + if os.Getenv("ZOEKT_ENABLE_BTREE_NAME") != "" { 324 + d.fileNameNgrams.bt, err = d.newBtreeIndex(toc.nameNgramText, toc.namePostings) 325 + } else { 326 + d.fileNameNgrams.m, err = d.readFileNameNgrams(toc) 327 + } 325 328 if err != nil { 326 329 return nil, err 327 330 } ··· 496 499 }, nil 497 500 } 498 501 499 - func (d *indexData) newBtreeIndex(toc *indexTOC) (btreeIndex, error) { 502 + func (d *indexData) newBtreeIndex(ngramSec simpleSection, postings compoundSection) (btreeIndex, error) { 500 503 bi := btreeIndex{file: d.file} 501 504 502 - textContent, err := d.readSectionBlob(toc.ngramText) 505 + textContent, err := d.readSectionBlob(ngramSec) 503 506 if err != nil { 504 507 return btreeIndex{}, err 505 508 } ··· 517 520 bi.bt = bt 518 521 519 522 // hold on to simple sections (8 bytes each) 520 - bi.ngramSec = toc.ngramText 521 - bi.postingIndex = toc.postings.index 523 + bi.ngramSec = ngramSec 524 + bi.postingIndex = postings.index 522 525 523 526 return bi, nil 524 527 }
+7 -1
read_test.go
··· 116 116 if !reflect.DeepEqual([]uint32{0, 4}, data.fileNameIndex) { 117 117 t.Errorf("got index %v, want {0,4}", data.fileNameIndex) 118 118 } 119 - if got := data.fileNameNgrams[stringToNGram("bCd")]; !reflect.DeepEqual(got, []byte{1}) { 119 + 120 + gotBlob, err := data.fileNameNgrams.GetBlob(stringToNGram("bCd")) 121 + if err != nil { 122 + t.Fatalf("fileNameNgrams.GetBlob: %v", err) 123 + } 124 + 125 + if !reflect.DeepEqual(gotBlob, []byte{1}) { 120 126 t.Errorf("got trigram bcd at bits %v, want sz 2", data.fileNameNgrams) 121 127 } 122 128 }