···512512 ng := ngram(binary.BigEndian.Uint64(textContent[i : i+ngramEncoding]))
513513 bt.insert(ng)
514514 }
515515-516516- // backfill "pointers" to the buckets and posting lists. Instead of
517517- // backfilling we could maintain state during insertion, however the
518518- // visitor pattern seems more natural and shouldn't be a performance issue,
519519- // because, based on the typical number of trigrams (500k) per shard, the
520520- // b-trees we construct here only have around 1000 leaf nodes.
521521- offset, bucketIndex := 0, 0
522522- bt.visit(func(no node) {
523523- switch n := no.(type) {
524524- case *leaf:
525525- n.bucketIndex = bucketIndex
526526- bucketIndex++
527527-528528- n.postingIndexOffset = offset
529529- offset += n.bucketSize
530530- case *innerNode:
531531- return
532532- }
533533- })
515515+ bt.freeze()
534516535517 bi.bt = bt
536518537537- bi.bucketOffsets = createBucketOffsets(toc.ngramText, btreeBucketSize)
519519+ bi.ngramSec = toc.ngramText
538520539521 bi.postingOffsets = toc.postings.offsets
540522 bi.postingDataSentinelOffset = toc.postings.data.off + toc.postings.data.sz
541523542524 return bi, nil
543543-}
544544-545545-// Because we insert ngrams into the btree in order, we can easily reconstruct
546546-// the buckets from the ngramText simpleSection just by knowing the bucketSize.
547547-// The last item of the returned slice is the sentinel value sec.off + sec.sz.
548548-func createBucketOffsets(sec simpleSection, bucketSize int) []uint32 {
549549- step := uint32((bucketSize / 2) * ngramEncoding)
550550-551551- offsets := make([]uint32, 0, ((sec.sz-1)/step)+1)
552552- offsets = append(offsets, sec.off)
553553-554554- sentinel := sec.off + sec.sz
555555- for off := sec.off + step; off+step < sentinel; off = off + step {
556556- offsets = append(offsets, off)
557557- }
558558- offsets = append(offsets, sentinel)
559559- return offsets
560525}
561526562527func (d *indexData) readFileNameNgrams(toc *indexTOC) (map[ngram][]byte, error) {