fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

zoekt: read posting offset from disk (#529)

The btree still referenced the offsets to the posting lists which had a
big impact on the heap usage of webserver.

With this change we just hold on to the simple section pointing to the
offsets IE we read offsets lazily and therfore incur another disk
access.

I added a unit test for ngramIndex.Get, because there wasn't really a
good test so far that captured the expected size of a posting list.

In total, after this change, the retrieval of a posting list for a given
ngram via the btree requires 2 disk accesses (1 for getting the bucket
ngrams, 1 for getting the offset to the posting list).

If this turns out to be to costly with regards to latency, we should
consider storing the offset together with the ngrams in the bucket.

+124 -21
+48 -18
btree.go
··· 286 286 // buckets 287 287 ngramSec simpleSection 288 288 289 - postingOffsets []uint32 290 - postingDataSentinelOffset uint32 289 + postingIndex simpleSection 291 290 } 292 291 293 292 func (b btreeIndex) SizeBytes() (sz int) { ··· 295 294 sz += int(pointerSize) + b.bt.sizeBytes() 296 295 // ngramSec 297 296 sz += 8 298 - // postingOffsets 299 - sz += int(sliceHeaderBytes) + 4*len(b.postingOffsets) 297 + // postingIndex 298 + sz += 8 300 299 // postingDataSentinelOffset 301 300 sz += 4 302 301 return ··· 338 337 return b.getPostingList(postingIndexOffset + x) 339 338 } 340 339 340 + // getPostingList returns the simple section pointing to the posting list of 341 + // the ngram at ngramIndex. 342 + // 343 + // Assumming we don't hit a page boundary, which should be rare given that we 344 + // only read 8 bytes, we need 1 disk access to read the posting offset. 345 + func (b btreeIndex) getPostingList(ngramIndex int) simpleSection { 346 + relativeOffsetBytes := uint32(ngramIndex) * 4 347 + 348 + if relativeOffsetBytes+8 <= b.postingIndex.sz { 349 + // read 2 offsets 350 + o, err := b.file.Read(b.postingIndex.off+relativeOffsetBytes, 8) 351 + if err != nil { 352 + return simpleSection{} 353 + } 354 + 355 + start := binary.BigEndian.Uint32(o[0:4]) 356 + end := binary.BigEndian.Uint32(o[4:8]) 357 + return simpleSection{ 358 + off: start, 359 + sz: end - start, 360 + } 361 + } else { 362 + // last ngram => read 1 offset and calculate the size of the posting 363 + // list from the offset of index section. 364 + o, err := b.file.Read(b.postingIndex.off+relativeOffsetBytes, 4) 365 + if err != nil { 366 + return simpleSection{} 367 + } 368 + 369 + start := binary.BigEndian.Uint32(o[0:4]) 370 + return simpleSection{ 371 + off: start, 372 + // The layout of the posting list compound section on disk is 373 + // 374 + // start b.postingIndex.off 375 + // v v 376 + // [[posting lists (simple section)][index (simple section)]] 377 + // <----------> 378 + // last posting list 379 + // 380 + sz: b.postingIndex.off - start, 381 + } 382 + } 383 + } 384 + 341 385 func (b btreeIndex) getBucket(bucketIndex int) (off uint32, sz uint32) { 342 386 // All but the rightmost bucket have exactly bucketSize/2 ngrams 343 387 sz = uint32(b.bt.opts.bucketSize / 2 * ngramEncoding) ··· 373 417 374 418 return m 375 419 } 376 - 377 - func (b btreeIndex) getPostingList(postingIndex int) simpleSection { 378 - if postingIndex+1 < len(b.postingOffsets) { 379 - return simpleSection{ 380 - off: b.postingOffsets[postingIndex], 381 - sz: b.postingOffsets[postingIndex+1] - b.postingOffsets[postingIndex], 382 - } 383 - } else { 384 - return simpleSection{ 385 - off: b.postingOffsets[postingIndex], 386 - sz: b.postingDataSentinelOffset - b.postingOffsets[postingIndex], 387 - } 388 - } 389 - }
+2 -3
read.go
··· 516 516 517 517 bi.bt = bt 518 518 519 + // hold on to simple sections (8 bytes each) 519 520 bi.ngramSec = toc.ngramText 520 - 521 - bi.postingOffsets = toc.postings.offsets 522 - bi.postingDataSentinelOffset = toc.postings.data.off + toc.postings.data.sz 521 + bi.postingIndex = toc.postings.index 523 522 524 523 return bi, nil 525 524 }
+74
read_test.go
··· 121 121 } 122 122 } 123 123 124 + func TestGet(t *testing.T) { 125 + b, err := NewIndexBuilder(nil) 126 + if err != nil { 127 + t.Fatalf("NewIndexBuilder: %v", err) 128 + } 129 + 130 + if err := b.AddFile("file_name", []byte("aaa bbbaaa")); err != nil { 131 + t.Fatalf("AddFile: %v", err) 132 + } 133 + 134 + var buf bytes.Buffer 135 + if err := b.Write(&buf); err != nil { 136 + t.Fatal(err) 137 + } 138 + f := &memSeeker{buf.Bytes()} 139 + 140 + r := reader{r: f} 141 + 142 + var toc indexTOC 143 + if err := r.readTOC(&toc); err != nil { 144 + t.Errorf("got read error %v", err) 145 + } 146 + 147 + id, err := r.readIndexData(&toc) 148 + if err != nil { 149 + t.Fatalf("readIndexData: %v", err) 150 + } 151 + 152 + var off uint32 = 96 153 + 154 + cases := []struct { 155 + ng string 156 + wantPostingList simpleSection 157 + }{ 158 + { 159 + ng: " bb", 160 + wantPostingList: simpleSection{off: off, sz: 1}, 161 + }, 162 + { 163 + ng: "a b", 164 + wantPostingList: simpleSection{off: off + 1, sz: 1}, 165 + }, 166 + { 167 + ng: "aa ", 168 + wantPostingList: simpleSection{off: off + 2, sz: 1}, 169 + }, 170 + { 171 + ng: "aaa", 172 + wantPostingList: simpleSection{off: off + 3, sz: 2}, 173 + }, 174 + { 175 + ng: "baa", 176 + wantPostingList: simpleSection{off: off + 5, sz: 1}, 177 + }, 178 + { 179 + ng: "bba", 180 + wantPostingList: simpleSection{off: off + 6, sz: 1}, 181 + }, 182 + { 183 + ng: "bbb", 184 + wantPostingList: simpleSection{off: off + 7, sz: 1}, 185 + }, 186 + } 187 + 188 + for _, tt := range cases { 189 + t.Run(tt.ng, func(t *testing.T) { 190 + havePostingList := id.ngrams.Get(stringToNGram(tt.ng)) 191 + if !reflect.DeepEqual(tt.wantPostingList, havePostingList) { 192 + t.Fatalf("\nwant:%+v\ngot: %+v", tt.wantPostingList, havePostingList) 193 + } 194 + }) 195 + } 196 + } 197 + 124 198 func loadShard(fn string) (Searcher, error) { 125 199 f, err := os.Open(fn) 126 200 if err != nil {