matchtree: capture Stats before pruning (#607)

We now call updateStats upto twice per shard search. The intention of
this is to capture statistics before pruning the matchtree. Previously
we would of done work in creating a matchtree but would then prune those
items away and would then never capture those statistics.

In practice that work was reading just one or two varint (the size of a
posting list) so likely had minimal impact on the reported statistics.
However, in the next commit we want to introduce a statistic which is
recorded even if we generate a noMatchTree.

The main technical part of this commit is ensuring all existing
updateStats functions can be called twice without overcounting.

Test Plan: go test

author

Keegan Carruthers-Smith committer

GitHub date 3 years ago (Jun 29, 2023, 12:17 PM +0200) commit 93f7b0c9 93f7b0c983a62f6aaf355935b8bbb482e44bb590 parent b9e6d943 b9e6d9433e2e4438be6727ac3f1ae0a00e283220

+31 -13

4 changed files

Expand all

eval.go

hititer.go

matchiter.go

matchtree.go

+5 -5

eval.go

··· 181 181 return nil, err 182 182 } 183 183 184 + // Capture the costs of construction before pruning 185 + updateMatchTreeStats(mt, &res.Stats) 186 + 184 187 mt, err = pruneMatchTree(mt) 185 188 if err != nil { 186 189 return nil, err ··· 380 383 } 381 384 } 382 385 383 - visitMatchTree(mt, func(mt matchTree) { 384 - if atom, ok := mt.(interface{ updateStats(*Stats) }); ok { 385 - atom.updateStats(&res.Stats) 386 - } 387 - }) 386 + // Update stats based on work done during document search. 387 + updateMatchTreeStats(mt, &res.Stats) 388 388 389 389 // If document ranking is enabled, then we can rank and truncate the files to save memory. 390 390 if limit := opts.MaxDocDisplayCount; opts.UseDocumentRanks && limit > 0 && limit < len(res.Files) {

+11 -8

hititer.go

··· 183 183 // compressedPostingIterator goes over a delta varint encoded posting 184 184 // list. 185 185 type compressedPostingIterator struct { 186 - blob, orig []byte 187 - _first uint32 188 - what ngram 186 + blob []byte 187 + indexBytesLoaded int 188 + _first uint32 189 + what ngram 189 190 } 190 191 191 192 func newCompressedPostingIterator(b []byte, w ngram) *compressedPostingIterator { 192 193 d, sz := binary.Uvarint(b) 193 194 return &compressedPostingIterator{ 194 - _first: uint32(d), 195 - blob: b[sz:], 196 - orig: b, 197 - what: w, 195 + _first: uint32(d), 196 + blob: b[sz:], 197 + indexBytesLoaded: sz, 198 + what: w, 198 199 } 199 200 } 200 201 ··· 216 217 for i._first <= limit && len(i.blob) > 0 { 217 218 delta, sz := binary.Uvarint(i.blob) 218 219 i._first += uint32(delta) 220 + i.indexBytesLoaded += sz 219 221 i.blob = i.blob[sz:] 220 222 } 221 223 ··· 225 227 } 226 228 227 229 func (i *compressedPostingIterator) updateStats(s *Stats) { 228 - s.IndexBytesLoaded += int64(len(i.orig) - len(i.blob)) 230 + s.IndexBytesLoaded += int64(i.indexBytesLoaded) 231 + i.indexBytesLoaded = 0 229 232 } 230 233 231 234 // mergingIterator forms the merge of a set of hitIterators, to

matchiter.go

··· 63 63 docIterator 64 64 65 65 candidates() []*candidateMatch 66 + 67 + // updateStats is called twice. After matchtree construction and after 68 + // searching is done. Implementations must take care to not report 69 + // statistics twice. 66 70 updateStats(*Stats) 67 71 } 68 72 ··· 150 154 func (i *ngramDocIterator) updateStats(s *Stats) { 151 155 i.iter.updateStats(s) 152 156 s.NgramMatches += i.matchCount 157 + i.matchCount = 0 153 158 } 154 159 155 160 func (i *ngramDocIterator) candidates() []*candidateMatch {

+10

matchtree.go

··· 520 520 } 521 521 } 522 522 523 + // updateMatchTreeStats calls updateStats on all atoms in mt which have that 524 + // function defined. 525 + func updateMatchTreeStats(mt matchTree, stats *Stats) { 526 + visitMatchTree(mt, func(mt matchTree) { 527 + if atom, ok := mt.(interface{ updateStats(*Stats) }); ok { 528 + atom.updateStats(stats) 529 + } 530 + }) 531 + } 532 + 523 533 // visitMatches visits all atoms which can contribute matches. Note: This 524 534 // skips noVisitMatchTree. 525 535 func visitMatches(t matchTree, known map[matchTree]bool, f func(matchTree)) {

Configure Feed

Configure Feed