fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

score: do scoring on candidateMatches (#723)

This is a refactor which removes our duplicated scoring logic for
ChunkMatch vs LineMatch. Instead we score slices of []*candidateMatch.

Other than being a good refactor, candidateMatch is a much more
appropriate structure to stuff in extra information for scoring than our
public APIs. So this enables the work we want to do around atom based
scoring.

The only behaviour change in this commit are two fixes:
- DocumentSection caching would fail if empty since we relied on the
empty cache to be non-nil. This lead to inflated ContentBytesLoaded.
- Empty FileMatch would still read in DocumentSection, even though it
wasn't needed.

Test Plan: go test. Our coverage is decent, we have lots of ranking
tests which did not change and we have hardcoded stats of work done
which did not change (except for the above fixes).

+98 -68
+88 -66
contentprovider.go
··· 140 140 func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch { 141 141 var result []LineMatch 142 142 if ms[0].fileName { 143 + score, debugScore, _ := p.candidateMatchScore(ms, language, debug) 144 + 143 145 // There is only "line" in a filename. 144 146 res := LineMatch{ 145 147 Line: p.id.fileName(p.idx), 146 148 FileName: true, 149 + 150 + Score: score, 151 + DebugScore: debugScore, 147 152 } 148 153 149 154 for _, m := range ms { ··· 157 162 } 158 163 } else { 159 164 ms = breakMatchesOnNewlines(ms, p.data(false)) 160 - result = p.fillContentMatches(ms, numContextLines) 161 - } 162 - 163 - sects := p.docSections() 164 - for i, m := range result { 165 - result[i].Score, result[i].DebugScore = p.matchScore(sects, &m, language, debug) 165 + result = p.fillContentMatches(ms, numContextLines, language, debug) 166 166 } 167 167 168 168 return result ··· 179 179 if ms[0].fileName { 180 180 // If the first match is a filename match, there will only be 181 181 // one match and the matched content will be the filename. 182 + 183 + score, debugScore, _ := p.candidateMatchScore(ms, language, debug) 182 184 183 185 fileName := p.id.fileName(p.idx) 184 186 ranges := make([]Range, 0, len(ms)) ··· 202 204 ContentStart: Location{ByteOffset: 0, LineNumber: 1, Column: 1}, 203 205 Ranges: ranges, 204 206 FileName: true, 207 + 208 + Score: score, 209 + DebugScore: debugScore, 205 210 }} 206 211 } else { 207 - result = p.fillContentChunkMatches(ms, numContextLines) 208 - } 209 - 210 - sects := p.docSections() 211 - for i, m := range result { 212 - result[i].Score, result[i].DebugScore = p.chunkMatchScore(sects, &m, language, debug) 212 + result = p.fillContentChunkMatches(ms, numContextLines, language, debug) 213 213 } 214 214 215 215 return result 216 216 } 217 217 218 - func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLines int) []LineMatch { 218 + func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch { 219 219 var result []LineMatch 220 220 for len(ms) > 0 { 221 221 m := ms[0] ··· 271 271 finalMatch.After = p.newlines().getLines(data, num+1, num+1+numContextLines) 272 272 } 273 273 274 - for _, m := range lineCands { 274 + score, debugScore, symbolInfo := p.candidateMatchScore(lineCands, language, debug) 275 + finalMatch.Score = score 276 + finalMatch.DebugScore = debugScore 277 + 278 + for i, m := range lineCands { 275 279 fragment := LineFragmentMatch{ 276 280 Offset: m.byteOffset, 277 281 LineOffset: int(m.byteOffset) - lineStart, 278 282 MatchLength: int(m.byteMatchSz), 279 283 } 280 - if m.symbol { 281 - start := p.id.fileEndSymbol[p.idx] 282 - fragment.SymbolInfo = p.id.symbols.data(start + m.symbolIdx) 283 - if fragment.SymbolInfo != nil { 284 - sec := p.docSections()[m.symbolIdx] 285 - fragment.SymbolInfo.Sym = string(data[sec.Start:sec.End]) 286 - } 284 + if i < len(symbolInfo) && symbolInfo[i] != nil { 285 + fragment.SymbolInfo = symbolInfo[i] 287 286 } 288 287 289 288 finalMatch.LineFragments = append(finalMatch.LineFragments, fragment) ··· 293 292 return result 294 293 } 295 294 296 - func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numContextLines int) []ChunkMatch { 295 + func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch { 297 296 newlines := p.newlines() 298 297 data := p.data(false) 299 298 ··· 311 310 chunks := chunkCandidates(ms, newlines, numContextLines) 312 311 chunkMatches := make([]ChunkMatch, 0, len(chunks)) 313 312 for _, chunk := range chunks { 313 + score, debugScore, symbolInfo := p.candidateMatchScore(chunk.candidates, language, debug) 314 + 314 315 ranges := make([]Range, 0, len(chunk.candidates)) 315 - var symbolInfo []*Symbol 316 - for i, cm := range chunk.candidates { 316 + for _, cm := range chunk.candidates { 317 317 startOffset := cm.byteOffset 318 318 endOffset := cm.byteOffset + cm.byteMatchSz 319 319 startLine, startLineOffset, _ := newlines.atOffset(startOffset) ··· 331 331 Column: columnHelper.get(endLineOffset, endOffset), 332 332 }, 333 333 }) 334 - 335 - if cm.symbol { 336 - if symbolInfo == nil { 337 - symbolInfo = make([]*Symbol, len(chunk.candidates)) 338 - } 339 - start := p.id.fileEndSymbol[p.idx] 340 - si := p.id.symbols.data(start + cm.symbolIdx) 341 - if si != nil { 342 - sec := p.docSections()[cm.symbolIdx] 343 - si.Sym = string(data[sec.Start:sec.End]) 344 - } 345 - symbolInfo[i] = si 346 - } 347 334 } 348 335 349 336 firstLineNumber := int(chunk.firstLine) - numContextLines ··· 362 349 FileName: false, 363 350 Ranges: ranges, 364 351 SymbolInfo: symbolInfo, 352 + Score: score, 353 + DebugScore: debugScore, 365 354 }) 366 355 } 367 356 return chunkMatches ··· 548 537 549 538 // findSection checks whether a section defined by offset and size lies within 550 539 // one of the sections in secs. 551 - func findSection(secs []DocumentSection, off, sz uint32) (int, bool) { 540 + func findSection(secs []DocumentSection, off, sz uint32) (uint32, bool) { 552 541 j := sort.Search(len(secs), func(i int) bool { 553 542 return secs[i].End >= off+sz 554 543 }) ··· 558 547 } 559 548 560 549 if secs[j].Start <= off && off+sz <= secs[j].End { 561 - return j, true 550 + return uint32(j), true 562 551 } 563 552 return 0, false 564 553 } 565 554 566 - func (p *contentProvider) chunkMatchScore(secs []DocumentSection, m *ChunkMatch, language string, debug bool) (float64, string) { 555 + func (p *contentProvider) findSymbol(cm *candidateMatch) (DocumentSection, *Symbol, bool) { 556 + if cm.fileName { 557 + return DocumentSection{}, nil, false 558 + } 559 + 560 + secs := p.docSections() 561 + 562 + secIdx, ok := cm.symbolIdx, cm.symbol 563 + if !ok { 564 + // Not from a symbol matchtree. Lets see if it intersects with a symbol. 565 + secIdx, ok = findSection(secs, cm.byteOffset, cm.byteMatchSz) 566 + } 567 + if !ok { 568 + return DocumentSection{}, nil, false 569 + } 570 + 571 + sec := secs[secIdx] 572 + 573 + // Now lets hydrate in the SymbolInfo. We do not hydrate in SymbolInfo.Sym 574 + // since some callsites do not need it stored, and that incurs an extra 575 + // copy. 576 + // 577 + // 2024-01-08 we are refactoring this and the code path indicates this can 578 + // fail, so callers need to handle nil symbol. However, it would be 579 + // surprising that we have a matching section but not symbol data. 580 + start := p.id.fileEndSymbol[p.idx] 581 + si := p.id.symbols.data(start + secIdx) 582 + 583 + return sec, si, true 584 + } 585 + 586 + func (p *contentProvider) candidateMatchScore(ms []*candidateMatch, language string, debug bool) (float64, string, []*Symbol) { 567 587 type debugScore struct { 568 588 what string 569 589 score float64 ··· 579 599 score.score += s 580 600 } 581 601 582 - data := p.data(m.FileName) 583 602 filename := p.data(true) 603 + var symbolInfo []*Symbol 584 604 585 - for i, r := range m.Ranges { 586 - // calculate the start and end offset relative to the start of the content 587 - relStartOffset := int(r.Start.ByteOffset - m.ContentStart.ByteOffset) 588 - relEndOffset := int(r.End.ByteOffset - m.ContentStart.ByteOffset) 605 + for i, m := range ms { 606 + data := p.data(m.fileName) 589 607 590 - startBoundary := relStartOffset < len(m.Content) && (relStartOffset == 0 || byteClass(m.Content[relStartOffset-1]) != byteClass(m.Content[relStartOffset])) 591 - endBoundary := relEndOffset > 0 && (relEndOffset == len(m.Content) || byteClass(m.Content[relEndOffset-1]) != byteClass(m.Content[relEndOffset])) 608 + endOffset := m.byteOffset + m.byteMatchSz 609 + startBoundary := m.byteOffset < uint32(len(data)) && (m.byteOffset == 0 || byteClass(data[m.byteOffset-1]) != byteClass(data[m.byteOffset])) 610 + endBoundary := endOffset > 0 && (endOffset == uint32(len(data)) || byteClass(data[endOffset-1]) != byteClass(data[endOffset])) 592 611 593 612 score.score = 0 594 613 score.what = "" ··· 599 618 addScore("PartialWordMatch", scorePartialWordMatch) 600 619 } 601 620 602 - if m.FileName { 603 - sep := bytes.LastIndexByte(m.Content, '/') 604 - startMatch := relStartOffset == sep+1 605 - endMatch := relEndOffset == len(m.Content) 621 + if m.fileName { 622 + sep := bytes.LastIndexByte(data, '/') 623 + startMatch := int(m.byteOffset) == sep+1 624 + endMatch := endOffset == uint32(len(data)) 606 625 if startMatch && endMatch { 607 626 addScore("Base", scoreBase) 608 627 } else if startMatch || endMatch { 609 628 addScore("EdgeBase", (scoreBase+scorePartialBase)/2) 610 - } else if sep < relStartOffset { 629 + } else if sep < int(m.byteOffset) { 611 630 addScore("InnerBase", scorePartialBase) 612 631 } 613 - } else if secIdx, ok := findSection(secs, uint32(r.Start.ByteOffset), uint32(r.End.ByteOffset-r.Start.ByteOffset)); ok { 614 - sec := secs[secIdx] 615 - startMatch := sec.Start == uint32(r.Start.ByteOffset) 616 - endMatch := sec.End == uint32(r.End.ByteOffset) 632 + } else if sec, si, ok := p.findSymbol(m); ok { 633 + startMatch := sec.Start == m.byteOffset 634 + endMatch := sec.End == endOffset 617 635 if startMatch && endMatch { 618 636 addScore("Symbol", scoreSymbol) 619 637 } else if startMatch || endMatch { ··· 622 640 addScore("InnerSymbol", scorePartialSymbol) 623 641 } 624 642 625 - var si *Symbol 626 - if m.SymbolInfo != nil { 627 - si = m.SymbolInfo[i] 628 - } 629 - if si == nil { 630 - // for non-symbol queries, we need to hydrate in SymbolInfo. 631 - start := p.id.fileEndSymbol[p.idx] 632 - si = p.id.symbols.data(start + uint32(secIdx)) 633 - } 643 + // Score based on symbol data 634 644 if si != nil { 635 645 symbolKind := ctags.ParseSymbolKind(si.Kind) 636 646 sym := sectionSlice(data, sec) 647 + 637 648 addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, symbolKind)) 649 + 650 + // This is from a symbol tree, so we need to store the symbol 651 + // information. 652 + if m.symbol { 653 + if symbolInfo == nil { 654 + symbolInfo = make([]*Symbol, len(ms)) 655 + } 656 + // findSymbols does not hydrate in Sym. So we need to store it. 657 + si.Sym = string(sym) 658 + symbolInfo[i] = si 659 + } 638 660 } 639 661 } 640 662 ··· 648 670 maxScore.what = fmt.Sprintf("score:%.2f <- %s", maxScore.score, strings.TrimSuffix(maxScore.what, ", ")) 649 671 } 650 672 651 - return maxScore.score, maxScore.what 673 + return maxScore.score, maxScore.what, symbolInfo 652 674 } 653 675 654 676 func (p *contentProvider) matchScore(secs []DocumentSection, m *LineMatch, language string, debug bool) (float64, string) {
+1 -1
eval_test.go
··· 160 160 161 161 t.Run("stats", func(t *testing.T) { 162 162 got, want := sr.Stats, Stats{ 163 - ContentBytesLoaded: 2, 163 + ContentBytesLoaded: 0, 164 164 FileCount: 2, 165 165 FilesConsidered: 2, 166 166 FilesSkipped: 2,
+9 -1
read.go
··· 532 532 return nil, 0, err 533 533 } 534 534 535 - return unmarshalDocSections(blob, buf), sec.sz, nil 535 + ds := unmarshalDocSections(blob, buf) 536 + 537 + // can be nil if buf is nil and there are no doc sections. However, we rely 538 + // on it being non-nil to cache the read. 539 + if ds == nil { 540 + ds = make([]DocumentSection, 0) 541 + } 542 + 543 + return ds, sec.sz, nil 536 544 } 537 545 538 546 func (d *indexData) readRanks(toc *indexTOC) error {