score: do scoring on candidateMatches (#723) · boltless.me/zoekt@b82a981

+88 -66

contentprovider.go

··· 140 140 func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch { 141 141 var result []LineMatch 142 142 if ms[0].fileName { 143 + score, debugScore, _ := p.candidateMatchScore(ms, language, debug) 144 + 143 145 // There is only "line" in a filename. 144 146 res := LineMatch{ 145 147 Line: p.id.fileName(p.idx), 146 148 FileName: true, 149 + 150 + Score: score, 151 + DebugScore: debugScore, 147 152 } 148 153 149 154 for _, m := range ms { ··· 157 162 } 158 163 } else { 159 164 ms = breakMatchesOnNewlines(ms, p.data(false)) 160 - result = p.fillContentMatches(ms, numContextLines) 161 - } 162 - 163 - sects := p.docSections() 164 - for i, m := range result { 165 - result[i].Score, result[i].DebugScore = p.matchScore(sects, &m, language, debug) 165 + result = p.fillContentMatches(ms, numContextLines, language, debug) 166 166 } 167 167 168 168 return result ··· 179 179 if ms[0].fileName { 180 180 // If the first match is a filename match, there will only be 181 181 // one match and the matched content will be the filename. 182 + 183 + score, debugScore, _ := p.candidateMatchScore(ms, language, debug) 182 184 183 185 fileName := p.id.fileName(p.idx) 184 186 ranges := make([]Range, 0, len(ms)) ··· 202 204 ContentStart: Location{ByteOffset: 0, LineNumber: 1, Column: 1}, 203 205 Ranges: ranges, 204 206 FileName: true, 207 + 208 + Score: score, 209 + DebugScore: debugScore, 205 210 }} 206 211 } else { 207 - result = p.fillContentChunkMatches(ms, numContextLines) 208 - } 209 - 210 - sects := p.docSections() 211 - for i, m := range result { 212 - result[i].Score, result[i].DebugScore = p.chunkMatchScore(sects, &m, language, debug) 212 + result = p.fillContentChunkMatches(ms, numContextLines, language, debug) 213 213 } 214 214 215 215 return result 216 216 } 217 217 218 - func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLines int) []LineMatch { 218 + func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch { 219 219 var result []LineMatch 220 220 for len(ms) > 0 { 221 221 m := ms[0] ··· 271 271 finalMatch.After = p.newlines().getLines(data, num+1, num+1+numContextLines) 272 272 } 273 273 274 - for _, m := range lineCands { 274 + score, debugScore, symbolInfo := p.candidateMatchScore(lineCands, language, debug) 275 + finalMatch.Score = score 276 + finalMatch.DebugScore = debugScore 277 + 278 + for i, m := range lineCands { 275 279 fragment := LineFragmentMatch{ 276 280 Offset: m.byteOffset, 277 281 LineOffset: int(m.byteOffset) - lineStart, 278 282 MatchLength: int(m.byteMatchSz), 279 283 } 280 - if m.symbol { 281 - start := p.id.fileEndSymbol[p.idx] 282 - fragment.SymbolInfo = p.id.symbols.data(start + m.symbolIdx) 283 - if fragment.SymbolInfo != nil { 284 - sec := p.docSections()[m.symbolIdx] 285 - fragment.SymbolInfo.Sym = string(data[sec.Start:sec.End]) 286 - } 284 + if i < len(symbolInfo) && symbolInfo[i] != nil { 285 + fragment.SymbolInfo = symbolInfo[i] 287 286 } 288 287 289 288 finalMatch.LineFragments = append(finalMatch.LineFragments, fragment) ··· 293 292 return result 294 293 } 295 294 296 - func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numContextLines int) []ChunkMatch { 295 + func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch { 297 296 newlines := p.newlines() 298 297 data := p.data(false) 299 298 ··· 311 310 chunks := chunkCandidates(ms, newlines, numContextLines) 312 311 chunkMatches := make([]ChunkMatch, 0, len(chunks)) 313 312 for _, chunk := range chunks { 313 + score, debugScore, symbolInfo := p.candidateMatchScore(chunk.candidates, language, debug) 314 + 314 315 ranges := make([]Range, 0, len(chunk.candidates)) 315 - var symbolInfo []*Symbol 316 - for i, cm := range chunk.candidates { 316 + for _, cm := range chunk.candidates { 317 317 startOffset := cm.byteOffset 318 318 endOffset := cm.byteOffset + cm.byteMatchSz 319 319 startLine, startLineOffset, _ := newlines.atOffset(startOffset) ··· 331 331 Column: columnHelper.get(endLineOffset, endOffset), 332 332 }, 333 333 }) 334 - 335 - if cm.symbol { 336 - if symbolInfo == nil { 337 - symbolInfo = make([]*Symbol, len(chunk.candidates)) 338 - } 339 - start := p.id.fileEndSymbol[p.idx] 340 - si := p.id.symbols.data(start + cm.symbolIdx) 341 - if si != nil { 342 - sec := p.docSections()[cm.symbolIdx] 343 - si.Sym = string(data[sec.Start:sec.End]) 344 - } 345 - symbolInfo[i] = si 346 - } 347 334 } 348 335 349 336 firstLineNumber := int(chunk.firstLine) - numContextLines ··· 362 349 FileName: false, 363 350 Ranges: ranges, 364 351 SymbolInfo: symbolInfo, 352 + Score: score, 353 + DebugScore: debugScore, 365 354 }) 366 355 } 367 356 return chunkMatches ··· 548 537 549 538 // findSection checks whether a section defined by offset and size lies within 550 539 // one of the sections in secs. 551 - func findSection(secs []DocumentSection, off, sz uint32) (int, bool) { 540 + func findSection(secs []DocumentSection, off, sz uint32) (uint32, bool) { 552 541 j := sort.Search(len(secs), func(i int) bool { 553 542 return secs[i].End >= off+sz 554 543 }) ··· 558 547 } 559 548 560 549 if secs[j].Start <= off && off+sz <= secs[j].End { 561 - return j, true 550 + return uint32(j), true 562 551 } 563 552 return 0, false 564 553 } 565 554 566 - func (p *contentProvider) chunkMatchScore(secs []DocumentSection, m *ChunkMatch, language string, debug bool) (float64, string) { 555 + func (p *contentProvider) findSymbol(cm *candidateMatch) (DocumentSection, *Symbol, bool) { 556 + if cm.fileName { 557 + return DocumentSection{}, nil, false 558 + } 559 + 560 + secs := p.docSections() 561 + 562 + secIdx, ok := cm.symbolIdx, cm.symbol 563 + if !ok { 564 + // Not from a symbol matchtree. Lets see if it intersects with a symbol. 565 + secIdx, ok = findSection(secs, cm.byteOffset, cm.byteMatchSz) 566 + } 567 + if !ok { 568 + return DocumentSection{}, nil, false 569 + } 570 + 571 + sec := secs[secIdx] 572 + 573 + // Now lets hydrate in the SymbolInfo. We do not hydrate in SymbolInfo.Sym 574 + // since some callsites do not need it stored, and that incurs an extra 575 + // copy. 576 + // 577 + // 2024-01-08 we are refactoring this and the code path indicates this can 578 + // fail, so callers need to handle nil symbol. However, it would be 579 + // surprising that we have a matching section but not symbol data. 580 + start := p.id.fileEndSymbol[p.idx] 581 + si := p.id.symbols.data(start + secIdx) 582 + 583 + return sec, si, true 584 + } 585 + 586 + func (p *contentProvider) candidateMatchScore(ms []*candidateMatch, language string, debug bool) (float64, string, []*Symbol) { 567 587 type debugScore struct { 568 588 what string 569 589 score float64 ··· 579 599 score.score += s 580 600 } 581 601 582 - data := p.data(m.FileName) 583 602 filename := p.data(true) 603 + var symbolInfo []*Symbol 584 604 585 - for i, r := range m.Ranges { 586 - // calculate the start and end offset relative to the start of the content 587 - relStartOffset := int(r.Start.ByteOffset - m.ContentStart.ByteOffset) 588 - relEndOffset := int(r.End.ByteOffset - m.ContentStart.ByteOffset) 605 + for i, m := range ms { 606 + data := p.data(m.fileName) 589 607 590 - startBoundary := relStartOffset < len(m.Content) && (relStartOffset == 0 || byteClass(m.Content[relStartOffset-1]) != byteClass(m.Content[relStartOffset])) 591 - endBoundary := relEndOffset > 0 && (relEndOffset == len(m.Content) || byteClass(m.Content[relEndOffset-1]) != byteClass(m.Content[relEndOffset])) 608 + endOffset := m.byteOffset + m.byteMatchSz 609 + startBoundary := m.byteOffset < uint32(len(data)) && (m.byteOffset == 0 || byteClass(data[m.byteOffset-1]) != byteClass(data[m.byteOffset])) 610 + endBoundary := endOffset > 0 && (endOffset == uint32(len(data)) || byteClass(data[endOffset-1]) != byteClass(data[endOffset])) 592 611 593 612 score.score = 0 594 613 score.what = "" ··· 599 618 addScore("PartialWordMatch", scorePartialWordMatch) 600 619 } 601 620 602 - if m.FileName { 603 - sep := bytes.LastIndexByte(m.Content, '/') 604 - startMatch := relStartOffset == sep+1 605 - endMatch := relEndOffset == len(m.Content) 621 + if m.fileName { 622 + sep := bytes.LastIndexByte(data, '/') 623 + startMatch := int(m.byteOffset) == sep+1 624 + endMatch := endOffset == uint32(len(data)) 606 625 if startMatch && endMatch { 607 626 addScore("Base", scoreBase) 608 627 } else if startMatch || endMatch { 609 628 addScore("EdgeBase", (scoreBase+scorePartialBase)/2) 610 - } else if sep < relStartOffset { 629 + } else if sep < int(m.byteOffset) { 611 630 addScore("InnerBase", scorePartialBase) 612 631 } 613 - } else if secIdx, ok := findSection(secs, uint32(r.Start.ByteOffset), uint32(r.End.ByteOffset-r.Start.ByteOffset)); ok { 614 - sec := secs[secIdx] 615 - startMatch := sec.Start == uint32(r.Start.ByteOffset) 616 - endMatch := sec.End == uint32(r.End.ByteOffset) 632 + } else if sec, si, ok := p.findSymbol(m); ok { 633 + startMatch := sec.Start == m.byteOffset 634 + endMatch := sec.End == endOffset 617 635 if startMatch && endMatch { 618 636 addScore("Symbol", scoreSymbol) 619 637 } else if startMatch || endMatch { ··· 622 640 addScore("InnerSymbol", scorePartialSymbol) 623 641 } 624 642 625 - var si *Symbol 626 - if m.SymbolInfo != nil { 627 - si = m.SymbolInfo[i] 628 - } 629 - if si == nil { 630 - // for non-symbol queries, we need to hydrate in SymbolInfo. 631 - start := p.id.fileEndSymbol[p.idx] 632 - si = p.id.symbols.data(start + uint32(secIdx)) 633 - } 643 + // Score based on symbol data 634 644 if si != nil { 635 645 symbolKind := ctags.ParseSymbolKind(si.Kind) 636 646 sym := sectionSlice(data, sec) 647 + 637 648 addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, symbolKind)) 649 + 650 + // This is from a symbol tree, so we need to store the symbol 651 + // information. 652 + if m.symbol { 653 + if symbolInfo == nil { 654 + symbolInfo = make([]*Symbol, len(ms)) 655 + } 656 + // findSymbols does not hydrate in Sym. So we need to store it. 657 + si.Sym = string(sym) 658 + symbolInfo[i] = si 659 + } 638 660 } 639 661 } 640 662 ··· 648 670 maxScore.what = fmt.Sprintf("score:%.2f <- %s", maxScore.score, strings.TrimSuffix(maxScore.what, ", ")) 649 671 } 650 672 651 - return maxScore.score, maxScore.what 673 + return maxScore.score, maxScore.what, symbolInfo 652 674 } 653 675 654 676 func (p *contentProvider) matchScore(secs []DocumentSection, m *LineMatch, language string, debug bool) (float64, string) {

+1 -1

eval_test.go

··· 160 160 161 161 t.Run("stats", func(t *testing.T) { 162 162 got, want := sr.Stats, Stats{ 163 - ContentBytesLoaded: 2, 163 + ContentBytesLoaded: 0, 164 164 FileCount: 2, 165 165 FilesConsidered: 2, 166 166 FilesSkipped: 2,

+9 -1

read.go

··· 532 532 return nil, 0, err 533 533 } 534 534 535 - return unmarshalDocSections(blob, buf), sec.sz, nil 535 + ds := unmarshalDocSections(blob, buf) 536 + 537 + // can be nil if buf is nil and there are no doc sections. However, we rely 538 + // on it being non-nil to cache the read. 539 + if ds == nil { 540 + ds = make([]DocumentSection, 0) 541 + } 542 + 543 + return ds, sec.sz, nil 536 544 } 537 545 538 546 func (d *indexData) readRanks(toc *indexTOC) error {

Configure Feed

Configure Feed