fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

Support BM25 scoring for chunk matches (#889)

Currently, BM25 scoring only applies to the overall `FileMatch` score. The
algorithm gathered term frequencies from all candidate matches in the file to
produce a file-level score. However `LineMatch` and `ChunkMatch` scores were
still calculated using the classic Zoekt scoring algorithm.

This PR implements BM25 scoring for `LineMatch` and `ChunkMatch`. It does so by
calculating a BM25 per line. Compared to the classic Zoekt algorithm, this
rewards multiple term matches on a line. Because our term frequency calculation
also boosts symbol matches, the score smoothly balances between "many term
matches" and "interesting term matches".

Now, the code is structured as follows:
* `scoreChunk`: goes through each line in the chunk, calculating its score
through `scoreLine`, and returns the best-scoring line
* `scoreLine`: calculates the score for a single line

The mental model is that "the score of a chunk is always the score of its best
line".

+475 -206
+21 -3
api.go
··· 19 19 "encoding/json" 20 20 "errors" 21 21 "fmt" 22 + "math" 22 23 "reflect" 23 24 "strconv" 24 25 "strings" ··· 133 134 sz += sliceHeaderBytes + uint64(len(m.Checksum)) 134 135 135 136 return 137 + } 138 + 139 + // addScore increments the score of the FileMatch by the computed score. If 140 + // debugScore is true, it also adds a debug string to the FileMatch. If raw is 141 + // -1, it is ignored. Otherwise, it is added to the debug string. 142 + func (m *FileMatch) addScore(what string, computed float64, raw float64, debugScore bool) { 143 + if computed != 0 && debugScore { 144 + var b strings.Builder 145 + fmt.Fprintf(&b, "%s", what) 146 + if raw != -1 { 147 + fmt.Fprintf(&b, "(%s)", strconv.FormatFloat(raw, 'f', -1, 64)) 148 + } 149 + fmt.Fprintf(&b, ":%.2f, ", computed) 150 + m.Debug += b.String() 151 + } 152 + m.Score += computed 136 153 } 137 154 138 155 // ChunkMatch is a set of non-overlapping matches within a contiguous range of ··· 671 688 // Normalize the repo score within [0, maxUint16), with the midpoint at 5,000. 672 689 // This means popular repos (roughly ones with over 5,000 stars) see diminishing 673 690 // returns from more stars. 674 - r.Rank = uint16(r.priority / (5000.0 + r.priority) * maxUInt16) 691 + r.Rank = uint16(r.priority / (5000.0 + r.priority) * math.MaxUint16) 675 692 } 676 693 } 677 694 ··· 687 704 return 0 688 705 } 689 706 months := int(t.Year()-1970)*12 + int(t.Month()-1) 690 - return uint16(min(months, maxUInt16)) 707 + return uint16(min(months, math.MaxUint16)) 691 708 } 692 709 693 710 // MergeMutable will merge x into r. mutated will be true if it made any ··· 976 993 977 994 // EXPERIMENTAL. If true, use text-search style scoring instead of the default 978 995 // scoring formula. The scoring algorithm treats each match in a file as a term 979 - // and computes an approximation to BM25. 996 + // and computes an approximation to BM25. When enabled, BM25 scoring is used for 997 + // the overall FileMatch score, as well as individual LineMatch and ChunkMatch scores. 980 998 // 981 999 // The calculation of IDF assumes that Zoekt visits all documents containing any 982 1000 // of the query terms during evaluation. This is true, for example, if all query
+15 -2
build/scoring_test.go
··· 94 94 language: "Java", 95 95 // bm25-score: 1.81 <- sum-termFrequencyScore: 116.00, length-ratio: 1.00 96 96 wantScore: 1.81, 97 - // line 3: public class InnerClasses { 98 - wantBestLineMatch: 3, 97 + // line 54: private static <A, B> B runInnerInterface(InnerInterface<A, B> fn, A a) { 98 + wantBestLineMatch: 54, 99 + }, { 100 + // Another content-only match 101 + fileName: "example.java", 102 + query: &query.And{Children: []query.Q{ 103 + &query.Substring{Pattern: "system"}, 104 + &query.Substring{Pattern: "time"}, 105 + }}, 106 + content: exampleJava, 107 + language: "Java", 108 + // bm25-score: 0.96 <- sum-termFrequencies: 12, length-ratio: 1.00 109 + wantScore: 0.96, 110 + // line 59: if (System.nanoTime() > System.currentTimeMillis()) { 111 + wantBestLineMatch: 59, 99 112 }, 100 113 { 101 114 // Matches only on filename
+22 -162
contentprovider.go
··· 16 16 17 17 import ( 18 18 "bytes" 19 - "fmt" 20 19 "log" 21 20 "path" 22 21 "slices" 23 22 "sort" 24 - "strings" 25 23 "unicode" 26 24 "unicode/utf8" 27 25 ··· 145 143 // 146 144 // Note: the byte slices may be backed by mmapped data, so before being 147 145 // returned by the API it needs to be copied. 148 - func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch { 146 + func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int, language string, opts *SearchOptions) []LineMatch { 149 147 var filenameMatches []*candidateMatch 150 148 contentMatches := make([]*candidateMatch, 0, len(ms)) 151 149 ··· 160 158 // If there are any content matches, we only return these and skip filename matches. 161 159 if len(contentMatches) > 0 { 162 160 contentMatches = breakMatchesOnNewlines(contentMatches, p.data(false)) 163 - return p.fillContentMatches(contentMatches, numContextLines, language, debug) 161 + return p.fillContentMatches(contentMatches, numContextLines, language, opts) 164 162 } 165 163 166 164 // Otherwise, we return a single line containing the filematch match. 167 - bestMatch, _ := p.candidateMatchScore(filenameMatches, language, debug) 165 + lineScore, _ := p.scoreLine(filenameMatches, language, -1 /* must pass -1 for filenames */, opts) 168 166 res := LineMatch{ 169 167 Line: p.id.fileName(p.idx), 170 168 FileName: true, 171 - Score: bestMatch.score, 172 - DebugScore: bestMatch.debugScore, 169 + Score: lineScore.score, 170 + DebugScore: lineScore.debugScore, 173 171 } 174 172 175 173 for _, m := range ms { ··· 192 190 // 193 191 // Note: the byte slices may be backed by mmapped data, so before being 194 192 // returned by the API it needs to be copied. 195 - func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch { 193 + func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines int, language string, opts *SearchOptions) []ChunkMatch { 196 194 var filenameMatches []*candidateMatch 197 195 contentMatches := make([]*candidateMatch, 0, len(ms)) 198 196 ··· 206 204 207 205 // If there are any content matches, we only return these and skip filename matches. 208 206 if len(contentMatches) > 0 { 209 - return p.fillContentChunkMatches(contentMatches, numContextLines, language, debug) 207 + return p.fillContentChunkMatches(contentMatches, numContextLines, language, opts) 210 208 } 211 209 212 210 // Otherwise, we return a single chunk representing the filename match. 213 - bestMatch, _ := p.candidateMatchScore(filenameMatches, language, debug) 211 + lineScore, _ := p.scoreLine(filenameMatches, language, -1 /* must pass -1 for filenames */, opts) 214 212 fileName := p.id.fileName(p.idx) 215 213 ranges := make([]Range, 0, len(ms)) 216 214 for _, m := range ms { ··· 233 231 ContentStart: Location{ByteOffset: 0, LineNumber: 1, Column: 1}, 234 232 Ranges: ranges, 235 233 FileName: true, 236 - Score: bestMatch.score, 237 - DebugScore: bestMatch.debugScore, 234 + Score: lineScore.score, 235 + DebugScore: lineScore.debugScore, 238 236 }} 239 237 } 240 238 241 - func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch { 239 + func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLines int, language string, opts *SearchOptions) []LineMatch { 242 240 var result []LineMatch 243 241 for len(ms) > 0 { 244 242 m := ms[0] ··· 296 294 finalMatch.After = p.newlines().getLines(data, num+1, num+1+numContextLines) 297 295 } 298 296 299 - bestMatch, symbolInfo := p.candidateMatchScore(lineCands, language, debug) 300 - finalMatch.Score = bestMatch.score 301 - finalMatch.DebugScore = bestMatch.debugScore 297 + lineScore, symbolInfo := p.scoreLine(lineCands, language, num, opts) 298 + finalMatch.Score = lineScore.score 299 + finalMatch.DebugScore = lineScore.debugScore 302 300 303 301 for i, m := range lineCands { 304 302 fragment := LineFragmentMatch{ ··· 306 304 LineOffset: int(m.byteOffset) - lineStart, 307 305 MatchLength: int(m.byteMatchSz), 308 306 } 307 + 309 308 if i < len(symbolInfo) && symbolInfo[i] != nil { 310 309 fragment.SymbolInfo = symbolInfo[i] 311 310 } ··· 317 316 return result 318 317 } 319 318 320 - func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch { 321 - newlines := p.newlines() 319 + func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numContextLines int, language string, opts *SearchOptions) []ChunkMatch { 322 320 data := p.data(false) 323 321 324 322 // columnHelper prevents O(len(ms) * len(data)) lookups for all columns. ··· 332 330 sort.Sort((sortByOffsetSlice)(ms)) 333 331 } 334 332 333 + newlines := p.newlines() 335 334 chunks := chunkCandidates(ms, newlines, numContextLines) 336 335 chunkMatches := make([]ChunkMatch, 0, len(chunks)) 337 336 for _, chunk := range chunks { 338 - bestMatch, symbolInfo := p.candidateMatchScore(chunk.candidates, language, debug) 339 - 340 337 ranges := make([]Range, 0, len(chunk.candidates)) 341 338 for _, cm := range chunk.candidates { 342 339 startOffset := cm.byteOffset ··· 363 360 } 364 361 firstLineStart := newlines.lineStart(firstLineNumber) 365 362 366 - bestLineMatch := 0 367 - if bestMatch.match != nil { 368 - bestLineMatch = newlines.atOffset(bestMatch.match.byteOffset) 369 - if debug { 370 - bestMatch.debugScore = fmt.Sprintf("%s, (line: %d)", bestMatch.debugScore, bestLineMatch) 371 - } 372 - } 373 - 363 + chunkScore, symbolInfo := p.scoreChunk(chunk.candidates, language, opts) 374 364 chunkMatches = append(chunkMatches, ChunkMatch{ 375 365 Content: newlines.getLines(data, firstLineNumber, int(chunk.lastLine)+numContextLines+1), 376 366 ContentStart: Location{ ··· 381 371 FileName: false, 382 372 Ranges: ranges, 383 373 SymbolInfo: symbolInfo, 384 - BestLineMatch: uint32(bestLineMatch), 385 - Score: bestMatch.score, 386 - DebugScore: bestMatch.debugScore, 374 + BestLineMatch: uint32(chunkScore.bestLine), 375 + Score: chunkScore.score, 376 + DebugScore: chunkScore.debugScore, 387 377 }) 388 378 } 389 379 return chunkMatches ··· 405 395 // output invariants: if you flatten candidates the input invariant is retained. 406 396 func chunkCandidates(ms []*candidateMatch, newlines newlines, numContextLines int) []candidateChunk { 407 397 var chunks []candidateChunk 398 + 408 399 for _, m := range ms { 409 400 startOffset := m.byteOffset 410 401 endOffset := m.byteOffset + m.byteMatchSz ··· 536 527 scoreKindMatch = 100.0 537 528 scoreFactorAtomMatch = 400.0 538 529 539 - // File-only scoring signals. For now these are also bounded ~9000 to give them 540 - // equal weight with the query-dependent signals. 541 - scoreFileRankFactor = 9000.0 542 - 543 530 // Used for ordering line and chunk matches within a file. 544 531 scoreLineOrderFactor = 1.0 545 532 ··· 641 628 si := p.id.symbols.data(start + secIdx) 642 629 643 630 return sec, si, true 644 - } 645 - 646 - // calculateTermFrequency computes the term frequency for the file match. 647 - // Notes: 648 - // * Filename matches count more than content matches. This mimics a common text search strategy to 'boost' matches on document titles. 649 - // * Symbol matches also count more than content matches, to reward matches on symbol definitions. 650 - func (p *contentProvider) calculateTermFrequency(cands []*candidateMatch, df termDocumentFrequency) map[string]int { 651 - // Treat each candidate match as a term and compute the frequencies. For now, ignore case 652 - // sensitivity and treat filenames and symbols the same as content. 653 - termFreqs := map[string]int{} 654 - for _, m := range cands { 655 - term := string(m.substrLowered) 656 - if m.fileName || p.matchesSymbol(m) { 657 - termFreqs[term] += 5 658 - } else { 659 - termFreqs[term]++ 660 - } 661 - } 662 - 663 - for term := range termFreqs { 664 - df[term] += 1 665 - } 666 - return termFreqs 667 - } 668 - 669 - // scoredMatch holds the score information for a candidate match. 670 - type scoredMatch struct { 671 - score float64 672 - debugScore string 673 - match *candidateMatch 674 - } 675 - 676 - // candidateMatchScore scores all candidate matches and returns the best-scoring match plus its score information. 677 - // Invariant: there should be at least one input candidate, len(ms) > 0. 678 - func (p *contentProvider) candidateMatchScore(ms []*candidateMatch, language string, debug bool) (scoredMatch, []*Symbol) { 679 - score := 0.0 680 - what := "" 681 - 682 - addScore := func(w string, s float64) { 683 - if s != 0 && debug { 684 - what += fmt.Sprintf("%s:%.2f, ", w, s) 685 - } 686 - score += s 687 - } 688 - 689 - filename := p.data(true) 690 - var symbolInfo []*Symbol 691 - 692 - var bestMatch scoredMatch 693 - for i, m := range ms { 694 - data := p.data(m.fileName) 695 - 696 - endOffset := m.byteOffset + m.byteMatchSz 697 - startBoundary := m.byteOffset < uint32(len(data)) && (m.byteOffset == 0 || byteClass(data[m.byteOffset-1]) != byteClass(data[m.byteOffset])) 698 - endBoundary := endOffset > 0 && (endOffset == uint32(len(data)) || byteClass(data[endOffset-1]) != byteClass(data[endOffset])) 699 - 700 - score = 0 701 - what = "" 702 - 703 - if startBoundary && endBoundary { 704 - addScore("WordMatch", scoreWordMatch) 705 - } else if startBoundary || endBoundary { 706 - addScore("PartialWordMatch", scorePartialWordMatch) 707 - } 708 - 709 - if m.fileName { 710 - sep := bytes.LastIndexByte(data, '/') 711 - startMatch := int(m.byteOffset) == sep+1 712 - endMatch := endOffset == uint32(len(data)) 713 - if startMatch && endMatch { 714 - addScore("Base", scoreBase) 715 - } else if startMatch || endMatch { 716 - addScore("EdgeBase", (scoreBase+scorePartialBase)/2) 717 - } else if sep < int(m.byteOffset) { 718 - addScore("InnerBase", scorePartialBase) 719 - } 720 - } else if sec, si, ok := p.findSymbol(m); ok { 721 - startMatch := sec.Start == m.byteOffset 722 - endMatch := sec.End == endOffset 723 - if startMatch && endMatch { 724 - addScore("Symbol", scoreSymbol) 725 - } else if startMatch || endMatch { 726 - addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2) 727 - } else { 728 - addScore("OverlapSymbol", scorePartialSymbol) 729 - } 730 - 731 - // Score based on symbol data 732 - if si != nil { 733 - symbolKind := ctags.ParseSymbolKind(si.Kind) 734 - sym := sectionSlice(data, sec) 735 - 736 - addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, symbolKind)) 737 - 738 - // This is from a symbol tree, so we need to store the symbol 739 - // information. 740 - if m.symbol { 741 - if symbolInfo == nil { 742 - symbolInfo = make([]*Symbol, len(ms)) 743 - } 744 - // findSymbols does not hydrate in Sym. So we need to store it. 745 - si.Sym = string(sym) 746 - symbolInfo[i] = si 747 - } 748 - } 749 - } 750 - 751 - // scoreWeight != 1 means it affects score 752 - if !epsilonEqualsOne(m.scoreWeight) { 753 - score = score * m.scoreWeight 754 - if debug { 755 - what += fmt.Sprintf("boost:%.2f, ", m.scoreWeight) 756 - } 757 - } 758 - 759 - if score > bestMatch.score { 760 - bestMatch.score = score 761 - bestMatch.debugScore = what 762 - bestMatch.match = m 763 - } 764 - } 765 - 766 - if debug { 767 - bestMatch.debugScore = fmt.Sprintf("score:%.2f <- %s", bestMatch.score, strings.TrimSuffix(bestMatch.debugScore, ", ")) 768 - } 769 - 770 - return bestMatch, symbolInfo 771 631 } 772 632 773 633 // sectionSlice will return data[sec.Start:sec.End] but will clip Start and
+2 -2
eval.go
··· 327 327 finalCands := d.gatherMatches(nextDoc, mt, known, shouldMergeMatches) 328 328 329 329 if opts.ChunkMatches { 330 - fileMatch.ChunkMatches = cp.fillChunkMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts.DebugScore) 330 + fileMatch.ChunkMatches = cp.fillChunkMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts) 331 331 } else { 332 - fileMatch.LineMatches = cp.fillMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts.DebugScore) 332 + fileMatch.LineMatches = cp.fillMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts) 333 333 } 334 334 335 335 var tf map[string]int
+160 -6
index_test.go
··· 45 45 } 46 46 } 47 47 48 - func testIndexBuilder(t *testing.T, repo *Repository, docs ...Document) *IndexBuilder { 49 - t.Helper() 48 + func testIndexBuilder(tb testing.TB, repo *Repository, docs ...Document) *IndexBuilder { 49 + tb.Helper() 50 50 51 51 b, err := NewIndexBuilder(repo) 52 52 if err != nil { 53 - t.Fatalf("NewIndexBuilder: %v", err) 53 + tb.Fatalf("NewIndexBuilder: %v", err) 54 54 } 55 55 56 56 for i, d := range docs { 57 57 if err := b.Add(d); err != nil { 58 - t.Fatalf("Add %d: %v", i, err) 58 + tb.Fatalf("Add %d: %v", i, err) 59 59 } 60 60 } 61 61 ··· 303 303 return res 304 304 } 305 305 306 - func searcherForTest(t *testing.T, b *IndexBuilder) Searcher { 306 + func searcherForTest(t testing.TB, b *IndexBuilder) Searcher { 307 307 var buf bytes.Buffer 308 308 if err := b.Write(&buf); err != nil { 309 309 t.Fatal(err) ··· 375 375 func wordsAsSymbols(doc Document) Document { 376 376 re := regexp.MustCompile(`\b\w{2,}\b`) 377 377 var symbols []DocumentSection 378 + var symbolsMetadata []*Symbol 378 379 for _, match := range re.FindAllIndex(doc.Content, -1) { 379 380 symbols = append(symbols, DocumentSection{ 380 381 Start: uint32(match[0]), 381 382 End: uint32(match[1]), 382 383 }) 384 + symbolsMetadata = append(symbolsMetadata, &Symbol{Kind: "method"}) 383 385 } 384 386 doc.Symbols = symbols 387 + doc.SymbolsMetaData = symbolsMetadata 385 388 return doc 386 389 } 387 390 ··· 992 995 }) 993 996 } 994 997 998 + func TestSearchBM25MatchScores(t *testing.T) { 999 + ctx := context.Background() 1000 + searcher := searcherForTest(t, testIndexBuilder(t, nil, 1001 + Document{Name: "f1", Content: []byte("one two three\naaaaaaaaaa\nbbbbbbbb\none two two")}, 1002 + Document{Name: "f2", Content: []byte("four five six\naaaaaaaaaa\nbbbbbbbb\nfour five five\nsix six")}, 1003 + wordsAsSymbols(Document{Name: "f3", Content: []byte("public static void main")}), 1004 + )) 1005 + 1006 + t.Run("LineMatches", func(t *testing.T) { 1007 + q := &query.Substring{Pattern: "two"} 1008 + sres, err := searcher.Search(ctx, q, &SearchOptions{UseBM25Scoring: true}) 1009 + if err != nil { 1010 + t.Fatal(err) 1011 + } 1012 + matches := sres.Files 1013 + if len(matches) != 1 { 1014 + t.Fatalf("want 1 file match, got %d", len(matches)) 1015 + } 1016 + 1017 + if len(matches[0].LineMatches) != 2 { 1018 + t.Fatalf("want 2 chunk matches, got %d", len(matches[0].ChunkMatches)) 1019 + } 1020 + 1021 + if matches[0].LineMatches[0].LineNumber != 4 { 1022 + t.Fatalf("want best-scoring line to be line 4, got %d", matches[0].LineMatches[0].LineNumber) 1023 + } 1024 + }) 1025 + 1026 + t.Run("ChunkMatches", func(t *testing.T) { 1027 + q := &query.Substring{Pattern: "five"} 1028 + sres, err := searcher.Search(ctx, q, &SearchOptions{UseBM25Scoring: true, ChunkMatches: true, NumContextLines: 1}) 1029 + if err != nil { 1030 + t.Fatal(err) 1031 + } 1032 + 1033 + matches := sres.Files 1034 + if len(matches) != 1 { 1035 + t.Fatalf("want 1 file match, got %d", len(matches)) 1036 + } 1037 + 1038 + if len(matches[0].ChunkMatches) != 2 { 1039 + t.Fatalf("want 2 chunk matches, got %d", len(matches[0].ChunkMatches)) 1040 + } 1041 + 1042 + if matches[0].ChunkMatches[0].BestLineMatch != 4 { 1043 + t.Fatalf("want best-scoring line to be line 4, got %d", matches[0].ChunkMatches[0].BestLineMatch) 1044 + } 1045 + }) 1046 + 1047 + t.Run("ChunkMatches with symbols", func(t *testing.T) { 1048 + q := &query.Or{ 1049 + Children: []query.Q{ 1050 + &query.Symbol{Expr: &query.Substring{Pattern: "main"}}, 1051 + &query.Substring{Pattern: "five"}, 1052 + }, 1053 + } 1054 + 1055 + sres, err := searcher.Search(ctx, q, &SearchOptions{UseBM25Scoring: true, ChunkMatches: true, NumContextLines: 1}) 1056 + if err != nil { 1057 + t.Fatal(err) 1058 + } 1059 + 1060 + matches := sres.Files 1061 + if len(matches) != 2 { 1062 + t.Fatalf("want 2 file match, got %d", len(matches)) 1063 + } 1064 + 1065 + foundSymbolInfo := false 1066 + for _, m := range matches { 1067 + for _, cm := range m.ChunkMatches { 1068 + if len(cm.SymbolInfo) > 0 { 1069 + foundSymbolInfo = true 1070 + } 1071 + } 1072 + } 1073 + 1074 + if !foundSymbolInfo { 1075 + t.Fatalf("want symbol info, got none") 1076 + } 1077 + }) 1078 + } 1079 + 995 1080 func TestFileRestriction(t *testing.T) { 996 1081 b := testIndexBuilder(t, nil, 997 1082 Document{Name: "banana1", Content: []byte("x orange y")}, ··· 2453 2538 res := searchForTest(t, b, q) 2454 2539 2455 2540 // 4096 (content) + 2 (overhead: newlines or doc sections) 2456 - if got, want := res.Stats.ContentBytesLoaded, int64(4100); got != want { 2541 + if got, want := res.Stats.ContentBytesLoaded, int64(4098); got != want { 2457 2542 t.Errorf("got content I/O %d, want %d", got, want) 2458 2543 } 2459 2544 ··· 2467 2552 t.Run("ChunkMatches", func(t *testing.T) { 2468 2553 q := &query.Substring{Pattern: "abc", CaseSensitive: true, Content: true} 2469 2554 res := searchForTest(t, b, q, chunkOpts) 2555 + 2556 + // 4096 (content) + 2 (overhead: newlines or doc sections) 2557 + if got, want := res.Stats.ContentBytesLoaded, int64(4098); got != want { 2558 + t.Errorf("got content I/O %d, want %d", got, want) 2559 + } 2560 + 2561 + // 1024 entries, each 4 bytes apart. 4 fits into single byte 2562 + // delta encoded. 2563 + if got, want := res.Stats.IndexBytesLoaded, int64(1024); got != want { 2564 + t.Errorf("got index I/O %d, want %d", got, want) 2565 + } 2566 + }) 2567 + 2568 + t.Run("LineMatches with BM25", func(t *testing.T) { 2569 + q := &query.Substring{Pattern: "abc", CaseSensitive: true, Content: true} 2570 + res := searchForTest(t, b, q, SearchOptions{UseBM25Scoring: true}) 2571 + 2572 + // 4096 (content) + 2 (overhead: newlines or doc sections) 2573 + if got, want := res.Stats.ContentBytesLoaded, int64(4098); got != want { 2574 + t.Errorf("got content I/O %d, want %d", got, want) 2575 + } 2576 + 2577 + // 1024 entries, each 4 bytes apart. 4 fits into single byte 2578 + // delta encoded. 2579 + if got, want := res.Stats.IndexBytesLoaded, int64(1024); got != want { 2580 + t.Errorf("got index I/O %d, want %d", got, want) 2581 + } 2582 + }) 2583 + 2584 + t.Run("ChunkMatches with BM25", func(t *testing.T) { 2585 + q := &query.Substring{Pattern: "abc", CaseSensitive: true, Content: true} 2586 + res := searchForTest(t, b, q, SearchOptions{UseBM25Scoring: true, ChunkMatches: true}) 2470 2587 2471 2588 // 4096 (content) + 2 (overhead: newlines or doc sections) 2472 2589 if got, want := res.Stats.ContentBytesLoaded, int64(4098); got != want { ··· 3781 3898 } 3782 3899 }) 3783 3900 } 3901 + 3902 + // Simple benchmark focused on chunk match scoring. It creates a single file that will have a 1000-line chunk match. 3903 + // The benchmark time is expected to be strongly correlated with time spent assembling and scoring this chunk. 3904 + func BenchmarkScoreChunkMatches(b *testing.B) { 3905 + ctx := context.Background() 3906 + var builder strings.Builder 3907 + for i := 0; i < 1000; i++ { 3908 + builder.WriteString(fmt.Sprintf("line-%d one one one two two two three three three four four four five five\n", i)) 3909 + } 3910 + 3911 + searcher := searcherForTest(b, testIndexBuilder(b, nil, 3912 + Document{Name: "f1", Content: []byte(builder.String())}, 3913 + )) 3914 + 3915 + q := &query.Or{ 3916 + Children: []query.Q{ 3917 + &query.Substring{Pattern: "f"}, 3918 + &query.Substring{Pattern: "t"}, 3919 + }} 3920 + 3921 + b.Run("score large ChunkMatch", func(b *testing.B) { 3922 + b.ReportAllocs() 3923 + b.ResetTimer() 3924 + 3925 + for i := 0; i < b.N; i++ { 3926 + sres, err := searcher.Search(ctx, q, &SearchOptions{ChunkMatches: true, NumContextLines: 1}) 3927 + if err != nil { 3928 + b.Fatal(err) 3929 + } 3930 + 3931 + matches := sres.Files 3932 + if len(matches) == 0 { 3933 + b.Fatalf("want file match, got none") 3934 + } 3935 + } 3936 + }) 3937 + }
+8 -1
read.go
··· 533 533 return nil, 0, err 534 534 } 535 535 536 - return fromSizedDeltas(blob, buf), sec.sz, nil 536 + nl := fromSizedDeltas(blob, buf) 537 + 538 + // can be nil if buf is nil and there are no doc sections. However, we rely 539 + // on it being non-nil to cache the read. 540 + if nl == nil { 541 + nl = make([]uint32, 0) 542 + } 543 + return nl, sec.sz, nil 537 544 } 538 545 539 546 func (d *indexData) readDocSections(i uint32, buf []DocumentSection) ([]DocumentSection, uint32, error) {
+247 -30
score.go
··· 15 15 package zoekt 16 16 17 17 import ( 18 + "bytes" 18 19 "fmt" 19 20 "math" 20 - "strconv" 21 21 "strings" 22 + 23 + "github.com/sourcegraph/zoekt/ctags" 22 24 ) 23 25 24 26 const ( 25 - maxUInt16 = 0xffff 26 27 ScoreOffset = 10_000_000 27 28 ) 28 29 29 - // addScore increments the score of the FileMatch by the computed score. If 30 - // debugScore is true, it also adds a debug string to the FileMatch. If raw is 31 - // -1, it is ignored. Otherwise, it is added to the debug string. 32 - func (m *FileMatch) addScore(what string, computed float64, raw float64, debugScore bool) { 33 - if computed != 0 && debugScore { 34 - var b strings.Builder 35 - fmt.Fprintf(&b, "%s", what) 36 - if raw != -1 { 37 - fmt.Fprintf(&b, "(%s)", strconv.FormatFloat(raw, 'f', -1, 64)) 30 + type chunkScore struct { 31 + score float64 32 + debugScore string 33 + bestLine int 34 + } 35 + 36 + // scoreChunk calculates the score for each line in the chunk based on its candidate matches, and returns the score of 37 + // the best-scoring line, along with its line number. 38 + // Invariant: there should be at least one input candidate, len(ms) > 0. 39 + func (p *contentProvider) scoreChunk(ms []*candidateMatch, language string, opts *SearchOptions) (chunkScore, []*Symbol) { 40 + nl := p.newlines() 41 + 42 + var bestScore lineScore 43 + bestLine := 0 44 + var symbolInfo []*Symbol 45 + 46 + start := 0 47 + currentLine := -1 48 + for i, m := range ms { 49 + lineNumber := -1 50 + if !m.fileName { 51 + lineNumber = nl.atOffset(m.byteOffset) 52 + } 53 + 54 + // If this match represents a new line, then score the previous line and update 'start'. 55 + if i != 0 && lineNumber != currentLine { 56 + score, si := p.scoreLine(ms[start:i], language, currentLine, opts) 57 + symbolInfo = append(symbolInfo, si...) 58 + if score.score > bestScore.score { 59 + bestScore = score 60 + bestLine = currentLine 61 + } 62 + start = i 63 + } 64 + currentLine = lineNumber 65 + } 66 + 67 + // Make sure to score the last line 68 + line, si := p.scoreLine(ms[start:], language, currentLine, opts) 69 + symbolInfo = append(symbolInfo, si...) 70 + if line.score > bestScore.score { 71 + bestScore = line 72 + bestLine = currentLine 73 + } 74 + 75 + cs := chunkScore{ 76 + score: bestScore.score, 77 + bestLine: bestLine, 78 + } 79 + if opts.DebugScore { 80 + cs.debugScore = fmt.Sprintf("%s, (line: %d)", bestScore.debugScore, bestLine) 81 + } 82 + return cs, symbolInfo 83 + } 84 + 85 + type lineScore struct { 86 + score float64 87 + debugScore string 88 + } 89 + 90 + // scoreLine calculates a score for the line based on its candidate matches. 91 + // Invariants: 92 + // - All candidate matches are assumed to come from the same line in the content. 93 + // - If this line represents a filename, then lineNumber must be -1. 94 + // - There should be at least one input candidate, len(ms) > 0. 95 + func (p *contentProvider) scoreLine(ms []*candidateMatch, language string, lineNumber int, opts *SearchOptions) (lineScore, []*Symbol) { 96 + if opts.UseBM25Scoring { 97 + score, symbolInfo := p.scoreLineBM25(ms, lineNumber) 98 + ls := lineScore{score: score} 99 + if opts.DebugScore { 100 + ls.debugScore = fmt.Sprintf("tfScore:%.2f, ", score) 101 + } 102 + return ls, symbolInfo 103 + } 104 + 105 + score := 0.0 106 + what := "" 107 + addScore := func(w string, s float64) { 108 + if s != 0 && opts.DebugScore { 109 + what += fmt.Sprintf("%s:%.2f, ", w, s) 110 + } 111 + score += s 112 + } 113 + 114 + filename := p.data(true) 115 + var symbolInfo []*Symbol 116 + 117 + var bestScore lineScore 118 + for i, m := range ms { 119 + data := p.data(m.fileName) 120 + 121 + endOffset := m.byteOffset + m.byteMatchSz 122 + startBoundary := m.byteOffset < uint32(len(data)) && (m.byteOffset == 0 || byteClass(data[m.byteOffset-1]) != byteClass(data[m.byteOffset])) 123 + endBoundary := endOffset > 0 && (endOffset == uint32(len(data)) || byteClass(data[endOffset-1]) != byteClass(data[endOffset])) 124 + 125 + score = 0 126 + what = "" 127 + 128 + if startBoundary && endBoundary { 129 + addScore("WordMatch", scoreWordMatch) 130 + } else if startBoundary || endBoundary { 131 + addScore("PartialWordMatch", scorePartialWordMatch) 132 + } 133 + 134 + if m.fileName { 135 + sep := bytes.LastIndexByte(data, '/') 136 + startMatch := int(m.byteOffset) == sep+1 137 + endMatch := endOffset == uint32(len(data)) 138 + if startMatch && endMatch { 139 + addScore("Base", scoreBase) 140 + } else if startMatch || endMatch { 141 + addScore("EdgeBase", (scoreBase+scorePartialBase)/2) 142 + } else if sep < int(m.byteOffset) { 143 + addScore("InnerBase", scorePartialBase) 144 + } 145 + } else if sec, si, ok := p.findSymbol(m); ok { 146 + startMatch := sec.Start == m.byteOffset 147 + endMatch := sec.End == endOffset 148 + if startMatch && endMatch { 149 + addScore("Symbol", scoreSymbol) 150 + } else if startMatch || endMatch { 151 + addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2) 152 + } else { 153 + addScore("OverlapSymbol", scorePartialSymbol) 154 + } 155 + 156 + // Score based on symbol data 157 + if si != nil { 158 + symbolKind := ctags.ParseSymbolKind(si.Kind) 159 + sym := sectionSlice(data, sec) 160 + 161 + addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, symbolKind)) 162 + 163 + // This is from a symbol tree, so we need to store the symbol 164 + // information. 165 + if m.symbol { 166 + if symbolInfo == nil { 167 + symbolInfo = make([]*Symbol, len(ms)) 168 + } 169 + // findSymbols does not hydrate in Sym. So we need to store it. 170 + si.Sym = string(sym) 171 + symbolInfo[i] = si 172 + } 173 + } 174 + } 175 + 176 + // scoreWeight != 1 means it affects score 177 + if !epsilonEqualsOne(m.scoreWeight) { 178 + score = score * m.scoreWeight 179 + if opts.DebugScore { 180 + what += fmt.Sprintf("boost:%.2f, ", m.scoreWeight) 181 + } 182 + } 183 + 184 + if score > bestScore.score { 185 + bestScore.score = score 186 + bestScore.debugScore = what 38 187 } 39 - fmt.Fprintf(&b, ":%.2f, ", computed) 40 - m.Debug += b.String() 188 + } 189 + 190 + if opts.DebugScore { 191 + bestScore.debugScore = fmt.Sprintf("score:%.2f <- %s", bestScore.score, strings.TrimSuffix(bestScore.debugScore, ", ")) 41 192 } 42 - m.Score += computed 193 + 194 + return bestScore, symbolInfo 195 + } 196 + 197 + // scoreLineBM25 computes the score of a line according to BM25, the most common scoring algorithm for text search: 198 + // https://en.wikipedia.org/wiki/Okapi_BM25. Compared to the standard scoreLine algorithm, this score rewards multiple 199 + // term matches on a line. 200 + // Notes: 201 + // - This BM25 calculation skips inverse document frequency (idf) to keep the implementation simple. 202 + // - It uses the same calculateTermFrequency method as BM25 file scoring, which boosts filename and symbol matches. 203 + func (p *contentProvider) scoreLineBM25(ms []*candidateMatch, lineNumber int) (float64, []*Symbol) { 204 + // If this is a filename, then don't compute BM25. The score would not be comparable to line scores. 205 + if lineNumber < 0 { 206 + return 0, nil 207 + } 208 + 209 + // Use standard parameter defaults used in Lucene (https://lucene.apache.org/core/10_1_0/core/org/apache/lucene/search/similarities/BM25Similarity.html) 210 + k, b := 1.2, 0.75 211 + 212 + // Calculate the length ratio of this line. As a heuristic, we assume an average line length of 100 characters. 213 + // Usually the calculation would be based on terms, but using bytes should work fine, as we're just computing a ratio. 214 + nl := p.newlines() 215 + lineLength := nl.lineStart(lineNumber+1) - nl.lineStart(lineNumber) 216 + L := float64(lineLength) / 100.0 217 + 218 + score := 0.0 219 + tfs := p.calculateTermFrequency(ms, termDocumentFrequency{}) 220 + for _, f := range tfs { 221 + score += ((k + 1.0) * float64(f)) / (k*(1.0-b+b*L) + float64(f)) 222 + } 223 + 224 + // Check if any match comes from a symbol match tree, and if so hydrate in symbol information 225 + var symbolInfo []*Symbol 226 + for _, m := range ms { 227 + if m.symbol { 228 + if sec, si, ok := p.findSymbol(m); ok && si != nil { 229 + // findSymbols does not hydrate in Sym. So we need to store it. 230 + sym := sectionSlice(p.data(false), sec) 231 + si.Sym = string(sym) 232 + symbolInfo = append(symbolInfo, si) 233 + } 234 + } 235 + } 236 + return score, symbolInfo 237 + } 238 + 239 + // termDocumentFrequency is a map "term" -> "number of documents that contain the term" 240 + type termDocumentFrequency map[string]int 241 + 242 + // calculateTermFrequency computes the term frequency for the file match. 243 + // Notes: 244 + // - Filename matches count more than content matches. This mimics a common text search strategy to 'boost' matches on document titles. 245 + // - Symbol matches also count more than content matches, to reward matches on symbol definitions. 246 + func (p *contentProvider) calculateTermFrequency(cands []*candidateMatch, df termDocumentFrequency) map[string]int { 247 + // Treat each candidate match as a term and compute the frequencies. For now, ignore case sensitivity and 248 + // ignore whether the match is a word boundary. 249 + termFreqs := map[string]int{} 250 + for _, m := range cands { 251 + term := string(m.substrLowered) 252 + if m.fileName || p.matchesSymbol(m) { 253 + termFreqs[term] += 5 254 + } else { 255 + termFreqs[term]++ 256 + } 257 + } 258 + 259 + for term := range termFreqs { 260 + df[term] += 1 261 + } 262 + return termFreqs 43 263 } 44 264 45 265 // scoreFile computes a score for the file match using various scoring signals, like ··· 110 330 } 111 331 } 112 332 113 - // idf computes the inverse document frequency for a term. nq is the number of 114 - // documents that contain the term and documentCount is the total number of 115 - // documents in the corpus. 116 - func idf(nq, documentCount int) float64 { 117 - return math.Log(1.0 + ((float64(documentCount) - float64(nq) + 0.5) / (float64(nq) + 0.5))) 118 - } 119 - 120 - // termDocumentFrequency is a map "term" -> "number of documents that contain the term" 121 - type termDocumentFrequency map[string]int 122 - 123 333 // termFrequency stores the term frequencies for doc. 124 334 type termFrequency struct { 125 335 doc uint32 126 336 tf map[string]int 127 337 } 128 338 129 - // scoreFilesUsingBM25 computes the score according to BM25, the most common 130 - // scoring algorithm for text search: https://en.wikipedia.org/wiki/Okapi_BM25. 339 + // scoreFilesUsingBM25 computes the score according to BM25, the most common scoring algorithm for text search: 340 + // https://en.wikipedia.org/wiki/Okapi_BM25. 131 341 // 132 - // This scoring strategy ignores all other signals including document ranks. 133 - // This keeps things simple for now, since BM25 is not normalized and can be 134 - // tricky to combine with other scoring signals. 342 + // Unlike standard file scoring, this scoring strategy ignores all other signals including document ranks. This keeps 343 + // things simple for now, since BM25 is not normalized and can be tricky to combine with other scoring signals. It also 344 + // ignores the individual LineMatch and ChunkMatch scores, instead calculating a score over all matches in the file. 135 345 func (d *indexData) scoreFilesUsingBM25(fileMatches []FileMatch, tfs []termFrequency, df termDocumentFrequency, opts *SearchOptions) { 136 - // Use standard parameter defaults (used in Lucene and academic papers) 346 + // Use standard parameter defaults used in Lucene (https://lucene.apache.org/core/10_1_0/core/org/apache/lucene/search/similarities/BM25Similarity.html) 137 347 k, b := 1.2, 0.75 138 348 139 349 averageFileLength := float64(d.boundaries[d.numDocs()]) / float64(d.numDocs()) ··· 166 376 } 167 377 } 168 378 } 379 + 380 + // idf computes the inverse document frequency for a term. nq is the number of 381 + // documents that contain the term and documentCount is the total number of 382 + // documents in the corpus. 383 + func idf(nq, documentCount int) float64 { 384 + return math.Log(1.0 + ((float64(documentCount) - float64(nq) + 0.5) / (float64(nq) + 0.5))) 385 + }