fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

Boost symbol matches in BM25 (#876)

When digging into our Natural Language Search (NLS) eval results, I found that one of the leading causes for flexible search types like "Fuzzy symbol search" and "Find logic" was noisy matches in top results. Currently, our BM25 ranking rewards any substring match equally. So for queries like 'extract tar', any match on 'tar' (even within unrelated terms like 'start', etc.) counts towards the term frequency.

This PR helps reduce noise by boosting symbol matches the same as we do filename matches. Our NLS evals show positive improvement, and context evals are the tiniest bit better.

+44 -80
+4 -4
build/scoring_test.go
··· 77 77 query: &query.Substring{Pattern: "example"}, 78 78 content: exampleJava, 79 79 language: "Java", 80 - // bm25-score: 0.57 <- sum-termFrequencyScore: 10.00, length-ratio: 1.00 81 - wantScore: 0.57, 80 + // bm25-score: 0.58 <- sum-termFrequencyScore: 14.00, length-ratio: 1.00 81 + wantScore: 0.58, 82 82 }, { 83 83 // Matches only on content 84 84 fileName: "example.java", ··· 89 89 }}, 90 90 content: exampleJava, 91 91 language: "Java", 92 - // bm25-score: 1.75 <- sum-termFrequencyScore: 56.00, length-ratio: 1.00 93 - wantScore: 1.75, 92 + // bm25-score: 1.81 <- sum-termFrequencyScore: 116.00, length-ratio: 1.00 93 + wantScore: 1.81, 94 94 }, 95 95 { 96 96 // Matches only on filename
+39
contentprovider.go
··· 588 588 return uint32(j), ol1 > 0 589 589 } 590 590 591 + func (p *contentProvider) matchesSymbol(cm *candidateMatch) bool { 592 + if cm.fileName { 593 + return false 594 + } 595 + 596 + // Check if this candidate came from a symbol matchTree 597 + if cm.symbol { 598 + return true 599 + } 600 + 601 + // Check if it overlaps with a symbol. 602 + secs := p.docSections() 603 + _, ok := findMaxOverlappingSection(secs, cm.byteOffset, cm.byteMatchSz) 604 + return ok 605 + } 606 + 591 607 func (p *contentProvider) findSymbol(cm *candidateMatch) (DocumentSection, *Symbol, bool) { 592 608 if cm.fileName { 593 609 return DocumentSection{}, nil, false ··· 617 633 si := p.id.symbols.data(start + secIdx) 618 634 619 635 return sec, si, true 636 + } 637 + 638 + // calculateTermFrequency computes the term frequency for the file match. 639 + // Notes: 640 + // * Filename matches count more than content matches. This mimics a common text search strategy to 'boost' matches on document titles. 641 + // * Symbol matches also count more than content matches, to reward matches on symbol definitions. 642 + func (p *contentProvider) calculateTermFrequency(cands []*candidateMatch, df termDocumentFrequency) map[string]int { 643 + // Treat each candidate match as a term and compute the frequencies. For now, ignore case 644 + // sensitivity and treat filenames and symbols the same as content. 645 + termFreqs := map[string]int{} 646 + for _, m := range cands { 647 + term := string(m.substrLowered) 648 + if m.fileName || p.matchesSymbol(m) { 649 + termFreqs[term] += 5 650 + } else { 651 + termFreqs[term]++ 652 + } 653 + } 654 + 655 + for term := range termFreqs { 656 + df[term] += 1 657 + } 658 + return termFreqs 620 659 } 621 660 622 661 func (p *contentProvider) candidateMatchScore(ms []*candidateMatch, language string, debug bool) (float64, string, []*Symbol) {
+1 -1
eval.go
··· 339 339 // document frequencies. Since we don't store document frequencies in the index, 340 340 // we have to defer the calculation of the final BM25 score to after the whole 341 341 // shard has been processed. 342 - tf = calculateTermFrequency(finalCands, df) 342 + tf = cp.calculateTermFrequency(finalCands, df) 343 343 } else { 344 344 // Use the standard, non-experimental scoring method by default 345 345 d.scoreFile(&fileMatch, nextDoc, mt, known, opts)
-24
score.go
··· 110 110 } 111 111 } 112 112 113 - // calculateTermFrequency computes the term frequency for the file match. 114 - // 115 - // Filename matches count more than content matches. This mimics a common text 116 - // search strategy where you 'boost' matches on document titles. 117 - func calculateTermFrequency(cands []*candidateMatch, df termDocumentFrequency) map[string]int { 118 - // Treat each candidate match as a term and compute the frequencies. For now, ignore case 119 - // sensitivity and treat filenames and symbols the same as content. 120 - termFreqs := map[string]int{} 121 - for _, cand := range cands { 122 - term := string(cand.substrLowered) 123 - if cand.fileName { 124 - termFreqs[term] += 5 125 - } else { 126 - termFreqs[term]++ 127 - } 128 - } 129 - 130 - for term := range termFreqs { 131 - df[term] += 1 132 - } 133 - 134 - return termFreqs 135 - } 136 - 137 113 // idf computes the inverse document frequency for a term. nq is the number of 138 114 // documents that contain the term and documentCount is the total number of 139 115 // documents in the corpus.
-51
score_test.go
··· 1 - package zoekt 2 - 3 - import ( 4 - "maps" 5 - "testing" 6 - ) 7 - 8 - func TestCalculateTermFrequency(t *testing.T) { 9 - cases := []struct { 10 - cands []*candidateMatch 11 - wantDF termDocumentFrequency 12 - wantTermFrequencies map[string]int 13 - }{{ 14 - cands: []*candidateMatch{ 15 - {substrLowered: []byte("foo")}, 16 - {substrLowered: []byte("foo")}, 17 - {substrLowered: []byte("bar")}, 18 - { 19 - substrLowered: []byte("bas"), 20 - fileName: true, 21 - }, 22 - }, 23 - wantDF: termDocumentFrequency{ 24 - "foo": 1, 25 - "bar": 1, 26 - "bas": 1, 27 - }, 28 - wantTermFrequencies: map[string]int{ 29 - "foo": 2, 30 - "bar": 1, 31 - "bas": 5, 32 - }, 33 - }, 34 - } 35 - 36 - for _, c := range cases { 37 - t.Run("", func(t *testing.T) { 38 - fm := FileMatch{} 39 - df := make(termDocumentFrequency) 40 - tf := calculateTermFrequency(c.cands, df) 41 - 42 - if !maps.Equal(df, c.wantDF) { 43 - t.Errorf("got %v, want %v", df, c.wantDF) 44 - } 45 - 46 - if !maps.Equal(tf, c.wantTermFrequencies) { 47 - t.Errorf("got %v, want %v", fm, c.wantTermFrequencies) 48 - } 49 - }) 50 - } 51 - }