Boost symbol matches in BM25 (#876) · boltless.me/zoekt@c03b77f

+4 -4

build/scoring_test.go

··· 77 77 query: &query.Substring{Pattern: "example"}, 78 78 content: exampleJava, 79 79 language: "Java", 80 - // bm25-score: 0.57 <- sum-termFrequencyScore: 10.00, length-ratio: 1.00 81 - wantScore: 0.57, 80 + // bm25-score: 0.58 <- sum-termFrequencyScore: 14.00, length-ratio: 1.00 81 + wantScore: 0.58, 82 82 }, { 83 83 // Matches only on content 84 84 fileName: "example.java", ··· 89 89 }}, 90 90 content: exampleJava, 91 91 language: "Java", 92 - // bm25-score: 1.75 <- sum-termFrequencyScore: 56.00, length-ratio: 1.00 93 - wantScore: 1.75, 92 + // bm25-score: 1.81 <- sum-termFrequencyScore: 116.00, length-ratio: 1.00 93 + wantScore: 1.81, 94 94 }, 95 95 { 96 96 // Matches only on filename

+39

contentprovider.go

··· 588 588 return uint32(j), ol1 > 0 589 589 } 590 590 591 + func (p *contentProvider) matchesSymbol(cm *candidateMatch) bool { 592 + if cm.fileName { 593 + return false 594 + } 595 + 596 + // Check if this candidate came from a symbol matchTree 597 + if cm.symbol { 598 + return true 599 + } 600 + 601 + // Check if it overlaps with a symbol. 602 + secs := p.docSections() 603 + _, ok := findMaxOverlappingSection(secs, cm.byteOffset, cm.byteMatchSz) 604 + return ok 605 + } 606 + 591 607 func (p *contentProvider) findSymbol(cm *candidateMatch) (DocumentSection, *Symbol, bool) { 592 608 if cm.fileName { 593 609 return DocumentSection{}, nil, false ··· 617 633 si := p.id.symbols.data(start + secIdx) 618 634 619 635 return sec, si, true 636 + } 637 + 638 + // calculateTermFrequency computes the term frequency for the file match. 639 + // Notes: 640 + // * Filename matches count more than content matches. This mimics a common text search strategy to 'boost' matches on document titles. 641 + // * Symbol matches also count more than content matches, to reward matches on symbol definitions. 642 + func (p *contentProvider) calculateTermFrequency(cands []*candidateMatch, df termDocumentFrequency) map[string]int { 643 + // Treat each candidate match as a term and compute the frequencies. For now, ignore case 644 + // sensitivity and treat filenames and symbols the same as content. 645 + termFreqs := map[string]int{} 646 + for _, m := range cands { 647 + term := string(m.substrLowered) 648 + if m.fileName || p.matchesSymbol(m) { 649 + termFreqs[term] += 5 650 + } else { 651 + termFreqs[term]++ 652 + } 653 + } 654 + 655 + for term := range termFreqs { 656 + df[term] += 1 657 + } 658 + return termFreqs 620 659 } 621 660 622 661 func (p *contentProvider) candidateMatchScore(ms []*candidateMatch, language string, debug bool) (float64, string, []*Symbol) {

+1 -1

eval.go

··· 339 339 // document frequencies. Since we don't store document frequencies in the index, 340 340 // we have to defer the calculation of the final BM25 score to after the whole 341 341 // shard has been processed. 342 - tf = calculateTermFrequency(finalCands, df) 342 + tf = cp.calculateTermFrequency(finalCands, df) 343 343 } else { 344 344 // Use the standard, non-experimental scoring method by default 345 345 d.scoreFile(&fileMatch, nextDoc, mt, known, opts)

-24

score.go

··· 110 110 } 111 111 } 112 112 113 - // calculateTermFrequency computes the term frequency for the file match. 114 - // 115 - // Filename matches count more than content matches. This mimics a common text 116 - // search strategy where you 'boost' matches on document titles. 117 - func calculateTermFrequency(cands []*candidateMatch, df termDocumentFrequency) map[string]int { 118 - // Treat each candidate match as a term and compute the frequencies. For now, ignore case 119 - // sensitivity and treat filenames and symbols the same as content. 120 - termFreqs := map[string]int{} 121 - for _, cand := range cands { 122 - term := string(cand.substrLowered) 123 - if cand.fileName { 124 - termFreqs[term] += 5 125 - } else { 126 - termFreqs[term]++ 127 - } 128 - } 129 - 130 - for term := range termFreqs { 131 - df[term] += 1 132 - } 133 - 134 - return termFreqs 135 - } 136 - 137 113 // idf computes the inverse document frequency for a term. nq is the number of 138 114 // documents that contain the term and documentCount is the total number of 139 115 // documents in the corpus.

-51

score_test.go

··· 1 - package zoekt 2 - 3 - import ( 4 - "maps" 5 - "testing" 6 - ) 7 - 8 - func TestCalculateTermFrequency(t *testing.T) { 9 - cases := []struct { 10 - cands []*candidateMatch 11 - wantDF termDocumentFrequency 12 - wantTermFrequencies map[string]int 13 - }{{ 14 - cands: []*candidateMatch{ 15 - {substrLowered: []byte("foo")}, 16 - {substrLowered: []byte("foo")}, 17 - {substrLowered: []byte("bar")}, 18 - { 19 - substrLowered: []byte("bas"), 20 - fileName: true, 21 - }, 22 - }, 23 - wantDF: termDocumentFrequency{ 24 - "foo": 1, 25 - "bar": 1, 26 - "bas": 1, 27 - }, 28 - wantTermFrequencies: map[string]int{ 29 - "foo": 2, 30 - "bar": 1, 31 - "bas": 5, 32 - }, 33 - }, 34 - } 35 - 36 - for _, c := range cases { 37 - t.Run("", func(t *testing.T) { 38 - fm := FileMatch{} 39 - df := make(termDocumentFrequency) 40 - tf := calculateTermFrequency(c.cands, df) 41 - 42 - if !maps.Equal(df, c.wantDF) { 43 - t.Errorf("got %v, want %v", df, c.wantDF) 44 - } 45 - 46 - if !maps.Equal(tf, c.wantTermFrequencies) { 47 - t.Errorf("got %v, want %v", fm, c.wantTermFrequencies) 48 - } 49 - }) 50 - } 51 - }

Configure Feed

Configure Feed