scoring: remove IDF from BM25 scoring (#912) · boltless.me/zoekt@b437dc7

+3 -28

index/eval.go

··· 25 25 26 26 enry_data "github.com/go-enry/go-enry/v2/data" 27 27 "github.com/grafana/regexp" 28 + 28 29 "github.com/sourcegraph/zoekt" 29 30 "github.com/sourcegraph/zoekt/internal/tenant" 30 31 "github.com/sourcegraph/zoekt/query" ··· 189 190 docCount := uint32(len(d.fileBranchMasks)) 190 191 lastDoc := int(-1) 191 192 192 - // document frequency per term 193 - df := make(termDocumentFrequency) 194 - 195 - // term frequency per file index 196 - var tfs []termFrequency 197 - 198 193 nextFileMatch: 199 194 for { 200 195 canceled := false ··· 320 315 fileMatch.LineMatches = cp.fillMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts) 321 316 } 322 317 323 - var tf map[string]int 324 318 if opts.UseBM25Scoring { 325 - // For BM25 scoring, the calculation of the score is split in two parts. Here we 326 - // calculate the term frequencies for the current document and update the 327 - // document frequencies. Since we don't store document frequencies in the index, 328 - // we have to defer the calculation of the final BM25 score to after the whole 329 - // shard has been processed. 330 - tf = cp.calculateTermFrequency(finalCands, df) 319 + tf := cp.calculateTermFrequency(finalCands) 320 + d.scoreFilesUsingBM25(&fileMatch, nextDoc, tf, opts) 331 321 } else { 332 322 // Use the standard, non-experimental scoring method by default 333 323 d.scoreFile(&fileMatch, nextDoc, mt, known, opts) ··· 348 338 repoMatchCount += len(fileMatch.LineMatches) 349 339 repoMatchCount += matchedChunkRanges 350 340 351 - if opts.UseBM25Scoring { 352 - // Invariant: tfs[i] belongs to res.Files[i] 353 - tfs = append(tfs, termFrequency{ 354 - doc: nextDoc, 355 - tf: tf, 356 - }) 357 - } 358 341 res.Files = append(res.Files, fileMatch) 359 342 360 343 res.Stats.MatchCount += len(fileMatch.LineMatches) 361 344 res.Stats.MatchCount += matchedChunkRanges 362 345 res.Stats.FileCount++ 363 - } 364 - 365 - // Calculate BM25 score for all file matches in the shard. We assume that we 366 - // have seen all documents containing any of the terms in the query so that df 367 - // correctly reflects the document frequencies. This is true, for example, if 368 - // all terms in the query are ORed together. 369 - if opts.UseBM25Scoring { 370 - d.scoreFilesUsingBM25(res.Files, tfs, df, opts) 371 346 } 372 347 373 348 for _, md := range d.repoMetaData {

+26 -42

index/score.go

··· 217 217 L := float64(lineLength) / 100.0 218 218 219 219 score := 0.0 220 - tfs := p.calculateTermFrequency(ms, termDocumentFrequency{}) 220 + tfs := p.calculateTermFrequency(ms) 221 221 for _, f := range tfs { 222 - score += ((k + 1.0) * float64(f)) / (k*(1.0-b+b*L) + float64(f)) 222 + score += tfScore(k, b, L, f) 223 223 } 224 224 225 225 // Check if any index comes from a symbol match tree, and if so hydrate in symbol information ··· 237 237 return score, symbolInfo 238 238 } 239 239 240 - // termDocumentFrequency is a map "term" -> "number of documents that contain the term" 241 - type termDocumentFrequency map[string]int 240 + // tfScore is the term frequency score for BM25. 241 + func tfScore(k float64, b float64, L float64, f int) float64 { 242 + return ((k + 1.0) * float64(f)) / (k*(1.0-b+b*L) + float64(f)) 243 + } 242 244 243 245 // calculateTermFrequency computes the term frequency for the file match. 244 246 // Notes: 245 247 // - Filename matches count more than content matches. This mimics a common text search strategy to 'boost' matches on document titles. 246 248 // - Symbol matches also count more than content matches, to reward matches on symbol definitions. 247 - func (p *contentProvider) calculateTermFrequency(cands []*candidateMatch, df termDocumentFrequency) map[string]int { 249 + func (p *contentProvider) calculateTermFrequency(cands []*candidateMatch) map[string]int { 248 250 // Treat each candidate match as a term and compute the frequencies. For now, ignore case sensitivity and 249 251 // ignore whether the index is a word boundary. 250 252 termFreqs := map[string]int{} ··· 257 259 } 258 260 } 259 261 260 - for term := range termFreqs { 261 - df[term] += 1 262 - } 263 262 return termFreqs 264 263 } 265 264 ··· 331 330 } 332 331 } 333 332 334 - // termFrequency stores the term frequencies for doc. 335 - type termFrequency struct { 336 - doc uint32 337 - tf map[string]int 338 - } 339 - 340 333 // scoreFilesUsingBM25 computes the score according to BM25, the most common scoring algorithm for text search: 341 - // https://en.wikipedia.org/wiki/Okapi_BM25. 334 + // https://en.wikipedia.org/wiki/Okapi_BM25. Note that we treat the inverse document frequency (idf) as constant. This 335 + // is supported by our evaluations which showed that for keyword style queries, idf can down-weight the score of some 336 + // keywords too much, leading to a worse ranking. The intuition is that each keyword is important independently of how 337 + // frequent it appears in the corpus. 342 338 // 343 339 // Unlike standard file scoring, this scoring strategy ignores all other signals including document ranks. This keeps 344 - // things simple for now, since BM25 is not normalized and can be tricky to combine with other scoring signals. It also 340 + // things simple for now, since BM25 is not normalized and can be tricky to combine with other scoring signals. It also 345 341 // ignores the individual LineMatch and ChunkMatch scores, instead calculating a score over all matches in the file. 346 - func (d *indexData) scoreFilesUsingBM25(fileMatches []zoekt.FileMatch, tfs []termFrequency, df termDocumentFrequency, opts *zoekt.SearchOptions) { 342 + func (d *indexData) scoreFilesUsingBM25(fileMatch *zoekt.FileMatch, doc uint32, tf map[string]int, opts *zoekt.SearchOptions) { 347 343 // Use standard parameter defaults used in Lucene (https://lucene.apache.org/core/10_1_0/core/org/apache/lucene/search/similarities/BM25Similarity.html) 348 344 k, b := 1.2, 0.75 349 345 ··· 353 349 averageFileLength++ 354 350 } 355 351 356 - for i := range tfs { 357 - score := 0.0 358 - 359 - // Compute the file length ratio. Usually the calculation would be based on terms, but using 360 - // bytes should work fine, as we're just computing a ratio. 361 - doc := tfs[i].doc 362 - fileLength := float64(d.boundaries[doc+1] - d.boundaries[doc]) 352 + // Compute the file length ratio. Usually the calculation would be based on terms, but using 353 + // bytes should work fine, as we're just computing a ratio. 354 + fileLength := float64(d.boundaries[doc+1] - d.boundaries[doc]) 363 355 364 - L := fileLength / averageFileLength 356 + L := fileLength / averageFileLength 365 357 366 - sumTF := 0 // Just for debugging 367 - for term, f := range tfs[i].tf { 368 - sumTF += f 369 - tfScore := ((k + 1.0) * float64(f)) / (k*(1.0-b+b*L) + float64(f)) 370 - score += idf(df[term], int(d.numDocs())) * tfScore 371 - } 358 + score := 0.0 359 + sumTF := 0 // Just for debugging 360 + for _, f := range tf { 361 + sumTF += f 362 + score += tfScore(k, b, L, f) 363 + } 372 364 373 - fileMatches[i].Score = score 365 + fileMatch.Score = score 374 366 375 - if opts.DebugScore { 376 - fileMatches[i].Debug = fmt.Sprintf("bm25-score: %.2f <- sum-termFrequencies: %d, length-ratio: %.2f", score, sumTF, L) 377 - } 367 + if opts.DebugScore { 368 + fileMatch.Debug = fmt.Sprintf("bm25-score: %.2f <- sum-termFrequencies: %d, length-ratio: %.2f", score, sumTF, L) 378 369 } 379 370 } 380 - 381 - // idf computes the inverse document frequency for a term. nq is the number of 382 - // documents that contain the term and documentCount is the total number of 383 - // documents in the corpus. 384 - func idf(nq, documentCount int) float64 { 385 - return math.Log(1.0 + ((float64(documentCount) - float64(nq) + 0.5) / (float64(nq) + 0.5))) 386 - }

+10 -10

internal/e2e/scoring_test.go

··· 79 79 query: &query.Substring{Pattern: "example"}, 80 80 content: exampleJava, 81 81 language: "Java", 82 - // bm25-score: 0.58 <- sum-termFrequencyScore: 14.00, length-ratio: 1.00 83 - wantScore: 0.58, 82 + // sum-termFrequencyScore: 14.00, length-ratio: 1.00 83 + wantScore: 2.02, 84 84 // line 5: private final int exampleField; 85 85 wantBestLineMatch: 5, 86 86 }, { ··· 93 93 }}, 94 94 content: exampleJava, 95 95 language: "Java", 96 - // bm25-score: 1.81 <- sum-termFrequencyScore: 116.00, length-ratio: 1.00 97 - wantScore: 1.81, 96 + // sum-termFrequencyScore: 116.00, length-ratio: 1.00 97 + wantScore: 6.30, 98 98 // line 54: private static <A, B> B runInnerInterface(InnerInterface<A, B> fn, A a) { 99 99 wantBestLineMatch: 54, 100 100 }, { ··· 106 106 }}, 107 107 content: exampleJava, 108 108 language: "Java", 109 - // bm25-score: 0.96 <- sum-termFrequencies: 12, length-ratio: 1.00 110 - wantScore: 0.96, 109 + // sum-termFrequencies: 12, length-ratio: 1.00 110 + wantScore: 3.33, 111 111 // line 59: if (System.nanoTime() > System.currentTimeMillis()) { 112 112 wantBestLineMatch: 59, 113 113 }, ··· 117 117 query: &query.Substring{Pattern: "java"}, 118 118 content: exampleJava, 119 119 language: "Java", 120 - // bm25-score: 0.51 <- sum-termFrequencyScore: 5.00, length-ratio: 1.00 121 - wantScore: 0.51, 120 + // sum-termFrequencyScore: 5.00, length-ratio: 1.00 121 + wantScore: 1.77, 122 122 }, 123 123 { 124 124 // Matches only on filename, and content is missing 125 125 fileName: "a/b/c/config.go", 126 126 query: &query.Substring{Pattern: "config.go"}, 127 127 language: "Go", 128 - // bm25-score: 0.60 <- sum-termFrequencyScore: 5.00, length-ratio: 0.00 129 - wantScore: 0.60, 128 + // sum-termFrequencyScore: 5.00, length-ratio: 0.00 129 + wantScore: 2.07, 130 130 }, 131 131 } 132 132

Configure Feed

Configure Feed