fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

scoring: remove IDF from BM25 scoring (#912)

We remove IDF from our BM25 scoring, effectively treating it as constant.

This is supported by our evaluations which showed that for keyword style queries, IDF can down-weight the score of important keywords too much, leading to a worse ranking. The intuition is that for code search, each keyword is important independently of how frequent it appears in the corpus.

Removing IDF allows us to apply BM25 scoring to a wider range of query types. Previously, BM25 was limited to queries with individual terms combined using OR, as IDF was calculated on the fly at query time.

Test plan:
updated tests

+39 -80
+3 -28
index/eval.go
··· 25 25 26 26 enry_data "github.com/go-enry/go-enry/v2/data" 27 27 "github.com/grafana/regexp" 28 + 28 29 "github.com/sourcegraph/zoekt" 29 30 "github.com/sourcegraph/zoekt/internal/tenant" 30 31 "github.com/sourcegraph/zoekt/query" ··· 189 190 docCount := uint32(len(d.fileBranchMasks)) 190 191 lastDoc := int(-1) 191 192 192 - // document frequency per term 193 - df := make(termDocumentFrequency) 194 - 195 - // term frequency per file index 196 - var tfs []termFrequency 197 - 198 193 nextFileMatch: 199 194 for { 200 195 canceled := false ··· 320 315 fileMatch.LineMatches = cp.fillMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts) 321 316 } 322 317 323 - var tf map[string]int 324 318 if opts.UseBM25Scoring { 325 - // For BM25 scoring, the calculation of the score is split in two parts. Here we 326 - // calculate the term frequencies for the current document and update the 327 - // document frequencies. Since we don't store document frequencies in the index, 328 - // we have to defer the calculation of the final BM25 score to after the whole 329 - // shard has been processed. 330 - tf = cp.calculateTermFrequency(finalCands, df) 319 + tf := cp.calculateTermFrequency(finalCands) 320 + d.scoreFilesUsingBM25(&fileMatch, nextDoc, tf, opts) 331 321 } else { 332 322 // Use the standard, non-experimental scoring method by default 333 323 d.scoreFile(&fileMatch, nextDoc, mt, known, opts) ··· 348 338 repoMatchCount += len(fileMatch.LineMatches) 349 339 repoMatchCount += matchedChunkRanges 350 340 351 - if opts.UseBM25Scoring { 352 - // Invariant: tfs[i] belongs to res.Files[i] 353 - tfs = append(tfs, termFrequency{ 354 - doc: nextDoc, 355 - tf: tf, 356 - }) 357 - } 358 341 res.Files = append(res.Files, fileMatch) 359 342 360 343 res.Stats.MatchCount += len(fileMatch.LineMatches) 361 344 res.Stats.MatchCount += matchedChunkRanges 362 345 res.Stats.FileCount++ 363 - } 364 - 365 - // Calculate BM25 score for all file matches in the shard. We assume that we 366 - // have seen all documents containing any of the terms in the query so that df 367 - // correctly reflects the document frequencies. This is true, for example, if 368 - // all terms in the query are ORed together. 369 - if opts.UseBM25Scoring { 370 - d.scoreFilesUsingBM25(res.Files, tfs, df, opts) 371 346 } 372 347 373 348 for _, md := range d.repoMetaData {
+26 -42
index/score.go
··· 217 217 L := float64(lineLength) / 100.0 218 218 219 219 score := 0.0 220 - tfs := p.calculateTermFrequency(ms, termDocumentFrequency{}) 220 + tfs := p.calculateTermFrequency(ms) 221 221 for _, f := range tfs { 222 - score += ((k + 1.0) * float64(f)) / (k*(1.0-b+b*L) + float64(f)) 222 + score += tfScore(k, b, L, f) 223 223 } 224 224 225 225 // Check if any index comes from a symbol match tree, and if so hydrate in symbol information ··· 237 237 return score, symbolInfo 238 238 } 239 239 240 - // termDocumentFrequency is a map "term" -> "number of documents that contain the term" 241 - type termDocumentFrequency map[string]int 240 + // tfScore is the term frequency score for BM25. 241 + func tfScore(k float64, b float64, L float64, f int) float64 { 242 + return ((k + 1.0) * float64(f)) / (k*(1.0-b+b*L) + float64(f)) 243 + } 242 244 243 245 // calculateTermFrequency computes the term frequency for the file match. 244 246 // Notes: 245 247 // - Filename matches count more than content matches. This mimics a common text search strategy to 'boost' matches on document titles. 246 248 // - Symbol matches also count more than content matches, to reward matches on symbol definitions. 247 - func (p *contentProvider) calculateTermFrequency(cands []*candidateMatch, df termDocumentFrequency) map[string]int { 249 + func (p *contentProvider) calculateTermFrequency(cands []*candidateMatch) map[string]int { 248 250 // Treat each candidate match as a term and compute the frequencies. For now, ignore case sensitivity and 249 251 // ignore whether the index is a word boundary. 250 252 termFreqs := map[string]int{} ··· 257 259 } 258 260 } 259 261 260 - for term := range termFreqs { 261 - df[term] += 1 262 - } 263 262 return termFreqs 264 263 } 265 264 ··· 331 330 } 332 331 } 333 332 334 - // termFrequency stores the term frequencies for doc. 335 - type termFrequency struct { 336 - doc uint32 337 - tf map[string]int 338 - } 339 - 340 333 // scoreFilesUsingBM25 computes the score according to BM25, the most common scoring algorithm for text search: 341 - // https://en.wikipedia.org/wiki/Okapi_BM25. 334 + // https://en.wikipedia.org/wiki/Okapi_BM25. Note that we treat the inverse document frequency (idf) as constant. This 335 + // is supported by our evaluations which showed that for keyword style queries, idf can down-weight the score of some 336 + // keywords too much, leading to a worse ranking. The intuition is that each keyword is important independently of how 337 + // frequent it appears in the corpus. 342 338 // 343 339 // Unlike standard file scoring, this scoring strategy ignores all other signals including document ranks. This keeps 344 - // things simple for now, since BM25 is not normalized and can be tricky to combine with other scoring signals. It also 340 + // things simple for now, since BM25 is not normalized and can be tricky to combine with other scoring signals. It also 345 341 // ignores the individual LineMatch and ChunkMatch scores, instead calculating a score over all matches in the file. 346 - func (d *indexData) scoreFilesUsingBM25(fileMatches []zoekt.FileMatch, tfs []termFrequency, df termDocumentFrequency, opts *zoekt.SearchOptions) { 342 + func (d *indexData) scoreFilesUsingBM25(fileMatch *zoekt.FileMatch, doc uint32, tf map[string]int, opts *zoekt.SearchOptions) { 347 343 // Use standard parameter defaults used in Lucene (https://lucene.apache.org/core/10_1_0/core/org/apache/lucene/search/similarities/BM25Similarity.html) 348 344 k, b := 1.2, 0.75 349 345 ··· 353 349 averageFileLength++ 354 350 } 355 351 356 - for i := range tfs { 357 - score := 0.0 358 - 359 - // Compute the file length ratio. Usually the calculation would be based on terms, but using 360 - // bytes should work fine, as we're just computing a ratio. 361 - doc := tfs[i].doc 362 - fileLength := float64(d.boundaries[doc+1] - d.boundaries[doc]) 352 + // Compute the file length ratio. Usually the calculation would be based on terms, but using 353 + // bytes should work fine, as we're just computing a ratio. 354 + fileLength := float64(d.boundaries[doc+1] - d.boundaries[doc]) 363 355 364 - L := fileLength / averageFileLength 356 + L := fileLength / averageFileLength 365 357 366 - sumTF := 0 // Just for debugging 367 - for term, f := range tfs[i].tf { 368 - sumTF += f 369 - tfScore := ((k + 1.0) * float64(f)) / (k*(1.0-b+b*L) + float64(f)) 370 - score += idf(df[term], int(d.numDocs())) * tfScore 371 - } 358 + score := 0.0 359 + sumTF := 0 // Just for debugging 360 + for _, f := range tf { 361 + sumTF += f 362 + score += tfScore(k, b, L, f) 363 + } 372 364 373 - fileMatches[i].Score = score 365 + fileMatch.Score = score 374 366 375 - if opts.DebugScore { 376 - fileMatches[i].Debug = fmt.Sprintf("bm25-score: %.2f <- sum-termFrequencies: %d, length-ratio: %.2f", score, sumTF, L) 377 - } 367 + if opts.DebugScore { 368 + fileMatch.Debug = fmt.Sprintf("bm25-score: %.2f <- sum-termFrequencies: %d, length-ratio: %.2f", score, sumTF, L) 378 369 } 379 370 } 380 - 381 - // idf computes the inverse document frequency for a term. nq is the number of 382 - // documents that contain the term and documentCount is the total number of 383 - // documents in the corpus. 384 - func idf(nq, documentCount int) float64 { 385 - return math.Log(1.0 + ((float64(documentCount) - float64(nq) + 0.5) / (float64(nq) + 0.5))) 386 - }
+10 -10
internal/e2e/scoring_test.go
··· 79 79 query: &query.Substring{Pattern: "example"}, 80 80 content: exampleJava, 81 81 language: "Java", 82 - // bm25-score: 0.58 <- sum-termFrequencyScore: 14.00, length-ratio: 1.00 83 - wantScore: 0.58, 82 + // sum-termFrequencyScore: 14.00, length-ratio: 1.00 83 + wantScore: 2.02, 84 84 // line 5: private final int exampleField; 85 85 wantBestLineMatch: 5, 86 86 }, { ··· 93 93 }}, 94 94 content: exampleJava, 95 95 language: "Java", 96 - // bm25-score: 1.81 <- sum-termFrequencyScore: 116.00, length-ratio: 1.00 97 - wantScore: 1.81, 96 + // sum-termFrequencyScore: 116.00, length-ratio: 1.00 97 + wantScore: 6.30, 98 98 // line 54: private static <A, B> B runInnerInterface(InnerInterface<A, B> fn, A a) { 99 99 wantBestLineMatch: 54, 100 100 }, { ··· 106 106 }}, 107 107 content: exampleJava, 108 108 language: "Java", 109 - // bm25-score: 0.96 <- sum-termFrequencies: 12, length-ratio: 1.00 110 - wantScore: 0.96, 109 + // sum-termFrequencies: 12, length-ratio: 1.00 110 + wantScore: 3.33, 111 111 // line 59: if (System.nanoTime() > System.currentTimeMillis()) { 112 112 wantBestLineMatch: 59, 113 113 }, ··· 117 117 query: &query.Substring{Pattern: "java"}, 118 118 content: exampleJava, 119 119 language: "Java", 120 - // bm25-score: 0.51 <- sum-termFrequencyScore: 5.00, length-ratio: 1.00 121 - wantScore: 0.51, 120 + // sum-termFrequencyScore: 5.00, length-ratio: 1.00 121 + wantScore: 1.77, 122 122 }, 123 123 { 124 124 // Matches only on filename, and content is missing 125 125 fileName: "a/b/c/config.go", 126 126 query: &query.Substring{Pattern: "config.go"}, 127 127 language: "Go", 128 - // bm25-score: 0.60 <- sum-termFrequencyScore: 5.00, length-ratio: 0.00 129 - wantScore: 0.60, 128 + // sum-termFrequencyScore: 5.00, length-ratio: 0.00 129 + wantScore: 2.07, 130 130 }, 131 131 } 132 132