ranking: add phrase boosting to BM25 (#917) · boltless.me/zoekt@3d43fdf

+1 -2

index/eval.go

··· 316 316 } 317 317 318 318 if opts.UseBM25Scoring { 319 - tf := cp.calculateTermFrequency(finalCands) 320 - d.scoreFilesUsingBM25(&fileMatch, nextDoc, tf, opts) 319 + d.scoreFilesUsingBM25(&fileMatch, nextDoc, finalCands, cp, opts) 321 320 } else { 322 321 // Use the standard, non-experimental scoring method by default 323 322 d.scoreFile(&fileMatch, nextDoc, mt, known, opts)

+35 -6

index/score.go

··· 235 235 } 236 236 } 237 237 } 238 + 239 + score = boostScore(score, ms) 238 240 return score, symbolInfo 239 241 } 240 242 ··· 261 263 } 262 264 263 265 return termFreqs 266 + } 267 + 268 + // boostScore finds whether any of the matches are part of a boosted match tree, then applies 269 + // the boost to the final score. This follows precedent in other search engines like Lucene, where 270 + // boosts multiply an entire query clause's final score. 271 + // 272 + // As a heuristic, we use the maximum boost across matches to avoid applying the same boost multiple times. 273 + func boostScore(score float64, ms []*candidateMatch) float64 { 274 + maxScoreWeight := 1.0 275 + for _, m := range ms { 276 + if m.scoreWeight > maxScoreWeight { 277 + maxScoreWeight = m.scoreWeight 278 + } 279 + } 280 + 281 + if !epsilonEqualsOne(maxScoreWeight) { 282 + score = score * maxScoreWeight 283 + } 284 + return score 264 285 } 265 286 266 287 // scoreFile computes a score for the file match using various scoring signals, like ··· 324 345 // keywords too much, leading to a worse ranking. The intuition is that each keyword is important independently of how 325 346 // frequent it appears in the corpus. 326 347 // 327 - // Unlike standard file scoring, this scoring strategy ignores all other signals including document ranks. This keeps 328 - // things simple for now, since BM25 is not normalized and can be tricky to combine with other scoring signals. It also 329 - // ignores the individual LineMatch and ChunkMatch scores, instead calculating a score over all matches in the file. 330 - func (d *indexData) scoreFilesUsingBM25(fileMatch *zoekt.FileMatch, doc uint32, tf map[string]int, opts *zoekt.SearchOptions) { 348 + // Unlike standard file scoring, this scoring strategy ignores the individual LineMatch and ChunkMatch scores, instead 349 + // calculating a score over all matches in the file. 350 + func (d *indexData) scoreFilesUsingBM25(fileMatch *zoekt.FileMatch, doc uint32, cands []*candidateMatch, cp *contentProvider, opts *zoekt.SearchOptions) { 351 + tf := cp.calculateTermFrequency(cands) 352 + 331 353 // Use standard parameter defaults used in Lucene (https://lucene.apache.org/core/10_1_0/core/org/apache/lucene/search/similarities/BM25Similarity.html) 332 354 k, b := 1.2, 0.75 333 355 ··· 343 365 344 366 L := fileLength / averageFileLength 345 367 346 - score := 0.0 368 + bm25Score := 0.0 347 369 sumTF := 0 // Just for debugging 348 370 for _, f := range tf { 349 371 sumTF += f 350 - score += tfScore(k, b, L, f) 372 + bm25Score += tfScore(k, b, L, f) 351 373 } 374 + 375 + score := boostScore(bm25Score, cands) 376 + boosted := score != bm25Score 377 + 352 378 // 2 digits of precision 353 379 score = math.Trunc(score*100) / 100 354 380 ··· 370 396 if opts.DebugScore { 371 397 // To make the debug output easier to read, we split the score into the query dependent score and the tiebreaker 372 398 fileMatch.Debug = fmt.Sprintf("bm25-score: %.2f (repo-rank: %d, file-rank: %.2f) <- sum-termFrequencies: %d, length-ratio: %.2f", score, md.Rank, fileOrderScore, sumTF, L) 399 + if boosted { 400 + fileMatch.Debug += fmt.Sprintf(" (boosted)") 401 + } 373 402 } 374 403 }

+17

internal/e2e/scoring_test.go

··· 110 110 wantScore: 3.33, 111 111 // line 59: if (System.nanoTime() > System.currentTimeMillis()) { 112 112 wantBestLineMatch: 59, 113 + }, { 114 + // phrase boosting 115 + fileName: "example.java", 116 + query: &query.Or{Children: []query.Q{ 117 + &query.Boost{Child: &query.Substring{Pattern: "public string apply"}, Boost: 20}, 118 + &query.And{Children: []query.Q{ 119 + &query.Substring{Pattern: "public"}, 120 + &query.Substring{Pattern: "string"}, 121 + &query.Substring{Pattern: "apply"}, 122 + }}, 123 + }}, 124 + content: exampleJava, 125 + language: "Java", 126 + // sum-termFrequencies: sum-termFrequencies: 40, length-ratio: 1.00 127 + wantScore: 140.80, 128 + // public String apply(String s) { 129 + wantBestLineMatch: 81, 113 130 }, 114 131 { 115 132 // Matches only on filename

Configure Feed

Configure Feed