ranking: add tiebreakers to BM25 (#914) · boltless.me/zoekt@914a27d

+32 -27

2 changed files

Expand all

index

score.go

internal

e2e

scoring_test.go

+30 -26

index/score.go

··· 25 25 ) 26 26 27 27 const ( 28 - ScoreOffset = 10_000_000 28 + ScoreOffset = 10_000_000 29 + ScoreOffsetBM25 = 1_000_000_000 29 30 ) 30 31 31 32 type chunkScore struct { ··· 299 300 fileMatch.ChunkMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.ChunkMatches)))) 300 301 } 301 302 302 - // Maintain ordering of input files. This 303 - // strictly dominates the in-file ordering of 304 - // the matches. 303 + // Maintain ordering of input files. This strictly dominates the in-file ordering of the matches. 305 304 addScore("fragment", maxFileScore) 306 305 307 - // Add tiebreakers 308 - // 309 - // ScoreOffset shifts the score 7 digits to the left. 310 - fileMatch.Score = math.Trunc(fileMatch.Score) * ScoreOffset 306 + // Truncate score to avoid overlap with the tiebreakers. 307 + fileMatch.Score = math.Trunc(fileMatch.Score) 311 308 312 - md := d.repoMetaData[d.repos[doc]] 313 - 314 - // md.Rank lies in the range [0, 65535]. Hence, we have to allocate 5 digits for 315 - // the rank. The scoreRepoRankFactor shifts the rank score 2 digits to the left, 316 - // reserving digits 3-7 for the repo rank. 317 - addScore("repo-rank", scoreRepoRankFactor*float64(md.Rank)) 318 - 319 - // digits 1-2 and the decimals are reserved for the doc order. Doc order 320 - // (without the scaling factor) lies in the range [0, 1]. The upper bound is 321 - // achieved for matches in the first document of a shard. 322 - addScore("doc-order", scoreFileOrderFactor*(1.0-float64(doc)/float64(len(d.boundaries)))) 309 + // Add tiebreakers 310 + repoRank := d.repoMetaData[d.repos[doc]].Rank // [0, 65535] 311 + docOrderScore := 1.0 - float64(doc)/float64(len(d.boundaries)) // [0, 1] 323 312 324 313 if opts.DebugScore { 325 - // To make the debug output easier to read, we split the score into the query 326 - // dependent score and the tiebreaker 327 - score := math.Trunc(fileMatch.Score / ScoreOffset) 328 - tiebreaker := fileMatch.Score - score*ScoreOffset 329 - fileMatch.Debug = fmt.Sprintf("score: %d (%.2f) <- %s", int(score), tiebreaker, strings.TrimSuffix(fileMatch.Debug, ", ")) 314 + // We log the score components individually for better readability. 315 + fileMatch.Debug = fmt.Sprintf("score: %d (repo-rank: %d, file-rank: %.2f) <- %s", int(fileMatch.Score), repoRank, docOrderScore, strings.TrimSuffix(fileMatch.Debug, ", ")) 330 316 } 317 + 318 + fileMatch.Score = ScoreOffset*fileMatch.Score + scoreRepoRankFactor*float64(repoRank) + scoreFileOrderFactor*docOrderScore 331 319 } 332 320 333 321 // scoreFilesUsingBM25 computes the score according to BM25, the most common scoring algorithm for text search: ··· 361 349 sumTF += f 362 350 score += tfScore(k, b, L, f) 363 351 } 352 + // 2 digits of precision 353 + score = math.Trunc(score*100) / 100 364 354 365 - fileMatch.Score = score 355 + md := d.repoMetaData[d.repos[doc]] 356 + fileOrderScore := 1.0 - float64(doc)/float64(len(d.boundaries)) 357 + 358 + // Offset score by 9 digits and add the tiebreaker. 359 + // 360 + // Example: For a BM25 score of 1.23, a repo rank of 456789 and a file order score of 0.12, we have a final score of 361 + // 12345678901.2 362 + // ^^^ 363 + // bm25 364 + // ^^^^^^ 365 + // repo rank 366 + // ^^^^ 367 + // doc order 368 + fileMatch.Score = score*ScoreOffsetBM25 + scoreRepoRankFactor*float64(md.Rank) + scoreFileOrderFactor*fileOrderScore 366 369 367 370 if opts.DebugScore { 368 - fileMatch.Debug = fmt.Sprintf("bm25-score: %.2f <- sum-termFrequencies: %d, length-ratio: %.2f", score, sumTF, L) 371 + // To make the debug output easier to read, we split the score into the query dependent score and the tiebreaker 372 + fileMatch.Debug = fmt.Sprintf("bm25-score: %.2f (repo-rank: %d, file-rank: %.2f) <- sum-termFrequencies: %d, length-ratio: %.2f", score, md.Rank, fileOrderScore, sumTF, L) 369 373 } 370 374 }

+2 -1

internal/e2e/scoring_test.go

··· 704 704 // helper to remove the tiebreaker from the score for easier comparison 705 705 func withoutTiebreaker(fullScore float64, useBM25 bool) float64 { 706 706 if useBM25 { 707 - return fullScore 707 + // Shift by ScoreOffsetBM25 and truncate to 2 decimal places 708 + return math.Trunc((fullScore/index.ScoreOffsetBM25)*100) / 100 708 709 } 709 710 return math.Trunc(fullScore / index.ScoreOffset) 710 711 }

Configure Feed

Configure Feed