fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

ranking: add tiebreakers to BM25 (#914)

This adds repo freshness and file order as tiebreakers to the final
bm25 score, just like we have for Zoekt's default scoring.

During testing I found that it is a lot less likely for the tiebreakers
to have an effect with BM25 because the score depends on qualites of the
document, such as the relative length and number of matches, which
usually differ even with the quality of the match is similar.

Test plan:
- Score tests still pass
- manual testing: see screenshots

+32 -27
+30 -26
index/score.go
··· 25 25 ) 26 26 27 27 const ( 28 - ScoreOffset = 10_000_000 28 + ScoreOffset = 10_000_000 29 + ScoreOffsetBM25 = 1_000_000_000 29 30 ) 30 31 31 32 type chunkScore struct { ··· 299 300 fileMatch.ChunkMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.ChunkMatches)))) 300 301 } 301 302 302 - // Maintain ordering of input files. This 303 - // strictly dominates the in-file ordering of 304 - // the matches. 303 + // Maintain ordering of input files. This strictly dominates the in-file ordering of the matches. 305 304 addScore("fragment", maxFileScore) 306 305 307 - // Add tiebreakers 308 - // 309 - // ScoreOffset shifts the score 7 digits to the left. 310 - fileMatch.Score = math.Trunc(fileMatch.Score) * ScoreOffset 306 + // Truncate score to avoid overlap with the tiebreakers. 307 + fileMatch.Score = math.Trunc(fileMatch.Score) 311 308 312 - md := d.repoMetaData[d.repos[doc]] 313 - 314 - // md.Rank lies in the range [0, 65535]. Hence, we have to allocate 5 digits for 315 - // the rank. The scoreRepoRankFactor shifts the rank score 2 digits to the left, 316 - // reserving digits 3-7 for the repo rank. 317 - addScore("repo-rank", scoreRepoRankFactor*float64(md.Rank)) 318 - 319 - // digits 1-2 and the decimals are reserved for the doc order. Doc order 320 - // (without the scaling factor) lies in the range [0, 1]. The upper bound is 321 - // achieved for matches in the first document of a shard. 322 - addScore("doc-order", scoreFileOrderFactor*(1.0-float64(doc)/float64(len(d.boundaries)))) 309 + // Add tiebreakers 310 + repoRank := d.repoMetaData[d.repos[doc]].Rank // [0, 65535] 311 + docOrderScore := 1.0 - float64(doc)/float64(len(d.boundaries)) // [0, 1] 323 312 324 313 if opts.DebugScore { 325 - // To make the debug output easier to read, we split the score into the query 326 - // dependent score and the tiebreaker 327 - score := math.Trunc(fileMatch.Score / ScoreOffset) 328 - tiebreaker := fileMatch.Score - score*ScoreOffset 329 - fileMatch.Debug = fmt.Sprintf("score: %d (%.2f) <- %s", int(score), tiebreaker, strings.TrimSuffix(fileMatch.Debug, ", ")) 314 + // We log the score components individually for better readability. 315 + fileMatch.Debug = fmt.Sprintf("score: %d (repo-rank: %d, file-rank: %.2f) <- %s", int(fileMatch.Score), repoRank, docOrderScore, strings.TrimSuffix(fileMatch.Debug, ", ")) 330 316 } 317 + 318 + fileMatch.Score = ScoreOffset*fileMatch.Score + scoreRepoRankFactor*float64(repoRank) + scoreFileOrderFactor*docOrderScore 331 319 } 332 320 333 321 // scoreFilesUsingBM25 computes the score according to BM25, the most common scoring algorithm for text search: ··· 361 349 sumTF += f 362 350 score += tfScore(k, b, L, f) 363 351 } 352 + // 2 digits of precision 353 + score = math.Trunc(score*100) / 100 364 354 365 - fileMatch.Score = score 355 + md := d.repoMetaData[d.repos[doc]] 356 + fileOrderScore := 1.0 - float64(doc)/float64(len(d.boundaries)) 357 + 358 + // Offset score by 9 digits and add the tiebreaker. 359 + // 360 + // Example: For a BM25 score of 1.23, a repo rank of 456789 and a file order score of 0.12, we have a final score of 361 + // 12345678901.2 362 + // ^^^ 363 + // bm25 364 + // ^^^^^^ 365 + // repo rank 366 + // ^^^^ 367 + // doc order 368 + fileMatch.Score = score*ScoreOffsetBM25 + scoreRepoRankFactor*float64(md.Rank) + scoreFileOrderFactor*fileOrderScore 366 369 367 370 if opts.DebugScore { 368 - fileMatch.Debug = fmt.Sprintf("bm25-score: %.2f <- sum-termFrequencies: %d, length-ratio: %.2f", score, sumTF, L) 371 + // To make the debug output easier to read, we split the score into the query dependent score and the tiebreaker 372 + fileMatch.Debug = fmt.Sprintf("bm25-score: %.2f (repo-rank: %d, file-rank: %.2f) <- sum-termFrequencies: %d, length-ratio: %.2f", score, md.Rank, fileOrderScore, sumTF, L) 369 373 } 370 374 }
+2 -1
internal/e2e/scoring_test.go
··· 704 704 // helper to remove the tiebreaker from the score for easier comparison 705 705 func withoutTiebreaker(fullScore float64, useBM25 bool) float64 { 706 706 if useBM25 { 707 - return fullScore 707 + // Shift by ScoreOffsetBM25 and truncate to 2 decimal places 708 + return math.Trunc((fullScore/index.ScoreOffsetBM25)*100) / 100 708 709 } 709 710 return math.Trunc(fullScore / index.ScoreOffset) 710 711 }