Add experimental option for keyword scoring (#583)

This PR adds an experimental option `UseKeywordScoring` for scoring file matches using [BM25](https://opensourceconnections.com/blog/2015/10/16/bm25-the-next-generation-of-lucene-relevation/), the most common approach in keyword search. It treats each match in a file as a term and uses term frequencies to compute an approximation to BM25.

For now, it makes several simplifications:
* We drop inverse document frequency (idf) from the formula, since we don't have access to global term statistics
* There is no special handling for case sensitivity, filename matches, or symbols
* It ignores all other scoring signals, since BM25 is not normalized and it can be hard to combine it with other signals

author

Julie Tibshirani committer

GitHub date 3 years ago (May 23, 2023, 10:50 AM -0700) commit 5250e0e5 5250e0e52a1bd6c2ffd2c74a91ffe090ba47e896 parent d6d1e686 d6d1e686095ef6730b7f371d33a20460d9ed0c9a

+82 -2

3 changed files

Expand all

api.go

eval.go

shards

shards_test.go

api.go

··· 888 888 // will be used. This option is temporary and is only exposed for testing/ tuning purposes. 889 889 DocumentRanksWeight float64 890 890 891 + // EXPERIMENTAL. If true, use keyword-style scoring instead of the default scoring formula. 892 + // Currently, this treats each match in a file as a term and computes an approximation to BM25. 893 + // When enabled, all other scoring signals are ignored, including document ranks. 894 + UseKeywordScoring bool 895 + 891 896 // Trace turns on opentracing for this request if true and if the Jaeger address was provided as 892 897 // a command-line flag 893 898 Trace bool

+42 -2

eval.go

··· 329 329 fileMatch.LineMatches = cp.fillMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts.DebugScore) 330 330 } 331 331 332 - d.scoreFileMatch(&fileMatch, nextDoc, mt, known, opts) 332 + if opts.UseKeywordScoring { 333 + d.scoreFileUsingBM25(&fileMatch, nextDoc, finalCands, opts) 334 + } else { 335 + // Use the standard, non-experimental scoring method by default 336 + d.scoreFile(&fileMatch, nextDoc, mt, known, opts) 337 + } 333 338 334 339 fileMatch.Branches = d.gatherBranches(nextDoc, mt, known) 335 340 sortMatchesByScore(fileMatch.LineMatches) ··· 383 388 return &res, nil 384 389 } 385 390 386 - func (d *indexData) scoreFileMatch(fileMatch *FileMatch, doc uint32, mt matchTree, known map[matchTree]bool, opts *SearchOptions) { 391 + // scoreFile computes a score for the file match using various scoring signals, like 392 + // whether there's an exact match on a symbol, the number of query clauses that matched, etc. 393 + func (d *indexData) scoreFile(fileMatch *FileMatch, doc uint32, mt matchTree, known map[matchTree]bool, opts *SearchOptions) { 387 394 atomMatchCount := 0 388 395 visitMatches(mt, known, func(mt matchTree) { 389 396 atomMatchCount++ ··· 447 454 md := d.repoMetaData[d.repos[doc]] 448 455 fileMatch.addScore("doc-order", scoreFileOrderFactor*(1.0-float64(doc)/float64(len(d.boundaries))), opts.DebugScore) 449 456 fileMatch.addScore("repo-rank", scoreRepoRankFactor*float64(md.Rank)/maxUInt16, opts.DebugScore) 457 + } 458 + 459 + // scoreFileUsingBM25 computes a score for the file match using an approximation to BM25, the most common scoring 460 + // algorithm for keyword search: https://en.wikipedia.org/wiki/Okapi_BM25. It implements all parts of the formula 461 + // except inverse document frequency (idf), since we don't have access to global term frequency statistics. 462 + // 463 + // This scoring strategy ignores all other signals including document ranks. This keeps things simple for now, 464 + // since BM25 is not normalized and can be tricky to combine with other scoring signals. 465 + func (d *indexData) scoreFileUsingBM25(fileMatch *FileMatch, doc uint32, cands []*candidateMatch, opts *SearchOptions) { 466 + // Treat each candidate match as a term and compute the frequencies. For now, ignore case 467 + // sensitivity and treat filenames and symbols the same as content. 468 + termFreqs := map[string]int{} 469 + for _, cand := range cands { 470 + term := string(cand.substrLowered) 471 + termFreqs[term]++ 472 + } 473 + 474 + // Compute the file length ratio. Usually the calculation would be based on terms, but using 475 + // bytes should work fine, as we're just computing a ratio. 476 + fileLength := float64(d.boundaries[doc+1] - d.boundaries[doc]) 477 + numFiles := len(d.boundaries) 478 + averageFileLength := float64(d.boundaries[numFiles - 1]) / float64(numFiles) 479 + L := fileLength / averageFileLength 480 + 481 + // Use standard parameter defaults (used in Lucene and academic papers) 482 + k, b := 1.2, 0.75 483 + score := 0.0 484 + for _, freq := range termFreqs { 485 + tf := float64(freq) 486 + score += ((k + 1.0) * tf) / (k * (1.0 - b + b * L) + tf) 487 + } 488 + 489 + fileMatch.addScore("keyword-score", score, opts.DebugScore) 450 490 } 451 491 452 492 func addRepo(res *SearchResult, repo *Repository) {

+35

shards/shards_test.go

··· 1108 1108 } 1109 1109 } 1110 1110 1111 + func TestUseKeywordScoring(t *testing.T) { 1112 + b := testIndexBuilder(t, 1113 + &zoekt.Repository{}, 1114 + zoekt.Document{Name: "f1", Content: []byte("one two two three")}, 1115 + zoekt.Document{Name: "f2", Content: []byte("one two one two")}, 1116 + zoekt.Document{Name: "f3", Content: []byte("one three three three")}) 1117 + 1118 + ss := newShardedSearcher(1) 1119 + searcher := searcherForTest(t, b) 1120 + ss.replace(map[string]zoekt.Searcher{"r1": searcher}) 1121 + 1122 + q := query.NewOr( 1123 + &query.Substring{Pattern: "one"}, 1124 + &query.Substring{Pattern: "three"}) 1125 + 1126 + opts := zoekt.SearchOptions{ 1127 + UseKeywordScoring: true, 1128 + } 1129 + 1130 + results, err := ss.Search(context.Background(), q, &opts) 1131 + if err != nil { 1132 + t.Fatal(err) 1133 + } 1134 + 1135 + var got []string 1136 + for _, f := range results.Files { 1137 + got = append(got, f.FileName) 1138 + } 1139 + 1140 + want := []string{"f3", "f1", "f2"} 1141 + if !reflect.DeepEqual(got, want) { 1142 + t.Errorf("got %v, want %v", got, want) 1143 + } 1144 + } 1145 + 1111 1146 func testShardedStreamSearch(t *testing.T, q query.Q, ib *zoekt.IndexBuilder, useDocumentRanks bool) []zoekt.FileMatch { 1112 1147 ss := newShardedSearcher(1) 1113 1148 searcher := searcherForTest(t, ib)

Configure Feed

Configure Feed