fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

Ranking: include filename matches in bm25 (#757)

BM25 considers a file to be a better match when there are many occurrences of
terms in the file. It's important to count all term occurrences, including
those in other fields like the filename.

For historical reasons, Zoekt trims all filename matches from a result if there
are any content matches. This meant that in BM25 scoring, we didn't account for
filename matches.

This PR refactors the match code so that we only trim filename matches when
assembling the final `FileMatch`. We retain filename matches when creating
`candidateMatch`, which lets BM25 scoring use them. Even without the better
BM25 scoring, I think this refactor makes the code easier to follow.

+193 -118
+67 -12
build/scoring_test.go
··· 60 60 } 61 61 62 62 for _, c := range cases { 63 - checkScoring(t, c, ctags.UniversalCTags) 63 + checkScoring(t, c, false, ctags.UniversalCTags) 64 + } 65 + } 66 + 67 + func TestBM25(t *testing.T) { 68 + exampleJava, err := os.ReadFile("./testdata/example.java") 69 + if err != nil { 70 + t.Fatal(err) 71 + } 72 + 73 + cases := []scoreCase{ 74 + { 75 + // Matches on both filename and content 76 + fileName: "example.java", 77 + query: &query.Substring{Pattern: "example"}, 78 + content: exampleJava, 79 + language: "Java", 80 + // keyword-score:1.63 (sum-tf: 6.00, length-ratio: 2.00) 81 + wantScore: 1.63, 82 + }, { 83 + // Matches only on content 84 + fileName: "example.java", 85 + query: &query.And{Children: []query.Q{ 86 + &query.Substring{Pattern: "inner"}, 87 + &query.Substring{Pattern: "static"}, 88 + &query.Substring{Pattern: "interface"}, 89 + }}, 90 + content: exampleJava, 91 + language: "Java", 92 + // keyword-score:5.75 (sum-tf: 56.00, length-ratio: 2.00) 93 + wantScore: 5.75, 94 + }, 95 + { 96 + // Matches only on filename 97 + fileName: "example.java", 98 + query: &query.Substring{Pattern: "java"}, 99 + content: exampleJava, 100 + language: "Java", 101 + // keyword-score:1.07 (sum-tf: 2.00, length-ratio: 2.00) 102 + wantScore: 1.07, 103 + }, 104 + { 105 + // Matches only on filename, and content is missing 106 + fileName: "a/b/c/config.go", 107 + query: &query.Substring{Pattern: "config.go"}, 108 + language: "Go", 109 + // keyword-score:1.91 (sum-tf: 2.00, length-ratio: 0.00) 110 + wantScore: 1.91, 111 + }, 112 + } 113 + 114 + for _, c := range cases { 115 + checkScoring(t, c, true, ctags.UniversalCTags) 64 116 } 65 117 } 66 118 ··· 197 249 } 198 250 199 251 for _, c := range cases { 200 - checkScoring(t, c, ctags.UniversalCTags) 252 + checkScoring(t, c, false, ctags.UniversalCTags) 201 253 } 202 254 } 203 255 ··· 261 313 parserType := ctags.UniversalCTags 262 314 for _, c := range cases { 263 315 t.Run(c.language, func(t *testing.T) { 264 - checkScoring(t, c, parserType) 316 + checkScoring(t, c, false, parserType) 265 317 }) 266 318 } 267 319 } ··· 318 370 parserType := ctags.UniversalCTags 319 371 for _, c := range cases { 320 372 t.Run(c.language, func(t *testing.T) { 321 - checkScoring(t, c, parserType) 373 + checkScoring(t, c, false, parserType) 322 374 }) 323 375 } 324 376 } ··· 350 402 351 403 for _, parserType := range []ctags.CTagsParserType{ctags.UniversalCTags, ctags.ScipCTags} { 352 404 for _, c := range cases { 353 - checkScoring(t, c, parserType) 405 + checkScoring(t, c, false, parserType) 354 406 } 355 407 } 356 408 ··· 364 416 wantScore: 7860, 365 417 } 366 418 367 - checkScoring(t, scipOnlyCase, ctags.ScipCTags) 419 + checkScoring(t, scipOnlyCase, false, ctags.ScipCTags) 368 420 } 369 421 370 422 func TestRuby(t *testing.T) { ··· 402 454 403 455 for _, parserType := range []ctags.CTagsParserType{ctags.UniversalCTags, ctags.ScipCTags} { 404 456 for _, c := range cases { 405 - checkScoring(t, c, parserType) 457 + checkScoring(t, c, false, parserType) 406 458 } 407 459 } 408 460 } ··· 450 502 451 503 parserType := ctags.UniversalCTags 452 504 for _, c := range cases { 453 - checkScoring(t, c, parserType) 505 + checkScoring(t, c, false, parserType) 454 506 } 455 507 } 456 508 ··· 509 561 510 562 for _, parserType := range []ctags.CTagsParserType{ctags.UniversalCTags, ctags.ScipCTags} { 511 563 for _, c := range cases { 512 - checkScoring(t, c, parserType) 564 + checkScoring(t, c, false, parserType) 513 565 } 514 566 } 515 567 } ··· 532 584 } 533 585 } 534 586 535 - func checkScoring(t *testing.T, c scoreCase, parserType ctags.CTagsParserType) { 587 + func checkScoring(t *testing.T, c scoreCase, keywordScoring bool, parserType ctags.CTagsParserType) { 536 588 skipIfCTagsUnavailable(t, parserType) 537 589 538 590 name := c.language ··· 572 624 } 573 625 defer ss.Close() 574 626 575 - srs, err := ss.Search(context.Background(), c.query, &zoekt.SearchOptions{DebugScore: true}) 627 + srs, err := ss.Search(context.Background(), c.query, &zoekt.SearchOptions{ 628 + UseKeywordScoring: keywordScoring, 629 + ChunkMatches: true, 630 + DebugScore: true}) 576 631 if err != nil { 577 632 t.Fatal(err) 578 633 } ··· 582 637 } 583 638 584 639 if got := srs.Files[0].Score; math.Abs(got-c.wantScore) > epsilon { 585 - t.Fatalf("score: want %f, got %f\ndebug: %s\ndebugscore: %s", c.wantScore, got, srs.Files[0].Debug, srs.Files[0].LineMatches[0].DebugScore) 640 + t.Fatalf("score: want %f, got %f\ndebug: %s\ndebugscore: %s", c.wantScore, got, srs.Files[0].Debug, srs.Files[0].ChunkMatches[0].DebugScore) 586 641 } 587 642 588 643 if got := srs.Files[0].Language; got != c.language {
+79 -55
contentprovider.go
··· 137 137 return byteOff 138 138 } 139 139 140 + // fillMatches converts the internal candidateMatch slice into our API's LineMatch. 141 + // It only ever returns content XOR filename matches, not both. If there are any 142 + // content matches, these are always returned, and we omit filename matches. 143 + // 144 + // Performance invariant: ms is sorted and non-overlapping. 145 + // 146 + // Note: the byte slices may be backed by mmapped data, so before being 147 + // returned by the API it needs to be copied. 140 148 func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch { 141 - var result []LineMatch 142 - if ms[0].fileName { 143 - score, debugScore, _ := p.candidateMatchScore(ms, language, debug) 144 - 145 - // There is only "line" in a filename. 146 - res := LineMatch{ 147 - Line: p.id.fileName(p.idx), 148 - FileName: true, 149 + var filenameMatches []*candidateMatch 150 + contentMatches := ms[:0] 149 151 150 - Score: score, 151 - DebugScore: debugScore, 152 + for _, m := range ms { 153 + if m.fileName { 154 + filenameMatches = append(filenameMatches, m) 155 + } else { 156 + contentMatches = append(contentMatches, m) 152 157 } 158 + } 153 159 154 - for _, m := range ms { 155 - res.LineFragments = append(res.LineFragments, LineFragmentMatch{ 156 - LineOffset: int(m.byteOffset), 157 - MatchLength: int(m.byteMatchSz), 158 - Offset: m.byteOffset, 159 - }) 160 + // If there are any content matches, we only return these and skip filename matches. 161 + if len(contentMatches) > 0 { 162 + contentMatches = breakMatchesOnNewlines(contentMatches, p.data(false)) 163 + return p.fillContentMatches(contentMatches, numContextLines, language, debug) 164 + } 165 + 166 + // Otherwise, we return a single line containing the filematch match. 167 + score, debugScore, _ := p.candidateMatchScore(filenameMatches, language, debug) 168 + res := LineMatch{ 169 + Line: p.id.fileName(p.idx), 170 + FileName: true, 171 + Score: score, 172 + DebugScore: debugScore, 173 + } 160 174 161 - result = []LineMatch{res} 162 - } 163 - } else { 164 - ms = breakMatchesOnNewlines(ms, p.data(false)) 165 - result = p.fillContentMatches(ms, numContextLines, language, debug) 175 + for _, m := range ms { 176 + res.LineFragments = append(res.LineFragments, LineFragmentMatch{ 177 + LineOffset: int(m.byteOffset), 178 + MatchLength: int(m.byteMatchSz), 179 + Offset: m.byteOffset, 180 + }) 166 181 } 167 182 168 - return result 183 + return []LineMatch{res} 184 + 169 185 } 170 186 171 - // fillChunkMatches converts the internal candidateMatch slice into our APIs ChunkMatch. 187 + // fillChunkMatches converts the internal candidateMatch slice into our API's ChunkMatch. 188 + // It only ever returns content XOR filename matches, not both. If there are any content 189 + // matches, these are always returned, and we omit filename matches. 172 190 // 173 191 // Performance invariant: ms is sorted and non-overlapping. 174 192 // 175 193 // Note: the byte slices may be backed by mmapped data, so before being 176 194 // returned by the API it needs to be copied. 177 195 func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch { 178 - var result []ChunkMatch 179 - if ms[0].fileName { 180 - // If the first match is a filename match, there will only be 181 - // one match and the matched content will be the filename. 182 - 183 - score, debugScore, _ := p.candidateMatchScore(ms, language, debug) 196 + var filenameMatches []*candidateMatch 197 + contentMatches := ms[:0] 184 198 185 - fileName := p.id.fileName(p.idx) 186 - ranges := make([]Range, 0, len(ms)) 187 - for _, m := range ms { 188 - ranges = append(ranges, Range{ 189 - Start: Location{ 190 - ByteOffset: m.byteOffset, 191 - LineNumber: 1, 192 - Column: uint32(utf8.RuneCount(fileName[:m.byteOffset]) + 1), 193 - }, 194 - End: Location{ 195 - ByteOffset: m.byteOffset + m.byteMatchSz, 196 - LineNumber: 1, 197 - Column: uint32(utf8.RuneCount(fileName[:m.byteOffset+m.byteMatchSz]) + 1), 198 - }, 199 - }) 199 + for _, m := range ms { 200 + if m.fileName { 201 + filenameMatches = append(filenameMatches, m) 202 + } else { 203 + contentMatches = append(contentMatches, m) 200 204 } 205 + } 201 206 202 - result = []ChunkMatch{{ 203 - Content: fileName, 204 - ContentStart: Location{ByteOffset: 0, LineNumber: 1, Column: 1}, 205 - Ranges: ranges, 206 - FileName: true, 207 + // If there are any content matches, we only return these and skip filename matches. 208 + if len(contentMatches) > 0 { 209 + return p.fillContentChunkMatches(contentMatches, numContextLines, language, debug) 210 + } 207 211 208 - Score: score, 209 - DebugScore: debugScore, 210 - }} 211 - } else { 212 - result = p.fillContentChunkMatches(ms, numContextLines, language, debug) 212 + // Otherwise, we return a single chunk representing the filename match. 213 + score, debugScore, _ := p.candidateMatchScore(filenameMatches, language, debug) 214 + fileName := p.id.fileName(p.idx) 215 + ranges := make([]Range, 0, len(ms)) 216 + for _, m := range ms { 217 + ranges = append(ranges, Range{ 218 + Start: Location{ 219 + ByteOffset: m.byteOffset, 220 + LineNumber: 1, 221 + Column: uint32(utf8.RuneCount(fileName[:m.byteOffset]) + 1), 222 + }, 223 + End: Location{ 224 + ByteOffset: m.byteOffset + m.byteMatchSz, 225 + LineNumber: 1, 226 + Column: uint32(utf8.RuneCount(fileName[:m.byteOffset+m.byteMatchSz]) + 1), 227 + }, 228 + }) 213 229 } 214 230 215 - return result 231 + return []ChunkMatch{{ 232 + Content: fileName, 233 + ContentStart: Location{ByteOffset: 0, LineNumber: 1, Column: 1}, 234 + Ranges: ranges, 235 + FileName: true, 236 + 237 + Score: score, 238 + DebugScore: debugScore, 239 + }} 216 240 } 217 241 218 242 func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch {
+33 -50
eval.go
··· 382 382 res.LineFragments[repo.Name] = repo.LineFragmentTemplate 383 383 } 384 384 385 - // Gather matches from this document. This never returns a mixture of 386 - // filename/content matches: if there are content matches, all 387 - // filename matches are trimmed from the result. The matches are 388 - // returned in document order and are non-overlapping. 385 + // Gather matches from this document. The matches are returned in document 386 + // order and are non-overlapping. All filename and content matches are 387 + // returned, with filename matches first. 389 388 // 390 389 // If `merge` is set, overlapping and adjacent matches will be merged 391 390 // into a single match. Otherwise, overlapping matches will be removed, ··· 407 406 } 408 407 }) 409 408 410 - // If there are content matches, trim all filename matches. 411 - foundContentMatch := false 412 - for _, c := range cands { 413 - if !c.fileName { 414 - foundContentMatch = true 415 - break 416 - } 417 - } 418 - 419 - res := cands[:0] 420 - for _, c := range cands { 421 - if !foundContentMatch || !c.fileName { 422 - res = append(res, c) 423 - } 424 - } 425 - cands = res 426 - 427 409 // If we found no candidate matches at all, assume there must have been a match on filename. 428 410 if len(cands) == 0 { 429 411 nm := d.fileName(nextDoc) ··· 439 421 }} 440 422 } 441 423 442 - if merge { 443 - // Merge adjacent candidates. This guarantees that the matches 444 - // are non-overlapping. 445 - sort.Sort((sortByOffsetSlice)(cands)) 446 - res = cands[:0] 447 - mergeRun := 1 448 - for i, c := range cands { 449 - if i == 0 { 450 - res = append(res, c) 451 - continue 452 - } 453 - last := res[len(res)-1] 424 + sort.Sort((sortByOffsetSlice)(cands)) 425 + res := cands[:0] 426 + mergeRun := 1 427 + for i, c := range cands { 428 + if i == 0 { 429 + res = append(res, c) 430 + continue 431 + } 432 + 433 + last := res[len(res)-1] 434 + 435 + // Never compare filename and content matches 436 + if last.fileName != c.fileName { 437 + res = append(res, c) 438 + continue 439 + } 440 + 441 + if merge { 442 + // Merge adjacent candidates. This guarantees that the matches 443 + // are non-overlapping. 454 444 lastEnd := last.byteOffset + last.byteMatchSz 455 445 end := c.byteOffset + c.byteMatchSz 456 446 if lastEnd >= c.byteOffset { 457 447 mergeRun++ 458 - 459 448 // Average out the score across the merged candidates. Only do it if 460 449 // we are boosting to avoid floating point funkiness in the normal 461 450 // case. ··· 472 461 } else { 473 462 mergeRun = 1 474 463 } 475 - 476 - res = append(res, c) 477 - } 478 - } else { 479 - // Remove overlapping candidates. This guarantees that the matches 480 - // are non-overlapping, but also preserves expected match counts. 481 - sort.Sort((sortByOffsetSlice)(cands)) 482 - res = cands[:0] 483 - for i, c := range cands { 484 - if i == 0 { 485 - res = append(res, c) 486 - continue 487 - } 488 - last := res[len(res)-1] 464 + } else { 465 + // Remove overlapping candidates. This guarantees that the matches 466 + // are non-overlapping, but also preserves expected match counts. 489 467 lastEnd := last.byteOffset + last.byteMatchSz 490 468 if lastEnd > c.byteOffset { 491 469 continue 492 470 } 493 - 494 - res = append(res, c) 495 471 } 472 + 473 + res = append(res, c) 496 474 } 497 475 return res 498 476 } ··· 502 480 func (m sortByOffsetSlice) Len() int { return len(m) } 503 481 func (m sortByOffsetSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 504 482 func (m sortByOffsetSlice) Less(i, j int) bool { 483 + // Sort all filename matches to the start 484 + if m[i].fileName != m[j].fileName { 485 + return m[i].fileName 486 + } 487 + 505 488 if m[i].byteOffset == m[j].byteOffset { // tie break if same offset 506 489 // Prefer longer candidates if starting at same position 507 490 return m[i].byteMatchSz > m[j].byteMatchSz
+14 -1
score.go
··· 119 119 // algorithm for keyword search: https://en.wikipedia.org/wiki/Okapi_BM25. It implements all parts of the formula 120 120 // except inverse document frequency (idf), since we don't have access to global term frequency statistics. 121 121 // 122 + // Filename matches count twice as much as content matches. This mimics a common text search strategy where you 123 + // 'boost' matches on document titles. 124 + // 122 125 // This scoring strategy ignores all other signals including document ranks. This keeps things simple for now, 123 126 // since BM25 is not normalized and can be tricky to combine with other scoring signals. 124 127 func (d *indexData) scoreFileUsingBM25(fileMatch *FileMatch, doc uint32, cands []*candidateMatch, opts *SearchOptions) { ··· 127 130 termFreqs := map[string]int{} 128 131 for _, cand := range cands { 129 132 term := string(cand.substrLowered) 130 - termFreqs[term]++ 133 + 134 + if cand.fileName { 135 + termFreqs[term] += 2 136 + } else { 137 + termFreqs[term]++ 138 + } 131 139 } 132 140 133 141 // Compute the file length ratio. Usually the calculation would be based on terms, but using ··· 135 143 fileLength := float64(d.boundaries[doc+1] - d.boundaries[doc]) 136 144 numFiles := len(d.boundaries) 137 145 averageFileLength := float64(d.boundaries[numFiles-1]) / float64(numFiles) 146 + 147 + // This is very unlikely, but explicitly guard against division by zero. 148 + if averageFileLength == 0 { 149 + averageFileLength++ 150 + } 138 151 L := fileLength / averageFileLength 139 152 140 153 // Use standard parameter defaults (used in Lucene and academic papers)