Ranking: include filename matches in bm25 (#757) · boltless.me/zoekt@43b9225

+67 -12

build/scoring_test.go

··· 60 60 } 61 61 62 62 for _, c := range cases { 63 - checkScoring(t, c, ctags.UniversalCTags) 63 + checkScoring(t, c, false, ctags.UniversalCTags) 64 + } 65 + } 66 + 67 + func TestBM25(t *testing.T) { 68 + exampleJava, err := os.ReadFile("./testdata/example.java") 69 + if err != nil { 70 + t.Fatal(err) 71 + } 72 + 73 + cases := []scoreCase{ 74 + { 75 + // Matches on both filename and content 76 + fileName: "example.java", 77 + query: &query.Substring{Pattern: "example"}, 78 + content: exampleJava, 79 + language: "Java", 80 + // keyword-score:1.63 (sum-tf: 6.00, length-ratio: 2.00) 81 + wantScore: 1.63, 82 + }, { 83 + // Matches only on content 84 + fileName: "example.java", 85 + query: &query.And{Children: []query.Q{ 86 + &query.Substring{Pattern: "inner"}, 87 + &query.Substring{Pattern: "static"}, 88 + &query.Substring{Pattern: "interface"}, 89 + }}, 90 + content: exampleJava, 91 + language: "Java", 92 + // keyword-score:5.75 (sum-tf: 56.00, length-ratio: 2.00) 93 + wantScore: 5.75, 94 + }, 95 + { 96 + // Matches only on filename 97 + fileName: "example.java", 98 + query: &query.Substring{Pattern: "java"}, 99 + content: exampleJava, 100 + language: "Java", 101 + // keyword-score:1.07 (sum-tf: 2.00, length-ratio: 2.00) 102 + wantScore: 1.07, 103 + }, 104 + { 105 + // Matches only on filename, and content is missing 106 + fileName: "a/b/c/config.go", 107 + query: &query.Substring{Pattern: "config.go"}, 108 + language: "Go", 109 + // keyword-score:1.91 (sum-tf: 2.00, length-ratio: 0.00) 110 + wantScore: 1.91, 111 + }, 112 + } 113 + 114 + for _, c := range cases { 115 + checkScoring(t, c, true, ctags.UniversalCTags) 64 116 } 65 117 } 66 118 ··· 197 249 } 198 250 199 251 for _, c := range cases { 200 - checkScoring(t, c, ctags.UniversalCTags) 252 + checkScoring(t, c, false, ctags.UniversalCTags) 201 253 } 202 254 } 203 255 ··· 261 313 parserType := ctags.UniversalCTags 262 314 for _, c := range cases { 263 315 t.Run(c.language, func(t *testing.T) { 264 - checkScoring(t, c, parserType) 316 + checkScoring(t, c, false, parserType) 265 317 }) 266 318 } 267 319 } ··· 318 370 parserType := ctags.UniversalCTags 319 371 for _, c := range cases { 320 372 t.Run(c.language, func(t *testing.T) { 321 - checkScoring(t, c, parserType) 373 + checkScoring(t, c, false, parserType) 322 374 }) 323 375 } 324 376 } ··· 350 402 351 403 for _, parserType := range []ctags.CTagsParserType{ctags.UniversalCTags, ctags.ScipCTags} { 352 404 for _, c := range cases { 353 - checkScoring(t, c, parserType) 405 + checkScoring(t, c, false, parserType) 354 406 } 355 407 } 356 408 ··· 364 416 wantScore: 7860, 365 417 } 366 418 367 - checkScoring(t, scipOnlyCase, ctags.ScipCTags) 419 + checkScoring(t, scipOnlyCase, false, ctags.ScipCTags) 368 420 } 369 421 370 422 func TestRuby(t *testing.T) { ··· 402 454 403 455 for _, parserType := range []ctags.CTagsParserType{ctags.UniversalCTags, ctags.ScipCTags} { 404 456 for _, c := range cases { 405 - checkScoring(t, c, parserType) 457 + checkScoring(t, c, false, parserType) 406 458 } 407 459 } 408 460 } ··· 450 502 451 503 parserType := ctags.UniversalCTags 452 504 for _, c := range cases { 453 - checkScoring(t, c, parserType) 505 + checkScoring(t, c, false, parserType) 454 506 } 455 507 } 456 508 ··· 509 561 510 562 for _, parserType := range []ctags.CTagsParserType{ctags.UniversalCTags, ctags.ScipCTags} { 511 563 for _, c := range cases { 512 - checkScoring(t, c, parserType) 564 + checkScoring(t, c, false, parserType) 513 565 } 514 566 } 515 567 } ··· 532 584 } 533 585 } 534 586 535 - func checkScoring(t *testing.T, c scoreCase, parserType ctags.CTagsParserType) { 587 + func checkScoring(t *testing.T, c scoreCase, keywordScoring bool, parserType ctags.CTagsParserType) { 536 588 skipIfCTagsUnavailable(t, parserType) 537 589 538 590 name := c.language ··· 572 624 } 573 625 defer ss.Close() 574 626 575 - srs, err := ss.Search(context.Background(), c.query, &zoekt.SearchOptions{DebugScore: true}) 627 + srs, err := ss.Search(context.Background(), c.query, &zoekt.SearchOptions{ 628 + UseKeywordScoring: keywordScoring, 629 + ChunkMatches: true, 630 + DebugScore: true}) 576 631 if err != nil { 577 632 t.Fatal(err) 578 633 } ··· 582 637 } 583 638 584 639 if got := srs.Files[0].Score; math.Abs(got-c.wantScore) > epsilon { 585 - t.Fatalf("score: want %f, got %f\ndebug: %s\ndebugscore: %s", c.wantScore, got, srs.Files[0].Debug, srs.Files[0].LineMatches[0].DebugScore) 640 + t.Fatalf("score: want %f, got %f\ndebug: %s\ndebugscore: %s", c.wantScore, got, srs.Files[0].Debug, srs.Files[0].ChunkMatches[0].DebugScore) 586 641 } 587 642 588 643 if got := srs.Files[0].Language; got != c.language {

+79 -55

contentprovider.go

··· 137 137 return byteOff 138 138 } 139 139 140 + // fillMatches converts the internal candidateMatch slice into our API's LineMatch. 141 + // It only ever returns content XOR filename matches, not both. If there are any 142 + // content matches, these are always returned, and we omit filename matches. 143 + // 144 + // Performance invariant: ms is sorted and non-overlapping. 145 + // 146 + // Note: the byte slices may be backed by mmapped data, so before being 147 + // returned by the API it needs to be copied. 140 148 func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch { 141 - var result []LineMatch 142 - if ms[0].fileName { 143 - score, debugScore, _ := p.candidateMatchScore(ms, language, debug) 144 - 145 - // There is only "line" in a filename. 146 - res := LineMatch{ 147 - Line: p.id.fileName(p.idx), 148 - FileName: true, 149 + var filenameMatches []*candidateMatch 150 + contentMatches := ms[:0] 149 151 150 - Score: score, 151 - DebugScore: debugScore, 152 + for _, m := range ms { 153 + if m.fileName { 154 + filenameMatches = append(filenameMatches, m) 155 + } else { 156 + contentMatches = append(contentMatches, m) 152 157 } 158 + } 153 159 154 - for _, m := range ms { 155 - res.LineFragments = append(res.LineFragments, LineFragmentMatch{ 156 - LineOffset: int(m.byteOffset), 157 - MatchLength: int(m.byteMatchSz), 158 - Offset: m.byteOffset, 159 - }) 160 + // If there are any content matches, we only return these and skip filename matches. 161 + if len(contentMatches) > 0 { 162 + contentMatches = breakMatchesOnNewlines(contentMatches, p.data(false)) 163 + return p.fillContentMatches(contentMatches, numContextLines, language, debug) 164 + } 165 + 166 + // Otherwise, we return a single line containing the filematch match. 167 + score, debugScore, _ := p.candidateMatchScore(filenameMatches, language, debug) 168 + res := LineMatch{ 169 + Line: p.id.fileName(p.idx), 170 + FileName: true, 171 + Score: score, 172 + DebugScore: debugScore, 173 + } 160 174 161 - result = []LineMatch{res} 162 - } 163 - } else { 164 - ms = breakMatchesOnNewlines(ms, p.data(false)) 165 - result = p.fillContentMatches(ms, numContextLines, language, debug) 175 + for _, m := range ms { 176 + res.LineFragments = append(res.LineFragments, LineFragmentMatch{ 177 + LineOffset: int(m.byteOffset), 178 + MatchLength: int(m.byteMatchSz), 179 + Offset: m.byteOffset, 180 + }) 166 181 } 167 182 168 - return result 183 + return []LineMatch{res} 184 + 169 185 } 170 186 171 - // fillChunkMatches converts the internal candidateMatch slice into our APIs ChunkMatch. 187 + // fillChunkMatches converts the internal candidateMatch slice into our API's ChunkMatch. 188 + // It only ever returns content XOR filename matches, not both. If there are any content 189 + // matches, these are always returned, and we omit filename matches. 172 190 // 173 191 // Performance invariant: ms is sorted and non-overlapping. 174 192 // 175 193 // Note: the byte slices may be backed by mmapped data, so before being 176 194 // returned by the API it needs to be copied. 177 195 func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch { 178 - var result []ChunkMatch 179 - if ms[0].fileName { 180 - // If the first match is a filename match, there will only be 181 - // one match and the matched content will be the filename. 182 - 183 - score, debugScore, _ := p.candidateMatchScore(ms, language, debug) 196 + var filenameMatches []*candidateMatch 197 + contentMatches := ms[:0] 184 198 185 - fileName := p.id.fileName(p.idx) 186 - ranges := make([]Range, 0, len(ms)) 187 - for _, m := range ms { 188 - ranges = append(ranges, Range{ 189 - Start: Location{ 190 - ByteOffset: m.byteOffset, 191 - LineNumber: 1, 192 - Column: uint32(utf8.RuneCount(fileName[:m.byteOffset]) + 1), 193 - }, 194 - End: Location{ 195 - ByteOffset: m.byteOffset + m.byteMatchSz, 196 - LineNumber: 1, 197 - Column: uint32(utf8.RuneCount(fileName[:m.byteOffset+m.byteMatchSz]) + 1), 198 - }, 199 - }) 199 + for _, m := range ms { 200 + if m.fileName { 201 + filenameMatches = append(filenameMatches, m) 202 + } else { 203 + contentMatches = append(contentMatches, m) 200 204 } 205 + } 201 206 202 - result = []ChunkMatch{{ 203 - Content: fileName, 204 - ContentStart: Location{ByteOffset: 0, LineNumber: 1, Column: 1}, 205 - Ranges: ranges, 206 - FileName: true, 207 + // If there are any content matches, we only return these and skip filename matches. 208 + if len(contentMatches) > 0 { 209 + return p.fillContentChunkMatches(contentMatches, numContextLines, language, debug) 210 + } 207 211 208 - Score: score, 209 - DebugScore: debugScore, 210 - }} 211 - } else { 212 - result = p.fillContentChunkMatches(ms, numContextLines, language, debug) 212 + // Otherwise, we return a single chunk representing the filename match. 213 + score, debugScore, _ := p.candidateMatchScore(filenameMatches, language, debug) 214 + fileName := p.id.fileName(p.idx) 215 + ranges := make([]Range, 0, len(ms)) 216 + for _, m := range ms { 217 + ranges = append(ranges, Range{ 218 + Start: Location{ 219 + ByteOffset: m.byteOffset, 220 + LineNumber: 1, 221 + Column: uint32(utf8.RuneCount(fileName[:m.byteOffset]) + 1), 222 + }, 223 + End: Location{ 224 + ByteOffset: m.byteOffset + m.byteMatchSz, 225 + LineNumber: 1, 226 + Column: uint32(utf8.RuneCount(fileName[:m.byteOffset+m.byteMatchSz]) + 1), 227 + }, 228 + }) 213 229 } 214 230 215 - return result 231 + return []ChunkMatch{{ 232 + Content: fileName, 233 + ContentStart: Location{ByteOffset: 0, LineNumber: 1, Column: 1}, 234 + Ranges: ranges, 235 + FileName: true, 236 + 237 + Score: score, 238 + DebugScore: debugScore, 239 + }} 216 240 } 217 241 218 242 func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch {

+33 -50

eval.go

··· 382 382 res.LineFragments[repo.Name] = repo.LineFragmentTemplate 383 383 } 384 384 385 - // Gather matches from this document. This never returns a mixture of 386 - // filename/content matches: if there are content matches, all 387 - // filename matches are trimmed from the result. The matches are 388 - // returned in document order and are non-overlapping. 385 + // Gather matches from this document. The matches are returned in document 386 + // order and are non-overlapping. All filename and content matches are 387 + // returned, with filename matches first. 389 388 // 390 389 // If `merge` is set, overlapping and adjacent matches will be merged 391 390 // into a single match. Otherwise, overlapping matches will be removed, ··· 407 406 } 408 407 }) 409 408 410 - // If there are content matches, trim all filename matches. 411 - foundContentMatch := false 412 - for _, c := range cands { 413 - if !c.fileName { 414 - foundContentMatch = true 415 - break 416 - } 417 - } 418 - 419 - res := cands[:0] 420 - for _, c := range cands { 421 - if !foundContentMatch || !c.fileName { 422 - res = append(res, c) 423 - } 424 - } 425 - cands = res 426 - 427 409 // If we found no candidate matches at all, assume there must have been a match on filename. 428 410 if len(cands) == 0 { 429 411 nm := d.fileName(nextDoc) ··· 439 421 }} 440 422 } 441 423 442 - if merge { 443 - // Merge adjacent candidates. This guarantees that the matches 444 - // are non-overlapping. 445 - sort.Sort((sortByOffsetSlice)(cands)) 446 - res = cands[:0] 447 - mergeRun := 1 448 - for i, c := range cands { 449 - if i == 0 { 450 - res = append(res, c) 451 - continue 452 - } 453 - last := res[len(res)-1] 424 + sort.Sort((sortByOffsetSlice)(cands)) 425 + res := cands[:0] 426 + mergeRun := 1 427 + for i, c := range cands { 428 + if i == 0 { 429 + res = append(res, c) 430 + continue 431 + } 432 + 433 + last := res[len(res)-1] 434 + 435 + // Never compare filename and content matches 436 + if last.fileName != c.fileName { 437 + res = append(res, c) 438 + continue 439 + } 440 + 441 + if merge { 442 + // Merge adjacent candidates. This guarantees that the matches 443 + // are non-overlapping. 454 444 lastEnd := last.byteOffset + last.byteMatchSz 455 445 end := c.byteOffset + c.byteMatchSz 456 446 if lastEnd >= c.byteOffset { 457 447 mergeRun++ 458 - 459 448 // Average out the score across the merged candidates. Only do it if 460 449 // we are boosting to avoid floating point funkiness in the normal 461 450 // case. ··· 472 461 } else { 473 462 mergeRun = 1 474 463 } 475 - 476 - res = append(res, c) 477 - } 478 - } else { 479 - // Remove overlapping candidates. This guarantees that the matches 480 - // are non-overlapping, but also preserves expected match counts. 481 - sort.Sort((sortByOffsetSlice)(cands)) 482 - res = cands[:0] 483 - for i, c := range cands { 484 - if i == 0 { 485 - res = append(res, c) 486 - continue 487 - } 488 - last := res[len(res)-1] 464 + } else { 465 + // Remove overlapping candidates. This guarantees that the matches 466 + // are non-overlapping, but also preserves expected match counts. 489 467 lastEnd := last.byteOffset + last.byteMatchSz 490 468 if lastEnd > c.byteOffset { 491 469 continue 492 470 } 493 - 494 - res = append(res, c) 495 471 } 472 + 473 + res = append(res, c) 496 474 } 497 475 return res 498 476 } ··· 502 480 func (m sortByOffsetSlice) Len() int { return len(m) } 503 481 func (m sortByOffsetSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 504 482 func (m sortByOffsetSlice) Less(i, j int) bool { 483 + // Sort all filename matches to the start 484 + if m[i].fileName != m[j].fileName { 485 + return m[i].fileName 486 + } 487 + 505 488 if m[i].byteOffset == m[j].byteOffset { // tie break if same offset 506 489 // Prefer longer candidates if starting at same position 507 490 return m[i].byteMatchSz > m[j].byteMatchSz

+14 -1

score.go

··· 119 119 // algorithm for keyword search: https://en.wikipedia.org/wiki/Okapi_BM25. It implements all parts of the formula 120 120 // except inverse document frequency (idf), since we don't have access to global term frequency statistics. 121 121 // 122 + // Filename matches count twice as much as content matches. This mimics a common text search strategy where you 123 + // 'boost' matches on document titles. 124 + // 122 125 // This scoring strategy ignores all other signals including document ranks. This keeps things simple for now, 123 126 // since BM25 is not normalized and can be tricky to combine with other scoring signals. 124 127 func (d *indexData) scoreFileUsingBM25(fileMatch *FileMatch, doc uint32, cands []*candidateMatch, opts *SearchOptions) { ··· 127 130 termFreqs := map[string]int{} 128 131 for _, cand := range cands { 129 132 term := string(cand.substrLowered) 130 - termFreqs[term]++ 133 + 134 + if cand.fileName { 135 + termFreqs[term] += 2 136 + } else { 137 + termFreqs[term]++ 138 + } 131 139 } 132 140 133 141 // Compute the file length ratio. Usually the calculation would be based on terms, but using ··· 135 143 fileLength := float64(d.boundaries[doc+1] - d.boundaries[doc]) 136 144 numFiles := len(d.boundaries) 137 145 averageFileLength := float64(d.boundaries[numFiles-1]) / float64(numFiles) 146 + 147 + // This is very unlikely, but explicitly guard against division by zero. 148 + if averageFileLength == 0 { 149 + averageFileLength++ 150 + } 138 151 L := fileLength / averageFileLength 139 152 140 153 // Use standard parameter defaults (used in Lucene and academic papers)

Configure Feed

Configure Feed