Remove special merging behavior for line matches (#888)

Usually, if there are candidate matches with overlapping ranges, then we just remove matches that overlap. However, when `opts.ChunkMatches = false`, we had special logic to merge overlapping matches.

This PR removes the overlapping logic to simplify the behavior. I couldn't see a good reason to keep this special handling. Plus, we are moving towards making `ChunkMatches` the default.

Another benefit of this change is that it makes the BM25 behavior easier to understand. If we merged together ranges, then we would be calculating term frequencies for spurious terms (like `new`, `queue`, `newqueue`, `queuenew`, etc.) Note: we currently only use BM25 with `ChunkMatches = true`, so there's not an active bug here.

author

Julie Tibshirani committer

GitHub date 1 year ago (Jan 15, 2025, 1:23 PM -0800) commit d301e838 d301e8380c5835ca738ac44d78ef5048e94c006a parent 4c8bb19e 4c8bb19e06668645fe2c55c861cf2426f53b9ade

+9 -36

1 changed file

Expand all

eval.go

+9 -36

eval.go

··· 323 323 // Important invariant for performance: finalCands is sorted by offset and 324 324 // non-overlapping. gatherMatches respects this invariant and all later 325 325 // transformations respect this. 326 - shouldMergeMatches := !opts.ChunkMatches 327 - finalCands := d.gatherMatches(nextDoc, mt, known, shouldMergeMatches) 326 + finalCands := d.gatherMatches(nextDoc, mt, known) 328 327 329 328 if opts.ChunkMatches { 330 329 fileMatch.ChunkMatches = cp.fillChunkMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts) ··· 417 416 // If `merge` is set, overlapping and adjacent matches will be merged 418 417 // into a single match. Otherwise, overlapping matches will be removed, 419 418 // but adjacent matches will remain. 420 - func (d *indexData) gatherMatches(nextDoc uint32, mt matchTree, known map[matchTree]bool, merge bool) []*candidateMatch { 419 + func (d *indexData) gatherMatches(nextDoc uint32, mt matchTree, known map[matchTree]bool) []*candidateMatch { 421 420 var cands []*candidateMatch 422 421 visitMatches(mt, known, 1, func(mt matchTree, scoreWeight float64) { 423 422 if smt, ok := mt.(*substrMatchTree); ok { ··· 449 448 }} 450 449 } 451 450 451 + // Remove overlapping candidates. This guarantees that the matches 452 + // are non-overlapping, but also preserves expected match counts. 452 453 sort.Sort((sortByOffsetSlice)(cands)) 453 454 res := cands[:0] 454 - mergeRun := 1 455 455 for i, c := range cands { 456 456 if i == 0 { 457 457 res = append(res, c) ··· 466 466 continue 467 467 } 468 468 469 - if merge { 470 - // Merge adjacent candidates. This guarantees that the matches 471 - // are non-overlapping. 472 - lastEnd := last.byteOffset + last.byteMatchSz 473 - end := c.byteOffset + c.byteMatchSz 474 - if lastEnd >= c.byteOffset { 475 - mergeRun++ 476 - // Average out the score across the merged candidates. Only do it if 477 - // we are boosting to avoid floating point funkiness in the normal 478 - // case. 479 - if !(epsilonEqualsOne(last.scoreWeight) && epsilonEqualsOne(c.scoreWeight)) { 480 - last.scoreWeight = ((last.scoreWeight * float64(mergeRun-1)) + c.scoreWeight) / float64(mergeRun) 481 - } 482 - 483 - // latest candidate goes further, update our end 484 - if end > lastEnd { 485 - last.byteMatchSz = end - last.byteOffset 486 - } 487 - 488 - continue 489 - } else { 490 - mergeRun = 1 491 - } 492 - } else { 493 - // Remove overlapping candidates. This guarantees that the matches 494 - // are non-overlapping, but also preserves expected match counts. 495 - lastEnd := last.byteOffset + last.byteMatchSz 496 - if lastEnd > c.byteOffset { 497 - continue 498 - } 469 + // Only add the match if its range doesn't overlap 470 + lastEnd := last.byteOffset + last.byteMatchSz 471 + if lastEnd <= c.byteOffset { 472 + res = append(res, c) 473 + continue 499 474 } 500 - 501 - res = append(res, c) 502 475 } 503 476 return res 504 477 }

Configure Feed

Configure Feed