fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

Ranking: tidy up scoring code (#755)

As part of our context quality work, I plan to update BM25 scoring so it takes
into account both filename and content matches. (Right now we ignore all
filename matches if there are any content matches.)

This is a preliminary refactor in the scoring code:
* Pull scoring logic into its own file `score.go`, since `eval.go` is super
long
* Make `indexData.Search` a bit shorter and add comments

+190 -191
-17
contentprovider.go
··· 535 535 scoreLineOrderFactor = 1.0 536 536 ) 537 537 538 - // findSection checks whether a section defined by offset and size lies within 539 - // one of the sections in secs. 540 - func findSection(secs []DocumentSection, off, sz uint32) (uint32, bool) { 541 - j := sort.Search(len(secs), func(i int) bool { 542 - return secs[i].End >= off+sz 543 - }) 544 - 545 - if j == len(secs) { 546 - return 0, false 547 - } 548 - 549 - if secs[j].Start <= off && off+sz <= secs[j].End { 550 - return uint32(j), true 551 - } 552 - return 0, false 553 - } 554 - 555 538 // findMaxOverlappingSection returns the index of the section in secs that 556 539 // overlaps the most with the area defined by off and sz, relative to the size 557 540 // of the section. If no section overlaps, it returns 0, false. If multiple
+39 -174
eval.go
··· 18 18 "context" 19 19 "fmt" 20 20 "log" 21 - "math" 22 21 "regexp/syntax" 23 22 "sort" 24 - "strconv" 25 23 "strings" 26 24 "time" 27 25 ··· 30 28 31 29 "github.com/sourcegraph/zoekt/query" 32 30 ) 33 - 34 - const maxUInt16 = 0xffff 35 - 36 - // addScore increments the score of the FileMatch by the computed score. If 37 - // debugScore is true, it also adds a debug string to the FileMatch. If raw is 38 - // -1, it is ignored. Otherwise, it is added to the debug string. 39 - func (m *FileMatch) addScore(what string, computed float64, raw float64, debugScore bool) { 40 - if computed != 0 && debugScore { 41 - var b strings.Builder 42 - fmt.Fprintf(&b, "%s", what) 43 - if raw != -1 { 44 - fmt.Fprintf(&b, "(%s)", strconv.FormatFloat(raw, 'f', -1, 64)) 45 - } 46 - fmt.Fprintf(&b, ":%.2f, ", computed) 47 - m.Debug += b.String() 48 - } 49 - m.Score += computed 50 - } 51 - 52 - func (m *FileMatch) addKeywordScore(score float64, sumTf float64, L float64, debugScore bool) { 53 - if debugScore { 54 - m.Debug += fmt.Sprintf("keyword-score:%.2f (sum-tf: %.2f, length-ratio: %.2f)", score, sumTf, L) 55 - } 56 - m.Score += score 57 - } 58 31 59 32 // simplifyMultiRepo takes a query and a predicate. It returns Const(true) if all 60 33 // repository names fulfill the predicate, Const(false) if none of them do, and q ··· 336 309 // non-overlapping. gatherMatches respects this invariant and all later 337 310 // transformations respect this. 338 311 shouldMergeMatches := !opts.ChunkMatches 339 - finalCands := gatherMatches(mt, known, shouldMergeMatches) 340 - 341 - if len(finalCands) == 0 { 342 - nm := d.fileName(nextDoc) 343 - finalCands = append(finalCands, 344 - &candidateMatch{ 345 - caseSensitive: false, 346 - fileName: true, 347 - substrBytes: nm, 348 - substrLowered: nm, 349 - file: nextDoc, 350 - runeOffset: 0, 351 - byteOffset: 0, 352 - byteMatchSz: uint32(len(nm)), 353 - }) 354 - } 312 + finalCands := d.gatherMatches(nextDoc, mt, known, shouldMergeMatches) 355 313 356 314 if opts.ChunkMatches { 357 315 fileMatch.ChunkMatches = cp.fillChunkMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts.DebugScore) ··· 391 349 res.Stats.FileCount++ 392 350 } 393 351 394 - // We do not sort Files here, instead we rely on the shards pkg to do file 395 - // ranking. If we sorted now, we would break the assumption that results 396 - // from the same repo in a shard appear next to each other. 397 - 398 352 for _, md := range d.repoMetaData { 399 353 r := md 400 354 addRepo(&res, &r) ··· 416 370 return &res, nil 417 371 } 418 372 419 - // scoreFile computes a score for the file match using various scoring signals, like 420 - // whether there's an exact match on a symbol, the number of query clauses that matched, etc. 421 - func (d *indexData) scoreFile(fileMatch *FileMatch, doc uint32, mt matchTree, known map[matchTree]bool, opts *SearchOptions) { 422 - atomMatchCount := 0 423 - visitMatchAtoms(mt, known, func(mt matchTree) { 424 - atomMatchCount++ 425 - }) 426 - 427 - addScore := func(what string, computed float64) { 428 - fileMatch.addScore(what, computed, -1, opts.DebugScore) 429 - } 430 - 431 - // atom-count boosts files with matches from more than 1 atom. The 432 - // maximum boost is scoreFactorAtomMatch. 433 - if atomMatchCount > 0 { 434 - fileMatch.addScore("atom", (1.0-1.0/float64(atomMatchCount))*scoreFactorAtomMatch, float64(atomMatchCount), opts.DebugScore) 435 - } 436 - 437 - maxFileScore := 0.0 438 - for i := range fileMatch.LineMatches { 439 - if maxFileScore < fileMatch.LineMatches[i].Score { 440 - maxFileScore = fileMatch.LineMatches[i].Score 441 - } 442 - 443 - // Order by ordering in file. 444 - fileMatch.LineMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.LineMatches)))) 445 - } 446 - 447 - for i := range fileMatch.ChunkMatches { 448 - if maxFileScore < fileMatch.ChunkMatches[i].Score { 449 - maxFileScore = fileMatch.ChunkMatches[i].Score 450 - } 451 - 452 - // Order by ordering in file. 453 - fileMatch.ChunkMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.ChunkMatches)))) 454 - } 455 - 456 - // Maintain ordering of input files. This 457 - // strictly dominates the in-file ordering of 458 - // the matches. 459 - addScore("fragment", maxFileScore) 460 - 461 - if opts.UseDocumentRanks && len(d.ranks) > int(doc) { 462 - weight := scoreFileRankFactor 463 - if opts.DocumentRanksWeight > 0.0 { 464 - weight = opts.DocumentRanksWeight 465 - } 466 - 467 - ranks := d.ranks[doc] 468 - // The ranks slice always contains one entry representing the file rank (unless it's empty since the 469 - // file doesn't have a rank). This is left over from when documents could have multiple rank signals, 470 - // and we plan to clean this up. 471 - if len(ranks) > 0 { 472 - // The file rank represents a log (base 2) count. The log ranks should be bounded at 32, but we 473 - // cap it just in case to ensure it falls in the range [0, 1]. 474 - normalized := math.Min(1.0, ranks[0]/32.0) 475 - addScore("file-rank", weight*normalized) 476 - } 477 - } 478 - 479 - md := d.repoMetaData[d.repos[doc]] 480 - addScore("doc-order", scoreFileOrderFactor*(1.0-float64(doc)/float64(len(d.boundaries)))) 481 - addScore("repo-rank", scoreRepoRankFactor*float64(md.Rank)/maxUInt16) 482 - 483 - if opts.DebugScore { 484 - fileMatch.Debug = strings.TrimSuffix(fileMatch.Debug, ", ") 485 - } 486 - } 487 - 488 - // scoreFileUsingBM25 computes a score for the file match using an approximation to BM25, the most common scoring 489 - // algorithm for keyword search: https://en.wikipedia.org/wiki/Okapi_BM25. It implements all parts of the formula 490 - // except inverse document frequency (idf), since we don't have access to global term frequency statistics. 491 - // 492 - // This scoring strategy ignores all other signals including document ranks. This keeps things simple for now, 493 - // since BM25 is not normalized and can be tricky to combine with other scoring signals. 494 - func (d *indexData) scoreFileUsingBM25(fileMatch *FileMatch, doc uint32, cands []*candidateMatch, opts *SearchOptions) { 495 - // Treat each candidate match as a term and compute the frequencies. For now, ignore case 496 - // sensitivity and treat filenames and symbols the same as content. 497 - termFreqs := map[string]int{} 498 - for _, cand := range cands { 499 - term := string(cand.substrLowered) 500 - termFreqs[term]++ 501 - } 502 - 503 - // Compute the file length ratio. Usually the calculation would be based on terms, but using 504 - // bytes should work fine, as we're just computing a ratio. 505 - fileLength := float64(d.boundaries[doc+1] - d.boundaries[doc]) 506 - numFiles := len(d.boundaries) 507 - averageFileLength := float64(d.boundaries[numFiles-1]) / float64(numFiles) 508 - L := fileLength / averageFileLength 509 - 510 - // Use standard parameter defaults (used in Lucene and academic papers) 511 - k, b := 1.2, 0.75 512 - sumTf := 0.0 // Just for debugging 513 - score := 0.0 514 - for _, freq := range termFreqs { 515 - tf := float64(freq) 516 - sumTf += tf 517 - score += ((k + 1.0) * tf) / (k*(1.0-b+b*L) + tf) 518 - } 519 - 520 - fileMatch.addKeywordScore(score, sumTf, L, opts.DebugScore) 521 - } 522 - 523 373 func addRepo(res *SearchResult, repo *Repository) { 524 374 if res.RepoURLs == nil { 525 375 res.RepoURLs = map[string]string{} ··· 532 382 res.LineFragments[repo.Name] = repo.LineFragmentTemplate 533 383 } 534 384 535 - type sortByOffsetSlice []*candidateMatch 536 - 537 - func (m sortByOffsetSlice) Len() int { return len(m) } 538 - func (m sortByOffsetSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 539 - func (m sortByOffsetSlice) Less(i, j int) bool { 540 - if m[i].byteOffset == m[j].byteOffset { // tie break if same offset 541 - // Prefer longer candidates if starting at same position 542 - return m[i].byteMatchSz > m[j].byteMatchSz 543 - } 544 - return m[i].byteOffset < m[j].byteOffset 545 - } 546 - 547 - // setScoreWeight is a helper used by gatherMatches to set the weight based on 548 - // the score weight of the matchTree. 549 - func setScoreWeight(scoreWeight float64, cm []*candidateMatch) []*candidateMatch { 550 - for _, m := range cm { 551 - m.scoreWeight = scoreWeight 552 - } 553 - return cm 554 - } 555 - 556 385 // Gather matches from this document. This never returns a mixture of 557 386 // filename/content matches: if there are content matches, all 558 387 // filename matches are trimmed from the result. The matches are ··· 561 390 // If `merge` is set, overlapping and adjacent matches will be merged 562 391 // into a single match. Otherwise, overlapping matches will be removed, 563 392 // but adjacent matches will remain. 564 - func gatherMatches(mt matchTree, known map[matchTree]bool, merge bool) []*candidateMatch { 393 + func (d *indexData) gatherMatches(nextDoc uint32, mt matchTree, known map[matchTree]bool, merge bool) []*candidateMatch { 565 394 var cands []*candidateMatch 566 395 visitMatches(mt, known, 1, func(mt matchTree, scoreWeight float64) { 567 396 if smt, ok := mt.(*substrMatchTree); ok { ··· 578 407 } 579 408 }) 580 409 410 + // If there are content matches, trim all filename matches. 581 411 foundContentMatch := false 582 412 for _, c := range cands { 583 413 if !c.fileName { ··· 594 424 } 595 425 cands = res 596 426 427 + // If we found no candidate matches at all, assume there must have been a match on filename. 428 + if len(cands) == 0 { 429 + nm := d.fileName(nextDoc) 430 + return []*candidateMatch{{ 431 + caseSensitive: false, 432 + fileName: true, 433 + substrBytes: nm, 434 + substrLowered: nm, 435 + file: nextDoc, 436 + runeOffset: 0, 437 + byteOffset: 0, 438 + byteMatchSz: uint32(len(nm)), 439 + }} 440 + } 441 + 597 442 if merge { 598 443 // Merge adjacent candidates. This guarantees that the matches 599 444 // are non-overlapping. ··· 649 494 res = append(res, c) 650 495 } 651 496 } 497 + return res 498 + } 652 499 653 - return res 500 + type sortByOffsetSlice []*candidateMatch 501 + 502 + func (m sortByOffsetSlice) Len() int { return len(m) } 503 + func (m sortByOffsetSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 504 + func (m sortByOffsetSlice) Less(i, j int) bool { 505 + if m[i].byteOffset == m[j].byteOffset { // tie break if same offset 506 + // Prefer longer candidates if starting at same position 507 + return m[i].byteMatchSz > m[j].byteMatchSz 508 + } 509 + return m[i].byteOffset < m[j].byteOffset 510 + } 511 + 512 + // setScoreWeight is a helper used by gatherMatches to set the weight based on 513 + // the score weight of the matchTree. 514 + func setScoreWeight(scoreWeight float64, cm []*candidateMatch) []*candidateMatch { 515 + for _, m := range cm { 516 + m.scoreWeight = scoreWeight 517 + } 518 + return cm 654 519 } 655 520 656 521 func (d *indexData) branchIndex(docID uint32) int {
+151
score.go
··· 1 + // Copyright 2016 Google Inc. All rights reserved. 2 + // 3 + // Licensed under the Apache License, Version 2.0 (the "License"); 4 + // you may not use this file except in compliance with the License. 5 + // You may obtain a copy of the License at 6 + // 7 + // http://www.apache.org/licenses/LICENSE-2.0 8 + // 9 + // Unless required by applicable law or agreed to in writing, software 10 + // distributed under the License is distributed on an "AS IS" BASIS, 11 + // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 + // See the License for the specific language governing permissions and 13 + // limitations under the License. 14 + 15 + package zoekt 16 + 17 + import ( 18 + "fmt" 19 + "math" 20 + "strconv" 21 + "strings" 22 + ) 23 + 24 + const maxUInt16 = 0xffff 25 + 26 + // addScore increments the score of the FileMatch by the computed score. If 27 + // debugScore is true, it also adds a debug string to the FileMatch. If raw is 28 + // -1, it is ignored. Otherwise, it is added to the debug string. 29 + func (m *FileMatch) addScore(what string, computed float64, raw float64, debugScore bool) { 30 + if computed != 0 && debugScore { 31 + var b strings.Builder 32 + fmt.Fprintf(&b, "%s", what) 33 + if raw != -1 { 34 + fmt.Fprintf(&b, "(%s)", strconv.FormatFloat(raw, 'f', -1, 64)) 35 + } 36 + fmt.Fprintf(&b, ":%.2f, ", computed) 37 + m.Debug += b.String() 38 + } 39 + m.Score += computed 40 + } 41 + 42 + func (m *FileMatch) addKeywordScore(score float64, sumTf float64, L float64, debugScore bool) { 43 + if debugScore { 44 + m.Debug += fmt.Sprintf("keyword-score:%.2f (sum-tf: %.2f, length-ratio: %.2f)", score, sumTf, L) 45 + } 46 + m.Score += score 47 + } 48 + 49 + // scoreFile computes a score for the file match using various scoring signals, like 50 + // whether there's an exact match on a symbol, the number of query clauses that matched, etc. 51 + func (d *indexData) scoreFile(fileMatch *FileMatch, doc uint32, mt matchTree, known map[matchTree]bool, opts *SearchOptions) { 52 + atomMatchCount := 0 53 + visitMatchAtoms(mt, known, func(mt matchTree) { 54 + atomMatchCount++ 55 + }) 56 + 57 + addScore := func(what string, computed float64) { 58 + fileMatch.addScore(what, computed, -1, opts.DebugScore) 59 + } 60 + 61 + // atom-count boosts files with matches from more than 1 atom. The 62 + // maximum boost is scoreFactorAtomMatch. 63 + if atomMatchCount > 0 { 64 + fileMatch.addScore("atom", (1.0-1.0/float64(atomMatchCount))*scoreFactorAtomMatch, float64(atomMatchCount), opts.DebugScore) 65 + } 66 + 67 + maxFileScore := 0.0 68 + for i := range fileMatch.LineMatches { 69 + if maxFileScore < fileMatch.LineMatches[i].Score { 70 + maxFileScore = fileMatch.LineMatches[i].Score 71 + } 72 + 73 + // Order by ordering in file. 74 + fileMatch.LineMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.LineMatches)))) 75 + } 76 + 77 + for i := range fileMatch.ChunkMatches { 78 + if maxFileScore < fileMatch.ChunkMatches[i].Score { 79 + maxFileScore = fileMatch.ChunkMatches[i].Score 80 + } 81 + 82 + // Order by ordering in file. 83 + fileMatch.ChunkMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.ChunkMatches)))) 84 + } 85 + 86 + // Maintain ordering of input files. This 87 + // strictly dominates the in-file ordering of 88 + // the matches. 89 + addScore("fragment", maxFileScore) 90 + 91 + if opts.UseDocumentRanks && len(d.ranks) > int(doc) { 92 + weight := scoreFileRankFactor 93 + if opts.DocumentRanksWeight > 0.0 { 94 + weight = opts.DocumentRanksWeight 95 + } 96 + 97 + ranks := d.ranks[doc] 98 + // The ranks slice always contains one entry representing the file rank (unless it's empty since the 99 + // file doesn't have a rank). This is left over from when documents could have multiple rank signals, 100 + // and we plan to clean this up. 101 + if len(ranks) > 0 { 102 + // The file rank represents a log (base 2) count. The log ranks should be bounded at 32, but we 103 + // cap it just in case to ensure it falls in the range [0, 1]. 104 + normalized := math.Min(1.0, ranks[0]/32.0) 105 + addScore("file-rank", weight*normalized) 106 + } 107 + } 108 + 109 + md := d.repoMetaData[d.repos[doc]] 110 + addScore("doc-order", scoreFileOrderFactor*(1.0-float64(doc)/float64(len(d.boundaries)))) 111 + addScore("repo-rank", scoreRepoRankFactor*float64(md.Rank)/maxUInt16) 112 + 113 + if opts.DebugScore { 114 + fileMatch.Debug = strings.TrimSuffix(fileMatch.Debug, ", ") 115 + } 116 + } 117 + 118 + // scoreFileUsingBM25 computes a score for the file match using an approximation to BM25, the most common scoring 119 + // algorithm for keyword search: https://en.wikipedia.org/wiki/Okapi_BM25. It implements all parts of the formula 120 + // except inverse document frequency (idf), since we don't have access to global term frequency statistics. 121 + // 122 + // This scoring strategy ignores all other signals including document ranks. This keeps things simple for now, 123 + // since BM25 is not normalized and can be tricky to combine with other scoring signals. 124 + func (d *indexData) scoreFileUsingBM25(fileMatch *FileMatch, doc uint32, cands []*candidateMatch, opts *SearchOptions) { 125 + // Treat each candidate match as a term and compute the frequencies. For now, ignore case 126 + // sensitivity and treat filenames and symbols the same as content. 127 + termFreqs := map[string]int{} 128 + for _, cand := range cands { 129 + term := string(cand.substrLowered) 130 + termFreqs[term]++ 131 + } 132 + 133 + // Compute the file length ratio. Usually the calculation would be based on terms, but using 134 + // bytes should work fine, as we're just computing a ratio. 135 + fileLength := float64(d.boundaries[doc+1] - d.boundaries[doc]) 136 + numFiles := len(d.boundaries) 137 + averageFileLength := float64(d.boundaries[numFiles-1]) / float64(numFiles) 138 + L := fileLength / averageFileLength 139 + 140 + // Use standard parameter defaults (used in Lucene and academic papers) 141 + k, b := 1.2, 0.75 142 + sumTf := 0.0 // Just for debugging 143 + score := 0.0 144 + for _, freq := range termFreqs { 145 + tf := float64(freq) 146 + sumTf += tf 147 + score += ((k + 1.0) * tf) / (k*(1.0-b+b*L) + tf) 148 + } 149 + 150 + fileMatch.addKeywordScore(score, sumTf, L, opts.DebugScore) 151 + }