fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

at main 15 kB View raw
1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package index 16 17import ( 18 "bytes" 19 "fmt" 20 "math" 21 "strings" 22 23 "github.com/go-enry/go-enry/v2" 24 25 "github.com/sourcegraph/zoekt" 26 "github.com/sourcegraph/zoekt/internal/ctags" 27) 28 29const ( 30 ScoreOffset = 10_000_000 31) 32 33type chunkScore struct { 34 score float64 35 debugScore string 36 bestLine int 37} 38 39// scoreChunk calculates the score for each line in the chunk based on its candidate matches, and returns the score of 40// the best-scoring line, along with its line number. 41// Invariant: there should be at least one input candidate, len(ms) > 0. 42func (p *contentProvider) scoreChunk(ms []*candidateMatch, language string, opts *zoekt.SearchOptions) (chunkScore, []*zoekt.Symbol) { 43 nl := p.newlines() 44 45 var bestScore lineScore 46 bestLine := 0 47 var symbolInfo []*zoekt.Symbol 48 49 start := 0 50 currentLine := -1 51 for i, m := range ms { 52 lineNumber := -1 53 if !m.fileName { 54 lineNumber = nl.atOffset(m.byteOffset) 55 } 56 57 // If this match represents a new line, then score the previous line and update 'start'. 58 if i != 0 && lineNumber != currentLine { 59 score, si := p.scoreLine(ms[start:i], language, currentLine, opts) 60 symbolInfo = append(symbolInfo, si...) 61 if score.score > bestScore.score { 62 bestScore = score 63 bestLine = currentLine 64 } 65 start = i 66 } 67 currentLine = lineNumber 68 } 69 70 // Make sure to score the last line 71 line, si := p.scoreLine(ms[start:], language, currentLine, opts) 72 symbolInfo = append(symbolInfo, si...) 73 if line.score > bestScore.score { 74 bestScore = line 75 bestLine = currentLine 76 } 77 78 cs := chunkScore{ 79 score: bestScore.score, 80 bestLine: bestLine, 81 } 82 if opts.DebugScore { 83 cs.debugScore = fmt.Sprintf("%s, (line: %d)", bestScore.debugScore, bestLine) 84 } 85 return cs, symbolInfo 86} 87 88type lineScore struct { 89 score float64 90 debugScore string 91} 92 93// scoreLine calculates a score for the line based on its candidate matches. 94// Invariants: 95// - All candidate matches are assumed to come from the same line in the content. 96// - If this line represents a filename, then lineNumber must be -1. 97// - There should be at least one input candidate, len(ms) > 0. 98func (p *contentProvider) scoreLine(ms []*candidateMatch, language string, lineNumber int, opts *zoekt.SearchOptions) (lineScore, []*zoekt.Symbol) { 99 if opts.UseBM25Scoring { 100 score, symbolInfo := p.scoreLineBM25(ms, lineNumber) 101 ls := lineScore{score: score} 102 if opts.DebugScore { 103 ls.debugScore = fmt.Sprintf("tfScore:%.2f, ", score) 104 } 105 return ls, symbolInfo 106 } 107 108 score := 0.0 109 what := "" 110 addScore := func(w string, s float64) { 111 if s != 0 && opts.DebugScore { 112 what += fmt.Sprintf("%s:%.2f, ", w, s) 113 } 114 score += s 115 } 116 117 filename := p.data(true) 118 var symbolInfo []*zoekt.Symbol 119 120 var bestScore lineScore 121 for i, m := range ms { 122 data := p.data(m.fileName) 123 124 endOffset := m.byteOffset + m.byteMatchSz 125 startBoundary := m.byteOffset < uint32(len(data)) && (m.byteOffset == 0 || byteClass(data[m.byteOffset-1]) != byteClass(data[m.byteOffset])) 126 endBoundary := endOffset > 0 && (endOffset == uint32(len(data)) || byteClass(data[endOffset-1]) != byteClass(data[endOffset])) 127 128 score = 0 129 what = "" 130 131 if startBoundary && endBoundary { 132 addScore("WordMatch", scoreWordMatch) 133 } else if startBoundary || endBoundary { 134 addScore("PartialWordMatch", scorePartialWordMatch) 135 } 136 137 if m.fileName { 138 sep := bytes.LastIndexByte(data, '/') 139 startMatch := int(m.byteOffset) == sep+1 140 endMatch := endOffset == uint32(len(data)) 141 if startMatch && endMatch { 142 addScore("Base", scoreBase) 143 } else if startMatch || endMatch { 144 addScore("EdgeBase", (scoreBase+scorePartialBase)/2) 145 } else if sep < int(m.byteOffset) { 146 addScore("InnerBase", scorePartialBase) 147 } 148 } else if sec, si, ok := p.findSymbol(m); ok { 149 startMatch := sec.Start == m.byteOffset 150 endMatch := sec.End == endOffset 151 if startMatch && endMatch { 152 addScore("Symbol", scoreSymbol) 153 } else if startMatch || endMatch { 154 addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2) 155 } else { 156 addScore("OverlapSymbol", scorePartialSymbol) 157 } 158 159 // Score based on symbol data 160 if si != nil { 161 symbolKind := ctags.ParseSymbolKind(si.Kind) 162 sym := sectionSlice(data, sec) 163 164 addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, symbolKind)) 165 166 // This is from a symbol tree, so we need to store the symbol 167 // information. 168 if m.symbol { 169 if symbolInfo == nil { 170 symbolInfo = make([]*zoekt.Symbol, len(ms)) 171 } 172 // findSymbols does not hydrate in Sym. So we need to store it. 173 si.Sym = string(sym) 174 symbolInfo[i] = si 175 } 176 } 177 } 178 179 // scoreWeight != 1 means it affects score 180 if !epsilonEqualsOne(m.scoreWeight) { 181 score = score * m.scoreWeight 182 if opts.DebugScore { 183 what += fmt.Sprintf("boost:%.2f, ", m.scoreWeight) 184 } 185 } 186 187 if score > bestScore.score { 188 bestScore.score = score 189 bestScore.debugScore = what 190 } 191 } 192 193 if opts.DebugScore { 194 bestScore.debugScore = fmt.Sprintf("score:%.2f <- %s", bestScore.score, strings.TrimSuffix(bestScore.debugScore, ", ")) 195 } 196 197 return bestScore, symbolInfo 198} 199 200// scoreLineBM25 computes the score of a line according to BM25, the most common scoring algorithm for text search: 201// https://en.wikipedia.org/wiki/Okapi_BM25. Compared to the standard scoreLine algorithm, this score rewards multiple 202// term matches on a line. 203// Notes: 204// - This BM25 calculation skips inverse document frequency (idf) to keep the implementation simple. 205// - It uses the same calculateTermFrequency method as BM25 file scoring, which boosts filename and symbol matches. 206func (p *contentProvider) scoreLineBM25(ms []*candidateMatch, lineNumber int) (float64, []*zoekt.Symbol) { 207 // If this is a filename, then don't compute BM25. The score would not be comparable to line scores. 208 if lineNumber < 0 { 209 return 0, nil 210 } 211 212 // Use standard parameter defaults used in Lucene (https://lucene.apache.org/core/10_1_0/core/org/apache/lucene/search/similarities/BM25Similarity.html) 213 k, b := 1.2, 0.75 214 215 // Calculate the length ratio of this line. As a heuristic, we assume an average line length of 100 characters. 216 // Usually the calculation would be based on terms, but using bytes should work fine, as we're just computing a ratio. 217 nl := p.newlines() 218 lineLength := nl.lineStart(lineNumber+1) - nl.lineStart(lineNumber) 219 L := float64(lineLength) / 100.0 220 221 score := 0.0 222 tfs := p.calculateTermFrequency(ms, false) // ignore file priority, since we're just scoring within a single file 223 for _, f := range tfs { 224 score += tfScore(k, b, L, f) 225 } 226 227 // Check if any index comes from a symbol match tree, and if so hydrate in symbol information 228 var symbolInfo []*zoekt.Symbol 229 for _, m := range ms { 230 if m.symbol { 231 if sec, si, ok := p.findSymbol(m); ok && si != nil { 232 // findSymbols does not hydrate in Sym. So we need to store it. 233 sym := sectionSlice(p.data(false), sec) 234 si.Sym = string(sym) 235 symbolInfo = append(symbolInfo, si) 236 } 237 } 238 } 239 240 score = boostScore(score, ms) 241 return score, symbolInfo 242} 243 244// tfScore is the term frequency score for BM25. 245func tfScore(k float64, b float64, L float64, f int) float64 { 246 return ((k + 1.0) * float64(f)) / (k*(1.0-b+b*L) + float64(f)) 247} 248 249const importantTermBoost = 5 250const lowPriorityFilePenalty = 5 251 252// calculateTermFrequency computes the term frequency for the file match. 253// Notes: 254// - Filename matches count more than content matches. This mimics a common text search strategy to 'boost' matches on document titles. 255// - Symbol matches also count more than content matches, to reward matches on symbol definitions. 256// - "Low priority" files like tests, generated files, etc. have their term frequency down-weighted, to prioritize matches from 'regular' files 257func (p *contentProvider) calculateTermFrequency(cands []*candidateMatch, lowPriority bool) map[string]int { 258 // Treat each candidate match as a term and compute the frequencies. For now, ignore case sensitivity and 259 // ignore whether the index is a word boundary. 260 termFreqs := map[string]int{} 261 for _, m := range cands { 262 term := string(m.substrLowered) 263 if m.fileName || p.matchesSymbol(m) { 264 termFreqs[term] += importantTermBoost 265 } else { 266 termFreqs[term]++ 267 } 268 } 269 270 // If a file is a test, generated, etc., then down-weight its term frequency. The BM25F interpretation 271 // is that this data lives in a separate 'field' that is half the priority of regular content. 272 if lowPriority { 273 for term := range termFreqs { 274 termFreqs[term] = termFreqs[term] / lowPriorityFilePenalty 275 } 276 } 277 278 return termFreqs 279} 280 281// boostScore finds whether any of the matches are part of a boosted match tree, then applies 282// the boost to the final score. This follows precedent in other search engines like Lucene, where 283// boosts multiply an entire query clause's final score. 284// 285// As a heuristic, we use the maximum boost across matches to avoid applying the same boost multiple times. 286func boostScore(score float64, ms []*candidateMatch) float64 { 287 maxScoreWeight := 1.0 288 for _, m := range ms { 289 if m.scoreWeight > maxScoreWeight { 290 maxScoreWeight = m.scoreWeight 291 } 292 } 293 294 if !epsilonEqualsOne(maxScoreWeight) { 295 score = score * maxScoreWeight 296 } 297 return score 298} 299 300// scoreFile computes a score for the file match using various scoring signals, like 301// whether there's an exact match on a symbol, the number of query clauses that matched, etc. 302func (d *indexData) scoreFile(fileMatch *zoekt.FileMatch, doc uint32, mt matchTree, known map[matchTree]bool, opts *zoekt.SearchOptions) { 303 atomMatchCount := 0 304 visitMatchAtoms(mt, known, func(mt matchTree) { 305 atomMatchCount++ 306 }) 307 308 addScore := func(what string, computed float64) { 309 fileMatch.AddScore(what, computed, -1, opts.DebugScore) 310 } 311 312 // atom-count boosts files with matches from more than 1 atom. The 313 // maximum boost is scoreFactorAtomMatch. 314 if atomMatchCount > 0 { 315 fileMatch.AddScore("atom", (1.0-1.0/float64(atomMatchCount))*scoreFactorAtomMatch, float64(atomMatchCount), opts.DebugScore) 316 } 317 318 maxFileScore := 0.0 319 for i := range fileMatch.LineMatches { 320 if maxFileScore < fileMatch.LineMatches[i].Score { 321 maxFileScore = fileMatch.LineMatches[i].Score 322 } 323 324 // Order by ordering in file. 325 fileMatch.LineMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.LineMatches)))) 326 } 327 328 for i := range fileMatch.ChunkMatches { 329 if maxFileScore < fileMatch.ChunkMatches[i].Score { 330 maxFileScore = fileMatch.ChunkMatches[i].Score 331 } 332 333 // Order by ordering in file. 334 fileMatch.ChunkMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.ChunkMatches)))) 335 } 336 337 // Maintain ordering of input files. This strictly dominates the in-file ordering of the matches. 338 addScore("fragment", maxFileScore) 339 340 // Truncate score to avoid overlap with the tiebreakers. 341 fileMatch.Score = math.Trunc(fileMatch.Score) 342 343 // Add tiebreakers 344 repoRank := d.repoMetaData[d.repos[doc]].Rank // [0, 65535] 345 docOrderScore := 1.0 - float64(doc)/float64(len(d.boundaries)) // [0, 1] 346 347 if opts.DebugScore { 348 // We log the score components individually for better readability. 349 fileMatch.Debug = fmt.Sprintf("score: %d (repo-rank: %d, file-rank: %.2f) <- %s", int(fileMatch.Score), repoRank, docOrderScore, strings.TrimSuffix(fileMatch.Debug, ", ")) 350 } 351 352 fileMatch.Score = ScoreOffset*fileMatch.Score + scoreRepoRankFactor*float64(repoRank) + scoreFileOrderFactor*docOrderScore 353} 354 355// scoreFileBM25 computes the score according to BM25, the most common scoring algorithm for text search: 356// https://en.wikipedia.org/wiki/Okapi_BM25. Note that we treat the inverse document frequency (idf) as constant. This 357// is supported by our evaluations which showed that for keyword style queries, idf can down-weight the score of some 358// keywords too much, leading to a worse ranking. The intuition is that each keyword is important independently of how 359// frequent it appears in the corpus. 360// 361// Unlike standard file scoring, this scoring strategy ignores the individual LineMatch and ChunkMatch scores, instead 362// calculating a score over all matches in the file. 363func (d *indexData) scoreFileBM25(fileMatch *zoekt.FileMatch, doc uint32, cands []*candidateMatch, cp *contentProvider, opts *zoekt.SearchOptions) { 364 lowPriority := d.isLowPriority(fileMatch, doc) 365 tf := cp.calculateTermFrequency(cands, lowPriority) 366 367 // Use standard parameter defaults used in Lucene (https://lucene.apache.org/core/10_1_0/core/org/apache/lucene/search/similarities/BM25Similarity.html) 368 k, b := 1.2, 0.75 369 370 averageFileLength := float64(d.boundaries[d.numDocs()]) / float64(d.numDocs()) 371 // This is very unlikely, but explicitly guard against division by zero. 372 if averageFileLength == 0 { 373 averageFileLength++ 374 } 375 376 // Compute the file length ratio. Usually the calculation would be based on terms, but using 377 // bytes should work fine, as we're just computing a ratio. 378 fileLength := float64(d.boundaries[doc+1] - d.boundaries[doc]) 379 380 L := fileLength / averageFileLength 381 382 bm25Score := 0.0 383 sumTF := 0 // Just for debugging 384 for _, f := range tf { 385 sumTF += f 386 bm25Score += tfScore(k, b, L, f) 387 } 388 389 score := boostScore(bm25Score, cands) 390 boosted := score != bm25Score 391 fileMatch.Score = score 392 393 if opts.DebugScore { 394 // To make the debug output easier to read, we split the score into the query dependent score and the tiebreaker 395 fileMatch.Debug = fmt.Sprintf("bm25-score: %.2f (low-priority: %t) <- sum-termFrequencies: %d, length-ratio: %.2f", score, lowPriority, sumTF, L) 396 if boosted { 397 fileMatch.Debug += fmt.Sprintf(" (boosted)") 398 } 399 } 400} 401 402func (d *indexData) isLowPriority(fileMatch *zoekt.FileMatch, doc uint32) bool { 403 category := d.getCategory(doc) 404 if category != FileCategoryMissing { 405 return category.lowPriority() 406 } else { 407 // The category may be missing from older index versions. In this case, 408 // perform a cheap, best-effort check against the filename. 409 path := fileMatch.FileName 410 return enry.IsTest(path) || enry.IsVendor(path) 411 } 412}