fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package index 16 17import ( 18 "bytes" 19 "fmt" 20 "math" 21 "strings" 22 23 "github.com/sourcegraph/zoekt" 24 "github.com/sourcegraph/zoekt/internal/ctags" 25) 26 27const ( 28 ScoreOffset = 10_000_000 29 ScoreOffsetBM25 = 1_000_000_000 30) 31 32type chunkScore struct { 33 score float64 34 debugScore string 35 bestLine int 36} 37 38// scoreChunk calculates the score for each line in the chunk based on its candidate matches, and returns the score of 39// the best-scoring line, along with its line number. 40// Invariant: there should be at least one input candidate, len(ms) > 0. 41func (p *contentProvider) scoreChunk(ms []*candidateMatch, language string, opts *zoekt.SearchOptions) (chunkScore, []*zoekt.Symbol) { 42 nl := p.newlines() 43 44 var bestScore lineScore 45 bestLine := 0 46 var symbolInfo []*zoekt.Symbol 47 48 start := 0 49 currentLine := -1 50 for i, m := range ms { 51 lineNumber := -1 52 if !m.fileName { 53 lineNumber = nl.atOffset(m.byteOffset) 54 } 55 56 // If this match represents a new line, then score the previous line and update 'start'. 57 if i != 0 && lineNumber != currentLine { 58 score, si := p.scoreLine(ms[start:i], language, currentLine, opts) 59 symbolInfo = append(symbolInfo, si...) 60 if score.score > bestScore.score { 61 bestScore = score 62 bestLine = currentLine 63 } 64 start = i 65 } 66 currentLine = lineNumber 67 } 68 69 // Make sure to score the last line 70 line, si := p.scoreLine(ms[start:], language, currentLine, opts) 71 symbolInfo = append(symbolInfo, si...) 72 if line.score > bestScore.score { 73 bestScore = line 74 bestLine = currentLine 75 } 76 77 cs := chunkScore{ 78 score: bestScore.score, 79 bestLine: bestLine, 80 } 81 if opts.DebugScore { 82 cs.debugScore = fmt.Sprintf("%s, (line: %d)", bestScore.debugScore, bestLine) 83 } 84 return cs, symbolInfo 85} 86 87type lineScore struct { 88 score float64 89 debugScore string 90} 91 92// scoreLine calculates a score for the line based on its candidate matches. 93// Invariants: 94// - All candidate matches are assumed to come from the same line in the content. 95// - If this line represents a filename, then lineNumber must be -1. 96// - There should be at least one input candidate, len(ms) > 0. 97func (p *contentProvider) scoreLine(ms []*candidateMatch, language string, lineNumber int, opts *zoekt.SearchOptions) (lineScore, []*zoekt.Symbol) { 98 if opts.UseBM25Scoring { 99 score, symbolInfo := p.scoreLineBM25(ms, lineNumber) 100 ls := lineScore{score: score} 101 if opts.DebugScore { 102 ls.debugScore = fmt.Sprintf("tfScore:%.2f, ", score) 103 } 104 return ls, symbolInfo 105 } 106 107 score := 0.0 108 what := "" 109 addScore := func(w string, s float64) { 110 if s != 0 && opts.DebugScore { 111 what += fmt.Sprintf("%s:%.2f, ", w, s) 112 } 113 score += s 114 } 115 116 filename := p.data(true) 117 var symbolInfo []*zoekt.Symbol 118 119 var bestScore lineScore 120 for i, m := range ms { 121 data := p.data(m.fileName) 122 123 endOffset := m.byteOffset + m.byteMatchSz 124 startBoundary := m.byteOffset < uint32(len(data)) && (m.byteOffset == 0 || byteClass(data[m.byteOffset-1]) != byteClass(data[m.byteOffset])) 125 endBoundary := endOffset > 0 && (endOffset == uint32(len(data)) || byteClass(data[endOffset-1]) != byteClass(data[endOffset])) 126 127 score = 0 128 what = "" 129 130 if startBoundary && endBoundary { 131 addScore("WordMatch", scoreWordMatch) 132 } else if startBoundary || endBoundary { 133 addScore("PartialWordMatch", scorePartialWordMatch) 134 } 135 136 if m.fileName { 137 sep := bytes.LastIndexByte(data, '/') 138 startMatch := int(m.byteOffset) == sep+1 139 endMatch := endOffset == uint32(len(data)) 140 if startMatch && endMatch { 141 addScore("Base", scoreBase) 142 } else if startMatch || endMatch { 143 addScore("EdgeBase", (scoreBase+scorePartialBase)/2) 144 } else if sep < int(m.byteOffset) { 145 addScore("InnerBase", scorePartialBase) 146 } 147 } else if sec, si, ok := p.findSymbol(m); ok { 148 startMatch := sec.Start == m.byteOffset 149 endMatch := sec.End == endOffset 150 if startMatch && endMatch { 151 addScore("Symbol", scoreSymbol) 152 } else if startMatch || endMatch { 153 addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2) 154 } else { 155 addScore("OverlapSymbol", scorePartialSymbol) 156 } 157 158 // Score based on symbol data 159 if si != nil { 160 symbolKind := ctags.ParseSymbolKind(si.Kind) 161 sym := sectionSlice(data, sec) 162 163 addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, symbolKind)) 164 165 // This is from a symbol tree, so we need to store the symbol 166 // information. 167 if m.symbol { 168 if symbolInfo == nil { 169 symbolInfo = make([]*zoekt.Symbol, len(ms)) 170 } 171 // findSymbols does not hydrate in Sym. So we need to store it. 172 si.Sym = string(sym) 173 symbolInfo[i] = si 174 } 175 } 176 } 177 178 // scoreWeight != 1 means it affects score 179 if !epsilonEqualsOne(m.scoreWeight) { 180 score = score * m.scoreWeight 181 if opts.DebugScore { 182 what += fmt.Sprintf("boost:%.2f, ", m.scoreWeight) 183 } 184 } 185 186 if score > bestScore.score { 187 bestScore.score = score 188 bestScore.debugScore = what 189 } 190 } 191 192 if opts.DebugScore { 193 bestScore.debugScore = fmt.Sprintf("score:%.2f <- %s", bestScore.score, strings.TrimSuffix(bestScore.debugScore, ", ")) 194 } 195 196 return bestScore, symbolInfo 197} 198 199// scoreLineBM25 computes the score of a line according to BM25, the most common scoring algorithm for text search: 200// https://en.wikipedia.org/wiki/Okapi_BM25. Compared to the standard scoreLine algorithm, this score rewards multiple 201// term matches on a line. 202// Notes: 203// - This BM25 calculation skips inverse document frequency (idf) to keep the implementation simple. 204// - It uses the same calculateTermFrequency method as BM25 file scoring, which boosts filename and symbol matches. 205func (p *contentProvider) scoreLineBM25(ms []*candidateMatch, lineNumber int) (float64, []*zoekt.Symbol) { 206 // If this is a filename, then don't compute BM25. The score would not be comparable to line scores. 207 if lineNumber < 0 { 208 return 0, nil 209 } 210 211 // Use standard parameter defaults used in Lucene (https://lucene.apache.org/core/10_1_0/core/org/apache/lucene/search/similarities/BM25Similarity.html) 212 k, b := 1.2, 0.75 213 214 // Calculate the length ratio of this line. As a heuristic, we assume an average line length of 100 characters. 215 // Usually the calculation would be based on terms, but using bytes should work fine, as we're just computing a ratio. 216 nl := p.newlines() 217 lineLength := nl.lineStart(lineNumber+1) - nl.lineStart(lineNumber) 218 L := float64(lineLength) / 100.0 219 220 score := 0.0 221 tfs := p.calculateTermFrequency(ms) 222 for _, f := range tfs { 223 score += tfScore(k, b, L, f) 224 } 225 226 // Check if any index comes from a symbol match tree, and if so hydrate in symbol information 227 var symbolInfo []*zoekt.Symbol 228 for _, m := range ms { 229 if m.symbol { 230 if sec, si, ok := p.findSymbol(m); ok && si != nil { 231 // findSymbols does not hydrate in Sym. So we need to store it. 232 sym := sectionSlice(p.data(false), sec) 233 si.Sym = string(sym) 234 symbolInfo = append(symbolInfo, si) 235 } 236 } 237 } 238 return score, symbolInfo 239} 240 241// tfScore is the term frequency score for BM25. 242func tfScore(k float64, b float64, L float64, f int) float64 { 243 return ((k + 1.0) * float64(f)) / (k*(1.0-b+b*L) + float64(f)) 244} 245 246// calculateTermFrequency computes the term frequency for the file match. 247// Notes: 248// - Filename matches count more than content matches. This mimics a common text search strategy to 'boost' matches on document titles. 249// - Symbol matches also count more than content matches, to reward matches on symbol definitions. 250func (p *contentProvider) calculateTermFrequency(cands []*candidateMatch) map[string]int { 251 // Treat each candidate match as a term and compute the frequencies. For now, ignore case sensitivity and 252 // ignore whether the index is a word boundary. 253 termFreqs := map[string]int{} 254 for _, m := range cands { 255 term := string(m.substrLowered) 256 if m.fileName || p.matchesSymbol(m) { 257 termFreqs[term] += 5 258 } else { 259 termFreqs[term]++ 260 } 261 } 262 263 return termFreqs 264} 265 266// scoreFile computes a score for the file match using various scoring signals, like 267// whether there's an exact match on a symbol, the number of query clauses that matched, etc. 268func (d *indexData) scoreFile(fileMatch *zoekt.FileMatch, doc uint32, mt matchTree, known map[matchTree]bool, opts *zoekt.SearchOptions) { 269 atomMatchCount := 0 270 visitMatchAtoms(mt, known, func(mt matchTree) { 271 atomMatchCount++ 272 }) 273 274 addScore := func(what string, computed float64) { 275 fileMatch.AddScore(what, computed, -1, opts.DebugScore) 276 } 277 278 // atom-count boosts files with matches from more than 1 atom. The 279 // maximum boost is scoreFactorAtomMatch. 280 if atomMatchCount > 0 { 281 fileMatch.AddScore("atom", (1.0-1.0/float64(atomMatchCount))*scoreFactorAtomMatch, float64(atomMatchCount), opts.DebugScore) 282 } 283 284 maxFileScore := 0.0 285 for i := range fileMatch.LineMatches { 286 if maxFileScore < fileMatch.LineMatches[i].Score { 287 maxFileScore = fileMatch.LineMatches[i].Score 288 } 289 290 // Order by ordering in file. 291 fileMatch.LineMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.LineMatches)))) 292 } 293 294 for i := range fileMatch.ChunkMatches { 295 if maxFileScore < fileMatch.ChunkMatches[i].Score { 296 maxFileScore = fileMatch.ChunkMatches[i].Score 297 } 298 299 // Order by ordering in file. 300 fileMatch.ChunkMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.ChunkMatches)))) 301 } 302 303 // Maintain ordering of input files. This strictly dominates the in-file ordering of the matches. 304 addScore("fragment", maxFileScore) 305 306 // Truncate score to avoid overlap with the tiebreakers. 307 fileMatch.Score = math.Trunc(fileMatch.Score) 308 309 // Add tiebreakers 310 repoRank := d.repoMetaData[d.repos[doc]].Rank // [0, 65535] 311 docOrderScore := 1.0 - float64(doc)/float64(len(d.boundaries)) // [0, 1] 312 313 if opts.DebugScore { 314 // We log the score components individually for better readability. 315 fileMatch.Debug = fmt.Sprintf("score: %d (repo-rank: %d, file-rank: %.2f) <- %s", int(fileMatch.Score), repoRank, docOrderScore, strings.TrimSuffix(fileMatch.Debug, ", ")) 316 } 317 318 fileMatch.Score = ScoreOffset*fileMatch.Score + scoreRepoRankFactor*float64(repoRank) + scoreFileOrderFactor*docOrderScore 319} 320 321// scoreFilesUsingBM25 computes the score according to BM25, the most common scoring algorithm for text search: 322// https://en.wikipedia.org/wiki/Okapi_BM25. Note that we treat the inverse document frequency (idf) as constant. This 323// is supported by our evaluations which showed that for keyword style queries, idf can down-weight the score of some 324// keywords too much, leading to a worse ranking. The intuition is that each keyword is important independently of how 325// frequent it appears in the corpus. 326// 327// Unlike standard file scoring, this scoring strategy ignores all other signals including document ranks. This keeps 328// things simple for now, since BM25 is not normalized and can be tricky to combine with other scoring signals. It also 329// ignores the individual LineMatch and ChunkMatch scores, instead calculating a score over all matches in the file. 330func (d *indexData) scoreFilesUsingBM25(fileMatch *zoekt.FileMatch, doc uint32, tf map[string]int, opts *zoekt.SearchOptions) { 331 // Use standard parameter defaults used in Lucene (https://lucene.apache.org/core/10_1_0/core/org/apache/lucene/search/similarities/BM25Similarity.html) 332 k, b := 1.2, 0.75 333 334 averageFileLength := float64(d.boundaries[d.numDocs()]) / float64(d.numDocs()) 335 // This is very unlikely, but explicitly guard against division by zero. 336 if averageFileLength == 0 { 337 averageFileLength++ 338 } 339 340 // Compute the file length ratio. Usually the calculation would be based on terms, but using 341 // bytes should work fine, as we're just computing a ratio. 342 fileLength := float64(d.boundaries[doc+1] - d.boundaries[doc]) 343 344 L := fileLength / averageFileLength 345 346 score := 0.0 347 sumTF := 0 // Just for debugging 348 for _, f := range tf { 349 sumTF += f 350 score += tfScore(k, b, L, f) 351 } 352 // 2 digits of precision 353 score = math.Trunc(score*100) / 100 354 355 md := d.repoMetaData[d.repos[doc]] 356 fileOrderScore := 1.0 - float64(doc)/float64(len(d.boundaries)) 357 358 // Offset score by 9 digits and add the tiebreaker. 359 // 360 // Example: For a BM25 score of 1.23, a repo rank of 456789 and a file order score of 0.12, we have a final score of 361 // 12345678901.2 362 // ^^^ 363 // bm25 364 // ^^^^^^ 365 // repo rank 366 // ^^^^ 367 // doc order 368 fileMatch.Score = score*ScoreOffsetBM25 + scoreRepoRankFactor*float64(md.Rank) + scoreFileOrderFactor*fileOrderScore 369 370 if opts.DebugScore { 371 // To make the debug output easier to read, we split the score into the query dependent score and the tiebreaker 372 fileMatch.Debug = fmt.Sprintf("bm25-score: %.2f (repo-rank: %d, file-rank: %.2f) <- sum-termFrequencies: %d, length-ratio: %.2f", score, md.Rank, fileOrderScore, sumTF, L) 373 } 374}