fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package index 16 17import ( 18 "bytes" 19 "fmt" 20 "math" 21 "strings" 22 23 "github.com/sourcegraph/zoekt" 24 "github.com/sourcegraph/zoekt/internal/ctags" 25) 26 27const ( 28 ScoreOffset = 10_000_000 29) 30 31type chunkScore struct { 32 score float64 33 debugScore string 34 bestLine int 35} 36 37// scoreChunk calculates the score for each line in the chunk based on its candidate matches, and returns the score of 38// the best-scoring line, along with its line number. 39// Invariant: there should be at least one input candidate, len(ms) > 0. 40func (p *contentProvider) scoreChunk(ms []*candidateMatch, language string, opts *zoekt.SearchOptions) (chunkScore, []*zoekt.Symbol) { 41 nl := p.newlines() 42 43 var bestScore lineScore 44 bestLine := 0 45 var symbolInfo []*zoekt.Symbol 46 47 start := 0 48 currentLine := -1 49 for i, m := range ms { 50 lineNumber := -1 51 if !m.fileName { 52 lineNumber = nl.atOffset(m.byteOffset) 53 } 54 55 // If this match represents a new line, then score the previous line and update 'start'. 56 if i != 0 && lineNumber != currentLine { 57 score, si := p.scoreLine(ms[start:i], language, currentLine, opts) 58 symbolInfo = append(symbolInfo, si...) 59 if score.score > bestScore.score { 60 bestScore = score 61 bestLine = currentLine 62 } 63 start = i 64 } 65 currentLine = lineNumber 66 } 67 68 // Make sure to score the last line 69 line, si := p.scoreLine(ms[start:], language, currentLine, opts) 70 symbolInfo = append(symbolInfo, si...) 71 if line.score > bestScore.score { 72 bestScore = line 73 bestLine = currentLine 74 } 75 76 cs := chunkScore{ 77 score: bestScore.score, 78 bestLine: bestLine, 79 } 80 if opts.DebugScore { 81 cs.debugScore = fmt.Sprintf("%s, (line: %d)", bestScore.debugScore, bestLine) 82 } 83 return cs, symbolInfo 84} 85 86type lineScore struct { 87 score float64 88 debugScore string 89} 90 91// scoreLine calculates a score for the line based on its candidate matches. 92// Invariants: 93// - All candidate matches are assumed to come from the same line in the content. 94// - If this line represents a filename, then lineNumber must be -1. 95// - There should be at least one input candidate, len(ms) > 0. 96func (p *contentProvider) scoreLine(ms []*candidateMatch, language string, lineNumber int, opts *zoekt.SearchOptions) (lineScore, []*zoekt.Symbol) { 97 if opts.UseBM25Scoring { 98 score, symbolInfo := p.scoreLineBM25(ms, lineNumber) 99 ls := lineScore{score: score} 100 if opts.DebugScore { 101 ls.debugScore = fmt.Sprintf("tfScore:%.2f, ", score) 102 } 103 return ls, symbolInfo 104 } 105 106 score := 0.0 107 what := "" 108 addScore := func(w string, s float64) { 109 if s != 0 && opts.DebugScore { 110 what += fmt.Sprintf("%s:%.2f, ", w, s) 111 } 112 score += s 113 } 114 115 filename := p.data(true) 116 var symbolInfo []*zoekt.Symbol 117 118 var bestScore lineScore 119 for i, m := range ms { 120 data := p.data(m.fileName) 121 122 endOffset := m.byteOffset + m.byteMatchSz 123 startBoundary := m.byteOffset < uint32(len(data)) && (m.byteOffset == 0 || byteClass(data[m.byteOffset-1]) != byteClass(data[m.byteOffset])) 124 endBoundary := endOffset > 0 && (endOffset == uint32(len(data)) || byteClass(data[endOffset-1]) != byteClass(data[endOffset])) 125 126 score = 0 127 what = "" 128 129 if startBoundary && endBoundary { 130 addScore("WordMatch", scoreWordMatch) 131 } else if startBoundary || endBoundary { 132 addScore("PartialWordMatch", scorePartialWordMatch) 133 } 134 135 if m.fileName { 136 sep := bytes.LastIndexByte(data, '/') 137 startMatch := int(m.byteOffset) == sep+1 138 endMatch := endOffset == uint32(len(data)) 139 if startMatch && endMatch { 140 addScore("Base", scoreBase) 141 } else if startMatch || endMatch { 142 addScore("EdgeBase", (scoreBase+scorePartialBase)/2) 143 } else if sep < int(m.byteOffset) { 144 addScore("InnerBase", scorePartialBase) 145 } 146 } else if sec, si, ok := p.findSymbol(m); ok { 147 startMatch := sec.Start == m.byteOffset 148 endMatch := sec.End == endOffset 149 if startMatch && endMatch { 150 addScore("Symbol", scoreSymbol) 151 } else if startMatch || endMatch { 152 addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2) 153 } else { 154 addScore("OverlapSymbol", scorePartialSymbol) 155 } 156 157 // Score based on symbol data 158 if si != nil { 159 symbolKind := ctags.ParseSymbolKind(si.Kind) 160 sym := sectionSlice(data, sec) 161 162 addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, symbolKind)) 163 164 // This is from a symbol tree, so we need to store the symbol 165 // information. 166 if m.symbol { 167 if symbolInfo == nil { 168 symbolInfo = make([]*zoekt.Symbol, len(ms)) 169 } 170 // findSymbols does not hydrate in Sym. So we need to store it. 171 si.Sym = string(sym) 172 symbolInfo[i] = si 173 } 174 } 175 } 176 177 // scoreWeight != 1 means it affects score 178 if !epsilonEqualsOne(m.scoreWeight) { 179 score = score * m.scoreWeight 180 if opts.DebugScore { 181 what += fmt.Sprintf("boost:%.2f, ", m.scoreWeight) 182 } 183 } 184 185 if score > bestScore.score { 186 bestScore.score = score 187 bestScore.debugScore = what 188 } 189 } 190 191 if opts.DebugScore { 192 bestScore.debugScore = fmt.Sprintf("score:%.2f <- %s", bestScore.score, strings.TrimSuffix(bestScore.debugScore, ", ")) 193 } 194 195 return bestScore, symbolInfo 196} 197 198// scoreLineBM25 computes the score of a line according to BM25, the most common scoring algorithm for text search: 199// https://en.wikipedia.org/wiki/Okapi_BM25. Compared to the standard scoreLine algorithm, this score rewards multiple 200// term matches on a line. 201// Notes: 202// - This BM25 calculation skips inverse document frequency (idf) to keep the implementation simple. 203// - It uses the same calculateTermFrequency method as BM25 file scoring, which boosts filename and symbol matches. 204func (p *contentProvider) scoreLineBM25(ms []*candidateMatch, lineNumber int) (float64, []*zoekt.Symbol) { 205 // If this is a filename, then don't compute BM25. The score would not be comparable to line scores. 206 if lineNumber < 0 { 207 return 0, nil 208 } 209 210 // Use standard parameter defaults used in Lucene (https://lucene.apache.org/core/10_1_0/core/org/apache/lucene/search/similarities/BM25Similarity.html) 211 k, b := 1.2, 0.75 212 213 // Calculate the length ratio of this line. As a heuristic, we assume an average line length of 100 characters. 214 // Usually the calculation would be based on terms, but using bytes should work fine, as we're just computing a ratio. 215 nl := p.newlines() 216 lineLength := nl.lineStart(lineNumber+1) - nl.lineStart(lineNumber) 217 L := float64(lineLength) / 100.0 218 219 score := 0.0 220 tfs := p.calculateTermFrequency(ms, termDocumentFrequency{}) 221 for _, f := range tfs { 222 score += ((k + 1.0) * float64(f)) / (k*(1.0-b+b*L) + float64(f)) 223 } 224 225 // Check if any index comes from a symbol match tree, and if so hydrate in symbol information 226 var symbolInfo []*zoekt.Symbol 227 for _, m := range ms { 228 if m.symbol { 229 if sec, si, ok := p.findSymbol(m); ok && si != nil { 230 // findSymbols does not hydrate in Sym. So we need to store it. 231 sym := sectionSlice(p.data(false), sec) 232 si.Sym = string(sym) 233 symbolInfo = append(symbolInfo, si) 234 } 235 } 236 } 237 return score, symbolInfo 238} 239 240// termDocumentFrequency is a map "term" -> "number of documents that contain the term" 241type termDocumentFrequency map[string]int 242 243// calculateTermFrequency computes the term frequency for the file match. 244// Notes: 245// - Filename matches count more than content matches. This mimics a common text search strategy to 'boost' matches on document titles. 246// - Symbol matches also count more than content matches, to reward matches on symbol definitions. 247func (p *contentProvider) calculateTermFrequency(cands []*candidateMatch, df termDocumentFrequency) map[string]int { 248 // Treat each candidate match as a term and compute the frequencies. For now, ignore case sensitivity and 249 // ignore whether the index is a word boundary. 250 termFreqs := map[string]int{} 251 for _, m := range cands { 252 term := string(m.substrLowered) 253 if m.fileName || p.matchesSymbol(m) { 254 termFreqs[term] += 5 255 } else { 256 termFreqs[term]++ 257 } 258 } 259 260 for term := range termFreqs { 261 df[term] += 1 262 } 263 return termFreqs 264} 265 266// scoreFile computes a score for the file match using various scoring signals, like 267// whether there's an exact match on a symbol, the number of query clauses that matched, etc. 268func (d *indexData) scoreFile(fileMatch *zoekt.FileMatch, doc uint32, mt matchTree, known map[matchTree]bool, opts *zoekt.SearchOptions) { 269 atomMatchCount := 0 270 visitMatchAtoms(mt, known, func(mt matchTree) { 271 atomMatchCount++ 272 }) 273 274 addScore := func(what string, computed float64) { 275 fileMatch.AddScore(what, computed, -1, opts.DebugScore) 276 } 277 278 // atom-count boosts files with matches from more than 1 atom. The 279 // maximum boost is scoreFactorAtomMatch. 280 if atomMatchCount > 0 { 281 fileMatch.AddScore("atom", (1.0-1.0/float64(atomMatchCount))*scoreFactorAtomMatch, float64(atomMatchCount), opts.DebugScore) 282 } 283 284 maxFileScore := 0.0 285 for i := range fileMatch.LineMatches { 286 if maxFileScore < fileMatch.LineMatches[i].Score { 287 maxFileScore = fileMatch.LineMatches[i].Score 288 } 289 290 // Order by ordering in file. 291 fileMatch.LineMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.LineMatches)))) 292 } 293 294 for i := range fileMatch.ChunkMatches { 295 if maxFileScore < fileMatch.ChunkMatches[i].Score { 296 maxFileScore = fileMatch.ChunkMatches[i].Score 297 } 298 299 // Order by ordering in file. 300 fileMatch.ChunkMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.ChunkMatches)))) 301 } 302 303 // Maintain ordering of input files. This 304 // strictly dominates the in-file ordering of 305 // the matches. 306 addScore("fragment", maxFileScore) 307 308 // Add tiebreakers 309 // 310 // ScoreOffset shifts the score 7 digits to the left. 311 fileMatch.Score = math.Trunc(fileMatch.Score) * ScoreOffset 312 313 md := d.repoMetaData[d.repos[doc]] 314 315 // md.Rank lies in the range [0, 65535]. Hence, we have to allocate 5 digits for 316 // the rank. The scoreRepoRankFactor shifts the rank score 2 digits to the left, 317 // reserving digits 3-7 for the repo rank. 318 addScore("repo-rank", scoreRepoRankFactor*float64(md.Rank)) 319 320 // digits 1-2 and the decimals are reserved for the doc order. Doc order 321 // (without the scaling factor) lies in the range [0, 1]. The upper bound is 322 // achieved for matches in the first document of a shard. 323 addScore("doc-order", scoreFileOrderFactor*(1.0-float64(doc)/float64(len(d.boundaries)))) 324 325 if opts.DebugScore { 326 // To make the debug output easier to read, we split the score into the query 327 // dependent score and the tiebreaker 328 score := math.Trunc(fileMatch.Score / ScoreOffset) 329 tiebreaker := fileMatch.Score - score*ScoreOffset 330 fileMatch.Debug = fmt.Sprintf("score: %d (%.2f) <- %s", int(score), tiebreaker, strings.TrimSuffix(fileMatch.Debug, ", ")) 331 } 332} 333 334// termFrequency stores the term frequencies for doc. 335type termFrequency struct { 336 doc uint32 337 tf map[string]int 338} 339 340// scoreFilesUsingBM25 computes the score according to BM25, the most common scoring algorithm for text search: 341// https://en.wikipedia.org/wiki/Okapi_BM25. 342// 343// Unlike standard file scoring, this scoring strategy ignores all other signals including document ranks. This keeps 344// things simple for now, since BM25 is not normalized and can be tricky to combine with other scoring signals. It also 345// ignores the individual LineMatch and ChunkMatch scores, instead calculating a score over all matches in the file. 346func (d *indexData) scoreFilesUsingBM25(fileMatches []zoekt.FileMatch, tfs []termFrequency, df termDocumentFrequency, opts *zoekt.SearchOptions) { 347 // Use standard parameter defaults used in Lucene (https://lucene.apache.org/core/10_1_0/core/org/apache/lucene/search/similarities/BM25Similarity.html) 348 k, b := 1.2, 0.75 349 350 averageFileLength := float64(d.boundaries[d.numDocs()]) / float64(d.numDocs()) 351 // This is very unlikely, but explicitly guard against division by zero. 352 if averageFileLength == 0 { 353 averageFileLength++ 354 } 355 356 for i := range tfs { 357 score := 0.0 358 359 // Compute the file length ratio. Usually the calculation would be based on terms, but using 360 // bytes should work fine, as we're just computing a ratio. 361 doc := tfs[i].doc 362 fileLength := float64(d.boundaries[doc+1] - d.boundaries[doc]) 363 364 L := fileLength / averageFileLength 365 366 sumTF := 0 // Just for debugging 367 for term, f := range tfs[i].tf { 368 sumTF += f 369 tfScore := ((k + 1.0) * float64(f)) / (k*(1.0-b+b*L) + float64(f)) 370 score += idf(df[term], int(d.numDocs())) * tfScore 371 } 372 373 fileMatches[i].Score = score 374 375 if opts.DebugScore { 376 fileMatches[i].Debug = fmt.Sprintf("bm25-score: %.2f <- sum-termFrequencies: %d, length-ratio: %.2f", score, sumTF, L) 377 } 378 } 379} 380 381// idf computes the inverse document frequency for a term. nq is the number of 382// documents that contain the term and documentCount is the total number of 383// documents in the corpus. 384func idf(nq, documentCount int) float64 { 385 return math.Log(1.0 + ((float64(documentCount) - float64(nq) + 0.5) / (float64(nq) + 0.5))) 386}