fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package index
16
17import (
18 "bytes"
19 "fmt"
20 "math"
21 "strings"
22
23 "github.com/sourcegraph/zoekt"
24 "github.com/sourcegraph/zoekt/internal/ctags"
25)
26
27const (
28 ScoreOffset = 10_000_000
29 ScoreOffsetBM25 = 1_000_000_000
30)
31
32type chunkScore struct {
33 score float64
34 debugScore string
35 bestLine int
36}
37
38// scoreChunk calculates the score for each line in the chunk based on its candidate matches, and returns the score of
39// the best-scoring line, along with its line number.
40// Invariant: there should be at least one input candidate, len(ms) > 0.
41func (p *contentProvider) scoreChunk(ms []*candidateMatch, language string, opts *zoekt.SearchOptions) (chunkScore, []*zoekt.Symbol) {
42 nl := p.newlines()
43
44 var bestScore lineScore
45 bestLine := 0
46 var symbolInfo []*zoekt.Symbol
47
48 start := 0
49 currentLine := -1
50 for i, m := range ms {
51 lineNumber := -1
52 if !m.fileName {
53 lineNumber = nl.atOffset(m.byteOffset)
54 }
55
56 // If this match represents a new line, then score the previous line and update 'start'.
57 if i != 0 && lineNumber != currentLine {
58 score, si := p.scoreLine(ms[start:i], language, currentLine, opts)
59 symbolInfo = append(symbolInfo, si...)
60 if score.score > bestScore.score {
61 bestScore = score
62 bestLine = currentLine
63 }
64 start = i
65 }
66 currentLine = lineNumber
67 }
68
69 // Make sure to score the last line
70 line, si := p.scoreLine(ms[start:], language, currentLine, opts)
71 symbolInfo = append(symbolInfo, si...)
72 if line.score > bestScore.score {
73 bestScore = line
74 bestLine = currentLine
75 }
76
77 cs := chunkScore{
78 score: bestScore.score,
79 bestLine: bestLine,
80 }
81 if opts.DebugScore {
82 cs.debugScore = fmt.Sprintf("%s, (line: %d)", bestScore.debugScore, bestLine)
83 }
84 return cs, symbolInfo
85}
86
87type lineScore struct {
88 score float64
89 debugScore string
90}
91
92// scoreLine calculates a score for the line based on its candidate matches.
93// Invariants:
94// - All candidate matches are assumed to come from the same line in the content.
95// - If this line represents a filename, then lineNumber must be -1.
96// - There should be at least one input candidate, len(ms) > 0.
97func (p *contentProvider) scoreLine(ms []*candidateMatch, language string, lineNumber int, opts *zoekt.SearchOptions) (lineScore, []*zoekt.Symbol) {
98 if opts.UseBM25Scoring {
99 score, symbolInfo := p.scoreLineBM25(ms, lineNumber)
100 ls := lineScore{score: score}
101 if opts.DebugScore {
102 ls.debugScore = fmt.Sprintf("tfScore:%.2f, ", score)
103 }
104 return ls, symbolInfo
105 }
106
107 score := 0.0
108 what := ""
109 addScore := func(w string, s float64) {
110 if s != 0 && opts.DebugScore {
111 what += fmt.Sprintf("%s:%.2f, ", w, s)
112 }
113 score += s
114 }
115
116 filename := p.data(true)
117 var symbolInfo []*zoekt.Symbol
118
119 var bestScore lineScore
120 for i, m := range ms {
121 data := p.data(m.fileName)
122
123 endOffset := m.byteOffset + m.byteMatchSz
124 startBoundary := m.byteOffset < uint32(len(data)) && (m.byteOffset == 0 || byteClass(data[m.byteOffset-1]) != byteClass(data[m.byteOffset]))
125 endBoundary := endOffset > 0 && (endOffset == uint32(len(data)) || byteClass(data[endOffset-1]) != byteClass(data[endOffset]))
126
127 score = 0
128 what = ""
129
130 if startBoundary && endBoundary {
131 addScore("WordMatch", scoreWordMatch)
132 } else if startBoundary || endBoundary {
133 addScore("PartialWordMatch", scorePartialWordMatch)
134 }
135
136 if m.fileName {
137 sep := bytes.LastIndexByte(data, '/')
138 startMatch := int(m.byteOffset) == sep+1
139 endMatch := endOffset == uint32(len(data))
140 if startMatch && endMatch {
141 addScore("Base", scoreBase)
142 } else if startMatch || endMatch {
143 addScore("EdgeBase", (scoreBase+scorePartialBase)/2)
144 } else if sep < int(m.byteOffset) {
145 addScore("InnerBase", scorePartialBase)
146 }
147 } else if sec, si, ok := p.findSymbol(m); ok {
148 startMatch := sec.Start == m.byteOffset
149 endMatch := sec.End == endOffset
150 if startMatch && endMatch {
151 addScore("Symbol", scoreSymbol)
152 } else if startMatch || endMatch {
153 addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2)
154 } else {
155 addScore("OverlapSymbol", scorePartialSymbol)
156 }
157
158 // Score based on symbol data
159 if si != nil {
160 symbolKind := ctags.ParseSymbolKind(si.Kind)
161 sym := sectionSlice(data, sec)
162
163 addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, symbolKind))
164
165 // This is from a symbol tree, so we need to store the symbol
166 // information.
167 if m.symbol {
168 if symbolInfo == nil {
169 symbolInfo = make([]*zoekt.Symbol, len(ms))
170 }
171 // findSymbols does not hydrate in Sym. So we need to store it.
172 si.Sym = string(sym)
173 symbolInfo[i] = si
174 }
175 }
176 }
177
178 // scoreWeight != 1 means it affects score
179 if !epsilonEqualsOne(m.scoreWeight) {
180 score = score * m.scoreWeight
181 if opts.DebugScore {
182 what += fmt.Sprintf("boost:%.2f, ", m.scoreWeight)
183 }
184 }
185
186 if score > bestScore.score {
187 bestScore.score = score
188 bestScore.debugScore = what
189 }
190 }
191
192 if opts.DebugScore {
193 bestScore.debugScore = fmt.Sprintf("score:%.2f <- %s", bestScore.score, strings.TrimSuffix(bestScore.debugScore, ", "))
194 }
195
196 return bestScore, symbolInfo
197}
198
199// scoreLineBM25 computes the score of a line according to BM25, the most common scoring algorithm for text search:
200// https://en.wikipedia.org/wiki/Okapi_BM25. Compared to the standard scoreLine algorithm, this score rewards multiple
201// term matches on a line.
202// Notes:
203// - This BM25 calculation skips inverse document frequency (idf) to keep the implementation simple.
204// - It uses the same calculateTermFrequency method as BM25 file scoring, which boosts filename and symbol matches.
205func (p *contentProvider) scoreLineBM25(ms []*candidateMatch, lineNumber int) (float64, []*zoekt.Symbol) {
206 // If this is a filename, then don't compute BM25. The score would not be comparable to line scores.
207 if lineNumber < 0 {
208 return 0, nil
209 }
210
211 // Use standard parameter defaults used in Lucene (https://lucene.apache.org/core/10_1_0/core/org/apache/lucene/search/similarities/BM25Similarity.html)
212 k, b := 1.2, 0.75
213
214 // Calculate the length ratio of this line. As a heuristic, we assume an average line length of 100 characters.
215 // Usually the calculation would be based on terms, but using bytes should work fine, as we're just computing a ratio.
216 nl := p.newlines()
217 lineLength := nl.lineStart(lineNumber+1) - nl.lineStart(lineNumber)
218 L := float64(lineLength) / 100.0
219
220 score := 0.0
221 tfs := p.calculateTermFrequency(ms)
222 for _, f := range tfs {
223 score += tfScore(k, b, L, f)
224 }
225
226 // Check if any index comes from a symbol match tree, and if so hydrate in symbol information
227 var symbolInfo []*zoekt.Symbol
228 for _, m := range ms {
229 if m.symbol {
230 if sec, si, ok := p.findSymbol(m); ok && si != nil {
231 // findSymbols does not hydrate in Sym. So we need to store it.
232 sym := sectionSlice(p.data(false), sec)
233 si.Sym = string(sym)
234 symbolInfo = append(symbolInfo, si)
235 }
236 }
237 }
238 return score, symbolInfo
239}
240
241// tfScore is the term frequency score for BM25.
242func tfScore(k float64, b float64, L float64, f int) float64 {
243 return ((k + 1.0) * float64(f)) / (k*(1.0-b+b*L) + float64(f))
244}
245
246// calculateTermFrequency computes the term frequency for the file match.
247// Notes:
248// - Filename matches count more than content matches. This mimics a common text search strategy to 'boost' matches on document titles.
249// - Symbol matches also count more than content matches, to reward matches on symbol definitions.
250func (p *contentProvider) calculateTermFrequency(cands []*candidateMatch) map[string]int {
251 // Treat each candidate match as a term and compute the frequencies. For now, ignore case sensitivity and
252 // ignore whether the index is a word boundary.
253 termFreqs := map[string]int{}
254 for _, m := range cands {
255 term := string(m.substrLowered)
256 if m.fileName || p.matchesSymbol(m) {
257 termFreqs[term] += 5
258 } else {
259 termFreqs[term]++
260 }
261 }
262
263 return termFreqs
264}
265
266// scoreFile computes a score for the file match using various scoring signals, like
267// whether there's an exact match on a symbol, the number of query clauses that matched, etc.
268func (d *indexData) scoreFile(fileMatch *zoekt.FileMatch, doc uint32, mt matchTree, known map[matchTree]bool, opts *zoekt.SearchOptions) {
269 atomMatchCount := 0
270 visitMatchAtoms(mt, known, func(mt matchTree) {
271 atomMatchCount++
272 })
273
274 addScore := func(what string, computed float64) {
275 fileMatch.AddScore(what, computed, -1, opts.DebugScore)
276 }
277
278 // atom-count boosts files with matches from more than 1 atom. The
279 // maximum boost is scoreFactorAtomMatch.
280 if atomMatchCount > 0 {
281 fileMatch.AddScore("atom", (1.0-1.0/float64(atomMatchCount))*scoreFactorAtomMatch, float64(atomMatchCount), opts.DebugScore)
282 }
283
284 maxFileScore := 0.0
285 for i := range fileMatch.LineMatches {
286 if maxFileScore < fileMatch.LineMatches[i].Score {
287 maxFileScore = fileMatch.LineMatches[i].Score
288 }
289
290 // Order by ordering in file.
291 fileMatch.LineMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.LineMatches))))
292 }
293
294 for i := range fileMatch.ChunkMatches {
295 if maxFileScore < fileMatch.ChunkMatches[i].Score {
296 maxFileScore = fileMatch.ChunkMatches[i].Score
297 }
298
299 // Order by ordering in file.
300 fileMatch.ChunkMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.ChunkMatches))))
301 }
302
303 // Maintain ordering of input files. This strictly dominates the in-file ordering of the matches.
304 addScore("fragment", maxFileScore)
305
306 // Truncate score to avoid overlap with the tiebreakers.
307 fileMatch.Score = math.Trunc(fileMatch.Score)
308
309 // Add tiebreakers
310 repoRank := d.repoMetaData[d.repos[doc]].Rank // [0, 65535]
311 docOrderScore := 1.0 - float64(doc)/float64(len(d.boundaries)) // [0, 1]
312
313 if opts.DebugScore {
314 // We log the score components individually for better readability.
315 fileMatch.Debug = fmt.Sprintf("score: %d (repo-rank: %d, file-rank: %.2f) <- %s", int(fileMatch.Score), repoRank, docOrderScore, strings.TrimSuffix(fileMatch.Debug, ", "))
316 }
317
318 fileMatch.Score = ScoreOffset*fileMatch.Score + scoreRepoRankFactor*float64(repoRank) + scoreFileOrderFactor*docOrderScore
319}
320
321// scoreFilesUsingBM25 computes the score according to BM25, the most common scoring algorithm for text search:
322// https://en.wikipedia.org/wiki/Okapi_BM25. Note that we treat the inverse document frequency (idf) as constant. This
323// is supported by our evaluations which showed that for keyword style queries, idf can down-weight the score of some
324// keywords too much, leading to a worse ranking. The intuition is that each keyword is important independently of how
325// frequent it appears in the corpus.
326//
327// Unlike standard file scoring, this scoring strategy ignores all other signals including document ranks. This keeps
328// things simple for now, since BM25 is not normalized and can be tricky to combine with other scoring signals. It also
329// ignores the individual LineMatch and ChunkMatch scores, instead calculating a score over all matches in the file.
330func (d *indexData) scoreFilesUsingBM25(fileMatch *zoekt.FileMatch, doc uint32, tf map[string]int, opts *zoekt.SearchOptions) {
331 // Use standard parameter defaults used in Lucene (https://lucene.apache.org/core/10_1_0/core/org/apache/lucene/search/similarities/BM25Similarity.html)
332 k, b := 1.2, 0.75
333
334 averageFileLength := float64(d.boundaries[d.numDocs()]) / float64(d.numDocs())
335 // This is very unlikely, but explicitly guard against division by zero.
336 if averageFileLength == 0 {
337 averageFileLength++
338 }
339
340 // Compute the file length ratio. Usually the calculation would be based on terms, but using
341 // bytes should work fine, as we're just computing a ratio.
342 fileLength := float64(d.boundaries[doc+1] - d.boundaries[doc])
343
344 L := fileLength / averageFileLength
345
346 score := 0.0
347 sumTF := 0 // Just for debugging
348 for _, f := range tf {
349 sumTF += f
350 score += tfScore(k, b, L, f)
351 }
352 // 2 digits of precision
353 score = math.Trunc(score*100) / 100
354
355 md := d.repoMetaData[d.repos[doc]]
356 fileOrderScore := 1.0 - float64(doc)/float64(len(d.boundaries))
357
358 // Offset score by 9 digits and add the tiebreaker.
359 //
360 // Example: For a BM25 score of 1.23, a repo rank of 456789 and a file order score of 0.12, we have a final score of
361 // 12345678901.2
362 // ^^^
363 // bm25
364 // ^^^^^^
365 // repo rank
366 // ^^^^
367 // doc order
368 fileMatch.Score = score*ScoreOffsetBM25 + scoreRepoRankFactor*float64(md.Rank) + scoreFileOrderFactor*fileOrderScore
369
370 if opts.DebugScore {
371 // To make the debug output easier to read, we split the score into the query dependent score and the tiebreaker
372 fileMatch.Debug = fmt.Sprintf("bm25-score: %.2f (repo-rank: %d, file-rank: %.2f) <- sum-termFrequencies: %d, length-ratio: %.2f", score, md.Rank, fileOrderScore, sumTF, L)
373 }
374}