fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "bytes"
19 "fmt"
20 "math"
21 "strings"
22
23 "github.com/sourcegraph/zoekt/ctags"
24)
25
26const (
27 ScoreOffset = 10_000_000
28)
29
30type chunkScore struct {
31 score float64
32 debugScore string
33 bestLine int
34}
35
36// scoreChunk calculates the score for each line in the chunk based on its candidate matches, and returns the score of
37// the best-scoring line, along with its line number.
38// Invariant: there should be at least one input candidate, len(ms) > 0.
39func (p *contentProvider) scoreChunk(ms []*candidateMatch, language string, opts *SearchOptions) (chunkScore, []*Symbol) {
40 nl := p.newlines()
41
42 var bestScore lineScore
43 bestLine := 0
44 var symbolInfo []*Symbol
45
46 start := 0
47 currentLine := -1
48 for i, m := range ms {
49 lineNumber := -1
50 if !m.fileName {
51 lineNumber = nl.atOffset(m.byteOffset)
52 }
53
54 // If this match represents a new line, then score the previous line and update 'start'.
55 if i != 0 && lineNumber != currentLine {
56 score, si := p.scoreLine(ms[start:i], language, currentLine, opts)
57 symbolInfo = append(symbolInfo, si...)
58 if score.score > bestScore.score {
59 bestScore = score
60 bestLine = currentLine
61 }
62 start = i
63 }
64 currentLine = lineNumber
65 }
66
67 // Make sure to score the last line
68 line, si := p.scoreLine(ms[start:], language, currentLine, opts)
69 symbolInfo = append(symbolInfo, si...)
70 if line.score > bestScore.score {
71 bestScore = line
72 bestLine = currentLine
73 }
74
75 cs := chunkScore{
76 score: bestScore.score,
77 bestLine: bestLine,
78 }
79 if opts.DebugScore {
80 cs.debugScore = fmt.Sprintf("%s, (line: %d)", bestScore.debugScore, bestLine)
81 }
82 return cs, symbolInfo
83}
84
85type lineScore struct {
86 score float64
87 debugScore string
88}
89
90// scoreLine calculates a score for the line based on its candidate matches.
91// Invariants:
92// - All candidate matches are assumed to come from the same line in the content.
93// - If this line represents a filename, then lineNumber must be -1.
94// - There should be at least one input candidate, len(ms) > 0.
95func (p *contentProvider) scoreLine(ms []*candidateMatch, language string, lineNumber int, opts *SearchOptions) (lineScore, []*Symbol) {
96 if opts.UseBM25Scoring {
97 score, symbolInfo := p.scoreLineBM25(ms, lineNumber)
98 ls := lineScore{score: score}
99 if opts.DebugScore {
100 ls.debugScore = fmt.Sprintf("tfScore:%.2f, ", score)
101 }
102 return ls, symbolInfo
103 }
104
105 score := 0.0
106 what := ""
107 addScore := func(w string, s float64) {
108 if s != 0 && opts.DebugScore {
109 what += fmt.Sprintf("%s:%.2f, ", w, s)
110 }
111 score += s
112 }
113
114 filename := p.data(true)
115 var symbolInfo []*Symbol
116
117 var bestScore lineScore
118 for i, m := range ms {
119 data := p.data(m.fileName)
120
121 endOffset := m.byteOffset + m.byteMatchSz
122 startBoundary := m.byteOffset < uint32(len(data)) && (m.byteOffset == 0 || byteClass(data[m.byteOffset-1]) != byteClass(data[m.byteOffset]))
123 endBoundary := endOffset > 0 && (endOffset == uint32(len(data)) || byteClass(data[endOffset-1]) != byteClass(data[endOffset]))
124
125 score = 0
126 what = ""
127
128 if startBoundary && endBoundary {
129 addScore("WordMatch", scoreWordMatch)
130 } else if startBoundary || endBoundary {
131 addScore("PartialWordMatch", scorePartialWordMatch)
132 }
133
134 if m.fileName {
135 sep := bytes.LastIndexByte(data, '/')
136 startMatch := int(m.byteOffset) == sep+1
137 endMatch := endOffset == uint32(len(data))
138 if startMatch && endMatch {
139 addScore("Base", scoreBase)
140 } else if startMatch || endMatch {
141 addScore("EdgeBase", (scoreBase+scorePartialBase)/2)
142 } else if sep < int(m.byteOffset) {
143 addScore("InnerBase", scorePartialBase)
144 }
145 } else if sec, si, ok := p.findSymbol(m); ok {
146 startMatch := sec.Start == m.byteOffset
147 endMatch := sec.End == endOffset
148 if startMatch && endMatch {
149 addScore("Symbol", scoreSymbol)
150 } else if startMatch || endMatch {
151 addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2)
152 } else {
153 addScore("OverlapSymbol", scorePartialSymbol)
154 }
155
156 // Score based on symbol data
157 if si != nil {
158 symbolKind := ctags.ParseSymbolKind(si.Kind)
159 sym := sectionSlice(data, sec)
160
161 addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, symbolKind))
162
163 // This is from a symbol tree, so we need to store the symbol
164 // information.
165 if m.symbol {
166 if symbolInfo == nil {
167 symbolInfo = make([]*Symbol, len(ms))
168 }
169 // findSymbols does not hydrate in Sym. So we need to store it.
170 si.Sym = string(sym)
171 symbolInfo[i] = si
172 }
173 }
174 }
175
176 // scoreWeight != 1 means it affects score
177 if !epsilonEqualsOne(m.scoreWeight) {
178 score = score * m.scoreWeight
179 if opts.DebugScore {
180 what += fmt.Sprintf("boost:%.2f, ", m.scoreWeight)
181 }
182 }
183
184 if score > bestScore.score {
185 bestScore.score = score
186 bestScore.debugScore = what
187 }
188 }
189
190 if opts.DebugScore {
191 bestScore.debugScore = fmt.Sprintf("score:%.2f <- %s", bestScore.score, strings.TrimSuffix(bestScore.debugScore, ", "))
192 }
193
194 return bestScore, symbolInfo
195}
196
197// scoreLineBM25 computes the score of a line according to BM25, the most common scoring algorithm for text search:
198// https://en.wikipedia.org/wiki/Okapi_BM25. Compared to the standard scoreLine algorithm, this score rewards multiple
199// term matches on a line.
200// Notes:
201// - This BM25 calculation skips inverse document frequency (idf) to keep the implementation simple.
202// - It uses the same calculateTermFrequency method as BM25 file scoring, which boosts filename and symbol matches.
203func (p *contentProvider) scoreLineBM25(ms []*candidateMatch, lineNumber int) (float64, []*Symbol) {
204 // If this is a filename, then don't compute BM25. The score would not be comparable to line scores.
205 if lineNumber < 0 {
206 return 0, nil
207 }
208
209 // Use standard parameter defaults used in Lucene (https://lucene.apache.org/core/10_1_0/core/org/apache/lucene/search/similarities/BM25Similarity.html)
210 k, b := 1.2, 0.75
211
212 // Calculate the length ratio of this line. As a heuristic, we assume an average line length of 100 characters.
213 // Usually the calculation would be based on terms, but using bytes should work fine, as we're just computing a ratio.
214 nl := p.newlines()
215 lineLength := nl.lineStart(lineNumber+1) - nl.lineStart(lineNumber)
216 L := float64(lineLength) / 100.0
217
218 score := 0.0
219 tfs := p.calculateTermFrequency(ms, termDocumentFrequency{})
220 for _, f := range tfs {
221 score += ((k + 1.0) * float64(f)) / (k*(1.0-b+b*L) + float64(f))
222 }
223
224 // Check if any match comes from a symbol match tree, and if so hydrate in symbol information
225 var symbolInfo []*Symbol
226 for _, m := range ms {
227 if m.symbol {
228 if sec, si, ok := p.findSymbol(m); ok && si != nil {
229 // findSymbols does not hydrate in Sym. So we need to store it.
230 sym := sectionSlice(p.data(false), sec)
231 si.Sym = string(sym)
232 symbolInfo = append(symbolInfo, si)
233 }
234 }
235 }
236 return score, symbolInfo
237}
238
239// termDocumentFrequency is a map "term" -> "number of documents that contain the term"
240type termDocumentFrequency map[string]int
241
242// calculateTermFrequency computes the term frequency for the file match.
243// Notes:
244// - Filename matches count more than content matches. This mimics a common text search strategy to 'boost' matches on document titles.
245// - Symbol matches also count more than content matches, to reward matches on symbol definitions.
246func (p *contentProvider) calculateTermFrequency(cands []*candidateMatch, df termDocumentFrequency) map[string]int {
247 // Treat each candidate match as a term and compute the frequencies. For now, ignore case sensitivity and
248 // ignore whether the match is a word boundary.
249 termFreqs := map[string]int{}
250 for _, m := range cands {
251 term := string(m.substrLowered)
252 if m.fileName || p.matchesSymbol(m) {
253 termFreqs[term] += 5
254 } else {
255 termFreqs[term]++
256 }
257 }
258
259 for term := range termFreqs {
260 df[term] += 1
261 }
262 return termFreqs
263}
264
265// scoreFile computes a score for the file match using various scoring signals, like
266// whether there's an exact match on a symbol, the number of query clauses that matched, etc.
267func (d *indexData) scoreFile(fileMatch *FileMatch, doc uint32, mt matchTree, known map[matchTree]bool, opts *SearchOptions) {
268 atomMatchCount := 0
269 visitMatchAtoms(mt, known, func(mt matchTree) {
270 atomMatchCount++
271 })
272
273 addScore := func(what string, computed float64) {
274 fileMatch.addScore(what, computed, -1, opts.DebugScore)
275 }
276
277 // atom-count boosts files with matches from more than 1 atom. The
278 // maximum boost is scoreFactorAtomMatch.
279 if atomMatchCount > 0 {
280 fileMatch.addScore("atom", (1.0-1.0/float64(atomMatchCount))*scoreFactorAtomMatch, float64(atomMatchCount), opts.DebugScore)
281 }
282
283 maxFileScore := 0.0
284 for i := range fileMatch.LineMatches {
285 if maxFileScore < fileMatch.LineMatches[i].Score {
286 maxFileScore = fileMatch.LineMatches[i].Score
287 }
288
289 // Order by ordering in file.
290 fileMatch.LineMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.LineMatches))))
291 }
292
293 for i := range fileMatch.ChunkMatches {
294 if maxFileScore < fileMatch.ChunkMatches[i].Score {
295 maxFileScore = fileMatch.ChunkMatches[i].Score
296 }
297
298 // Order by ordering in file.
299 fileMatch.ChunkMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.ChunkMatches))))
300 }
301
302 // Maintain ordering of input files. This
303 // strictly dominates the in-file ordering of
304 // the matches.
305 addScore("fragment", maxFileScore)
306
307 // Add tiebreakers
308 //
309 // ScoreOffset shifts the score 7 digits to the left.
310 fileMatch.Score = math.Trunc(fileMatch.Score) * ScoreOffset
311
312 md := d.repoMetaData[d.repos[doc]]
313
314 // md.Rank lies in the range [0, 65535]. Hence, we have to allocate 5 digits for
315 // the rank. The scoreRepoRankFactor shifts the rank score 2 digits to the left,
316 // reserving digits 3-7 for the repo rank.
317 addScore("repo-rank", scoreRepoRankFactor*float64(md.Rank))
318
319 // digits 1-2 and the decimals are reserved for the doc order. Doc order
320 // (without the scaling factor) lies in the range [0, 1]. The upper bound is
321 // achieved for matches in the first document of a shard.
322 addScore("doc-order", scoreFileOrderFactor*(1.0-float64(doc)/float64(len(d.boundaries))))
323
324 if opts.DebugScore {
325 // To make the debug output easier to read, we split the score into the query
326 // dependent score and the tiebreaker
327 score := math.Trunc(fileMatch.Score / ScoreOffset)
328 tiebreaker := fileMatch.Score - score*ScoreOffset
329 fileMatch.Debug = fmt.Sprintf("score: %d (%.2f) <- %s", int(score), tiebreaker, strings.TrimSuffix(fileMatch.Debug, ", "))
330 }
331}
332
333// termFrequency stores the term frequencies for doc.
334type termFrequency struct {
335 doc uint32
336 tf map[string]int
337}
338
339// scoreFilesUsingBM25 computes the score according to BM25, the most common scoring algorithm for text search:
340// https://en.wikipedia.org/wiki/Okapi_BM25.
341//
342// Unlike standard file scoring, this scoring strategy ignores all other signals including document ranks. This keeps
343// things simple for now, since BM25 is not normalized and can be tricky to combine with other scoring signals. It also
344// ignores the individual LineMatch and ChunkMatch scores, instead calculating a score over all matches in the file.
345func (d *indexData) scoreFilesUsingBM25(fileMatches []FileMatch, tfs []termFrequency, df termDocumentFrequency, opts *SearchOptions) {
346 // Use standard parameter defaults used in Lucene (https://lucene.apache.org/core/10_1_0/core/org/apache/lucene/search/similarities/BM25Similarity.html)
347 k, b := 1.2, 0.75
348
349 averageFileLength := float64(d.boundaries[d.numDocs()]) / float64(d.numDocs())
350 // This is very unlikely, but explicitly guard against division by zero.
351 if averageFileLength == 0 {
352 averageFileLength++
353 }
354
355 for i := range tfs {
356 score := 0.0
357
358 // Compute the file length ratio. Usually the calculation would be based on terms, but using
359 // bytes should work fine, as we're just computing a ratio.
360 doc := tfs[i].doc
361 fileLength := float64(d.boundaries[doc+1] - d.boundaries[doc])
362
363 L := fileLength / averageFileLength
364
365 sumTF := 0 // Just for debugging
366 for term, f := range tfs[i].tf {
367 sumTF += f
368 tfScore := ((k + 1.0) * float64(f)) / (k*(1.0-b+b*L) + float64(f))
369 score += idf(df[term], int(d.numDocs())) * tfScore
370 }
371
372 fileMatches[i].Score = score
373
374 if opts.DebugScore {
375 fileMatches[i].Debug = fmt.Sprintf("bm25-score: %.2f <- sum-termFrequencies: %d, length-ratio: %.2f", score, sumTF, L)
376 }
377 }
378}
379
380// idf computes the inverse document frequency for a term. nq is the number of
381// documents that contain the term and documentCount is the total number of
382// documents in the corpus.
383func idf(nq, documentCount int) float64 {
384 return math.Log(1.0 + ((float64(documentCount) - float64(nq) + 0.5) / (float64(nq) + 0.5)))
385}