index/score.go at main · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / index / score.go
at main 15 kB View raw
Keegan Carruthers-Smith refactor(all): goimports -w -local github.com/sourcegraph/zoekt (#948) 1y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package index
 16
 17import (
 18	"bytes"
 19	"fmt"
 20	"math"
 21	"strings"
 22
 23	"github.com/go-enry/go-enry/v2"
 24
 25	"github.com/sourcegraph/zoekt"
 26	"github.com/sourcegraph/zoekt/internal/ctags"
 27)
 28
 29const (
 30	ScoreOffset = 10_000_000
 31)
 32
 33type chunkScore struct {
 34	score      float64
 35	debugScore string
 36	bestLine   int
 37}
 38
 39// scoreChunk calculates the score for each line in the chunk based on its candidate matches, and returns the score of
 40// the best-scoring line, along with its line number.
 41// Invariant: there should be at least one input candidate, len(ms) > 0.
 42func (p *contentProvider) scoreChunk(ms []*candidateMatch, language string, opts *zoekt.SearchOptions) (chunkScore, []*zoekt.Symbol) {
 43	nl := p.newlines()
 44
 45	var bestScore lineScore
 46	bestLine := 0
 47	var symbolInfo []*zoekt.Symbol
 48
 49	start := 0
 50	currentLine := -1
 51	for i, m := range ms {
 52		lineNumber := -1
 53		if !m.fileName {
 54			lineNumber = nl.atOffset(m.byteOffset)
 55		}
 56
 57		// If this match represents a new line, then score the previous line and update 'start'.
 58		if i != 0 && lineNumber != currentLine {
 59			score, si := p.scoreLine(ms[start:i], language, currentLine, opts)
 60			symbolInfo = append(symbolInfo, si...)
 61			if score.score > bestScore.score {
 62				bestScore = score
 63				bestLine = currentLine
 64			}
 65			start = i
 66		}
 67		currentLine = lineNumber
 68	}
 69
 70	// Make sure to score the last line
 71	line, si := p.scoreLine(ms[start:], language, currentLine, opts)
 72	symbolInfo = append(symbolInfo, si...)
 73	if line.score > bestScore.score {
 74		bestScore = line
 75		bestLine = currentLine
 76	}
 77
 78	cs := chunkScore{
 79		score:    bestScore.score,
 80		bestLine: bestLine,
 81	}
 82	if opts.DebugScore {
 83		cs.debugScore = fmt.Sprintf("%s, (line: %d)", bestScore.debugScore, bestLine)
 84	}
 85	return cs, symbolInfo
 86}
 87
 88type lineScore struct {
 89	score      float64
 90	debugScore string
 91}
 92
 93// scoreLine calculates a score for the line based on its candidate matches.
 94// Invariants:
 95// - All candidate matches are assumed to come from the same line in the content.
 96// - If this line represents a filename, then lineNumber must be -1.
 97// - There should be at least one input candidate, len(ms) > 0.
 98func (p *contentProvider) scoreLine(ms []*candidateMatch, language string, lineNumber int, opts *zoekt.SearchOptions) (lineScore, []*zoekt.Symbol) {
 99	if opts.UseBM25Scoring {
100		score, symbolInfo := p.scoreLineBM25(ms, lineNumber)
101		ls := lineScore{score: score}
102		if opts.DebugScore {
103			ls.debugScore = fmt.Sprintf("tfScore:%.2f, ", score)
104		}
105		return ls, symbolInfo
106	}
107
108	score := 0.0
109	what := ""
110	addScore := func(w string, s float64) {
111		if s != 0 && opts.DebugScore {
112			what += fmt.Sprintf("%s:%.2f, ", w, s)
113		}
114		score += s
115	}
116
117	filename := p.data(true)
118	var symbolInfo []*zoekt.Symbol
119
120	var bestScore lineScore
121	for i, m := range ms {
122		data := p.data(m.fileName)
123
124		endOffset := m.byteOffset + m.byteMatchSz
125		startBoundary := m.byteOffset < uint32(len(data)) && (m.byteOffset == 0 || byteClass(data[m.byteOffset-1]) != byteClass(data[m.byteOffset]))
126		endBoundary := endOffset > 0 && (endOffset == uint32(len(data)) || byteClass(data[endOffset-1]) != byteClass(data[endOffset]))
127
128		score = 0
129		what = ""
130
131		if startBoundary && endBoundary {
132			addScore("WordMatch", scoreWordMatch)
133		} else if startBoundary || endBoundary {
134			addScore("PartialWordMatch", scorePartialWordMatch)
135		}
136
137		if m.fileName {
138			sep := bytes.LastIndexByte(data, '/')
139			startMatch := int(m.byteOffset) == sep+1
140			endMatch := endOffset == uint32(len(data))
141			if startMatch && endMatch {
142				addScore("Base", scoreBase)
143			} else if startMatch || endMatch {
144				addScore("EdgeBase", (scoreBase+scorePartialBase)/2)
145			} else if sep < int(m.byteOffset) {
146				addScore("InnerBase", scorePartialBase)
147			}
148		} else if sec, si, ok := p.findSymbol(m); ok {
149			startMatch := sec.Start == m.byteOffset
150			endMatch := sec.End == endOffset
151			if startMatch && endMatch {
152				addScore("Symbol", scoreSymbol)
153			} else if startMatch || endMatch {
154				addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2)
155			} else {
156				addScore("OverlapSymbol", scorePartialSymbol)
157			}
158
159			// Score based on symbol data
160			if si != nil {
161				symbolKind := ctags.ParseSymbolKind(si.Kind)
162				sym := sectionSlice(data, sec)
163
164				addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, symbolKind))
165
166				// This is from a symbol tree, so we need to store the symbol
167				// information.
168				if m.symbol {
169					if symbolInfo == nil {
170						symbolInfo = make([]*zoekt.Symbol, len(ms))
171					}
172					// findSymbols does not hydrate in Sym. So we need to store it.
173					si.Sym = string(sym)
174					symbolInfo[i] = si
175				}
176			}
177		}
178
179		// scoreWeight != 1 means it affects score
180		if !epsilonEqualsOne(m.scoreWeight) {
181			score = score * m.scoreWeight
182			if opts.DebugScore {
183				what += fmt.Sprintf("boost:%.2f, ", m.scoreWeight)
184			}
185		}
186
187		if score > bestScore.score {
188			bestScore.score = score
189			bestScore.debugScore = what
190		}
191	}
192
193	if opts.DebugScore {
194		bestScore.debugScore = fmt.Sprintf("score:%.2f <- %s", bestScore.score, strings.TrimSuffix(bestScore.debugScore, ", "))
195	}
196
197	return bestScore, symbolInfo
198}
199
200// scoreLineBM25 computes the score of a line according to BM25, the most common scoring algorithm for text search:
201// https://en.wikipedia.org/wiki/Okapi_BM25. Compared to the standard scoreLine algorithm, this score rewards multiple
202// term matches on a line.
203// Notes:
204// - This BM25 calculation skips inverse document frequency (idf) to keep the implementation simple.
205// - It uses the same calculateTermFrequency method as BM25 file scoring, which boosts filename and symbol matches.
206func (p *contentProvider) scoreLineBM25(ms []*candidateMatch, lineNumber int) (float64, []*zoekt.Symbol) {
207	// If this is a filename, then don't compute BM25. The score would not be comparable to line scores.
208	if lineNumber < 0 {
209		return 0, nil
210	}
211
212	// Use standard parameter defaults used in Lucene (https://lucene.apache.org/core/10_1_0/core/org/apache/lucene/search/similarities/BM25Similarity.html)
213	k, b := 1.2, 0.75
214
215	// Calculate the length ratio of this line. As a heuristic, we assume an average line length of 100 characters.
216	// Usually the calculation would be based on terms, but using bytes should work fine, as we're just computing a ratio.
217	nl := p.newlines()
218	lineLength := nl.lineStart(lineNumber+1) - nl.lineStart(lineNumber)
219	L := float64(lineLength) / 100.0
220
221	score := 0.0
222	tfs := p.calculateTermFrequency(ms, false) // ignore file priority, since we're just scoring within a single file
223	for _, f := range tfs {
224		score += tfScore(k, b, L, f)
225	}
226
227	// Check if any index comes from a symbol match tree, and if so hydrate in symbol information
228	var symbolInfo []*zoekt.Symbol
229	for _, m := range ms {
230		if m.symbol {
231			if sec, si, ok := p.findSymbol(m); ok && si != nil {
232				// findSymbols does not hydrate in Sym. So we need to store it.
233				sym := sectionSlice(p.data(false), sec)
234				si.Sym = string(sym)
235				symbolInfo = append(symbolInfo, si)
236			}
237		}
238	}
239
240	score = boostScore(score, ms)
241	return score, symbolInfo
242}
243
244// tfScore is the term frequency score for BM25.
245func tfScore(k float64, b float64, L float64, f int) float64 {
246	return ((k + 1.0) * float64(f)) / (k*(1.0-b+b*L) + float64(f))
247}
248
249const importantTermBoost = 5
250const lowPriorityFilePenalty = 5
251
252// calculateTermFrequency computes the term frequency for the file match.
253// Notes:
254// - Filename matches count more than content matches. This mimics a common text search strategy to 'boost' matches on document titles.
255// - Symbol matches also count more than content matches, to reward matches on symbol definitions.
256// - "Low priority" files like tests, generated files, etc. have their term frequency down-weighted, to prioritize matches from 'regular' files
257func (p *contentProvider) calculateTermFrequency(cands []*candidateMatch, lowPriority bool) map[string]int {
258	// Treat each candidate match as a term and compute the frequencies. For now, ignore case sensitivity and
259	// ignore whether the index is a word boundary.
260	termFreqs := map[string]int{}
261	for _, m := range cands {
262		term := string(m.substrLowered)
263		if m.fileName || p.matchesSymbol(m) {
264			termFreqs[term] += importantTermBoost
265		} else {
266			termFreqs[term]++
267		}
268	}
269
270	// If a file is a test, generated, etc., then down-weight its term frequency. The BM25F interpretation
271	// is that this data lives in a separate 'field' that is half the priority of regular content.
272	if lowPriority {
273		for term := range termFreqs {
274			termFreqs[term] = termFreqs[term] / lowPriorityFilePenalty
275		}
276	}
277
278	return termFreqs
279}
280
281// boostScore finds whether any of the matches are part of a boosted match tree, then applies
282// the boost to the final score. This follows precedent in other search engines like Lucene, where
283// boosts multiply an entire query clause's final score.
284//
285// As a heuristic, we use the maximum boost across matches to avoid applying the same boost multiple times.
286func boostScore(score float64, ms []*candidateMatch) float64 {
287	maxScoreWeight := 1.0
288	for _, m := range ms {
289		if m.scoreWeight > maxScoreWeight {
290			maxScoreWeight = m.scoreWeight
291		}
292	}
293
294	if !epsilonEqualsOne(maxScoreWeight) {
295		score = score * maxScoreWeight
296	}
297	return score
298}
299
300// scoreFile computes a score for the file match using various scoring signals, like
301// whether there's an exact match on a symbol, the number of query clauses that matched, etc.
302func (d *indexData) scoreFile(fileMatch *zoekt.FileMatch, doc uint32, mt matchTree, known map[matchTree]bool, opts *zoekt.SearchOptions) {
303	atomMatchCount := 0
304	visitMatchAtoms(mt, known, func(mt matchTree) {
305		atomMatchCount++
306	})
307
308	addScore := func(what string, computed float64) {
309		fileMatch.AddScore(what, computed, -1, opts.DebugScore)
310	}
311
312	// atom-count boosts files with matches from more than 1 atom. The
313	// maximum boost is scoreFactorAtomMatch.
314	if atomMatchCount > 0 {
315		fileMatch.AddScore("atom", (1.0-1.0/float64(atomMatchCount))*scoreFactorAtomMatch, float64(atomMatchCount), opts.DebugScore)
316	}
317
318	maxFileScore := 0.0
319	for i := range fileMatch.LineMatches {
320		if maxFileScore < fileMatch.LineMatches[i].Score {
321			maxFileScore = fileMatch.LineMatches[i].Score
322		}
323
324		// Order by ordering in file.
325		fileMatch.LineMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.LineMatches))))
326	}
327
328	for i := range fileMatch.ChunkMatches {
329		if maxFileScore < fileMatch.ChunkMatches[i].Score {
330			maxFileScore = fileMatch.ChunkMatches[i].Score
331		}
332
333		// Order by ordering in file.
334		fileMatch.ChunkMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.ChunkMatches))))
335	}
336
337	// Maintain ordering of input files. This strictly dominates the in-file ordering of the matches.
338	addScore("fragment", maxFileScore)
339
340	// Truncate score to avoid overlap with the tiebreakers.
341	fileMatch.Score = math.Trunc(fileMatch.Score)
342
343	// Add tiebreakers
344	repoRank := d.repoMetaData[d.repos[doc]].Rank                  // [0, 65535]
345	docOrderScore := 1.0 - float64(doc)/float64(len(d.boundaries)) // [0, 1]
346
347	if opts.DebugScore {
348		// We log the score components individually for better readability.
349		fileMatch.Debug = fmt.Sprintf("score: %d (repo-rank: %d, file-rank: %.2f) <- %s", int(fileMatch.Score), repoRank, docOrderScore, strings.TrimSuffix(fileMatch.Debug, ", "))
350	}
351
352	fileMatch.Score = ScoreOffset*fileMatch.Score + scoreRepoRankFactor*float64(repoRank) + scoreFileOrderFactor*docOrderScore
353}
354
355// scoreFileBM25 computes the score according to BM25, the most common scoring algorithm for text search:
356// https://en.wikipedia.org/wiki/Okapi_BM25. Note that we treat the inverse document frequency (idf) as constant. This
357// is supported by our evaluations which showed that for keyword style queries, idf can down-weight the score of some
358// keywords too much, leading to a worse ranking. The intuition is that each keyword is important independently of how
359// frequent it appears in the corpus.
360//
361// Unlike standard file scoring, this scoring strategy ignores the individual LineMatch and ChunkMatch scores, instead
362// calculating a score over all matches in the file.
363func (d *indexData) scoreFileBM25(fileMatch *zoekt.FileMatch, doc uint32, cands []*candidateMatch, cp *contentProvider, opts *zoekt.SearchOptions) {
364	lowPriority := d.isLowPriority(fileMatch, doc)
365	tf := cp.calculateTermFrequency(cands, lowPriority)
366
367	// Use standard parameter defaults used in Lucene (https://lucene.apache.org/core/10_1_0/core/org/apache/lucene/search/similarities/BM25Similarity.html)
368	k, b := 1.2, 0.75
369
370	averageFileLength := float64(d.boundaries[d.numDocs()]) / float64(d.numDocs())
371	// This is very unlikely, but explicitly guard against division by zero.
372	if averageFileLength == 0 {
373		averageFileLength++
374	}
375
376	// Compute the file length ratio. Usually the calculation would be based on terms, but using
377	// bytes should work fine, as we're just computing a ratio.
378	fileLength := float64(d.boundaries[doc+1] - d.boundaries[doc])
379
380	L := fileLength / averageFileLength
381
382	bm25Score := 0.0
383	sumTF := 0 // Just for debugging
384	for _, f := range tf {
385		sumTF += f
386		bm25Score += tfScore(k, b, L, f)
387	}
388
389	score := boostScore(bm25Score, cands)
390	boosted := score != bm25Score
391	fileMatch.Score = score
392
393	if opts.DebugScore {
394		// To make the debug output easier to read, we split the score into the query dependent score and the tiebreaker
395		fileMatch.Debug = fmt.Sprintf("bm25-score: %.2f (low-priority: %t) <- sum-termFrequencies: %d, length-ratio: %.2f", score, lowPriority, sumTF, L)
396		if boosted {
397			fileMatch.Debug += fmt.Sprintf(" (boosted)")
398		}
399	}
400}
401
402func (d *indexData) isLowPriority(fileMatch *zoekt.FileMatch, doc uint32) bool {
403	category := d.getCategory(doc)
404	if category != FileCategoryMissing {
405		return category.lowPriority()
406	} else {
407		// The category may be missing from older index versions. In this case,
408		// perform a cheap, best-effort check against the filename.
409		path := fileMatch.FileName
410		return enry.IsTest(path) || enry.IsVendor(path)
411	}
412}
Configure Feed

Configure Feed