index/score.go at 858516b98ec252c033e9413b7351badfa8afe4e4 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / index / score.go
at 858516b98ec252c033e9413b7351badfa8afe4e4 15 kB View raw
Julie Tibshirani ranking: incorporate file signals into BM25F (#922) 1y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package index
 16
 17import (
 18	"bytes"
 19	"fmt"
 20	"math"
 21	"strings"
 22
 23	"github.com/go-enry/go-enry/v2"
 24	"github.com/sourcegraph/zoekt"
 25	"github.com/sourcegraph/zoekt/internal/ctags"
 26)
 27
 28const (
 29	ScoreOffset = 10_000_000
 30)
 31
 32type chunkScore struct {
 33	score      float64
 34	debugScore string
 35	bestLine   int
 36}
 37
 38// scoreChunk calculates the score for each line in the chunk based on its candidate matches, and returns the score of
 39// the best-scoring line, along with its line number.
 40// Invariant: there should be at least one input candidate, len(ms) > 0.
 41func (p *contentProvider) scoreChunk(ms []*candidateMatch, language string, opts *zoekt.SearchOptions) (chunkScore, []*zoekt.Symbol) {
 42	nl := p.newlines()
 43
 44	var bestScore lineScore
 45	bestLine := 0
 46	var symbolInfo []*zoekt.Symbol
 47
 48	start := 0
 49	currentLine := -1
 50	for i, m := range ms {
 51		lineNumber := -1
 52		if !m.fileName {
 53			lineNumber = nl.atOffset(m.byteOffset)
 54		}
 55
 56		// If this match represents a new line, then score the previous line and update 'start'.
 57		if i != 0 && lineNumber != currentLine {
 58			score, si := p.scoreLine(ms[start:i], language, currentLine, opts)
 59			symbolInfo = append(symbolInfo, si...)
 60			if score.score > bestScore.score {
 61				bestScore = score
 62				bestLine = currentLine
 63			}
 64			start = i
 65		}
 66		currentLine = lineNumber
 67	}
 68
 69	// Make sure to score the last line
 70	line, si := p.scoreLine(ms[start:], language, currentLine, opts)
 71	symbolInfo = append(symbolInfo, si...)
 72	if line.score > bestScore.score {
 73		bestScore = line
 74		bestLine = currentLine
 75	}
 76
 77	cs := chunkScore{
 78		score:    bestScore.score,
 79		bestLine: bestLine,
 80	}
 81	if opts.DebugScore {
 82		cs.debugScore = fmt.Sprintf("%s, (line: %d)", bestScore.debugScore, bestLine)
 83	}
 84	return cs, symbolInfo
 85}
 86
 87type lineScore struct {
 88	score      float64
 89	debugScore string
 90}
 91
 92// scoreLine calculates a score for the line based on its candidate matches.
 93// Invariants:
 94// - All candidate matches are assumed to come from the same line in the content.
 95// - If this line represents a filename, then lineNumber must be -1.
 96// - There should be at least one input candidate, len(ms) > 0.
 97func (p *contentProvider) scoreLine(ms []*candidateMatch, language string, lineNumber int, opts *zoekt.SearchOptions) (lineScore, []*zoekt.Symbol) {
 98	if opts.UseBM25Scoring {
 99		score, symbolInfo := p.scoreLineBM25(ms, lineNumber)
100		ls := lineScore{score: score}
101		if opts.DebugScore {
102			ls.debugScore = fmt.Sprintf("tfScore:%.2f, ", score)
103		}
104		return ls, symbolInfo
105	}
106
107	score := 0.0
108	what := ""
109	addScore := func(w string, s float64) {
110		if s != 0 && opts.DebugScore {
111			what += fmt.Sprintf("%s:%.2f, ", w, s)
112		}
113		score += s
114	}
115
116	filename := p.data(true)
117	var symbolInfo []*zoekt.Symbol
118
119	var bestScore lineScore
120	for i, m := range ms {
121		data := p.data(m.fileName)
122
123		endOffset := m.byteOffset + m.byteMatchSz
124		startBoundary := m.byteOffset < uint32(len(data)) && (m.byteOffset == 0 || byteClass(data[m.byteOffset-1]) != byteClass(data[m.byteOffset]))
125		endBoundary := endOffset > 0 && (endOffset == uint32(len(data)) || byteClass(data[endOffset-1]) != byteClass(data[endOffset]))
126
127		score = 0
128		what = ""
129
130		if startBoundary && endBoundary {
131			addScore("WordMatch", scoreWordMatch)
132		} else if startBoundary || endBoundary {
133			addScore("PartialWordMatch", scorePartialWordMatch)
134		}
135
136		if m.fileName {
137			sep := bytes.LastIndexByte(data, '/')
138			startMatch := int(m.byteOffset) == sep+1
139			endMatch := endOffset == uint32(len(data))
140			if startMatch && endMatch {
141				addScore("Base", scoreBase)
142			} else if startMatch || endMatch {
143				addScore("EdgeBase", (scoreBase+scorePartialBase)/2)
144			} else if sep < int(m.byteOffset) {
145				addScore("InnerBase", scorePartialBase)
146			}
147		} else if sec, si, ok := p.findSymbol(m); ok {
148			startMatch := sec.Start == m.byteOffset
149			endMatch := sec.End == endOffset
150			if startMatch && endMatch {
151				addScore("Symbol", scoreSymbol)
152			} else if startMatch || endMatch {
153				addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2)
154			} else {
155				addScore("OverlapSymbol", scorePartialSymbol)
156			}
157
158			// Score based on symbol data
159			if si != nil {
160				symbolKind := ctags.ParseSymbolKind(si.Kind)
161				sym := sectionSlice(data, sec)
162
163				addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, symbolKind))
164
165				// This is from a symbol tree, so we need to store the symbol
166				// information.
167				if m.symbol {
168					if symbolInfo == nil {
169						symbolInfo = make([]*zoekt.Symbol, len(ms))
170					}
171					// findSymbols does not hydrate in Sym. So we need to store it.
172					si.Sym = string(sym)
173					symbolInfo[i] = si
174				}
175			}
176		}
177
178		// scoreWeight != 1 means it affects score
179		if !epsilonEqualsOne(m.scoreWeight) {
180			score = score * m.scoreWeight
181			if opts.DebugScore {
182				what += fmt.Sprintf("boost:%.2f, ", m.scoreWeight)
183			}
184		}
185
186		if score > bestScore.score {
187			bestScore.score = score
188			bestScore.debugScore = what
189		}
190	}
191
192	if opts.DebugScore {
193		bestScore.debugScore = fmt.Sprintf("score:%.2f <- %s", bestScore.score, strings.TrimSuffix(bestScore.debugScore, ", "))
194	}
195
196	return bestScore, symbolInfo
197}
198
199// scoreLineBM25 computes the score of a line according to BM25, the most common scoring algorithm for text search:
200// https://en.wikipedia.org/wiki/Okapi_BM25. Compared to the standard scoreLine algorithm, this score rewards multiple
201// term matches on a line.
202// Notes:
203// - This BM25 calculation skips inverse document frequency (idf) to keep the implementation simple.
204// - It uses the same calculateTermFrequency method as BM25 file scoring, which boosts filename and symbol matches.
205func (p *contentProvider) scoreLineBM25(ms []*candidateMatch, lineNumber int) (float64, []*zoekt.Symbol) {
206	// If this is a filename, then don't compute BM25. The score would not be comparable to line scores.
207	if lineNumber < 0 {
208		return 0, nil
209	}
210
211	// Use standard parameter defaults used in Lucene (https://lucene.apache.org/core/10_1_0/core/org/apache/lucene/search/similarities/BM25Similarity.html)
212	k, b := 1.2, 0.75
213
214	// Calculate the length ratio of this line. As a heuristic, we assume an average line length of 100 characters.
215	// Usually the calculation would be based on terms, but using bytes should work fine, as we're just computing a ratio.
216	nl := p.newlines()
217	lineLength := nl.lineStart(lineNumber+1) - nl.lineStart(lineNumber)
218	L := float64(lineLength) / 100.0
219
220	score := 0.0
221	tfs := p.calculateTermFrequency(ms, false) // ignore file priority, since we're just scoring within a single file
222	for _, f := range tfs {
223		score += tfScore(k, b, L, f)
224	}
225
226	// Check if any index comes from a symbol match tree, and if so hydrate in symbol information
227	var symbolInfo []*zoekt.Symbol
228	for _, m := range ms {
229		if m.symbol {
230			if sec, si, ok := p.findSymbol(m); ok && si != nil {
231				// findSymbols does not hydrate in Sym. So we need to store it.
232				sym := sectionSlice(p.data(false), sec)
233				si.Sym = string(sym)
234				symbolInfo = append(symbolInfo, si)
235			}
236		}
237	}
238
239	score = boostScore(score, ms)
240	return score, symbolInfo
241}
242
243// tfScore is the term frequency score for BM25.
244func tfScore(k float64, b float64, L float64, f int) float64 {
245	return ((k + 1.0) * float64(f)) / (k*(1.0-b+b*L) + float64(f))
246}
247
248const importantTermBoost = 5
249const lowPriorityFilePenalty = 5
250
251// calculateTermFrequency computes the term frequency for the file match.
252// Notes:
253// - Filename matches count more than content matches. This mimics a common text search strategy to 'boost' matches on document titles.
254// - Symbol matches also count more than content matches, to reward matches on symbol definitions.
255// - "Low priority" files like tests, generated files, etc. have their term frequency down-weighted, to prioritize matches from 'regular' files
256func (p *contentProvider) calculateTermFrequency(cands []*candidateMatch, lowPriority bool) map[string]int {
257	// Treat each candidate match as a term and compute the frequencies. For now, ignore case sensitivity and
258	// ignore whether the index is a word boundary.
259	termFreqs := map[string]int{}
260	for _, m := range cands {
261		term := string(m.substrLowered)
262		if m.fileName || p.matchesSymbol(m) {
263			termFreqs[term] += importantTermBoost
264		} else {
265			termFreqs[term]++
266		}
267	}
268
269	// If a file is a test, generated, etc., then down-weight its term frequency. The BM25F interpretation
270	// is that this data lives in a separate 'field' that is half the priority of regular content.
271	if lowPriority {
272		for term := range termFreqs {
273			termFreqs[term] = termFreqs[term] / lowPriorityFilePenalty
274		}
275	}
276
277	return termFreqs
278}
279
280// boostScore finds whether any of the matches are part of a boosted match tree, then applies
281// the boost to the final score. This follows precedent in other search engines like Lucene, where
282// boosts multiply an entire query clause's final score.
283//
284// As a heuristic, we use the maximum boost across matches to avoid applying the same boost multiple times.
285func boostScore(score float64, ms []*candidateMatch) float64 {
286	maxScoreWeight := 1.0
287	for _, m := range ms {
288		if m.scoreWeight > maxScoreWeight {
289			maxScoreWeight = m.scoreWeight
290		}
291	}
292
293	if !epsilonEqualsOne(maxScoreWeight) {
294		score = score * maxScoreWeight
295	}
296	return score
297}
298
299// scoreFile computes a score for the file match using various scoring signals, like
300// whether there's an exact match on a symbol, the number of query clauses that matched, etc.
301func (d *indexData) scoreFile(fileMatch *zoekt.FileMatch, doc uint32, mt matchTree, known map[matchTree]bool, opts *zoekt.SearchOptions) {
302	atomMatchCount := 0
303	visitMatchAtoms(mt, known, func(mt matchTree) {
304		atomMatchCount++
305	})
306
307	addScore := func(what string, computed float64) {
308		fileMatch.AddScore(what, computed, -1, opts.DebugScore)
309	}
310
311	// atom-count boosts files with matches from more than 1 atom. The
312	// maximum boost is scoreFactorAtomMatch.
313	if atomMatchCount > 0 {
314		fileMatch.AddScore("atom", (1.0-1.0/float64(atomMatchCount))*scoreFactorAtomMatch, float64(atomMatchCount), opts.DebugScore)
315	}
316
317	maxFileScore := 0.0
318	for i := range fileMatch.LineMatches {
319		if maxFileScore < fileMatch.LineMatches[i].Score {
320			maxFileScore = fileMatch.LineMatches[i].Score
321		}
322
323		// Order by ordering in file.
324		fileMatch.LineMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.LineMatches))))
325	}
326
327	for i := range fileMatch.ChunkMatches {
328		if maxFileScore < fileMatch.ChunkMatches[i].Score {
329			maxFileScore = fileMatch.ChunkMatches[i].Score
330		}
331
332		// Order by ordering in file.
333		fileMatch.ChunkMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.ChunkMatches))))
334	}
335
336	// Maintain ordering of input files. This strictly dominates the in-file ordering of the matches.
337	addScore("fragment", maxFileScore)
338
339	// Truncate score to avoid overlap with the tiebreakers.
340	fileMatch.Score = math.Trunc(fileMatch.Score)
341
342	// Add tiebreakers
343	repoRank := d.repoMetaData[d.repos[doc]].Rank                  // [0, 65535]
344	docOrderScore := 1.0 - float64(doc)/float64(len(d.boundaries)) // [0, 1]
345
346	if opts.DebugScore {
347		// We log the score components individually for better readability.
348		fileMatch.Debug = fmt.Sprintf("score: %d (repo-rank: %d, file-rank: %.2f) <- %s", int(fileMatch.Score), repoRank, docOrderScore, strings.TrimSuffix(fileMatch.Debug, ", "))
349	}
350
351	fileMatch.Score = ScoreOffset*fileMatch.Score + scoreRepoRankFactor*float64(repoRank) + scoreFileOrderFactor*docOrderScore
352}
353
354// scoreFileBM25 computes the score according to BM25, the most common scoring algorithm for text search:
355// https://en.wikipedia.org/wiki/Okapi_BM25. Note that we treat the inverse document frequency (idf) as constant. This
356// is supported by our evaluations which showed that for keyword style queries, idf can down-weight the score of some
357// keywords too much, leading to a worse ranking. The intuition is that each keyword is important independently of how
358// frequent it appears in the corpus.
359//
360// Unlike standard file scoring, this scoring strategy ignores the individual LineMatch and ChunkMatch scores, instead
361// calculating a score over all matches in the file.
362func (d *indexData) scoreFileBM25(fileMatch *zoekt.FileMatch, doc uint32, cands []*candidateMatch, cp *contentProvider, opts *zoekt.SearchOptions) {
363	lowPriority := d.isLowPriority(fileMatch, doc)
364	tf := cp.calculateTermFrequency(cands, lowPriority)
365
366	// Use standard parameter defaults used in Lucene (https://lucene.apache.org/core/10_1_0/core/org/apache/lucene/search/similarities/BM25Similarity.html)
367	k, b := 1.2, 0.75
368
369	averageFileLength := float64(d.boundaries[d.numDocs()]) / float64(d.numDocs())
370	// This is very unlikely, but explicitly guard against division by zero.
371	if averageFileLength == 0 {
372		averageFileLength++
373	}
374
375	// Compute the file length ratio. Usually the calculation would be based on terms, but using
376	// bytes should work fine, as we're just computing a ratio.
377	fileLength := float64(d.boundaries[doc+1] - d.boundaries[doc])
378
379	L := fileLength / averageFileLength
380
381	bm25Score := 0.0
382	sumTF := 0 // Just for debugging
383	for _, f := range tf {
384		sumTF += f
385		bm25Score += tfScore(k, b, L, f)
386	}
387
388	score := boostScore(bm25Score, cands)
389	boosted := score != bm25Score
390	fileMatch.Score = score
391
392	if opts.DebugScore {
393		// To make the debug output easier to read, we split the score into the query dependent score and the tiebreaker
394		fileMatch.Debug = fmt.Sprintf("bm25-score: %.2f (low-priority: %t) <- sum-termFrequencies: %d, length-ratio: %.2f", score, lowPriority, sumTF, L)
395		if boosted {
396			fileMatch.Debug += fmt.Sprintf(" (boosted)")
397		}
398	}
399}
400
401func (d *indexData) isLowPriority(fileMatch *zoekt.FileMatch, doc uint32) bool {
402	category := d.getCategory(doc)
403	if category != FileCategoryMissing {
404		return category.lowPriority()
405	} else {
406		// The category may be missing from older index versions. In this case,
407		// perform a cheap, best-effort check against the filename.
408		path := fileMatch.FileName
409		return enry.IsTest(path) || enry.IsVendor(path)
410	}
411}
Configure Feed

Configure Feed