index/score.go at 914a27d7d2e2b489a70e0fc1c6ce64aae8e7964b · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / index / score.go
at 914a27d7d2e2b489a70e0fc1c6ce64aae8e7964b 13 kB View raw
Stefan Hengl ranking: add tiebreakers to BM25 (#914) 1y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package index
 16
 17import (
 18	"bytes"
 19	"fmt"
 20	"math"
 21	"strings"
 22
 23	"github.com/sourcegraph/zoekt"
 24	"github.com/sourcegraph/zoekt/internal/ctags"
 25)
 26
 27const (
 28	ScoreOffset     = 10_000_000
 29	ScoreOffsetBM25 = 1_000_000_000
 30)
 31
 32type chunkScore struct {
 33	score      float64
 34	debugScore string
 35	bestLine   int
 36}
 37
 38// scoreChunk calculates the score for each line in the chunk based on its candidate matches, and returns the score of
 39// the best-scoring line, along with its line number.
 40// Invariant: there should be at least one input candidate, len(ms) > 0.
 41func (p *contentProvider) scoreChunk(ms []*candidateMatch, language string, opts *zoekt.SearchOptions) (chunkScore, []*zoekt.Symbol) {
 42	nl := p.newlines()
 43
 44	var bestScore lineScore
 45	bestLine := 0
 46	var symbolInfo []*zoekt.Symbol
 47
 48	start := 0
 49	currentLine := -1
 50	for i, m := range ms {
 51		lineNumber := -1
 52		if !m.fileName {
 53			lineNumber = nl.atOffset(m.byteOffset)
 54		}
 55
 56		// If this match represents a new line, then score the previous line and update 'start'.
 57		if i != 0 && lineNumber != currentLine {
 58			score, si := p.scoreLine(ms[start:i], language, currentLine, opts)
 59			symbolInfo = append(symbolInfo, si...)
 60			if score.score > bestScore.score {
 61				bestScore = score
 62				bestLine = currentLine
 63			}
 64			start = i
 65		}
 66		currentLine = lineNumber
 67	}
 68
 69	// Make sure to score the last line
 70	line, si := p.scoreLine(ms[start:], language, currentLine, opts)
 71	symbolInfo = append(symbolInfo, si...)
 72	if line.score > bestScore.score {
 73		bestScore = line
 74		bestLine = currentLine
 75	}
 76
 77	cs := chunkScore{
 78		score:    bestScore.score,
 79		bestLine: bestLine,
 80	}
 81	if opts.DebugScore {
 82		cs.debugScore = fmt.Sprintf("%s, (line: %d)", bestScore.debugScore, bestLine)
 83	}
 84	return cs, symbolInfo
 85}
 86
 87type lineScore struct {
 88	score      float64
 89	debugScore string
 90}
 91
 92// scoreLine calculates a score for the line based on its candidate matches.
 93// Invariants:
 94// - All candidate matches are assumed to come from the same line in the content.
 95// - If this line represents a filename, then lineNumber must be -1.
 96// - There should be at least one input candidate, len(ms) > 0.
 97func (p *contentProvider) scoreLine(ms []*candidateMatch, language string, lineNumber int, opts *zoekt.SearchOptions) (lineScore, []*zoekt.Symbol) {
 98	if opts.UseBM25Scoring {
 99		score, symbolInfo := p.scoreLineBM25(ms, lineNumber)
100		ls := lineScore{score: score}
101		if opts.DebugScore {
102			ls.debugScore = fmt.Sprintf("tfScore:%.2f, ", score)
103		}
104		return ls, symbolInfo
105	}
106
107	score := 0.0
108	what := ""
109	addScore := func(w string, s float64) {
110		if s != 0 && opts.DebugScore {
111			what += fmt.Sprintf("%s:%.2f, ", w, s)
112		}
113		score += s
114	}
115
116	filename := p.data(true)
117	var symbolInfo []*zoekt.Symbol
118
119	var bestScore lineScore
120	for i, m := range ms {
121		data := p.data(m.fileName)
122
123		endOffset := m.byteOffset + m.byteMatchSz
124		startBoundary := m.byteOffset < uint32(len(data)) && (m.byteOffset == 0 || byteClass(data[m.byteOffset-1]) != byteClass(data[m.byteOffset]))
125		endBoundary := endOffset > 0 && (endOffset == uint32(len(data)) || byteClass(data[endOffset-1]) != byteClass(data[endOffset]))
126
127		score = 0
128		what = ""
129
130		if startBoundary && endBoundary {
131			addScore("WordMatch", scoreWordMatch)
132		} else if startBoundary || endBoundary {
133			addScore("PartialWordMatch", scorePartialWordMatch)
134		}
135
136		if m.fileName {
137			sep := bytes.LastIndexByte(data, '/')
138			startMatch := int(m.byteOffset) == sep+1
139			endMatch := endOffset == uint32(len(data))
140			if startMatch && endMatch {
141				addScore("Base", scoreBase)
142			} else if startMatch || endMatch {
143				addScore("EdgeBase", (scoreBase+scorePartialBase)/2)
144			} else if sep < int(m.byteOffset) {
145				addScore("InnerBase", scorePartialBase)
146			}
147		} else if sec, si, ok := p.findSymbol(m); ok {
148			startMatch := sec.Start == m.byteOffset
149			endMatch := sec.End == endOffset
150			if startMatch && endMatch {
151				addScore("Symbol", scoreSymbol)
152			} else if startMatch || endMatch {
153				addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2)
154			} else {
155				addScore("OverlapSymbol", scorePartialSymbol)
156			}
157
158			// Score based on symbol data
159			if si != nil {
160				symbolKind := ctags.ParseSymbolKind(si.Kind)
161				sym := sectionSlice(data, sec)
162
163				addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, symbolKind))
164
165				// This is from a symbol tree, so we need to store the symbol
166				// information.
167				if m.symbol {
168					if symbolInfo == nil {
169						symbolInfo = make([]*zoekt.Symbol, len(ms))
170					}
171					// findSymbols does not hydrate in Sym. So we need to store it.
172					si.Sym = string(sym)
173					symbolInfo[i] = si
174				}
175			}
176		}
177
178		// scoreWeight != 1 means it affects score
179		if !epsilonEqualsOne(m.scoreWeight) {
180			score = score * m.scoreWeight
181			if opts.DebugScore {
182				what += fmt.Sprintf("boost:%.2f, ", m.scoreWeight)
183			}
184		}
185
186		if score > bestScore.score {
187			bestScore.score = score
188			bestScore.debugScore = what
189		}
190	}
191
192	if opts.DebugScore {
193		bestScore.debugScore = fmt.Sprintf("score:%.2f <- %s", bestScore.score, strings.TrimSuffix(bestScore.debugScore, ", "))
194	}
195
196	return bestScore, symbolInfo
197}
198
199// scoreLineBM25 computes the score of a line according to BM25, the most common scoring algorithm for text search:
200// https://en.wikipedia.org/wiki/Okapi_BM25. Compared to the standard scoreLine algorithm, this score rewards multiple
201// term matches on a line.
202// Notes:
203// - This BM25 calculation skips inverse document frequency (idf) to keep the implementation simple.
204// - It uses the same calculateTermFrequency method as BM25 file scoring, which boosts filename and symbol matches.
205func (p *contentProvider) scoreLineBM25(ms []*candidateMatch, lineNumber int) (float64, []*zoekt.Symbol) {
206	// If this is a filename, then don't compute BM25. The score would not be comparable to line scores.
207	if lineNumber < 0 {
208		return 0, nil
209	}
210
211	// Use standard parameter defaults used in Lucene (https://lucene.apache.org/core/10_1_0/core/org/apache/lucene/search/similarities/BM25Similarity.html)
212	k, b := 1.2, 0.75
213
214	// Calculate the length ratio of this line. As a heuristic, we assume an average line length of 100 characters.
215	// Usually the calculation would be based on terms, but using bytes should work fine, as we're just computing a ratio.
216	nl := p.newlines()
217	lineLength := nl.lineStart(lineNumber+1) - nl.lineStart(lineNumber)
218	L := float64(lineLength) / 100.0
219
220	score := 0.0
221	tfs := p.calculateTermFrequency(ms)
222	for _, f := range tfs {
223		score += tfScore(k, b, L, f)
224	}
225
226	// Check if any index comes from a symbol match tree, and if so hydrate in symbol information
227	var symbolInfo []*zoekt.Symbol
228	for _, m := range ms {
229		if m.symbol {
230			if sec, si, ok := p.findSymbol(m); ok && si != nil {
231				// findSymbols does not hydrate in Sym. So we need to store it.
232				sym := sectionSlice(p.data(false), sec)
233				si.Sym = string(sym)
234				symbolInfo = append(symbolInfo, si)
235			}
236		}
237	}
238	return score, symbolInfo
239}
240
241// tfScore is the term frequency score for BM25.
242func tfScore(k float64, b float64, L float64, f int) float64 {
243	return ((k + 1.0) * float64(f)) / (k*(1.0-b+b*L) + float64(f))
244}
245
246// calculateTermFrequency computes the term frequency for the file match.
247// Notes:
248// - Filename matches count more than content matches. This mimics a common text search strategy to 'boost' matches on document titles.
249// - Symbol matches also count more than content matches, to reward matches on symbol definitions.
250func (p *contentProvider) calculateTermFrequency(cands []*candidateMatch) map[string]int {
251	// Treat each candidate match as a term and compute the frequencies. For now, ignore case sensitivity and
252	// ignore whether the index is a word boundary.
253	termFreqs := map[string]int{}
254	for _, m := range cands {
255		term := string(m.substrLowered)
256		if m.fileName || p.matchesSymbol(m) {
257			termFreqs[term] += 5
258		} else {
259			termFreqs[term]++
260		}
261	}
262
263	return termFreqs
264}
265
266// scoreFile computes a score for the file match using various scoring signals, like
267// whether there's an exact match on a symbol, the number of query clauses that matched, etc.
268func (d *indexData) scoreFile(fileMatch *zoekt.FileMatch, doc uint32, mt matchTree, known map[matchTree]bool, opts *zoekt.SearchOptions) {
269	atomMatchCount := 0
270	visitMatchAtoms(mt, known, func(mt matchTree) {
271		atomMatchCount++
272	})
273
274	addScore := func(what string, computed float64) {
275		fileMatch.AddScore(what, computed, -1, opts.DebugScore)
276	}
277
278	// atom-count boosts files with matches from more than 1 atom. The
279	// maximum boost is scoreFactorAtomMatch.
280	if atomMatchCount > 0 {
281		fileMatch.AddScore("atom", (1.0-1.0/float64(atomMatchCount))*scoreFactorAtomMatch, float64(atomMatchCount), opts.DebugScore)
282	}
283
284	maxFileScore := 0.0
285	for i := range fileMatch.LineMatches {
286		if maxFileScore < fileMatch.LineMatches[i].Score {
287			maxFileScore = fileMatch.LineMatches[i].Score
288		}
289
290		// Order by ordering in file.
291		fileMatch.LineMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.LineMatches))))
292	}
293
294	for i := range fileMatch.ChunkMatches {
295		if maxFileScore < fileMatch.ChunkMatches[i].Score {
296			maxFileScore = fileMatch.ChunkMatches[i].Score
297		}
298
299		// Order by ordering in file.
300		fileMatch.ChunkMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.ChunkMatches))))
301	}
302
303	// Maintain ordering of input files. This strictly dominates the in-file ordering of the matches.
304	addScore("fragment", maxFileScore)
305
306	// Truncate score to avoid overlap with the tiebreakers.
307	fileMatch.Score = math.Trunc(fileMatch.Score)
308
309	// Add tiebreakers
310	repoRank := d.repoMetaData[d.repos[doc]].Rank                  // [0, 65535]
311	docOrderScore := 1.0 - float64(doc)/float64(len(d.boundaries)) // [0, 1]
312
313	if opts.DebugScore {
314		// We log the score components individually for better readability.
315		fileMatch.Debug = fmt.Sprintf("score: %d (repo-rank: %d, file-rank: %.2f) <- %s", int(fileMatch.Score), repoRank, docOrderScore, strings.TrimSuffix(fileMatch.Debug, ", "))
316	}
317
318	fileMatch.Score = ScoreOffset*fileMatch.Score + scoreRepoRankFactor*float64(repoRank) + scoreFileOrderFactor*docOrderScore
319}
320
321// scoreFilesUsingBM25 computes the score according to BM25, the most common scoring algorithm for text search:
322// https://en.wikipedia.org/wiki/Okapi_BM25. Note that we treat the inverse document frequency (idf) as constant. This
323// is supported by our evaluations which showed that for keyword style queries, idf can down-weight the score of some
324// keywords too much, leading to a worse ranking. The intuition is that each keyword is important independently of how
325// frequent it appears in the corpus.
326//
327// Unlike standard file scoring, this scoring strategy ignores all other signals including document ranks. This keeps
328// things simple for now, since BM25 is not normalized and can be tricky to combine with other scoring signals. It also
329// ignores the individual LineMatch and ChunkMatch scores, instead calculating a score over all matches in the file.
330func (d *indexData) scoreFilesUsingBM25(fileMatch *zoekt.FileMatch, doc uint32, tf map[string]int, opts *zoekt.SearchOptions) {
331	// Use standard parameter defaults used in Lucene (https://lucene.apache.org/core/10_1_0/core/org/apache/lucene/search/similarities/BM25Similarity.html)
332	k, b := 1.2, 0.75
333
334	averageFileLength := float64(d.boundaries[d.numDocs()]) / float64(d.numDocs())
335	// This is very unlikely, but explicitly guard against division by zero.
336	if averageFileLength == 0 {
337		averageFileLength++
338	}
339
340	// Compute the file length ratio. Usually the calculation would be based on terms, but using
341	// bytes should work fine, as we're just computing a ratio.
342	fileLength := float64(d.boundaries[doc+1] - d.boundaries[doc])
343
344	L := fileLength / averageFileLength
345
346	score := 0.0
347	sumTF := 0 // Just for debugging
348	for _, f := range tf {
349		sumTF += f
350		score += tfScore(k, b, L, f)
351	}
352	// 2 digits of precision
353	score = math.Trunc(score*100) / 100
354
355	md := d.repoMetaData[d.repos[doc]]
356	fileOrderScore := 1.0 - float64(doc)/float64(len(d.boundaries))
357
358	// Offset score by 9 digits and add the tiebreaker.
359	//
360	// Example: For a BM25 score of 1.23, a repo rank of 456789 and a file order score of 0.12, we have a final score of
361	// 12345678901.2
362	// ^^^
363	// bm25
364	//    ^^^^^^
365	//	  repo rank
366	//          ^^^^
367	//          doc order
368	fileMatch.Score = score*ScoreOffsetBM25 + scoreRepoRankFactor*float64(md.Rank) + scoreFileOrderFactor*fileOrderScore
369
370	if opts.DebugScore {
371		// To make the debug output easier to read, we split the score into the query dependent score and the tiebreaker
372		fileMatch.Debug = fmt.Sprintf("bm25-score: %.2f (repo-rank: %d, file-rank: %.2f) <- sum-termFrequencies: %d, length-ratio: %.2f", score, md.Rank, fileOrderScore, sumTF, L)
373	}
374}
Configure Feed

Configure Feed