index/score.go at 6b8e10c8e111626c506c3c76054f70af953d0d56 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / index / score.go
at 6b8e10c8e111626c506c3c76054f70af953d0d56 14 kB View raw
Julie Tibshirani Move root-level index code to index package (#902) 1y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package index
 16
 17import (
 18	"bytes"
 19	"fmt"
 20	"math"
 21	"strings"
 22
 23	"github.com/sourcegraph/zoekt"
 24	"github.com/sourcegraph/zoekt/internal/ctags"
 25)
 26
 27const (
 28	ScoreOffset = 10_000_000
 29)
 30
 31type chunkScore struct {
 32	score      float64
 33	debugScore string
 34	bestLine   int
 35}
 36
 37// scoreChunk calculates the score for each line in the chunk based on its candidate matches, and returns the score of
 38// the best-scoring line, along with its line number.
 39// Invariant: there should be at least one input candidate, len(ms) > 0.
 40func (p *contentProvider) scoreChunk(ms []*candidateMatch, language string, opts *zoekt.SearchOptions) (chunkScore, []*zoekt.Symbol) {
 41	nl := p.newlines()
 42
 43	var bestScore lineScore
 44	bestLine := 0
 45	var symbolInfo []*zoekt.Symbol
 46
 47	start := 0
 48	currentLine := -1
 49	for i, m := range ms {
 50		lineNumber := -1
 51		if !m.fileName {
 52			lineNumber = nl.atOffset(m.byteOffset)
 53		}
 54
 55		// If this match represents a new line, then score the previous line and update 'start'.
 56		if i != 0 && lineNumber != currentLine {
 57			score, si := p.scoreLine(ms[start:i], language, currentLine, opts)
 58			symbolInfo = append(symbolInfo, si...)
 59			if score.score > bestScore.score {
 60				bestScore = score
 61				bestLine = currentLine
 62			}
 63			start = i
 64		}
 65		currentLine = lineNumber
 66	}
 67
 68	// Make sure to score the last line
 69	line, si := p.scoreLine(ms[start:], language, currentLine, opts)
 70	symbolInfo = append(symbolInfo, si...)
 71	if line.score > bestScore.score {
 72		bestScore = line
 73		bestLine = currentLine
 74	}
 75
 76	cs := chunkScore{
 77		score:    bestScore.score,
 78		bestLine: bestLine,
 79	}
 80	if opts.DebugScore {
 81		cs.debugScore = fmt.Sprintf("%s, (line: %d)", bestScore.debugScore, bestLine)
 82	}
 83	return cs, symbolInfo
 84}
 85
 86type lineScore struct {
 87	score      float64
 88	debugScore string
 89}
 90
 91// scoreLine calculates a score for the line based on its candidate matches.
 92// Invariants:
 93// - All candidate matches are assumed to come from the same line in the content.
 94// - If this line represents a filename, then lineNumber must be -1.
 95// - There should be at least one input candidate, len(ms) > 0.
 96func (p *contentProvider) scoreLine(ms []*candidateMatch, language string, lineNumber int, opts *zoekt.SearchOptions) (lineScore, []*zoekt.Symbol) {
 97	if opts.UseBM25Scoring {
 98		score, symbolInfo := p.scoreLineBM25(ms, lineNumber)
 99		ls := lineScore{score: score}
100		if opts.DebugScore {
101			ls.debugScore = fmt.Sprintf("tfScore:%.2f, ", score)
102		}
103		return ls, symbolInfo
104	}
105
106	score := 0.0
107	what := ""
108	addScore := func(w string, s float64) {
109		if s != 0 && opts.DebugScore {
110			what += fmt.Sprintf("%s:%.2f, ", w, s)
111		}
112		score += s
113	}
114
115	filename := p.data(true)
116	var symbolInfo []*zoekt.Symbol
117
118	var bestScore lineScore
119	for i, m := range ms {
120		data := p.data(m.fileName)
121
122		endOffset := m.byteOffset + m.byteMatchSz
123		startBoundary := m.byteOffset < uint32(len(data)) && (m.byteOffset == 0 || byteClass(data[m.byteOffset-1]) != byteClass(data[m.byteOffset]))
124		endBoundary := endOffset > 0 && (endOffset == uint32(len(data)) || byteClass(data[endOffset-1]) != byteClass(data[endOffset]))
125
126		score = 0
127		what = ""
128
129		if startBoundary && endBoundary {
130			addScore("WordMatch", scoreWordMatch)
131		} else if startBoundary || endBoundary {
132			addScore("PartialWordMatch", scorePartialWordMatch)
133		}
134
135		if m.fileName {
136			sep := bytes.LastIndexByte(data, '/')
137			startMatch := int(m.byteOffset) == sep+1
138			endMatch := endOffset == uint32(len(data))
139			if startMatch && endMatch {
140				addScore("Base", scoreBase)
141			} else if startMatch || endMatch {
142				addScore("EdgeBase", (scoreBase+scorePartialBase)/2)
143			} else if sep < int(m.byteOffset) {
144				addScore("InnerBase", scorePartialBase)
145			}
146		} else if sec, si, ok := p.findSymbol(m); ok {
147			startMatch := sec.Start == m.byteOffset
148			endMatch := sec.End == endOffset
149			if startMatch && endMatch {
150				addScore("Symbol", scoreSymbol)
151			} else if startMatch || endMatch {
152				addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2)
153			} else {
154				addScore("OverlapSymbol", scorePartialSymbol)
155			}
156
157			// Score based on symbol data
158			if si != nil {
159				symbolKind := ctags.ParseSymbolKind(si.Kind)
160				sym := sectionSlice(data, sec)
161
162				addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, symbolKind))
163
164				// This is from a symbol tree, so we need to store the symbol
165				// information.
166				if m.symbol {
167					if symbolInfo == nil {
168						symbolInfo = make([]*zoekt.Symbol, len(ms))
169					}
170					// findSymbols does not hydrate in Sym. So we need to store it.
171					si.Sym = string(sym)
172					symbolInfo[i] = si
173				}
174			}
175		}
176
177		// scoreWeight != 1 means it affects score
178		if !epsilonEqualsOne(m.scoreWeight) {
179			score = score * m.scoreWeight
180			if opts.DebugScore {
181				what += fmt.Sprintf("boost:%.2f, ", m.scoreWeight)
182			}
183		}
184
185		if score > bestScore.score {
186			bestScore.score = score
187			bestScore.debugScore = what
188		}
189	}
190
191	if opts.DebugScore {
192		bestScore.debugScore = fmt.Sprintf("score:%.2f <- %s", bestScore.score, strings.TrimSuffix(bestScore.debugScore, ", "))
193	}
194
195	return bestScore, symbolInfo
196}
197
198// scoreLineBM25 computes the score of a line according to BM25, the most common scoring algorithm for text search:
199// https://en.wikipedia.org/wiki/Okapi_BM25. Compared to the standard scoreLine algorithm, this score rewards multiple
200// term matches on a line.
201// Notes:
202// - This BM25 calculation skips inverse document frequency (idf) to keep the implementation simple.
203// - It uses the same calculateTermFrequency method as BM25 file scoring, which boosts filename and symbol matches.
204func (p *contentProvider) scoreLineBM25(ms []*candidateMatch, lineNumber int) (float64, []*zoekt.Symbol) {
205	// If this is a filename, then don't compute BM25. The score would not be comparable to line scores.
206	if lineNumber < 0 {
207		return 0, nil
208	}
209
210	// Use standard parameter defaults used in Lucene (https://lucene.apache.org/core/10_1_0/core/org/apache/lucene/search/similarities/BM25Similarity.html)
211	k, b := 1.2, 0.75
212
213	// Calculate the length ratio of this line. As a heuristic, we assume an average line length of 100 characters.
214	// Usually the calculation would be based on terms, but using bytes should work fine, as we're just computing a ratio.
215	nl := p.newlines()
216	lineLength := nl.lineStart(lineNumber+1) - nl.lineStart(lineNumber)
217	L := float64(lineLength) / 100.0
218
219	score := 0.0
220	tfs := p.calculateTermFrequency(ms, termDocumentFrequency{})
221	for _, f := range tfs {
222		score += ((k + 1.0) * float64(f)) / (k*(1.0-b+b*L) + float64(f))
223	}
224
225	// Check if any index comes from a symbol match tree, and if so hydrate in symbol information
226	var symbolInfo []*zoekt.Symbol
227	for _, m := range ms {
228		if m.symbol {
229			if sec, si, ok := p.findSymbol(m); ok && si != nil {
230				// findSymbols does not hydrate in Sym. So we need to store it.
231				sym := sectionSlice(p.data(false), sec)
232				si.Sym = string(sym)
233				symbolInfo = append(symbolInfo, si)
234			}
235		}
236	}
237	return score, symbolInfo
238}
239
240// termDocumentFrequency is a map "term" -> "number of documents that contain the term"
241type termDocumentFrequency map[string]int
242
243// calculateTermFrequency computes the term frequency for the file match.
244// Notes:
245// - Filename matches count more than content matches. This mimics a common text search strategy to 'boost' matches on document titles.
246// - Symbol matches also count more than content matches, to reward matches on symbol definitions.
247func (p *contentProvider) calculateTermFrequency(cands []*candidateMatch, df termDocumentFrequency) map[string]int {
248	// Treat each candidate match as a term and compute the frequencies. For now, ignore case sensitivity and
249	// ignore whether the index is a word boundary.
250	termFreqs := map[string]int{}
251	for _, m := range cands {
252		term := string(m.substrLowered)
253		if m.fileName || p.matchesSymbol(m) {
254			termFreqs[term] += 5
255		} else {
256			termFreqs[term]++
257		}
258	}
259
260	for term := range termFreqs {
261		df[term] += 1
262	}
263	return termFreqs
264}
265
266// scoreFile computes a score for the file match using various scoring signals, like
267// whether there's an exact match on a symbol, the number of query clauses that matched, etc.
268func (d *indexData) scoreFile(fileMatch *zoekt.FileMatch, doc uint32, mt matchTree, known map[matchTree]bool, opts *zoekt.SearchOptions) {
269	atomMatchCount := 0
270	visitMatchAtoms(mt, known, func(mt matchTree) {
271		atomMatchCount++
272	})
273
274	addScore := func(what string, computed float64) {
275		fileMatch.AddScore(what, computed, -1, opts.DebugScore)
276	}
277
278	// atom-count boosts files with matches from more than 1 atom. The
279	// maximum boost is scoreFactorAtomMatch.
280	if atomMatchCount > 0 {
281		fileMatch.AddScore("atom", (1.0-1.0/float64(atomMatchCount))*scoreFactorAtomMatch, float64(atomMatchCount), opts.DebugScore)
282	}
283
284	maxFileScore := 0.0
285	for i := range fileMatch.LineMatches {
286		if maxFileScore < fileMatch.LineMatches[i].Score {
287			maxFileScore = fileMatch.LineMatches[i].Score
288		}
289
290		// Order by ordering in file.
291		fileMatch.LineMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.LineMatches))))
292	}
293
294	for i := range fileMatch.ChunkMatches {
295		if maxFileScore < fileMatch.ChunkMatches[i].Score {
296			maxFileScore = fileMatch.ChunkMatches[i].Score
297		}
298
299		// Order by ordering in file.
300		fileMatch.ChunkMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.ChunkMatches))))
301	}
302
303	// Maintain ordering of input files. This
304	// strictly dominates the in-file ordering of
305	// the matches.
306	addScore("fragment", maxFileScore)
307
308	// Add tiebreakers
309	//
310	// ScoreOffset shifts the score 7 digits to the left.
311	fileMatch.Score = math.Trunc(fileMatch.Score) * ScoreOffset
312
313	md := d.repoMetaData[d.repos[doc]]
314
315	// md.Rank lies in the range [0, 65535]. Hence, we have to allocate 5 digits for
316	// the rank. The scoreRepoRankFactor shifts the rank score 2 digits to the left,
317	// reserving digits 3-7 for the repo rank.
318	addScore("repo-rank", scoreRepoRankFactor*float64(md.Rank))
319
320	// digits 1-2 and the decimals are reserved for the doc order. Doc order
321	// (without the scaling factor) lies in the range [0, 1]. The upper bound is
322	// achieved for matches in the first document of a shard.
323	addScore("doc-order", scoreFileOrderFactor*(1.0-float64(doc)/float64(len(d.boundaries))))
324
325	if opts.DebugScore {
326		// To make the debug output easier to read, we split the score into the query
327		// dependent score and the tiebreaker
328		score := math.Trunc(fileMatch.Score / ScoreOffset)
329		tiebreaker := fileMatch.Score - score*ScoreOffset
330		fileMatch.Debug = fmt.Sprintf("score: %d (%.2f) <- %s", int(score), tiebreaker, strings.TrimSuffix(fileMatch.Debug, ", "))
331	}
332}
333
334// termFrequency stores the term frequencies for doc.
335type termFrequency struct {
336	doc uint32
337	tf  map[string]int
338}
339
340// scoreFilesUsingBM25 computes the score according to BM25, the most common scoring algorithm for text search:
341// https://en.wikipedia.org/wiki/Okapi_BM25.
342//
343// Unlike standard file scoring, this scoring strategy ignores all other signals including document ranks. This keeps
344// things simple for now, since BM25 is not normalized and can be  tricky to combine with other scoring signals. It also
345// ignores the individual LineMatch and ChunkMatch scores, instead calculating a score over all matches in the file.
346func (d *indexData) scoreFilesUsingBM25(fileMatches []zoekt.FileMatch, tfs []termFrequency, df termDocumentFrequency, opts *zoekt.SearchOptions) {
347	// Use standard parameter defaults used in Lucene (https://lucene.apache.org/core/10_1_0/core/org/apache/lucene/search/similarities/BM25Similarity.html)
348	k, b := 1.2, 0.75
349
350	averageFileLength := float64(d.boundaries[d.numDocs()]) / float64(d.numDocs())
351	// This is very unlikely, but explicitly guard against division by zero.
352	if averageFileLength == 0 {
353		averageFileLength++
354	}
355
356	for i := range tfs {
357		score := 0.0
358
359		// Compute the file length ratio. Usually the calculation would be based on terms, but using
360		// bytes should work fine, as we're just computing a ratio.
361		doc := tfs[i].doc
362		fileLength := float64(d.boundaries[doc+1] - d.boundaries[doc])
363
364		L := fileLength / averageFileLength
365
366		sumTF := 0 // Just for debugging
367		for term, f := range tfs[i].tf {
368			sumTF += f
369			tfScore := ((k + 1.0) * float64(f)) / (k*(1.0-b+b*L) + float64(f))
370			score += idf(df[term], int(d.numDocs())) * tfScore
371		}
372
373		fileMatches[i].Score = score
374
375		if opts.DebugScore {
376			fileMatches[i].Debug = fmt.Sprintf("bm25-score: %.2f <- sum-termFrequencies: %d, length-ratio: %.2f", score, sumTF, L)
377		}
378	}
379}
380
381// idf computes the inverse document frequency for a term. nq is the number of
382// documents that contain the term and documentCount is the total number of
383// documents in the corpus.
384func idf(nq, documentCount int) float64 {
385	return math.Log(1.0 + ((float64(documentCount) - float64(nq) + 0.5) / (float64(nq) + 0.5)))
386}
Configure Feed

Configure Feed