score.go at 4c8bb19e06668645fe2c55c861cf2426f53b9ade · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / score.go
at 4c8bb19e06668645fe2c55c861cf2426f53b9ade 13 kB View raw
Julie Tibshirani Support BM25 scoring for chunk matches (#889) 1y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package zoekt
 16
 17import (
 18	"bytes"
 19	"fmt"
 20	"math"
 21	"strings"
 22
 23	"github.com/sourcegraph/zoekt/ctags"
 24)
 25
 26const (
 27	ScoreOffset = 10_000_000
 28)
 29
 30type chunkScore struct {
 31	score      float64
 32	debugScore string
 33	bestLine   int
 34}
 35
 36// scoreChunk calculates the score for each line in the chunk based on its candidate matches, and returns the score of
 37// the best-scoring line, along with its line number.
 38// Invariant: there should be at least one input candidate, len(ms) > 0.
 39func (p *contentProvider) scoreChunk(ms []*candidateMatch, language string, opts *SearchOptions) (chunkScore, []*Symbol) {
 40	nl := p.newlines()
 41
 42	var bestScore lineScore
 43	bestLine := 0
 44	var symbolInfo []*Symbol
 45
 46	start := 0
 47	currentLine := -1
 48	for i, m := range ms {
 49		lineNumber := -1
 50		if !m.fileName {
 51			lineNumber = nl.atOffset(m.byteOffset)
 52		}
 53
 54		// If this match represents a new line, then score the previous line and update 'start'.
 55		if i != 0 && lineNumber != currentLine {
 56			score, si := p.scoreLine(ms[start:i], language, currentLine, opts)
 57			symbolInfo = append(symbolInfo, si...)
 58			if score.score > bestScore.score {
 59				bestScore = score
 60				bestLine = currentLine
 61			}
 62			start = i
 63		}
 64		currentLine = lineNumber
 65	}
 66
 67	// Make sure to score the last line
 68	line, si := p.scoreLine(ms[start:], language, currentLine, opts)
 69	symbolInfo = append(symbolInfo, si...)
 70	if line.score > bestScore.score {
 71		bestScore = line
 72		bestLine = currentLine
 73	}
 74
 75	cs := chunkScore{
 76		score:    bestScore.score,
 77		bestLine: bestLine,
 78	}
 79	if opts.DebugScore {
 80		cs.debugScore = fmt.Sprintf("%s, (line: %d)", bestScore.debugScore, bestLine)
 81	}
 82	return cs, symbolInfo
 83}
 84
 85type lineScore struct {
 86	score      float64
 87	debugScore string
 88}
 89
 90// scoreLine calculates a score for the line based on its candidate matches.
 91// Invariants:
 92// - All candidate matches are assumed to come from the same line in the content.
 93// - If this line represents a filename, then lineNumber must be -1.
 94// - There should be at least one input candidate, len(ms) > 0.
 95func (p *contentProvider) scoreLine(ms []*candidateMatch, language string, lineNumber int, opts *SearchOptions) (lineScore, []*Symbol) {
 96	if opts.UseBM25Scoring {
 97		score, symbolInfo := p.scoreLineBM25(ms, lineNumber)
 98		ls := lineScore{score: score}
 99		if opts.DebugScore {
100			ls.debugScore = fmt.Sprintf("tfScore:%.2f, ", score)
101		}
102		return ls, symbolInfo
103	}
104
105	score := 0.0
106	what := ""
107	addScore := func(w string, s float64) {
108		if s != 0 && opts.DebugScore {
109			what += fmt.Sprintf("%s:%.2f, ", w, s)
110		}
111		score += s
112	}
113
114	filename := p.data(true)
115	var symbolInfo []*Symbol
116
117	var bestScore lineScore
118	for i, m := range ms {
119		data := p.data(m.fileName)
120
121		endOffset := m.byteOffset + m.byteMatchSz
122		startBoundary := m.byteOffset < uint32(len(data)) && (m.byteOffset == 0 || byteClass(data[m.byteOffset-1]) != byteClass(data[m.byteOffset]))
123		endBoundary := endOffset > 0 && (endOffset == uint32(len(data)) || byteClass(data[endOffset-1]) != byteClass(data[endOffset]))
124
125		score = 0
126		what = ""
127
128		if startBoundary && endBoundary {
129			addScore("WordMatch", scoreWordMatch)
130		} else if startBoundary || endBoundary {
131			addScore("PartialWordMatch", scorePartialWordMatch)
132		}
133
134		if m.fileName {
135			sep := bytes.LastIndexByte(data, '/')
136			startMatch := int(m.byteOffset) == sep+1
137			endMatch := endOffset == uint32(len(data))
138			if startMatch && endMatch {
139				addScore("Base", scoreBase)
140			} else if startMatch || endMatch {
141				addScore("EdgeBase", (scoreBase+scorePartialBase)/2)
142			} else if sep < int(m.byteOffset) {
143				addScore("InnerBase", scorePartialBase)
144			}
145		} else if sec, si, ok := p.findSymbol(m); ok {
146			startMatch := sec.Start == m.byteOffset
147			endMatch := sec.End == endOffset
148			if startMatch && endMatch {
149				addScore("Symbol", scoreSymbol)
150			} else if startMatch || endMatch {
151				addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2)
152			} else {
153				addScore("OverlapSymbol", scorePartialSymbol)
154			}
155
156			// Score based on symbol data
157			if si != nil {
158				symbolKind := ctags.ParseSymbolKind(si.Kind)
159				sym := sectionSlice(data, sec)
160
161				addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, symbolKind))
162
163				// This is from a symbol tree, so we need to store the symbol
164				// information.
165				if m.symbol {
166					if symbolInfo == nil {
167						symbolInfo = make([]*Symbol, len(ms))
168					}
169					// findSymbols does not hydrate in Sym. So we need to store it.
170					si.Sym = string(sym)
171					symbolInfo[i] = si
172				}
173			}
174		}
175
176		// scoreWeight != 1 means it affects score
177		if !epsilonEqualsOne(m.scoreWeight) {
178			score = score * m.scoreWeight
179			if opts.DebugScore {
180				what += fmt.Sprintf("boost:%.2f, ", m.scoreWeight)
181			}
182		}
183
184		if score > bestScore.score {
185			bestScore.score = score
186			bestScore.debugScore = what
187		}
188	}
189
190	if opts.DebugScore {
191		bestScore.debugScore = fmt.Sprintf("score:%.2f <- %s", bestScore.score, strings.TrimSuffix(bestScore.debugScore, ", "))
192	}
193
194	return bestScore, symbolInfo
195}
196
197// scoreLineBM25 computes the score of a line according to BM25, the most common scoring algorithm for text search:
198// https://en.wikipedia.org/wiki/Okapi_BM25. Compared to the standard scoreLine algorithm, this score rewards multiple
199// term matches on a line.
200// Notes:
201// - This BM25 calculation skips inverse document frequency (idf) to keep the implementation simple.
202// - It uses the same calculateTermFrequency method as BM25 file scoring, which boosts filename and symbol matches.
203func (p *contentProvider) scoreLineBM25(ms []*candidateMatch, lineNumber int) (float64, []*Symbol) {
204	// If this is a filename, then don't compute BM25. The score would not be comparable to line scores.
205	if lineNumber < 0 {
206		return 0, nil
207	}
208
209	// Use standard parameter defaults used in Lucene (https://lucene.apache.org/core/10_1_0/core/org/apache/lucene/search/similarities/BM25Similarity.html)
210	k, b := 1.2, 0.75
211
212	// Calculate the length ratio of this line. As a heuristic, we assume an average line length of 100 characters.
213	// Usually the calculation would be based on terms, but using bytes should work fine, as we're just computing a ratio.
214	nl := p.newlines()
215	lineLength := nl.lineStart(lineNumber+1) - nl.lineStart(lineNumber)
216	L := float64(lineLength) / 100.0
217
218	score := 0.0
219	tfs := p.calculateTermFrequency(ms, termDocumentFrequency{})
220	for _, f := range tfs {
221		score += ((k + 1.0) * float64(f)) / (k*(1.0-b+b*L) + float64(f))
222	}
223
224	// Check if any match comes from a symbol match tree, and if so hydrate in symbol information
225	var symbolInfo []*Symbol
226	for _, m := range ms {
227		if m.symbol {
228			if sec, si, ok := p.findSymbol(m); ok && si != nil {
229				// findSymbols does not hydrate in Sym. So we need to store it.
230				sym := sectionSlice(p.data(false), sec)
231				si.Sym = string(sym)
232				symbolInfo = append(symbolInfo, si)
233			}
234		}
235	}
236	return score, symbolInfo
237}
238
239// termDocumentFrequency is a map "term" -> "number of documents that contain the term"
240type termDocumentFrequency map[string]int
241
242// calculateTermFrequency computes the term frequency for the file match.
243// Notes:
244// - Filename matches count more than content matches. This mimics a common text search strategy to 'boost' matches on document titles.
245// - Symbol matches also count more than content matches, to reward matches on symbol definitions.
246func (p *contentProvider) calculateTermFrequency(cands []*candidateMatch, df termDocumentFrequency) map[string]int {
247	// Treat each candidate match as a term and compute the frequencies. For now, ignore case sensitivity and
248	// ignore whether the match is a word boundary.
249	termFreqs := map[string]int{}
250	for _, m := range cands {
251		term := string(m.substrLowered)
252		if m.fileName || p.matchesSymbol(m) {
253			termFreqs[term] += 5
254		} else {
255			termFreqs[term]++
256		}
257	}
258
259	for term := range termFreqs {
260		df[term] += 1
261	}
262	return termFreqs
263}
264
265// scoreFile computes a score for the file match using various scoring signals, like
266// whether there's an exact match on a symbol, the number of query clauses that matched, etc.
267func (d *indexData) scoreFile(fileMatch *FileMatch, doc uint32, mt matchTree, known map[matchTree]bool, opts *SearchOptions) {
268	atomMatchCount := 0
269	visitMatchAtoms(mt, known, func(mt matchTree) {
270		atomMatchCount++
271	})
272
273	addScore := func(what string, computed float64) {
274		fileMatch.addScore(what, computed, -1, opts.DebugScore)
275	}
276
277	// atom-count boosts files with matches from more than 1 atom. The
278	// maximum boost is scoreFactorAtomMatch.
279	if atomMatchCount > 0 {
280		fileMatch.addScore("atom", (1.0-1.0/float64(atomMatchCount))*scoreFactorAtomMatch, float64(atomMatchCount), opts.DebugScore)
281	}
282
283	maxFileScore := 0.0
284	for i := range fileMatch.LineMatches {
285		if maxFileScore < fileMatch.LineMatches[i].Score {
286			maxFileScore = fileMatch.LineMatches[i].Score
287		}
288
289		// Order by ordering in file.
290		fileMatch.LineMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.LineMatches))))
291	}
292
293	for i := range fileMatch.ChunkMatches {
294		if maxFileScore < fileMatch.ChunkMatches[i].Score {
295			maxFileScore = fileMatch.ChunkMatches[i].Score
296		}
297
298		// Order by ordering in file.
299		fileMatch.ChunkMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.ChunkMatches))))
300	}
301
302	// Maintain ordering of input files. This
303	// strictly dominates the in-file ordering of
304	// the matches.
305	addScore("fragment", maxFileScore)
306
307	// Add tiebreakers
308	//
309	// ScoreOffset shifts the score 7 digits to the left.
310	fileMatch.Score = math.Trunc(fileMatch.Score) * ScoreOffset
311
312	md := d.repoMetaData[d.repos[doc]]
313
314	// md.Rank lies in the range [0, 65535]. Hence, we have to allocate 5 digits for
315	// the rank. The scoreRepoRankFactor shifts the rank score 2 digits to the left,
316	// reserving digits 3-7 for the repo rank.
317	addScore("repo-rank", scoreRepoRankFactor*float64(md.Rank))
318
319	// digits 1-2 and the decimals are reserved for the doc order. Doc order
320	// (without the scaling factor) lies in the range [0, 1]. The upper bound is
321	// achieved for matches in the first document of a shard.
322	addScore("doc-order", scoreFileOrderFactor*(1.0-float64(doc)/float64(len(d.boundaries))))
323
324	if opts.DebugScore {
325		// To make the debug output easier to read, we split the score into the query
326		// dependent score and the tiebreaker
327		score := math.Trunc(fileMatch.Score / ScoreOffset)
328		tiebreaker := fileMatch.Score - score*ScoreOffset
329		fileMatch.Debug = fmt.Sprintf("score: %d (%.2f) <- %s", int(score), tiebreaker, strings.TrimSuffix(fileMatch.Debug, ", "))
330	}
331}
332
333// termFrequency stores the term frequencies for doc.
334type termFrequency struct {
335	doc uint32
336	tf  map[string]int
337}
338
339// scoreFilesUsingBM25 computes the score according to BM25, the most common scoring algorithm for text search:
340// https://en.wikipedia.org/wiki/Okapi_BM25.
341//
342// Unlike standard file scoring, this scoring strategy ignores all other signals including document ranks. This keeps
343// things simple for now, since BM25 is not normalized and can be  tricky to combine with other scoring signals. It also
344// ignores the individual LineMatch and ChunkMatch scores, instead calculating a score over all matches in the file.
345func (d *indexData) scoreFilesUsingBM25(fileMatches []FileMatch, tfs []termFrequency, df termDocumentFrequency, opts *SearchOptions) {
346	// Use standard parameter defaults used in Lucene (https://lucene.apache.org/core/10_1_0/core/org/apache/lucene/search/similarities/BM25Similarity.html)
347	k, b := 1.2, 0.75
348
349	averageFileLength := float64(d.boundaries[d.numDocs()]) / float64(d.numDocs())
350	// This is very unlikely, but explicitly guard against division by zero.
351	if averageFileLength == 0 {
352		averageFileLength++
353	}
354
355	for i := range tfs {
356		score := 0.0
357
358		// Compute the file length ratio. Usually the calculation would be based on terms, but using
359		// bytes should work fine, as we're just computing a ratio.
360		doc := tfs[i].doc
361		fileLength := float64(d.boundaries[doc+1] - d.boundaries[doc])
362
363		L := fileLength / averageFileLength
364
365		sumTF := 0 // Just for debugging
366		for term, f := range tfs[i].tf {
367			sumTF += f
368			tfScore := ((k + 1.0) * float64(f)) / (k*(1.0-b+b*L) + float64(f))
369			score += idf(df[term], int(d.numDocs())) * tfScore
370		}
371
372		fileMatches[i].Score = score
373
374		if opts.DebugScore {
375			fileMatches[i].Debug = fmt.Sprintf("bm25-score: %.2f <- sum-termFrequencies: %d, length-ratio: %.2f", score, sumTF, L)
376		}
377	}
378}
379
380// idf computes the inverse document frequency for a term. nq is the number of
381// documents that contain the term and documentCount is the total number of
382// documents in the corpus.
383func idf(nq, documentCount int) float64 {
384	return math.Log(1.0 + ((float64(documentCount) - float64(nq) + 0.5) / (float64(nq) + 0.5)))
385}
Configure Feed

Configure Feed