score.go at 20c496e3680fb34554434062b032a0b07454d052 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / score.go
at 20c496e3680fb34554434062b032a0b07454d052 6.7 kB View raw
Stefan Hengl ranking: add IDF to BM25 score calculation (#788) 2y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package zoekt
 16
 17import (
 18	"fmt"
 19	"math"
 20	"strconv"
 21	"strings"
 22)
 23
 24const maxUInt16 = 0xffff
 25
 26// addScore increments the score of the FileMatch by the computed score. If
 27// debugScore is true, it also adds a debug string to the FileMatch. If raw is
 28// -1, it is ignored. Otherwise, it is added to the debug string.
 29func (m *FileMatch) addScore(what string, computed float64, raw float64, debugScore bool) {
 30	if computed != 0 && debugScore {
 31		var b strings.Builder
 32		fmt.Fprintf(&b, "%s", what)
 33		if raw != -1 {
 34			fmt.Fprintf(&b, "(%s)", strconv.FormatFloat(raw, 'f', -1, 64))
 35		}
 36		fmt.Fprintf(&b, ":%.2f, ", computed)
 37		m.Debug += b.String()
 38	}
 39	m.Score += computed
 40}
 41
 42// scoreFile computes a score for the file match using various scoring signals, like
 43// whether there's an exact match on a symbol, the number of query clauses that matched, etc.
 44func (d *indexData) scoreFile(fileMatch *FileMatch, doc uint32, mt matchTree, known map[matchTree]bool, opts *SearchOptions) {
 45	atomMatchCount := 0
 46	visitMatchAtoms(mt, known, func(mt matchTree) {
 47		atomMatchCount++
 48	})
 49
 50	addScore := func(what string, computed float64) {
 51		fileMatch.addScore(what, computed, -1, opts.DebugScore)
 52	}
 53
 54	// atom-count boosts files with matches from more than 1 atom. The
 55	// maximum boost is scoreFactorAtomMatch.
 56	if atomMatchCount > 0 {
 57		fileMatch.addScore("atom", (1.0-1.0/float64(atomMatchCount))*scoreFactorAtomMatch, float64(atomMatchCount), opts.DebugScore)
 58	}
 59
 60	maxFileScore := 0.0
 61	for i := range fileMatch.LineMatches {
 62		if maxFileScore < fileMatch.LineMatches[i].Score {
 63			maxFileScore = fileMatch.LineMatches[i].Score
 64		}
 65
 66		// Order by ordering in file.
 67		fileMatch.LineMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.LineMatches))))
 68	}
 69
 70	for i := range fileMatch.ChunkMatches {
 71		if maxFileScore < fileMatch.ChunkMatches[i].Score {
 72			maxFileScore = fileMatch.ChunkMatches[i].Score
 73		}
 74
 75		// Order by ordering in file.
 76		fileMatch.ChunkMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.ChunkMatches))))
 77	}
 78
 79	// Maintain ordering of input files. This
 80	// strictly dominates the in-file ordering of
 81	// the matches.
 82	addScore("fragment", maxFileScore)
 83
 84	if opts.UseDocumentRanks && len(d.ranks) > int(doc) {
 85		weight := scoreFileRankFactor
 86		if opts.DocumentRanksWeight > 0.0 {
 87			weight = opts.DocumentRanksWeight
 88		}
 89
 90		ranks := d.ranks[doc]
 91		// The ranks slice always contains one entry representing the file rank (unless it's empty since the
 92		// file doesn't have a rank). This is left over from when documents could have multiple rank signals,
 93		// and we plan to clean this up.
 94		if len(ranks) > 0 {
 95			// The file rank represents a log (base 2) count. The log ranks should be bounded at 32, but we
 96			// cap it just in case to ensure it falls in the range [0, 1].
 97			normalized := math.Min(1.0, ranks[0]/32.0)
 98			addScore("file-rank", weight*normalized)
 99		}
100	}
101
102	md := d.repoMetaData[d.repos[doc]]
103	addScore("doc-order", scoreFileOrderFactor*(1.0-float64(doc)/float64(len(d.boundaries))))
104	addScore("repo-rank", scoreRepoRankFactor*float64(md.Rank)/maxUInt16)
105
106	if opts.DebugScore {
107		fileMatch.Debug = fmt.Sprintf("score: %.2f <- %s", fileMatch.Score, strings.TrimSuffix(fileMatch.Debug, ", "))
108	}
109}
110
111// calculateTermFrequency computes the term frequency for the file match.
112//
113// Filename matches count more than content matches. This mimics a common text
114// search strategy where you 'boost' matches on document titles.
115func calculateTermFrequency(cands []*candidateMatch, df termDocumentFrequency) map[string]int {
116	// Treat each candidate match as a term and compute the frequencies. For now, ignore case
117	// sensitivity and treat filenames and symbols the same as content.
118	termFreqs := map[string]int{}
119	for _, cand := range cands {
120		term := string(cand.substrLowered)
121		if cand.fileName {
122			termFreqs[term] += 5
123		} else {
124			termFreqs[term]++
125		}
126	}
127
128	for term := range termFreqs {
129		df[term] += 1
130	}
131
132	return termFreqs
133}
134
135// idf computes the inverse document frequency for a term. nq is the number of
136// documents that contain the term and documentCount is the total number of
137// documents in the corpus.
138func idf(nq, documentCount int) float64 {
139	return math.Log(1.0 + ((float64(documentCount) - float64(nq) + 0.5) / (float64(nq) + 0.5)))
140}
141
142// termDocumentFrequency is a map "term" -> "number of documents that contain the term"
143type termDocumentFrequency map[string]int
144
145// termFrequency stores the term frequencies for doc.
146type termFrequency struct {
147	doc uint32
148	tf  map[string]int
149}
150
151// scoreFilesUsingBM25 computes the score according to BM25, the most common
152// scoring algorithm for text search: https://en.wikipedia.org/wiki/Okapi_BM25.
153//
154// This scoring strategy ignores all other signals including document ranks.
155// This keeps things simple for now, since BM25 is not normalized and can be
156// tricky to combine with other scoring signals.
157func (d *indexData) scoreFilesUsingBM25(fileMatches []FileMatch, tfs []termFrequency, df termDocumentFrequency, opts *SearchOptions) {
158	// Use standard parameter defaults (used in Lucene and academic papers)
159	k, b := 1.2, 0.75
160
161	averageFileLength := float64(d.boundaries[d.numDocs()]) / float64(d.numDocs())
162	// This is very unlikely, but explicitly guard against division by zero.
163	if averageFileLength == 0 {
164		averageFileLength++
165	}
166
167	for i := range tfs {
168		score := 0.0
169
170		// Compute the file length ratio. Usually the calculation would be based on terms, but using
171		// bytes should work fine, as we're just computing a ratio.
172		doc := tfs[i].doc
173		fileLength := float64(d.boundaries[doc+1] - d.boundaries[doc])
174
175		L := fileLength / averageFileLength
176
177		sumTF := 0 // Just for debugging
178		for term, f := range tfs[i].tf {
179			sumTF += f
180			tfScore := ((k + 1.0) * float64(f)) / (k*(1.0-b+b*L) + float64(f))
181			score += idf(df[term], int(d.numDocs())) * tfScore
182		}
183
184		fileMatches[i].Score = score
185
186		if opts.DebugScore {
187			fileMatches[i].Debug = fmt.Sprintf("bm25-score: %.2f <- sum-termFrequencies: %d, length-ratio: %.2f", score, sumTF, L)
188		}
189	}
190}
Configure Feed

Configure Feed