score.go at 59884de44b986c8e4d74e1bad5bf49d7b9093c8d · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / score.go
at 59884de44b986c8e4d74e1bad5bf49d7b9093c8d 6.8 kB View raw
Stefan Hengl ranking: removing document ranks (#853) 2y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package zoekt
 16
 17import (
 18	"fmt"
 19	"math"
 20	"strconv"
 21	"strings"
 22)
 23
 24const (
 25	maxUInt16   = 0xffff
 26	ScoreOffset = 10_000_000
 27)
 28
 29// addScore increments the score of the FileMatch by the computed score. If
 30// debugScore is true, it also adds a debug string to the FileMatch. If raw is
 31// -1, it is ignored. Otherwise, it is added to the debug string.
 32func (m *FileMatch) addScore(what string, computed float64, raw float64, debugScore bool) {
 33	if computed != 0 && debugScore {
 34		var b strings.Builder
 35		fmt.Fprintf(&b, "%s", what)
 36		if raw != -1 {
 37			fmt.Fprintf(&b, "(%s)", strconv.FormatFloat(raw, 'f', -1, 64))
 38		}
 39		fmt.Fprintf(&b, ":%.2f, ", computed)
 40		m.Debug += b.String()
 41	}
 42	m.Score += computed
 43}
 44
 45// scoreFile computes a score for the file match using various scoring signals, like
 46// whether there's an exact match on a symbol, the number of query clauses that matched, etc.
 47func (d *indexData) scoreFile(fileMatch *FileMatch, doc uint32, mt matchTree, known map[matchTree]bool, opts *SearchOptions) {
 48	atomMatchCount := 0
 49	visitMatchAtoms(mt, known, func(mt matchTree) {
 50		atomMatchCount++
 51	})
 52
 53	addScore := func(what string, computed float64) {
 54		fileMatch.addScore(what, computed, -1, opts.DebugScore)
 55	}
 56
 57	// atom-count boosts files with matches from more than 1 atom. The
 58	// maximum boost is scoreFactorAtomMatch.
 59	if atomMatchCount > 0 {
 60		fileMatch.addScore("atom", (1.0-1.0/float64(atomMatchCount))*scoreFactorAtomMatch, float64(atomMatchCount), opts.DebugScore)
 61	}
 62
 63	maxFileScore := 0.0
 64	for i := range fileMatch.LineMatches {
 65		if maxFileScore < fileMatch.LineMatches[i].Score {
 66			maxFileScore = fileMatch.LineMatches[i].Score
 67		}
 68
 69		// Order by ordering in file.
 70		fileMatch.LineMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.LineMatches))))
 71	}
 72
 73	for i := range fileMatch.ChunkMatches {
 74		if maxFileScore < fileMatch.ChunkMatches[i].Score {
 75			maxFileScore = fileMatch.ChunkMatches[i].Score
 76		}
 77
 78		// Order by ordering in file.
 79		fileMatch.ChunkMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.ChunkMatches))))
 80	}
 81
 82	// Maintain ordering of input files. This
 83	// strictly dominates the in-file ordering of
 84	// the matches.
 85	addScore("fragment", maxFileScore)
 86
 87	// Add tiebreakers
 88	//
 89	// ScoreOffset shifts the score 7 digits to the left.
 90	fileMatch.Score = math.Trunc(fileMatch.Score) * ScoreOffset
 91
 92	md := d.repoMetaData[d.repos[doc]]
 93
 94	// md.Rank lies in the range [0, 65535]. Hence, we have to allocate 5 digits for
 95	// the rank. The scoreRepoRankFactor shifts the rank score 2 digits to the left,
 96	// reserving digits 3-7 for the repo rank.
 97	addScore("repo-rank", scoreRepoRankFactor*float64(md.Rank))
 98
 99	// digits 1-2 and the decimals are reserved for the doc order. Doc order
100	// (without the scaling factor) lies in the range [0, 1]. The upper bound is
101	// achieved for matches in the first document of a shard.
102	addScore("doc-order", scoreFileOrderFactor*(1.0-float64(doc)/float64(len(d.boundaries))))
103
104	if opts.DebugScore {
105		// To make the debug output easier to read, we split the score into the query
106		// dependent score and the tiebreaker
107		score := math.Trunc(fileMatch.Score / ScoreOffset)
108		tiebreaker := fileMatch.Score - score*ScoreOffset
109		fileMatch.Debug = fmt.Sprintf("score: %d (%.2f) <- %s", int(score), tiebreaker, strings.TrimSuffix(fileMatch.Debug, ", "))
110	}
111}
112
113// calculateTermFrequency computes the term frequency for the file match.
114//
115// Filename matches count more than content matches. This mimics a common text
116// search strategy where you 'boost' matches on document titles.
117func calculateTermFrequency(cands []*candidateMatch, df termDocumentFrequency) map[string]int {
118	// Treat each candidate match as a term and compute the frequencies. For now, ignore case
119	// sensitivity and treat filenames and symbols the same as content.
120	termFreqs := map[string]int{}
121	for _, cand := range cands {
122		term := string(cand.substrLowered)
123		if cand.fileName {
124			termFreqs[term] += 5
125		} else {
126			termFreqs[term]++
127		}
128	}
129
130	for term := range termFreqs {
131		df[term] += 1
132	}
133
134	return termFreqs
135}
136
137// idf computes the inverse document frequency for a term. nq is the number of
138// documents that contain the term and documentCount is the total number of
139// documents in the corpus.
140func idf(nq, documentCount int) float64 {
141	return math.Log(1.0 + ((float64(documentCount) - float64(nq) + 0.5) / (float64(nq) + 0.5)))
142}
143
144// termDocumentFrequency is a map "term" -> "number of documents that contain the term"
145type termDocumentFrequency map[string]int
146
147// termFrequency stores the term frequencies for doc.
148type termFrequency struct {
149	doc uint32
150	tf  map[string]int
151}
152
153// scoreFilesUsingBM25 computes the score according to BM25, the most common
154// scoring algorithm for text search: https://en.wikipedia.org/wiki/Okapi_BM25.
155//
156// This scoring strategy ignores all other signals including document ranks.
157// This keeps things simple for now, since BM25 is not normalized and can be
158// tricky to combine with other scoring signals.
159func (d *indexData) scoreFilesUsingBM25(fileMatches []FileMatch, tfs []termFrequency, df termDocumentFrequency, opts *SearchOptions) {
160	// Use standard parameter defaults (used in Lucene and academic papers)
161	k, b := 1.2, 0.75
162
163	averageFileLength := float64(d.boundaries[d.numDocs()]) / float64(d.numDocs())
164	// This is very unlikely, but explicitly guard against division by zero.
165	if averageFileLength == 0 {
166		averageFileLength++
167	}
168
169	for i := range tfs {
170		score := 0.0
171
172		// Compute the file length ratio. Usually the calculation would be based on terms, but using
173		// bytes should work fine, as we're just computing a ratio.
174		doc := tfs[i].doc
175		fileLength := float64(d.boundaries[doc+1] - d.boundaries[doc])
176
177		L := fileLength / averageFileLength
178
179		sumTF := 0 // Just for debugging
180		for term, f := range tfs[i].tf {
181			sumTF += f
182			tfScore := ((k + 1.0) * float64(f)) / (k*(1.0-b+b*L) + float64(f))
183			score += idf(df[term], int(d.numDocs())) * tfScore
184		}
185
186		fileMatches[i].Score = score
187
188		if opts.DebugScore {
189			fileMatches[i].Debug = fmt.Sprintf("bm25-score: %.2f <- sum-termFrequencies: %d, length-ratio: %.2f", score, sumTF, L)
190		}
191	}
192}
Configure Feed

Configure Feed