fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "fmt"
19 "math"
20 "strconv"
21 "strings"
22)
23
24const (
25 maxUInt16 = 0xffff
26 ScoreOffset = 10_000_000
27)
28
29// addScore increments the score of the FileMatch by the computed score. If
30// debugScore is true, it also adds a debug string to the FileMatch. If raw is
31// -1, it is ignored. Otherwise, it is added to the debug string.
32func (m *FileMatch) addScore(what string, computed float64, raw float64, debugScore bool) {
33 if computed != 0 && debugScore {
34 var b strings.Builder
35 fmt.Fprintf(&b, "%s", what)
36 if raw != -1 {
37 fmt.Fprintf(&b, "(%s)", strconv.FormatFloat(raw, 'f', -1, 64))
38 }
39 fmt.Fprintf(&b, ":%.2f, ", computed)
40 m.Debug += b.String()
41 }
42 m.Score += computed
43}
44
45// scoreFile computes a score for the file match using various scoring signals, like
46// whether there's an exact match on a symbol, the number of query clauses that matched, etc.
47func (d *indexData) scoreFile(fileMatch *FileMatch, doc uint32, mt matchTree, known map[matchTree]bool, opts *SearchOptions) {
48 atomMatchCount := 0
49 visitMatchAtoms(mt, known, func(mt matchTree) {
50 atomMatchCount++
51 })
52
53 addScore := func(what string, computed float64) {
54 fileMatch.addScore(what, computed, -1, opts.DebugScore)
55 }
56
57 // atom-count boosts files with matches from more than 1 atom. The
58 // maximum boost is scoreFactorAtomMatch.
59 if atomMatchCount > 0 {
60 fileMatch.addScore("atom", (1.0-1.0/float64(atomMatchCount))*scoreFactorAtomMatch, float64(atomMatchCount), opts.DebugScore)
61 }
62
63 maxFileScore := 0.0
64 for i := range fileMatch.LineMatches {
65 if maxFileScore < fileMatch.LineMatches[i].Score {
66 maxFileScore = fileMatch.LineMatches[i].Score
67 }
68
69 // Order by ordering in file.
70 fileMatch.LineMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.LineMatches))))
71 }
72
73 for i := range fileMatch.ChunkMatches {
74 if maxFileScore < fileMatch.ChunkMatches[i].Score {
75 maxFileScore = fileMatch.ChunkMatches[i].Score
76 }
77
78 // Order by ordering in file.
79 fileMatch.ChunkMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.ChunkMatches))))
80 }
81
82 // Maintain ordering of input files. This
83 // strictly dominates the in-file ordering of
84 // the matches.
85 addScore("fragment", maxFileScore)
86
87 if opts.UseDocumentRanks && len(d.ranks) > int(doc) {
88 weight := scoreFileRankFactor
89 if opts.DocumentRanksWeight > 0.0 {
90 weight = opts.DocumentRanksWeight
91 }
92
93 ranks := d.ranks[doc]
94 // The ranks slice always contains one entry representing the file rank (unless it's empty since the
95 // file doesn't have a rank). This is left over from when documents could have multiple rank signals,
96 // and we plan to clean this up.
97 if len(ranks) > 0 {
98 // The file rank represents a log (base 2) count. The log ranks should be bounded at 32, but we
99 // cap it just in case to ensure it falls in the range [0, 1].
100 normalized := math.Min(1.0, ranks[0]/32.0)
101 addScore("file-rank", weight*normalized)
102 }
103 }
104
105 // Add tiebreakers
106 //
107 // ScoreOffset shifts the score 7 digits to the left.
108 fileMatch.Score = math.Trunc(fileMatch.Score) * ScoreOffset
109
110 md := d.repoMetaData[d.repos[doc]]
111
112 // md.Rank lies in the range [0, 65535]. Hence, we have to allocate 5 digits for
113 // the rank. The scoreRepoRankFactor shifts the rank score 2 digits to the left,
114 // reserving digits 3-7 for the repo rank.
115 addScore("repo-rank", scoreRepoRankFactor*float64(md.Rank))
116
117 // digits 1-2 and the decimals are reserved for the doc order. Doc order
118 // (without the scaling factor) lies in the range [0, 1]. The upper bound is
119 // achieved for matches in the first document of a shard.
120 addScore("doc-order", scoreFileOrderFactor*(1.0-float64(doc)/float64(len(d.boundaries))))
121
122 if opts.DebugScore {
123 // To make the debug output easier to read, we split the score into the query
124 // dependent score and the tiebreaker
125 score := math.Trunc(fileMatch.Score / ScoreOffset)
126 tiebreaker := fileMatch.Score - score*ScoreOffset
127 fileMatch.Debug = fmt.Sprintf("score: %d (%.2f) <- %s", int(score), tiebreaker, strings.TrimSuffix(fileMatch.Debug, ", "))
128 }
129}
130
131// calculateTermFrequency computes the term frequency for the file match.
132//
133// Filename matches count more than content matches. This mimics a common text
134// search strategy where you 'boost' matches on document titles.
135func calculateTermFrequency(cands []*candidateMatch, df termDocumentFrequency) map[string]int {
136 // Treat each candidate match as a term and compute the frequencies. For now, ignore case
137 // sensitivity and treat filenames and symbols the same as content.
138 termFreqs := map[string]int{}
139 for _, cand := range cands {
140 term := string(cand.substrLowered)
141 if cand.fileName {
142 termFreqs[term] += 5
143 } else {
144 termFreqs[term]++
145 }
146 }
147
148 for term := range termFreqs {
149 df[term] += 1
150 }
151
152 return termFreqs
153}
154
155// idf computes the inverse document frequency for a term. nq is the number of
156// documents that contain the term and documentCount is the total number of
157// documents in the corpus.
158func idf(nq, documentCount int) float64 {
159 return math.Log(1.0 + ((float64(documentCount) - float64(nq) + 0.5) / (float64(nq) + 0.5)))
160}
161
162// termDocumentFrequency is a map "term" -> "number of documents that contain the term"
163type termDocumentFrequency map[string]int
164
165// termFrequency stores the term frequencies for doc.
166type termFrequency struct {
167 doc uint32
168 tf map[string]int
169}
170
171// scoreFilesUsingBM25 computes the score according to BM25, the most common
172// scoring algorithm for text search: https://en.wikipedia.org/wiki/Okapi_BM25.
173//
174// This scoring strategy ignores all other signals including document ranks.
175// This keeps things simple for now, since BM25 is not normalized and can be
176// tricky to combine with other scoring signals.
177func (d *indexData) scoreFilesUsingBM25(fileMatches []FileMatch, tfs []termFrequency, df termDocumentFrequency, opts *SearchOptions) {
178 // Use standard parameter defaults (used in Lucene and academic papers)
179 k, b := 1.2, 0.75
180
181 averageFileLength := float64(d.boundaries[d.numDocs()]) / float64(d.numDocs())
182 // This is very unlikely, but explicitly guard against division by zero.
183 if averageFileLength == 0 {
184 averageFileLength++
185 }
186
187 for i := range tfs {
188 score := 0.0
189
190 // Compute the file length ratio. Usually the calculation would be based on terms, but using
191 // bytes should work fine, as we're just computing a ratio.
192 doc := tfs[i].doc
193 fileLength := float64(d.boundaries[doc+1] - d.boundaries[doc])
194
195 L := fileLength / averageFileLength
196
197 sumTF := 0 // Just for debugging
198 for term, f := range tfs[i].tf {
199 sumTF += f
200 tfScore := ((k + 1.0) * float64(f)) / (k*(1.0-b+b*L) + float64(f))
201 score += idf(df[term], int(d.numDocs())) * tfScore
202 }
203
204 fileMatches[i].Score = score
205
206 if opts.DebugScore {
207 fileMatches[i].Debug = fmt.Sprintf("bm25-score: %.2f <- sum-termFrequencies: %d, length-ratio: %.2f", score, sumTF, L)
208 }
209 }
210}