fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "fmt"
19 "math"
20 "strconv"
21 "strings"
22)
23
24const maxUInt16 = 0xffff
25
26// addScore increments the score of the FileMatch by the computed score. If
27// debugScore is true, it also adds a debug string to the FileMatch. If raw is
28// -1, it is ignored. Otherwise, it is added to the debug string.
29func (m *FileMatch) addScore(what string, computed float64, raw float64, debugScore bool) {
30 if computed != 0 && debugScore {
31 var b strings.Builder
32 fmt.Fprintf(&b, "%s", what)
33 if raw != -1 {
34 fmt.Fprintf(&b, "(%s)", strconv.FormatFloat(raw, 'f', -1, 64))
35 }
36 fmt.Fprintf(&b, ":%.2f, ", computed)
37 m.Debug += b.String()
38 }
39 m.Score += computed
40}
41
42func (m *FileMatch) addKeywordScore(score float64, sumTf float64, L float64, debugScore bool) {
43 if debugScore {
44 m.Debug += fmt.Sprintf("keyword-score:%.2f (sum-tf: %.2f, length-ratio: %.2f)", score, sumTf, L)
45 }
46 m.Score += score
47}
48
49// scoreFile computes a score for the file match using various scoring signals, like
50// whether there's an exact match on a symbol, the number of query clauses that matched, etc.
51func (d *indexData) scoreFile(fileMatch *FileMatch, doc uint32, mt matchTree, known map[matchTree]bool, opts *SearchOptions) {
52 atomMatchCount := 0
53 visitMatchAtoms(mt, known, func(mt matchTree) {
54 atomMatchCount++
55 })
56
57 addScore := func(what string, computed float64) {
58 fileMatch.addScore(what, computed, -1, opts.DebugScore)
59 }
60
61 // atom-count boosts files with matches from more than 1 atom. The
62 // maximum boost is scoreFactorAtomMatch.
63 if atomMatchCount > 0 {
64 fileMatch.addScore("atom", (1.0-1.0/float64(atomMatchCount))*scoreFactorAtomMatch, float64(atomMatchCount), opts.DebugScore)
65 }
66
67 maxFileScore := 0.0
68 for i := range fileMatch.LineMatches {
69 if maxFileScore < fileMatch.LineMatches[i].Score {
70 maxFileScore = fileMatch.LineMatches[i].Score
71 }
72
73 // Order by ordering in file.
74 fileMatch.LineMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.LineMatches))))
75 }
76
77 for i := range fileMatch.ChunkMatches {
78 if maxFileScore < fileMatch.ChunkMatches[i].Score {
79 maxFileScore = fileMatch.ChunkMatches[i].Score
80 }
81
82 // Order by ordering in file.
83 fileMatch.ChunkMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.ChunkMatches))))
84 }
85
86 // Maintain ordering of input files. This
87 // strictly dominates the in-file ordering of
88 // the matches.
89 addScore("fragment", maxFileScore)
90
91 if opts.UseDocumentRanks && len(d.ranks) > int(doc) {
92 weight := scoreFileRankFactor
93 if opts.DocumentRanksWeight > 0.0 {
94 weight = opts.DocumentRanksWeight
95 }
96
97 ranks := d.ranks[doc]
98 // The ranks slice always contains one entry representing the file rank (unless it's empty since the
99 // file doesn't have a rank). This is left over from when documents could have multiple rank signals,
100 // and we plan to clean this up.
101 if len(ranks) > 0 {
102 // The file rank represents a log (base 2) count. The log ranks should be bounded at 32, but we
103 // cap it just in case to ensure it falls in the range [0, 1].
104 normalized := math.Min(1.0, ranks[0]/32.0)
105 addScore("file-rank", weight*normalized)
106 }
107 }
108
109 md := d.repoMetaData[d.repos[doc]]
110 addScore("doc-order", scoreFileOrderFactor*(1.0-float64(doc)/float64(len(d.boundaries))))
111 addScore("repo-rank", scoreRepoRankFactor*float64(md.Rank)/maxUInt16)
112
113 if opts.DebugScore {
114 fileMatch.Debug = strings.TrimSuffix(fileMatch.Debug, ", ")
115 }
116}
117
118// scoreFileUsingBM25 computes a score for the file match using an approximation to BM25, the most common scoring
119// algorithm for keyword search: https://en.wikipedia.org/wiki/Okapi_BM25. It implements all parts of the formula
120// except inverse document frequency (idf), since we don't have access to global term frequency statistics.
121//
122// Filename matches count twice as much as content matches. This mimics a common text search strategy where you
123// 'boost' matches on document titles.
124//
125// This scoring strategy ignores all other signals including document ranks. This keeps things simple for now,
126// since BM25 is not normalized and can be tricky to combine with other scoring signals.
127func (d *indexData) scoreFileUsingBM25(fileMatch *FileMatch, doc uint32, cands []*candidateMatch, opts *SearchOptions) {
128 // Treat each candidate match as a term and compute the frequencies. For now, ignore case
129 // sensitivity and treat filenames and symbols the same as content.
130 termFreqs := map[string]int{}
131 for _, cand := range cands {
132 term := string(cand.substrLowered)
133
134 if cand.fileName {
135 termFreqs[term] += 2
136 } else {
137 termFreqs[term]++
138 }
139 }
140
141 // Compute the file length ratio. Usually the calculation would be based on terms, but using
142 // bytes should work fine, as we're just computing a ratio.
143 fileLength := float64(d.boundaries[doc+1] - d.boundaries[doc])
144 numFiles := len(d.boundaries)
145 averageFileLength := float64(d.boundaries[numFiles-1]) / float64(numFiles)
146
147 // This is very unlikely, but explicitly guard against division by zero.
148 if averageFileLength == 0 {
149 averageFileLength++
150 }
151 L := fileLength / averageFileLength
152
153 // Use standard parameter defaults (used in Lucene and academic papers)
154 k, b := 1.2, 0.75
155 sumTf := 0.0 // Just for debugging
156 score := 0.0
157 for _, freq := range termFreqs {
158 tf := float64(freq)
159 sumTf += tf
160 score += ((k + 1.0) * tf) / (k*(1.0-b+b*L) + tf)
161 }
162
163 fileMatch.addKeywordScore(score, sumTf, L, opts.DebugScore)
164}