fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "fmt" 19 "math" 20 "strconv" 21 "strings" 22) 23 24const maxUInt16 = 0xffff 25 26// addScore increments the score of the FileMatch by the computed score. If 27// debugScore is true, it also adds a debug string to the FileMatch. If raw is 28// -1, it is ignored. Otherwise, it is added to the debug string. 29func (m *FileMatch) addScore(what string, computed float64, raw float64, debugScore bool) { 30 if computed != 0 && debugScore { 31 var b strings.Builder 32 fmt.Fprintf(&b, "%s", what) 33 if raw != -1 { 34 fmt.Fprintf(&b, "(%s)", strconv.FormatFloat(raw, 'f', -1, 64)) 35 } 36 fmt.Fprintf(&b, ":%.2f, ", computed) 37 m.Debug += b.String() 38 } 39 m.Score += computed 40} 41 42func (m *FileMatch) addKeywordScore(score float64, sumTf float64, L float64, debugScore bool) { 43 if debugScore { 44 m.Debug += fmt.Sprintf("keyword-score:%.2f (sum-tf: %.2f, length-ratio: %.2f)", score, sumTf, L) 45 } 46 m.Score += score 47} 48 49// scoreFile computes a score for the file match using various scoring signals, like 50// whether there's an exact match on a symbol, the number of query clauses that matched, etc. 51func (d *indexData) scoreFile(fileMatch *FileMatch, doc uint32, mt matchTree, known map[matchTree]bool, opts *SearchOptions) { 52 atomMatchCount := 0 53 visitMatchAtoms(mt, known, func(mt matchTree) { 54 atomMatchCount++ 55 }) 56 57 addScore := func(what string, computed float64) { 58 fileMatch.addScore(what, computed, -1, opts.DebugScore) 59 } 60 61 // atom-count boosts files with matches from more than 1 atom. The 62 // maximum boost is scoreFactorAtomMatch. 63 if atomMatchCount > 0 { 64 fileMatch.addScore("atom", (1.0-1.0/float64(atomMatchCount))*scoreFactorAtomMatch, float64(atomMatchCount), opts.DebugScore) 65 } 66 67 maxFileScore := 0.0 68 for i := range fileMatch.LineMatches { 69 if maxFileScore < fileMatch.LineMatches[i].Score { 70 maxFileScore = fileMatch.LineMatches[i].Score 71 } 72 73 // Order by ordering in file. 74 fileMatch.LineMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.LineMatches)))) 75 } 76 77 for i := range fileMatch.ChunkMatches { 78 if maxFileScore < fileMatch.ChunkMatches[i].Score { 79 maxFileScore = fileMatch.ChunkMatches[i].Score 80 } 81 82 // Order by ordering in file. 83 fileMatch.ChunkMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.ChunkMatches)))) 84 } 85 86 // Maintain ordering of input files. This 87 // strictly dominates the in-file ordering of 88 // the matches. 89 addScore("fragment", maxFileScore) 90 91 if opts.UseDocumentRanks && len(d.ranks) > int(doc) { 92 weight := scoreFileRankFactor 93 if opts.DocumentRanksWeight > 0.0 { 94 weight = opts.DocumentRanksWeight 95 } 96 97 ranks := d.ranks[doc] 98 // The ranks slice always contains one entry representing the file rank (unless it's empty since the 99 // file doesn't have a rank). This is left over from when documents could have multiple rank signals, 100 // and we plan to clean this up. 101 if len(ranks) > 0 { 102 // The file rank represents a log (base 2) count. The log ranks should be bounded at 32, but we 103 // cap it just in case to ensure it falls in the range [0, 1]. 104 normalized := math.Min(1.0, ranks[0]/32.0) 105 addScore("file-rank", weight*normalized) 106 } 107 } 108 109 md := d.repoMetaData[d.repos[doc]] 110 addScore("doc-order", scoreFileOrderFactor*(1.0-float64(doc)/float64(len(d.boundaries)))) 111 addScore("repo-rank", scoreRepoRankFactor*float64(md.Rank)/maxUInt16) 112 113 if opts.DebugScore { 114 fileMatch.Debug = strings.TrimSuffix(fileMatch.Debug, ", ") 115 } 116} 117 118// scoreFileUsingBM25 computes a score for the file match using an approximation to BM25, the most common scoring 119// algorithm for keyword search: https://en.wikipedia.org/wiki/Okapi_BM25. It implements all parts of the formula 120// except inverse document frequency (idf), since we don't have access to global term frequency statistics. 121// 122// Filename matches count twice as much as content matches. This mimics a common text search strategy where you 123// 'boost' matches on document titles. 124// 125// This scoring strategy ignores all other signals including document ranks. This keeps things simple for now, 126// since BM25 is not normalized and can be tricky to combine with other scoring signals. 127func (d *indexData) scoreFileUsingBM25(fileMatch *FileMatch, doc uint32, cands []*candidateMatch, opts *SearchOptions) { 128 // Treat each candidate match as a term and compute the frequencies. For now, ignore case 129 // sensitivity and treat filenames and symbols the same as content. 130 termFreqs := map[string]int{} 131 for _, cand := range cands { 132 term := string(cand.substrLowered) 133 134 if cand.fileName { 135 termFreqs[term] += 2 136 } else { 137 termFreqs[term]++ 138 } 139 } 140 141 // Compute the file length ratio. Usually the calculation would be based on terms, but using 142 // bytes should work fine, as we're just computing a ratio. 143 fileLength := float64(d.boundaries[doc+1] - d.boundaries[doc]) 144 numFiles := len(d.boundaries) 145 averageFileLength := float64(d.boundaries[numFiles-1]) / float64(numFiles) 146 147 // This is very unlikely, but explicitly guard against division by zero. 148 if averageFileLength == 0 { 149 averageFileLength++ 150 } 151 L := fileLength / averageFileLength 152 153 // Use standard parameter defaults (used in Lucene and academic papers) 154 k, b := 1.2, 0.75 155 sumTf := 0.0 // Just for debugging 156 score := 0.0 157 for _, freq := range termFreqs { 158 tf := float64(freq) 159 sumTf += tf 160 score += ((k + 1.0) * tf) / (k*(1.0-b+b*L) + tf) 161 } 162 163 fileMatch.addKeywordScore(score, sumTf, L, opts.DebugScore) 164}