fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "fmt" 19 "math" 20 "strconv" 21 "strings" 22) 23 24const maxUInt16 = 0xffff 25 26// addScore increments the score of the FileMatch by the computed score. If 27// debugScore is true, it also adds a debug string to the FileMatch. If raw is 28// -1, it is ignored. Otherwise, it is added to the debug string. 29func (m *FileMatch) addScore(what string, computed float64, raw float64, debugScore bool) { 30 if computed != 0 && debugScore { 31 var b strings.Builder 32 fmt.Fprintf(&b, "%s", what) 33 if raw != -1 { 34 fmt.Fprintf(&b, "(%s)", strconv.FormatFloat(raw, 'f', -1, 64)) 35 } 36 fmt.Fprintf(&b, ":%.2f, ", computed) 37 m.Debug += b.String() 38 } 39 m.Score += computed 40} 41 42// scoreFile computes a score for the file match using various scoring signals, like 43// whether there's an exact match on a symbol, the number of query clauses that matched, etc. 44func (d *indexData) scoreFile(fileMatch *FileMatch, doc uint32, mt matchTree, known map[matchTree]bool, opts *SearchOptions) { 45 atomMatchCount := 0 46 visitMatchAtoms(mt, known, func(mt matchTree) { 47 atomMatchCount++ 48 }) 49 50 addScore := func(what string, computed float64) { 51 fileMatch.addScore(what, computed, -1, opts.DebugScore) 52 } 53 54 // atom-count boosts files with matches from more than 1 atom. The 55 // maximum boost is scoreFactorAtomMatch. 56 if atomMatchCount > 0 { 57 fileMatch.addScore("atom", (1.0-1.0/float64(atomMatchCount))*scoreFactorAtomMatch, float64(atomMatchCount), opts.DebugScore) 58 } 59 60 maxFileScore := 0.0 61 for i := range fileMatch.LineMatches { 62 if maxFileScore < fileMatch.LineMatches[i].Score { 63 maxFileScore = fileMatch.LineMatches[i].Score 64 } 65 66 // Order by ordering in file. 67 fileMatch.LineMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.LineMatches)))) 68 } 69 70 for i := range fileMatch.ChunkMatches { 71 if maxFileScore < fileMatch.ChunkMatches[i].Score { 72 maxFileScore = fileMatch.ChunkMatches[i].Score 73 } 74 75 // Order by ordering in file. 76 fileMatch.ChunkMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.ChunkMatches)))) 77 } 78 79 // Maintain ordering of input files. This 80 // strictly dominates the in-file ordering of 81 // the matches. 82 addScore("fragment", maxFileScore) 83 84 if opts.UseDocumentRanks && len(d.ranks) > int(doc) { 85 weight := scoreFileRankFactor 86 if opts.DocumentRanksWeight > 0.0 { 87 weight = opts.DocumentRanksWeight 88 } 89 90 ranks := d.ranks[doc] 91 // The ranks slice always contains one entry representing the file rank (unless it's empty since the 92 // file doesn't have a rank). This is left over from when documents could have multiple rank signals, 93 // and we plan to clean this up. 94 if len(ranks) > 0 { 95 // The file rank represents a log (base 2) count. The log ranks should be bounded at 32, but we 96 // cap it just in case to ensure it falls in the range [0, 1]. 97 normalized := math.Min(1.0, ranks[0]/32.0) 98 addScore("file-rank", weight*normalized) 99 } 100 } 101 102 md := d.repoMetaData[d.repos[doc]] 103 addScore("doc-order", scoreFileOrderFactor*(1.0-float64(doc)/float64(len(d.boundaries)))) 104 addScore("repo-rank", scoreRepoRankFactor*float64(md.Rank)/maxUInt16) 105 106 if opts.DebugScore { 107 fileMatch.Debug = fmt.Sprintf("score: %.2f <- %s", fileMatch.Score, strings.TrimSuffix(fileMatch.Debug, ", ")) 108 } 109} 110 111// calculateTermFrequency computes the term frequency for the file match. 112// 113// Filename matches count more than content matches. This mimics a common text 114// search strategy where you 'boost' matches on document titles. 115func calculateTermFrequency(cands []*candidateMatch, df termDocumentFrequency) map[string]int { 116 // Treat each candidate match as a term and compute the frequencies. For now, ignore case 117 // sensitivity and treat filenames and symbols the same as content. 118 termFreqs := map[string]int{} 119 for _, cand := range cands { 120 term := string(cand.substrLowered) 121 if cand.fileName { 122 termFreqs[term] += 5 123 } else { 124 termFreqs[term]++ 125 } 126 } 127 128 for term := range termFreqs { 129 df[term] += 1 130 } 131 132 return termFreqs 133} 134 135// idf computes the inverse document frequency for a term. nq is the number of 136// documents that contain the term and documentCount is the total number of 137// documents in the corpus. 138func idf(nq, documentCount int) float64 { 139 return math.Log(1.0 + ((float64(documentCount) - float64(nq) + 0.5) / (float64(nq) + 0.5))) 140} 141 142// termDocumentFrequency is a map "term" -> "number of documents that contain the term" 143type termDocumentFrequency map[string]int 144 145// termFrequency stores the term frequencies for doc. 146type termFrequency struct { 147 doc uint32 148 tf map[string]int 149} 150 151// scoreFilesUsingBM25 computes the score according to BM25, the most common 152// scoring algorithm for text search: https://en.wikipedia.org/wiki/Okapi_BM25. 153// 154// This scoring strategy ignores all other signals including document ranks. 155// This keeps things simple for now, since BM25 is not normalized and can be 156// tricky to combine with other scoring signals. 157func (d *indexData) scoreFilesUsingBM25(fileMatches []FileMatch, tfs []termFrequency, df termDocumentFrequency, opts *SearchOptions) { 158 // Use standard parameter defaults (used in Lucene and academic papers) 159 k, b := 1.2, 0.75 160 161 averageFileLength := float64(d.boundaries[d.numDocs()]) / float64(d.numDocs()) 162 // This is very unlikely, but explicitly guard against division by zero. 163 if averageFileLength == 0 { 164 averageFileLength++ 165 } 166 167 for i := range tfs { 168 score := 0.0 169 170 // Compute the file length ratio. Usually the calculation would be based on terms, but using 171 // bytes should work fine, as we're just computing a ratio. 172 doc := tfs[i].doc 173 fileLength := float64(d.boundaries[doc+1] - d.boundaries[doc]) 174 175 L := fileLength / averageFileLength 176 177 sumTF := 0 // Just for debugging 178 for term, f := range tfs[i].tf { 179 sumTF += f 180 tfScore := ((k + 1.0) * float64(f)) / (k*(1.0-b+b*L) + float64(f)) 181 score += idf(df[term], int(d.numDocs())) * tfScore 182 } 183 184 fileMatches[i].Score = score 185 186 if opts.DebugScore { 187 fileMatches[i].Debug = fmt.Sprintf("bm25-score: %.2f <- sum-termFrequencies: %d, length-ratio: %.2f", score, sumTF, L) 188 } 189 } 190}