fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "fmt" 19 "math" 20 "strconv" 21 "strings" 22) 23 24const ( 25 maxUInt16 = 0xffff 26 ScoreOffset = 10_000_000 27) 28 29// addScore increments the score of the FileMatch by the computed score. If 30// debugScore is true, it also adds a debug string to the FileMatch. If raw is 31// -1, it is ignored. Otherwise, it is added to the debug string. 32func (m *FileMatch) addScore(what string, computed float64, raw float64, debugScore bool) { 33 if computed != 0 && debugScore { 34 var b strings.Builder 35 fmt.Fprintf(&b, "%s", what) 36 if raw != -1 { 37 fmt.Fprintf(&b, "(%s)", strconv.FormatFloat(raw, 'f', -1, 64)) 38 } 39 fmt.Fprintf(&b, ":%.2f, ", computed) 40 m.Debug += b.String() 41 } 42 m.Score += computed 43} 44 45// scoreFile computes a score for the file match using various scoring signals, like 46// whether there's an exact match on a symbol, the number of query clauses that matched, etc. 47func (d *indexData) scoreFile(fileMatch *FileMatch, doc uint32, mt matchTree, known map[matchTree]bool, opts *SearchOptions) { 48 atomMatchCount := 0 49 visitMatchAtoms(mt, known, func(mt matchTree) { 50 atomMatchCount++ 51 }) 52 53 addScore := func(what string, computed float64) { 54 fileMatch.addScore(what, computed, -1, opts.DebugScore) 55 } 56 57 // atom-count boosts files with matches from more than 1 atom. The 58 // maximum boost is scoreFactorAtomMatch. 59 if atomMatchCount > 0 { 60 fileMatch.addScore("atom", (1.0-1.0/float64(atomMatchCount))*scoreFactorAtomMatch, float64(atomMatchCount), opts.DebugScore) 61 } 62 63 maxFileScore := 0.0 64 for i := range fileMatch.LineMatches { 65 if maxFileScore < fileMatch.LineMatches[i].Score { 66 maxFileScore = fileMatch.LineMatches[i].Score 67 } 68 69 // Order by ordering in file. 70 fileMatch.LineMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.LineMatches)))) 71 } 72 73 for i := range fileMatch.ChunkMatches { 74 if maxFileScore < fileMatch.ChunkMatches[i].Score { 75 maxFileScore = fileMatch.ChunkMatches[i].Score 76 } 77 78 // Order by ordering in file. 79 fileMatch.ChunkMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.ChunkMatches)))) 80 } 81 82 // Maintain ordering of input files. This 83 // strictly dominates the in-file ordering of 84 // the matches. 85 addScore("fragment", maxFileScore) 86 87 // Add tiebreakers 88 // 89 // ScoreOffset shifts the score 7 digits to the left. 90 fileMatch.Score = math.Trunc(fileMatch.Score) * ScoreOffset 91 92 md := d.repoMetaData[d.repos[doc]] 93 94 // md.Rank lies in the range [0, 65535]. Hence, we have to allocate 5 digits for 95 // the rank. The scoreRepoRankFactor shifts the rank score 2 digits to the left, 96 // reserving digits 3-7 for the repo rank. 97 addScore("repo-rank", scoreRepoRankFactor*float64(md.Rank)) 98 99 // digits 1-2 and the decimals are reserved for the doc order. Doc order 100 // (without the scaling factor) lies in the range [0, 1]. The upper bound is 101 // achieved for matches in the first document of a shard. 102 addScore("doc-order", scoreFileOrderFactor*(1.0-float64(doc)/float64(len(d.boundaries)))) 103 104 if opts.DebugScore { 105 // To make the debug output easier to read, we split the score into the query 106 // dependent score and the tiebreaker 107 score := math.Trunc(fileMatch.Score / ScoreOffset) 108 tiebreaker := fileMatch.Score - score*ScoreOffset 109 fileMatch.Debug = fmt.Sprintf("score: %d (%.2f) <- %s", int(score), tiebreaker, strings.TrimSuffix(fileMatch.Debug, ", ")) 110 } 111} 112 113// calculateTermFrequency computes the term frequency for the file match. 114// 115// Filename matches count more than content matches. This mimics a common text 116// search strategy where you 'boost' matches on document titles. 117func calculateTermFrequency(cands []*candidateMatch, df termDocumentFrequency) map[string]int { 118 // Treat each candidate match as a term and compute the frequencies. For now, ignore case 119 // sensitivity and treat filenames and symbols the same as content. 120 termFreqs := map[string]int{} 121 for _, cand := range cands { 122 term := string(cand.substrLowered) 123 if cand.fileName { 124 termFreqs[term] += 5 125 } else { 126 termFreqs[term]++ 127 } 128 } 129 130 for term := range termFreqs { 131 df[term] += 1 132 } 133 134 return termFreqs 135} 136 137// idf computes the inverse document frequency for a term. nq is the number of 138// documents that contain the term and documentCount is the total number of 139// documents in the corpus. 140func idf(nq, documentCount int) float64 { 141 return math.Log(1.0 + ((float64(documentCount) - float64(nq) + 0.5) / (float64(nq) + 0.5))) 142} 143 144// termDocumentFrequency is a map "term" -> "number of documents that contain the term" 145type termDocumentFrequency map[string]int 146 147// termFrequency stores the term frequencies for doc. 148type termFrequency struct { 149 doc uint32 150 tf map[string]int 151} 152 153// scoreFilesUsingBM25 computes the score according to BM25, the most common 154// scoring algorithm for text search: https://en.wikipedia.org/wiki/Okapi_BM25. 155// 156// This scoring strategy ignores all other signals including document ranks. 157// This keeps things simple for now, since BM25 is not normalized and can be 158// tricky to combine with other scoring signals. 159func (d *indexData) scoreFilesUsingBM25(fileMatches []FileMatch, tfs []termFrequency, df termDocumentFrequency, opts *SearchOptions) { 160 // Use standard parameter defaults (used in Lucene and academic papers) 161 k, b := 1.2, 0.75 162 163 averageFileLength := float64(d.boundaries[d.numDocs()]) / float64(d.numDocs()) 164 // This is very unlikely, but explicitly guard against division by zero. 165 if averageFileLength == 0 { 166 averageFileLength++ 167 } 168 169 for i := range tfs { 170 score := 0.0 171 172 // Compute the file length ratio. Usually the calculation would be based on terms, but using 173 // bytes should work fine, as we're just computing a ratio. 174 doc := tfs[i].doc 175 fileLength := float64(d.boundaries[doc+1] - d.boundaries[doc]) 176 177 L := fileLength / averageFileLength 178 179 sumTF := 0 // Just for debugging 180 for term, f := range tfs[i].tf { 181 sumTF += f 182 tfScore := ((k + 1.0) * float64(f)) / (k*(1.0-b+b*L) + float64(f)) 183 score += idf(df[term], int(d.numDocs())) * tfScore 184 } 185 186 fileMatches[i].Score = score 187 188 if opts.DebugScore { 189 fileMatches[i].Debug = fmt.Sprintf("bm25-score: %.2f <- sum-termFrequencies: %d, length-ratio: %.2f", score, sumTF, L) 190 } 191 } 192}