score.go at 43b92256ba7117270d7cc8ca434cc14faf2cc241 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / score.go
at 43b92256ba7117270d7cc8ca434cc14faf2cc241 6.1 kB View raw
Julie Tibshirani Ranking: include filename matches in bm25 (#757) 2y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package zoekt
 16
 17import (
 18	"fmt"
 19	"math"
 20	"strconv"
 21	"strings"
 22)
 23
 24const maxUInt16 = 0xffff
 25
 26// addScore increments the score of the FileMatch by the computed score. If
 27// debugScore is true, it also adds a debug string to the FileMatch. If raw is
 28// -1, it is ignored. Otherwise, it is added to the debug string.
 29func (m *FileMatch) addScore(what string, computed float64, raw float64, debugScore bool) {
 30	if computed != 0 && debugScore {
 31		var b strings.Builder
 32		fmt.Fprintf(&b, "%s", what)
 33		if raw != -1 {
 34			fmt.Fprintf(&b, "(%s)", strconv.FormatFloat(raw, 'f', -1, 64))
 35		}
 36		fmt.Fprintf(&b, ":%.2f, ", computed)
 37		m.Debug += b.String()
 38	}
 39	m.Score += computed
 40}
 41
 42func (m *FileMatch) addKeywordScore(score float64, sumTf float64, L float64, debugScore bool) {
 43	if debugScore {
 44		m.Debug += fmt.Sprintf("keyword-score:%.2f (sum-tf: %.2f, length-ratio: %.2f)", score, sumTf, L)
 45	}
 46	m.Score += score
 47}
 48
 49// scoreFile computes a score for the file match using various scoring signals, like
 50// whether there's an exact match on a symbol, the number of query clauses that matched, etc.
 51func (d *indexData) scoreFile(fileMatch *FileMatch, doc uint32, mt matchTree, known map[matchTree]bool, opts *SearchOptions) {
 52	atomMatchCount := 0
 53	visitMatchAtoms(mt, known, func(mt matchTree) {
 54		atomMatchCount++
 55	})
 56
 57	addScore := func(what string, computed float64) {
 58		fileMatch.addScore(what, computed, -1, opts.DebugScore)
 59	}
 60
 61	// atom-count boosts files with matches from more than 1 atom. The
 62	// maximum boost is scoreFactorAtomMatch.
 63	if atomMatchCount > 0 {
 64		fileMatch.addScore("atom", (1.0-1.0/float64(atomMatchCount))*scoreFactorAtomMatch, float64(atomMatchCount), opts.DebugScore)
 65	}
 66
 67	maxFileScore := 0.0
 68	for i := range fileMatch.LineMatches {
 69		if maxFileScore < fileMatch.LineMatches[i].Score {
 70			maxFileScore = fileMatch.LineMatches[i].Score
 71		}
 72
 73		// Order by ordering in file.
 74		fileMatch.LineMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.LineMatches))))
 75	}
 76
 77	for i := range fileMatch.ChunkMatches {
 78		if maxFileScore < fileMatch.ChunkMatches[i].Score {
 79			maxFileScore = fileMatch.ChunkMatches[i].Score
 80		}
 81
 82		// Order by ordering in file.
 83		fileMatch.ChunkMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.ChunkMatches))))
 84	}
 85
 86	// Maintain ordering of input files. This
 87	// strictly dominates the in-file ordering of
 88	// the matches.
 89	addScore("fragment", maxFileScore)
 90
 91	if opts.UseDocumentRanks && len(d.ranks) > int(doc) {
 92		weight := scoreFileRankFactor
 93		if opts.DocumentRanksWeight > 0.0 {
 94			weight = opts.DocumentRanksWeight
 95		}
 96
 97		ranks := d.ranks[doc]
 98		// The ranks slice always contains one entry representing the file rank (unless it's empty since the
 99		// file doesn't have a rank). This is left over from when documents could have multiple rank signals,
100		// and we plan to clean this up.
101		if len(ranks) > 0 {
102			// The file rank represents a log (base 2) count. The log ranks should be bounded at 32, but we
103			// cap it just in case to ensure it falls in the range [0, 1].
104			normalized := math.Min(1.0, ranks[0]/32.0)
105			addScore("file-rank", weight*normalized)
106		}
107	}
108
109	md := d.repoMetaData[d.repos[doc]]
110	addScore("doc-order", scoreFileOrderFactor*(1.0-float64(doc)/float64(len(d.boundaries))))
111	addScore("repo-rank", scoreRepoRankFactor*float64(md.Rank)/maxUInt16)
112
113	if opts.DebugScore {
114		fileMatch.Debug = strings.TrimSuffix(fileMatch.Debug, ", ")
115	}
116}
117
118// scoreFileUsingBM25 computes a score for the file match using an approximation to BM25, the most common scoring
119// algorithm for keyword search: https://en.wikipedia.org/wiki/Okapi_BM25. It implements all parts of the formula
120// except inverse document frequency (idf), since we don't have access to global term frequency statistics.
121//
122// Filename matches count twice as much as content matches. This mimics a common text search strategy where you
123// 'boost' matches on document titles.
124//
125// This scoring strategy ignores all other signals including document ranks. This keeps things simple for now,
126// since BM25 is not normalized and can be tricky to combine with other scoring signals.
127func (d *indexData) scoreFileUsingBM25(fileMatch *FileMatch, doc uint32, cands []*candidateMatch, opts *SearchOptions) {
128	// Treat each candidate match as a term and compute the frequencies. For now, ignore case
129	// sensitivity and treat filenames and symbols the same as content.
130	termFreqs := map[string]int{}
131	for _, cand := range cands {
132		term := string(cand.substrLowered)
133
134		if cand.fileName {
135			termFreqs[term] += 2
136		} else {
137			termFreqs[term]++
138		}
139	}
140
141	// Compute the file length ratio. Usually the calculation would be based on terms, but using
142	// bytes should work fine, as we're just computing a ratio.
143	fileLength := float64(d.boundaries[doc+1] - d.boundaries[doc])
144	numFiles := len(d.boundaries)
145	averageFileLength := float64(d.boundaries[numFiles-1]) / float64(numFiles)
146
147	// This is very unlikely, but explicitly guard against division by zero.
148	if averageFileLength == 0 {
149		averageFileLength++
150	}
151	L := fileLength / averageFileLength
152
153	// Use standard parameter defaults (used in Lucene and academic papers)
154	k, b := 1.2, 0.75
155	sumTf := 0.0 // Just for debugging
156	score := 0.0
157	for _, freq := range termFreqs {
158		tf := float64(freq)
159		sumTf += tf
160		score += ((k + 1.0) * tf) / (k*(1.0-b+b*L) + tf)
161	}
162
163	fileMatch.addKeywordScore(score, sumTf, L, opts.DebugScore)
164}
Configure Feed

Configure Feed