index/eval.go at 388cba654f9ede9d576ba3591f9df10820215df2 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / index / eval.go
at 388cba654f9ede9d576ba3591f9df10820215df2 19 kB View raw
Stefan Hengl scoring: remove IDF from BM25 scoring (#912) 1y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package index
 16
 17import (
 18	"context"
 19	"fmt"
 20	"log"
 21	"regexp/syntax"
 22	"sort"
 23	"strings"
 24	"time"
 25
 26	enry_data "github.com/go-enry/go-enry/v2/data"
 27	"github.com/grafana/regexp"
 28
 29	"github.com/sourcegraph/zoekt"
 30	"github.com/sourcegraph/zoekt/internal/tenant"
 31	"github.com/sourcegraph/zoekt/query"
 32)
 33
 34// simplifyMultiRepo takes a query and a predicate. It returns Const(true) if all
 35// repository names fulfill the predicate, Const(false) if none of them do, and q
 36// otherwise.
 37func (d *indexData) simplifyMultiRepo(q query.Q, predicate func(*zoekt.Repository) bool) query.Q {
 38	count := 0
 39	alive := len(d.repoMetaData)
 40	for i := range d.repoMetaData {
 41		if d.repoMetaData[i].Tombstone {
 42			alive--
 43		} else if predicate(&d.repoMetaData[i]) {
 44			count++
 45		}
 46	}
 47	if count == alive {
 48		return &query.Const{Value: true}
 49	}
 50	if count > 0 {
 51		return q
 52	}
 53	return &query.Const{Value: false}
 54}
 55
 56func (d *indexData) simplify(in query.Q) query.Q {
 57	eval := query.Map(in, func(q query.Q) query.Q {
 58		switch r := q.(type) {
 59		case *query.Repo:
 60			return d.simplifyMultiRepo(q, func(repo *zoekt.Repository) bool {
 61				return r.Regexp.MatchString(repo.Name)
 62			})
 63		case *query.RepoRegexp:
 64			return d.simplifyMultiRepo(q, func(repo *zoekt.Repository) bool {
 65				return r.Regexp.MatchString(repo.Name)
 66			})
 67		case *query.BranchesRepos:
 68			for i := range d.repoMetaData {
 69				for _, br := range r.List {
 70					if br.Repos.Contains(d.repoMetaData[i].ID) {
 71						return q
 72					}
 73				}
 74			}
 75			return &query.Const{Value: false}
 76		case *query.RepoSet:
 77			return d.simplifyMultiRepo(q, func(repo *zoekt.Repository) bool {
 78				return r.Set[repo.Name]
 79			})
 80		case query.RawConfig:
 81			return d.simplifyMultiRepo(q, func(repo *zoekt.Repository) bool { return uint8(r)&encodeRawConfig(repo.RawConfig) == uint8(r) })
 82		case *query.RepoIDs:
 83			return d.simplifyMultiRepo(q, func(repo *zoekt.Repository) bool {
 84				return r.Repos.Contains(repo.ID)
 85			})
 86		case *query.Language:
 87			_, has := d.metaData.LanguageMap[r.Language]
 88			if !has && d.metaData.IndexFeatureVersion < 12 {
 89				// For index files that haven't been re-indexed by go-enry,
 90				// fall back to file-based matching and continue even if this
 91				// repo doesn't have the specific language present.
 92				extsForLang := enry_data.ExtensionsByLanguage[r.Language]
 93				if extsForLang != nil {
 94					extFrags := make([]string, 0, len(extsForLang))
 95					for _, ext := range extsForLang {
 96						extFrags = append(extFrags, regexp.QuoteMeta(ext))
 97					}
 98					if len(extFrags) > 0 {
 99						pattern := fmt.Sprintf("(?i)(%s)$", strings.Join(extFrags, "|"))
100						// inlined copy of query.regexpQuery
101						re, err := syntax.Parse(pattern, syntax.Perl)
102						if err != nil {
103							return &query.Const{Value: false}
104						}
105						if re.Op == syntax.OpLiteral {
106							return &query.Substring{
107								Pattern:  string(re.Rune),
108								FileName: true,
109							}
110						}
111						return &query.Regexp{
112							Regexp:   re,
113							FileName: true,
114						}
115					}
116				}
117			}
118			if !has {
119				return &query.Const{Value: false}
120			}
121		}
122		return q
123	})
124	return query.Simplify(eval)
125}
126
127func (d *indexData) Search(ctx context.Context, q query.Q, opts *zoekt.SearchOptions) (sr *zoekt.SearchResult, err error) {
128	timer := newTimer()
129
130	copyOpts := *opts
131	opts = &copyOpts
132	opts.SetDefaults()
133
134	var res zoekt.SearchResult
135	if len(d.fileNameIndex) == 0 {
136		return &res, nil
137	}
138
139	select {
140	case <-ctx.Done():
141		res.Stats.ShardsSkipped++
142		return &res, nil
143	default:
144	}
145
146	q = d.simplify(q)
147	if c, ok := q.(*query.Const); ok && !c.Value {
148		return &res, nil
149	}
150
151	if opts.EstimateDocCount {
152		res.Stats.ShardFilesConsidered = len(d.fileBranchMasks)
153		return &res, nil
154	}
155
156	q = query.Map(q, query.ExpandFileContent)
157
158	mt, err := d.newMatchTree(q, matchTreeOpt{})
159	if err != nil {
160		return nil, err
161	}
162
163	// Capture the costs of construction before pruning
164	updateMatchTreeStats(mt, &res.Stats)
165
166	mt, err = pruneMatchTree(mt)
167	if err != nil {
168		return nil, err
169	}
170	res.Stats.MatchTreeConstruction = timer.Elapsed()
171	if mt == nil {
172		res.Stats.ShardsSkippedFilter++
173		return &res, nil
174	}
175
176	res.Stats.ShardsScanned++
177
178	cp := &contentProvider{
179		id:    d,
180		stats: &res.Stats,
181	}
182
183	// Track the number of documents found in a repository for
184	// ShardRepoMaxMatchCount
185	var (
186		lastRepoID     uint16
187		repoMatchCount int
188	)
189
190	docCount := uint32(len(d.fileBranchMasks))
191	lastDoc := int(-1)
192
193nextFileMatch:
194	for {
195		canceled := false
196		select {
197		case <-ctx.Done():
198			canceled = true
199		default:
200		}
201
202		nextDoc := mt.nextDoc()
203		if int(nextDoc) <= lastDoc {
204			nextDoc = uint32(lastDoc + 1)
205		}
206
207		for ; nextDoc < docCount; nextDoc++ {
208			repoID := d.repos[nextDoc]
209			repoMetadata := &d.repoMetaData[repoID]
210
211			// Skip tombstoned repositories
212			if repoMetadata.Tombstone {
213				continue
214			}
215
216			// 🚨 SECURITY: Skip documents that don't belong to the tenant. This check is
217			// necessary to prevent leaking data across tenants.
218			if !tenant.HasAccess(ctx, repoMetadata.TenantID) {
219				continue
220			}
221
222			// Skip documents that are tombstoned
223			if len(repoMetadata.FileTombstones) > 0 {
224				if _, tombstoned := repoMetadata.FileTombstones[string(d.fileName(nextDoc))]; tombstoned {
225					continue
226				}
227			}
228
229			// Skip documents over ShardRepoMaxMatchCount if specified.
230			if opts.ShardRepoMaxMatchCount > 0 {
231				if repoMatchCount >= opts.ShardRepoMaxMatchCount && repoID == lastRepoID {
232					res.Stats.FilesSkipped++
233					continue
234				}
235			}
236
237			break
238		}
239
240		if nextDoc >= docCount {
241			break
242		}
243
244		lastDoc = int(nextDoc)
245
246		// We track lastRepoID for ShardRepoMaxMatchCount
247		if lastRepoID != d.repos[nextDoc] {
248			lastRepoID = d.repos[nextDoc]
249			repoMatchCount = 0
250		}
251
252		if canceled || (res.Stats.MatchCount >= opts.ShardMaxMatchCount && opts.ShardMaxMatchCount > 0) {
253			res.Stats.FilesSkipped += int(docCount - nextDoc)
254			break
255		}
256
257		res.Stats.FilesConsidered++
258		mt.prepare(nextDoc)
259
260		cp.setDocument(nextDoc)
261
262		known := make(map[matchTree]bool)
263		md := d.repoMetaData[d.repos[nextDoc]]
264
265		for cost := costMin; cost <= costMax; cost++ {
266			switch evalMatchTree(cp, cost, known, mt) {
267			case matchesRequiresHigherCost:
268				if cost == costMax {
269					log.Panicf("did not decide. Repo %s, doc %d, known %v",
270						md.Name, nextDoc, known)
271				}
272			case matchesFound:
273				// could short-circuit now, but we want to run higher costs to
274				// potentially find higher ranked matches.
275			case matchesNone:
276				continue nextFileMatch
277			}
278		}
279
280		fileMatch := zoekt.FileMatch{
281			Repository:         md.Name,
282			RepositoryID:       md.ID,
283			RepositoryPriority: md.GetPriority(),
284			FileName:           string(d.fileName(nextDoc)),
285			Checksum:           d.getChecksum(nextDoc),
286			Language:           d.languageMap[d.getLanguage(nextDoc)],
287		}
288
289		if s := d.subRepos[nextDoc]; s > 0 {
290			if s >= uint32(len(d.subRepoPaths[d.repos[nextDoc]])) {
291				log.Panicf("corrupt index: subrepo %d beyond %v", s, d.subRepoPaths)
292			}
293			path := d.subRepoPaths[d.repos[nextDoc]][s]
294			fileMatch.SubRepositoryPath = path
295			sr := md.SubRepoMap[path]
296			fileMatch.SubRepositoryName = sr.Name
297			if idx := d.branchIndex(nextDoc); idx >= 0 {
298				fileMatch.Version = sr.Branches[idx].Version
299			}
300		} else {
301			idx := d.branchIndex(nextDoc)
302			if idx >= 0 {
303				fileMatch.Version = md.Branches[idx].Version
304			}
305		}
306
307		// Important invariant for performance: finalCands is sorted by offset and
308		// non-overlapping. gatherMatches respects this invariant and all later
309		// transformations respect this.
310		finalCands := d.gatherMatches(nextDoc, mt, known)
311
312		if opts.ChunkMatches {
313			fileMatch.ChunkMatches = cp.fillChunkMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts)
314		} else {
315			fileMatch.LineMatches = cp.fillMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts)
316		}
317
318		if opts.UseBM25Scoring {
319			tf := cp.calculateTermFrequency(finalCands)
320			d.scoreFilesUsingBM25(&fileMatch, nextDoc, tf, opts)
321		} else {
322			// Use the standard, non-experimental scoring method by default
323			d.scoreFile(&fileMatch, nextDoc, mt, known, opts)
324		}
325
326		fileMatch.Branches = d.gatherBranches(nextDoc, mt, known)
327		sortMatchesByScore(fileMatch.LineMatches)
328		sortChunkMatchesByScore(fileMatch.ChunkMatches)
329		if opts.Whole {
330			fileMatch.Content = cp.data(false)
331		}
332
333		matchedChunkRanges := 0
334		for _, cm := range fileMatch.ChunkMatches {
335			matchedChunkRanges += len(cm.Ranges)
336		}
337
338		repoMatchCount += len(fileMatch.LineMatches)
339		repoMatchCount += matchedChunkRanges
340
341		res.Files = append(res.Files, fileMatch)
342
343		res.Stats.MatchCount += len(fileMatch.LineMatches)
344		res.Stats.MatchCount += matchedChunkRanges
345		res.Stats.FileCount++
346	}
347
348	for _, md := range d.repoMetaData {
349		r := md
350		addRepo(&res, &r)
351		for _, v := range r.SubRepoMap {
352			addRepo(&res, v)
353		}
354	}
355
356	// Update stats based on work done during document search.
357	updateMatchTreeStats(mt, &res.Stats)
358
359	res.Stats.MatchTreeSearch = timer.Elapsed()
360
361	return &res, nil
362}
363
364func addRepo(res *zoekt.SearchResult, repo *zoekt.Repository) {
365	if res.RepoURLs == nil {
366		res.RepoURLs = map[string]string{}
367	}
368	res.RepoURLs[repo.Name] = repo.FileURLTemplate
369
370	if res.LineFragments == nil {
371		res.LineFragments = map[string]string{}
372	}
373	res.LineFragments[repo.Name] = repo.LineFragmentTemplate
374}
375
376// Gather matches from this document. The matches are returned in document
377// order and are non-overlapping. All filename and content matches are
378// returned, with filename matches first.
379//
380// If `merge` is set, overlapping and adjacent matches will be merged
381// into a single index. Otherwise, overlapping matches will be removed,
382// but adjacent matches will remain.
383func (d *indexData) gatherMatches(nextDoc uint32, mt matchTree, known map[matchTree]bool) []*candidateMatch {
384	var cands []*candidateMatch
385	visitMatches(mt, known, 1, func(mt matchTree, scoreWeight float64) {
386		if smt, ok := mt.(*substrMatchTree); ok {
387			cands = append(cands, setScoreWeight(scoreWeight, smt.current)...)
388		}
389		if rmt, ok := mt.(*regexpMatchTree); ok {
390			cands = append(cands, setScoreWeight(scoreWeight, rmt.found)...)
391		}
392		if rmt, ok := mt.(*wordMatchTree); ok {
393			cands = append(cands, setScoreWeight(scoreWeight, rmt.found)...)
394		}
395		if smt, ok := mt.(*symbolRegexpMatchTree); ok {
396			cands = append(cands, setScoreWeight(scoreWeight, smt.found)...)
397		}
398	})
399
400	// If we found no candidate matches at all, assume there must have been a match on filename.
401	if len(cands) == 0 {
402		nm := d.fileName(nextDoc)
403		return []*candidateMatch{{
404			caseSensitive: false,
405			fileName:      true,
406			substrBytes:   nm,
407			substrLowered: nm,
408			file:          nextDoc,
409			runeOffset:    0,
410			byteOffset:    0,
411			byteMatchSz:   uint32(len(nm)),
412		}}
413	}
414
415	// Remove overlapping candidates. This guarantees that the matches
416	// are non-overlapping, but also preserves expected match counts.
417	sort.Sort((sortByOffsetSlice)(cands))
418	res := cands[:0]
419	for i, c := range cands {
420		if i == 0 {
421			res = append(res, c)
422			continue
423		}
424
425		last := res[len(res)-1]
426
427		// Never compare filename and content matches
428		if last.fileName != c.fileName {
429			res = append(res, c)
430			continue
431		}
432
433		// Only add the match if its range doesn't overlap
434		lastEnd := last.byteOffset + last.byteMatchSz
435		if lastEnd <= c.byteOffset {
436			res = append(res, c)
437			continue
438		}
439	}
440	return res
441}
442
443type sortByOffsetSlice []*candidateMatch
444
445func (m sortByOffsetSlice) Len() int      { return len(m) }
446func (m sortByOffsetSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
447func (m sortByOffsetSlice) Less(i, j int) bool {
448	// Sort all filename matches to the start
449	if m[i].fileName != m[j].fileName {
450		return m[i].fileName
451	}
452
453	if m[i].byteOffset == m[j].byteOffset { // tie break if same offset
454		// Prefer longer candidates if starting at same position
455		return m[i].byteMatchSz > m[j].byteMatchSz
456	}
457	return m[i].byteOffset < m[j].byteOffset
458}
459
460// setScoreWeight is a helper used by gatherMatches to set the weight based on
461// the score weight of the matchTree.
462func setScoreWeight(scoreWeight float64, cm []*candidateMatch) []*candidateMatch {
463	for _, m := range cm {
464		m.scoreWeight = scoreWeight
465	}
466	return cm
467}
468
469func (d *indexData) branchIndex(docID uint32) int {
470	mask := d.fileBranchMasks[docID]
471	idx := 0
472	for mask != 0 {
473		if mask&0x1 != 0 {
474			return idx
475		}
476		idx++
477		mask >>= 1
478	}
479	return -1
480}
481
482// gatherBranches returns a list of branch names taking into account any branch
483// filters in the query. If the query contains a branch filter, it returns all
484// branches containing the docID and matching the branch filter. Otherwise, it
485// returns all branches containing docID.
486func (d *indexData) gatherBranches(docID uint32, mt matchTree, known map[matchTree]bool) []string {
487	var mask uint64
488	visitMatchAtoms(mt, known, func(mt matchTree) {
489		bq, ok := mt.(*branchQueryMatchTree)
490		if !ok {
491			return
492		}
493
494		mask = mask | bq.branchMask()
495	})
496
497	if mask == 0 {
498		mask = d.fileBranchMasks[docID]
499	}
500
501	var branches []string
502	id := uint32(1)
503	branchNames := d.branchNames[d.repos[docID]]
504	for mask != 0 {
505		if mask&0x1 != 0 {
506			branches = append(branches, branchNames[uint(id)])
507		}
508		id <<= 1
509		mask >>= 1
510	}
511
512	return branches
513}
514
515func (d *indexData) List(ctx context.Context, q query.Q, opts *zoekt.ListOptions) (rl *zoekt.RepoList, err error) {
516	var include func(rle *zoekt.RepoListEntry) bool
517
518	q = d.simplify(q)
519	if c, ok := q.(*query.Const); ok {
520		if !c.Value {
521			return &zoekt.RepoList{}, nil
522		}
523		include = func(rle *zoekt.RepoListEntry) bool {
524			return true
525		}
526	} else {
527		sr, err := d.Search(ctx, q, &zoekt.SearchOptions{
528			ShardRepoMaxMatchCount: 1,
529		})
530		if err != nil {
531			return nil, err
532		}
533
534		foundRepos := make(map[string]struct{}, len(sr.Files))
535		for _, file := range sr.Files {
536			foundRepos[file.Repository] = struct{}{}
537		}
538
539		include = func(rle *zoekt.RepoListEntry) bool {
540			_, ok := foundRepos[rle.Repository.Name]
541			return ok
542		}
543	}
544
545	var l zoekt.RepoList
546
547	field, err := opts.GetField()
548	if err != nil {
549		return nil, err
550	}
551	switch field {
552	case zoekt.RepoListFieldRepos:
553		l.Repos = make([]*zoekt.RepoListEntry, 0, len(d.repoListEntry))
554	case zoekt.RepoListFieldReposMap:
555		l.ReposMap = make(zoekt.ReposMap, len(d.repoListEntry))
556	}
557
558	for i := range d.repoListEntry {
559		if d.repoMetaData[i].Tombstone {
560			continue
561		}
562		// 🚨 SECURITY: Skip documents that don't belong to the tenant. This check is
563		// necessary to prevent leaking data across tenants.
564		if !tenant.HasAccess(ctx, d.repoMetaData[i].TenantID) {
565			continue
566		}
567		rle := &d.repoListEntry[i]
568		if !include(rle) {
569			continue
570		}
571
572		l.Stats.Add(&rle.Stats)
573
574		// Backwards compat for when ID is missing
575		if rle.Repository.ID == 0 {
576			l.Repos = append(l.Repos, rle)
577			continue
578		}
579
580		switch field {
581		case zoekt.RepoListFieldRepos:
582			l.Repos = append(l.Repos, rle)
583		case zoekt.RepoListFieldReposMap:
584			l.ReposMap[rle.Repository.ID] = zoekt.MinimalRepoListEntry{
585				HasSymbols:    rle.Repository.HasSymbols,
586				Branches:      rle.Repository.Branches,
587				IndexTimeUnix: rle.IndexMetadata.IndexTime.Unix(),
588			}
589		}
590
591	}
592
593	// Only one of these fields is populated and in all cases the size of that
594	// field is the number of Repos in this shard.
595	l.Stats.Repos = len(l.Repos) + len(l.ReposMap)
596
597	return &l, nil
598}
599
600// regexpToMatchTreeRecursive converts a regular expression to a matchTree mt. If
601// mt is equivalent to the input r, isEqual = true and the matchTree can be used
602// in place of the regex r. If singleLine = true, then the matchTree and all
603// its children only match terms on the same line. singleLine is used during
604// recursion to decide whether to return an andLineMatchTree (singleLine = true)
605// or a andMatchTree (singleLine = false).
606func (d *indexData) regexpToMatchTreeRecursive(r *syntax.Regexp, minTextSize int, fileName bool, caseSensitive bool) (mt matchTree, isEqual bool, singleLine bool, err error) {
607	// TODO - we could perhaps transform Begin/EndText in '\n'?
608	// TODO - we could perhaps transform CharClass in (OrQuery )
609	// if there are just a few runes, and part of a OpConcat?
610	switch r.Op {
611	case syntax.OpLiteral:
612		s := string(r.Rune)
613		if len(s) >= minTextSize {
614			ignoreCase := syntax.FoldCase == (r.Flags & syntax.FoldCase)
615			mt, err := d.newSubstringMatchTree(&query.Substring{Pattern: s, FileName: fileName, CaseSensitive: !ignoreCase && caseSensitive})
616			return mt, true, !strings.Contains(s, "\n"), err
617		}
618	case syntax.OpCapture:
619		return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
620
621	case syntax.OpPlus:
622		return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
623
624	case syntax.OpRepeat:
625		if r.Min == 1 {
626			return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
627		} else if r.Min > 1 {
628			// (x){2,} can't be expressed precisely by the matchTree
629			mt, _, singleLine, err := d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
630			return mt, false, singleLine, err
631		}
632	case syntax.OpConcat, syntax.OpAlternate:
633		var qs []matchTree
634		isEq := true
635		singleLine = true
636		for _, sr := range r.Sub {
637			if sq, subIsEq, subSingleLine, err := d.regexpToMatchTreeRecursive(sr, minTextSize, fileName, caseSensitive); sq != nil {
638				if err != nil {
639					return nil, false, false, err
640				}
641				isEq = isEq && subIsEq
642				singleLine = singleLine && subSingleLine
643				qs = append(qs, sq)
644			}
645		}
646		if r.Op == syntax.OpConcat {
647			if len(qs) > 1 {
648				isEq = false
649			}
650			newQs := make([]matchTree, 0, len(qs))
651			for _, q := range qs {
652				if _, ok := q.(*bruteForceMatchTree); ok {
653					continue
654				}
655				newQs = append(newQs, q)
656			}
657			if len(newQs) == 1 {
658				return newQs[0], isEq, singleLine, nil
659			}
660			if len(newQs) == 0 {
661				return &bruteForceMatchTree{}, isEq, singleLine, nil
662			}
663			if singleLine {
664				return &andLineMatchTree{andMatchTree{children: newQs}}, isEq, singleLine, nil
665			}
666			return &andMatchTree{newQs}, isEq, singleLine, nil
667		}
668		for _, q := range qs {
669			if _, ok := q.(*bruteForceMatchTree); ok {
670				return q, isEq, false, nil
671			}
672		}
673		if len(qs) == 0 {
674			return &noMatchTree{Why: "const"}, isEq, false, nil
675		}
676		return &orMatchTree{qs}, isEq, false, nil
677	case syntax.OpStar:
678		if r.Sub[0].Op == syntax.OpAnyCharNotNL {
679			return &bruteForceMatchTree{}, false, true, nil
680		}
681	}
682	return &bruteForceMatchTree{}, false, false, nil
683}
684
685type timer struct {
686	last time.Time
687}
688
689func newTimer() *timer {
690	return &timer{
691		last: time.Now(),
692	}
693}
694
695func (t *timer) Elapsed() time.Duration {
696	now := time.Now()
697	d := now.Sub(t.last)
698	t.last = now
699	return d
700}
Configure Feed

Configure Feed