eval.go at b227501acf82ca21a07a6cf1d7b36616a2b21327 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / eval.go
at b227501acf82ca21a07a6cf1d7b36616a2b21327 25 kB View raw
Keegan Carruthers-Smith score: introduce query.Boost to scale score (#728) 2y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package zoekt
 16
 17import (
 18	"context"
 19	"fmt"
 20	"log"
 21	"math"
 22	"regexp/syntax"
 23	"sort"
 24	"strconv"
 25	"strings"
 26	"time"
 27
 28	enry_data "github.com/go-enry/go-enry/v2/data"
 29	"github.com/grafana/regexp"
 30
 31	"github.com/sourcegraph/zoekt/query"
 32)
 33
 34const maxUInt16 = 0xffff
 35
 36// addScore increments the score of the FileMatch by the computed score. If
 37// debugScore is true, it also adds a debug string to the FileMatch. If raw is
 38// -1, it is ignored. Otherwise, it is added to the debug string.
 39func (m *FileMatch) addScore(what string, computed float64, raw float64, debugScore bool) {
 40	if computed != 0 && debugScore {
 41		var b strings.Builder
 42		fmt.Fprintf(&b, "%s", what)
 43		if raw != -1 {
 44			fmt.Fprintf(&b, "(%s)", strconv.FormatFloat(raw, 'f', -1, 64))
 45		}
 46		fmt.Fprintf(&b, ":%.2f, ", computed)
 47		m.Debug += b.String()
 48	}
 49	m.Score += computed
 50}
 51
 52func (m *FileMatch) addKeywordScore(score float64, sumTf float64, L float64, debugScore bool) {
 53	if debugScore {
 54		m.Debug += fmt.Sprintf("keyword-score:%.2f (sum-tf: %.2f, length-ratio: %.2f)", score, sumTf, L)
 55	}
 56	m.Score += score
 57}
 58
 59// simplifyMultiRepo takes a query and a predicate. It returns Const(true) if all
 60// repository names fulfill the predicate, Const(false) if none of them do, and q
 61// otherwise.
 62func (d *indexData) simplifyMultiRepo(q query.Q, predicate func(*Repository) bool) query.Q {
 63	count := 0
 64	alive := len(d.repoMetaData)
 65	for i := range d.repoMetaData {
 66		if d.repoMetaData[i].Tombstone {
 67			alive--
 68		} else if predicate(&d.repoMetaData[i]) {
 69			count++
 70		}
 71	}
 72	if count == alive {
 73		return &query.Const{Value: true}
 74	}
 75	if count > 0 {
 76		return q
 77	}
 78	return &query.Const{Value: false}
 79}
 80
 81func (d *indexData) simplify(in query.Q) query.Q {
 82	eval := query.Map(in, func(q query.Q) query.Q {
 83		switch r := q.(type) {
 84		case *query.Repo:
 85			return d.simplifyMultiRepo(q, func(repo *Repository) bool {
 86				return r.Regexp.MatchString(repo.Name)
 87			})
 88		case *query.RepoRegexp:
 89			return d.simplifyMultiRepo(q, func(repo *Repository) bool {
 90				return r.Regexp.MatchString(repo.Name)
 91			})
 92		case *query.BranchesRepos:
 93			for i := range d.repoMetaData {
 94				for _, br := range r.List {
 95					if br.Repos.Contains(d.repoMetaData[i].ID) {
 96						return q
 97					}
 98				}
 99			}
100			return &query.Const{Value: false}
101		case *query.RepoSet:
102			return d.simplifyMultiRepo(q, func(repo *Repository) bool {
103				return r.Set[repo.Name]
104			})
105		case *query.RepoIDs:
106			return d.simplifyMultiRepo(q, func(repo *Repository) bool {
107				return r.Repos.Contains(repo.ID)
108			})
109		case *query.Language:
110			_, has := d.metaData.LanguageMap[r.Language]
111			if !has && d.metaData.IndexFeatureVersion < 12 {
112				// For index files that haven't been re-indexed by go-enry,
113				// fall back to file-based matching and continue even if this
114				// repo doesn't have the specific language present.
115				extsForLang := enry_data.ExtensionsByLanguage[r.Language]
116				if extsForLang != nil {
117					extFrags := make([]string, 0, len(extsForLang))
118					for _, ext := range extsForLang {
119						extFrags = append(extFrags, regexp.QuoteMeta(ext))
120					}
121					if len(extFrags) > 0 {
122						pattern := fmt.Sprintf("(?i)(%s)$", strings.Join(extFrags, "|"))
123						// inlined copy of query.regexpQuery
124						re, err := syntax.Parse(pattern, syntax.Perl)
125						if err != nil {
126							return &query.Const{Value: false}
127						}
128						if re.Op == syntax.OpLiteral {
129							return &query.Substring{
130								Pattern:  string(re.Rune),
131								FileName: true,
132							}
133						}
134						return &query.Regexp{
135							Regexp:   re,
136							FileName: true,
137						}
138					}
139				}
140			}
141			if !has {
142				return &query.Const{Value: false}
143			}
144		}
145		return q
146	})
147	return query.Simplify(eval)
148}
149
150func (o *SearchOptions) SetDefaults() {
151	if o.ShardMaxMatchCount == 0 {
152		// We cap the total number of matches, so overly broad
153		// searches don't crash the machine.
154		o.ShardMaxMatchCount = 100000
155	}
156	if o.TotalMaxMatchCount == 0 {
157		o.TotalMaxMatchCount = 10 * o.ShardMaxMatchCount
158	}
159}
160
161func (d *indexData) Search(ctx context.Context, q query.Q, opts *SearchOptions) (sr *SearchResult, err error) {
162	timer := newTimer()
163
164	copyOpts := *opts
165	opts = &copyOpts
166	opts.SetDefaults()
167
168	var res SearchResult
169	if len(d.fileNameIndex) == 0 {
170		return &res, nil
171	}
172
173	select {
174	case <-ctx.Done():
175		res.Stats.ShardsSkipped++
176		return &res, nil
177	default:
178	}
179
180	q = d.simplify(q)
181	if c, ok := q.(*query.Const); ok && !c.Value {
182		return &res, nil
183	}
184
185	if opts.EstimateDocCount {
186		res.Stats.ShardFilesConsidered = len(d.fileBranchMasks)
187		return &res, nil
188	}
189
190	q = query.Map(q, query.ExpandFileContent)
191
192	mt, err := d.newMatchTree(q, matchTreeOpt{})
193	if err != nil {
194		return nil, err
195	}
196
197	// Capture the costs of construction before pruning
198	updateMatchTreeStats(mt, &res.Stats)
199
200	mt, err = pruneMatchTree(mt)
201	if err != nil {
202		return nil, err
203	}
204	res.Stats.MatchTreeConstruction = timer.Elapsed()
205	if mt == nil {
206		res.Stats.ShardsSkippedFilter++
207		return &res, nil
208	}
209
210	res.Stats.ShardsScanned++
211
212	cp := &contentProvider{
213		id:    d,
214		stats: &res.Stats,
215	}
216
217	// Track the number of documents found in a repository for
218	// ShardRepoMaxMatchCount
219	var (
220		lastRepoID     uint16
221		repoMatchCount int
222	)
223
224	docCount := uint32(len(d.fileBranchMasks))
225	lastDoc := int(-1)
226
227nextFileMatch:
228	for {
229		canceled := false
230		select {
231		case <-ctx.Done():
232			canceled = true
233		default:
234		}
235
236		nextDoc := mt.nextDoc()
237		if int(nextDoc) <= lastDoc {
238			nextDoc = uint32(lastDoc + 1)
239		}
240
241		for ; nextDoc < docCount; nextDoc++ {
242			repoID := d.repos[nextDoc]
243			repoMetadata := &d.repoMetaData[repoID]
244
245			// Skip tombstoned repositories
246			if repoMetadata.Tombstone {
247				continue
248			}
249
250			// Skip documents that are tombstoned
251			if len(repoMetadata.FileTombstones) > 0 {
252				if _, tombstoned := repoMetadata.FileTombstones[string(d.fileName(nextDoc))]; tombstoned {
253					continue
254				}
255			}
256
257			// Skip documents over ShardRepoMaxMatchCount if specified.
258			if opts.ShardRepoMaxMatchCount > 0 {
259				if repoMatchCount >= opts.ShardRepoMaxMatchCount && repoID == lastRepoID {
260					res.Stats.FilesSkipped++
261					continue
262				}
263			}
264
265			break
266		}
267
268		if nextDoc >= docCount {
269			break
270		}
271
272		lastDoc = int(nextDoc)
273
274		// We track lastRepoID for ShardRepoMaxMatchCount
275		if lastRepoID != d.repos[nextDoc] {
276			lastRepoID = d.repos[nextDoc]
277			repoMatchCount = 0
278		}
279
280		if canceled || (res.Stats.MatchCount >= opts.ShardMaxMatchCount && opts.ShardMaxMatchCount > 0) {
281			res.Stats.FilesSkipped += int(docCount - nextDoc)
282			break
283		}
284
285		res.Stats.FilesConsidered++
286		mt.prepare(nextDoc)
287
288		cp.setDocument(nextDoc)
289
290		known := make(map[matchTree]bool)
291		md := d.repoMetaData[d.repos[nextDoc]]
292
293		for cost := costMin; cost <= costMax; cost++ {
294			switch evalMatchTree(cp, cost, known, mt) {
295			case matchesRequiresHigherCost:
296				if cost == costMax {
297					log.Panicf("did not decide. Repo %s, doc %d, known %v",
298						md.Name, nextDoc, known)
299				}
300			case matchesFound:
301				// could short-circuit now, but we want to run higher costs to
302				// potentially find higher ranked matches.
303			case matchesNone:
304				continue nextFileMatch
305			}
306		}
307
308		fileMatch := FileMatch{
309			Repository:         md.Name,
310			RepositoryID:       md.ID,
311			RepositoryPriority: md.priority,
312			FileName:           string(d.fileName(nextDoc)),
313			Checksum:           d.getChecksum(nextDoc),
314			Language:           d.languageMap[d.getLanguage(nextDoc)],
315		}
316
317		if s := d.subRepos[nextDoc]; s > 0 {
318			if s >= uint32(len(d.subRepoPaths[d.repos[nextDoc]])) {
319				log.Panicf("corrupt index: subrepo %d beyond %v", s, d.subRepoPaths)
320			}
321			path := d.subRepoPaths[d.repos[nextDoc]][s]
322			fileMatch.SubRepositoryPath = path
323			sr := md.SubRepoMap[path]
324			fileMatch.SubRepositoryName = sr.Name
325			if idx := d.branchIndex(nextDoc); idx >= 0 {
326				fileMatch.Version = sr.Branches[idx].Version
327			}
328		} else {
329			idx := d.branchIndex(nextDoc)
330			if idx >= 0 {
331				fileMatch.Version = md.Branches[idx].Version
332			}
333		}
334
335		// Important invariant for performance: finalCands is sorted by offset and
336		// non-overlapping. gatherMatches respects this invariant and all later
337		// transformations respect this.
338		shouldMergeMatches := !opts.ChunkMatches
339		finalCands := gatherMatches(mt, known, shouldMergeMatches)
340
341		if len(finalCands) == 0 {
342			nm := d.fileName(nextDoc)
343			finalCands = append(finalCands,
344				&candidateMatch{
345					caseSensitive: false,
346					fileName:      true,
347					substrBytes:   nm,
348					substrLowered: nm,
349					file:          nextDoc,
350					runeOffset:    0,
351					byteOffset:    0,
352					byteMatchSz:   uint32(len(nm)),
353				})
354		}
355
356		if opts.ChunkMatches {
357			fileMatch.ChunkMatches = cp.fillChunkMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts.DebugScore)
358		} else {
359			fileMatch.LineMatches = cp.fillMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts.DebugScore)
360		}
361
362		if opts.UseKeywordScoring {
363			d.scoreFileUsingBM25(&fileMatch, nextDoc, finalCands, opts)
364		} else {
365			// Use the standard, non-experimental scoring method by default
366			d.scoreFile(&fileMatch, nextDoc, mt, known, opts)
367		}
368
369		fileMatch.Branches = d.gatherBranches(nextDoc, mt, known)
370		sortMatchesByScore(fileMatch.LineMatches)
371		sortChunkMatchesByScore(fileMatch.ChunkMatches)
372		if opts.Whole {
373			fileMatch.Content = cp.data(false)
374		}
375
376		matchedChunkRanges := 0
377		for _, cm := range fileMatch.ChunkMatches {
378			matchedChunkRanges += len(cm.Ranges)
379		}
380
381		repoMatchCount += len(fileMatch.LineMatches)
382		repoMatchCount += matchedChunkRanges
383
384		if opts.DebugScore {
385			fileMatch.Debug = fmt.Sprintf("score:%.2f <- %s", fileMatch.Score, fileMatch.Debug)
386		}
387
388		res.Files = append(res.Files, fileMatch)
389		res.Stats.MatchCount += len(fileMatch.LineMatches)
390		res.Stats.MatchCount += matchedChunkRanges
391		res.Stats.FileCount++
392	}
393
394	// We do not sort Files here, instead we rely on the shards pkg to do file
395	// ranking. If we sorted now, we would break the assumption that results
396	// from the same repo in a shard appear next to each other.
397
398	for _, md := range d.repoMetaData {
399		r := md
400		addRepo(&res, &r)
401		for _, v := range r.SubRepoMap {
402			addRepo(&res, v)
403		}
404	}
405
406	// Update stats based on work done during document search.
407	updateMatchTreeStats(mt, &res.Stats)
408
409	// If document ranking is enabled, then we can rank and truncate the files to save memory.
410	if opts.UseDocumentRanks {
411		res.Files = SortAndTruncateFiles(res.Files, opts)
412	}
413
414	res.Stats.MatchTreeSearch = timer.Elapsed()
415
416	return &res, nil
417}
418
419// scoreFile computes a score for the file match using various scoring signals, like
420// whether there's an exact match on a symbol, the number of query clauses that matched, etc.
421func (d *indexData) scoreFile(fileMatch *FileMatch, doc uint32, mt matchTree, known map[matchTree]bool, opts *SearchOptions) {
422	atomMatchCount := 0
423	visitMatchAtoms(mt, known, func(mt matchTree) {
424		atomMatchCount++
425	})
426
427	addScore := func(what string, computed float64) {
428		fileMatch.addScore(what, computed, -1, opts.DebugScore)
429	}
430
431	// atom-count boosts files with matches from more than 1 atom. The
432	// maximum boost is scoreFactorAtomMatch.
433	if atomMatchCount > 0 {
434		fileMatch.addScore("atom", (1.0-1.0/float64(atomMatchCount))*scoreFactorAtomMatch, float64(atomMatchCount), opts.DebugScore)
435	}
436
437	maxFileScore := 0.0
438	for i := range fileMatch.LineMatches {
439		if maxFileScore < fileMatch.LineMatches[i].Score {
440			maxFileScore = fileMatch.LineMatches[i].Score
441		}
442
443		// Order by ordering in file.
444		fileMatch.LineMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.LineMatches))))
445	}
446
447	for i := range fileMatch.ChunkMatches {
448		if maxFileScore < fileMatch.ChunkMatches[i].Score {
449			maxFileScore = fileMatch.ChunkMatches[i].Score
450		}
451
452		// Order by ordering in file.
453		fileMatch.ChunkMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.ChunkMatches))))
454	}
455
456	// Maintain ordering of input files. This
457	// strictly dominates the in-file ordering of
458	// the matches.
459	addScore("fragment", maxFileScore)
460
461	if opts.UseDocumentRanks && len(d.ranks) > int(doc) {
462		weight := scoreFileRankFactor
463		if opts.DocumentRanksWeight > 0.0 {
464			weight = opts.DocumentRanksWeight
465		}
466
467		ranks := d.ranks[doc]
468		// The ranks slice always contains one entry representing the file rank (unless it's empty since the
469		// file doesn't have a rank). This is left over from when documents could have multiple rank signals,
470		// and we plan to clean this up.
471		if len(ranks) > 0 {
472			// The file rank represents a log (base 2) count. The log ranks should be bounded at 32, but we
473			// cap it just in case to ensure it falls in the range [0, 1].
474			normalized := math.Min(1.0, ranks[0]/32.0)
475			addScore("file-rank", weight*normalized)
476		}
477	}
478
479	md := d.repoMetaData[d.repos[doc]]
480	addScore("doc-order", scoreFileOrderFactor*(1.0-float64(doc)/float64(len(d.boundaries))))
481	addScore("repo-rank", scoreRepoRankFactor*float64(md.Rank)/maxUInt16)
482
483	if opts.DebugScore {
484		fileMatch.Debug = strings.TrimSuffix(fileMatch.Debug, ", ")
485	}
486}
487
488// scoreFileUsingBM25 computes a score for the file match using an approximation to BM25, the most common scoring
489// algorithm for keyword search: https://en.wikipedia.org/wiki/Okapi_BM25. It implements all parts of the formula
490// except inverse document frequency (idf), since we don't have access to global term frequency statistics.
491//
492// This scoring strategy ignores all other signals including document ranks. This keeps things simple for now,
493// since BM25 is not normalized and can be tricky to combine with other scoring signals.
494func (d *indexData) scoreFileUsingBM25(fileMatch *FileMatch, doc uint32, cands []*candidateMatch, opts *SearchOptions) {
495	// Treat each candidate match as a term and compute the frequencies. For now, ignore case
496	// sensitivity and treat filenames and symbols the same as content.
497	termFreqs := map[string]int{}
498	for _, cand := range cands {
499		term := string(cand.substrLowered)
500		termFreqs[term]++
501	}
502
503	// Compute the file length ratio. Usually the calculation would be based on terms, but using
504	// bytes should work fine, as we're just computing a ratio.
505	fileLength := float64(d.boundaries[doc+1] - d.boundaries[doc])
506	numFiles := len(d.boundaries)
507	averageFileLength := float64(d.boundaries[numFiles-1]) / float64(numFiles)
508	L := fileLength / averageFileLength
509
510	// Use standard parameter defaults (used in Lucene and academic papers)
511	k, b := 1.2, 0.75
512	sumTf := 0.0 // Just for debugging
513	score := 0.0
514	for _, freq := range termFreqs {
515		tf := float64(freq)
516		sumTf += tf
517		score += ((k + 1.0) * tf) / (k*(1.0-b+b*L) + tf)
518	}
519
520	fileMatch.addKeywordScore(score, sumTf, L, opts.DebugScore)
521}
522
523func addRepo(res *SearchResult, repo *Repository) {
524	if res.RepoURLs == nil {
525		res.RepoURLs = map[string]string{}
526	}
527	res.RepoURLs[repo.Name] = repo.FileURLTemplate
528
529	if res.LineFragments == nil {
530		res.LineFragments = map[string]string{}
531	}
532	res.LineFragments[repo.Name] = repo.LineFragmentTemplate
533}
534
535type sortByOffsetSlice []*candidateMatch
536
537func (m sortByOffsetSlice) Len() int      { return len(m) }
538func (m sortByOffsetSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
539func (m sortByOffsetSlice) Less(i, j int) bool {
540	if m[i].byteOffset == m[j].byteOffset { // tie break if same offset
541		// Prefer longer candidates if starting at same position
542		return m[i].byteMatchSz > m[j].byteMatchSz
543	}
544	return m[i].byteOffset < m[j].byteOffset
545}
546
547// setScoreWeight is a helper used by gatherMatches to set the weight based on
548// the score weight of the matchTree.
549func setScoreWeight(scoreWeight float64, cm []*candidateMatch) []*candidateMatch {
550	for _, m := range cm {
551		m.scoreWeight = scoreWeight
552	}
553	return cm
554}
555
556// Gather matches from this document. This never returns a mixture of
557// filename/content matches: if there are content matches, all
558// filename matches are trimmed from the result. The matches are
559// returned in document order and are non-overlapping.
560//
561// If `merge` is set, overlapping and adjacent matches will be merged
562// into a single match. Otherwise, overlapping matches will be removed,
563// but adjacent matches will remain.
564func gatherMatches(mt matchTree, known map[matchTree]bool, merge bool) []*candidateMatch {
565	var cands []*candidateMatch
566	visitMatches(mt, known, 1, func(mt matchTree, scoreWeight float64) {
567		if smt, ok := mt.(*substrMatchTree); ok {
568			cands = append(cands, setScoreWeight(scoreWeight, smt.current)...)
569		}
570		if rmt, ok := mt.(*regexpMatchTree); ok {
571			cands = append(cands, setScoreWeight(scoreWeight, rmt.found)...)
572		}
573		if rmt, ok := mt.(*wordMatchTree); ok {
574			cands = append(cands, setScoreWeight(scoreWeight, rmt.found)...)
575		}
576		if smt, ok := mt.(*symbolRegexpMatchTree); ok {
577			cands = append(cands, setScoreWeight(scoreWeight, smt.found)...)
578		}
579	})
580
581	foundContentMatch := false
582	for _, c := range cands {
583		if !c.fileName {
584			foundContentMatch = true
585			break
586		}
587	}
588
589	res := cands[:0]
590	for _, c := range cands {
591		if !foundContentMatch || !c.fileName {
592			res = append(res, c)
593		}
594	}
595	cands = res
596
597	if merge {
598		// Merge adjacent candidates. This guarantees that the matches
599		// are non-overlapping.
600		sort.Sort((sortByOffsetSlice)(cands))
601		res = cands[:0]
602		mergeRun := 1
603		for i, c := range cands {
604			if i == 0 {
605				res = append(res, c)
606				continue
607			}
608			last := res[len(res)-1]
609			lastEnd := last.byteOffset + last.byteMatchSz
610			end := c.byteOffset + c.byteMatchSz
611			if lastEnd >= c.byteOffset {
612				mergeRun++
613
614				// Average out the score across the merged candidates. Only do it if
615				// we are boosting to avoid floating point funkiness in the normal
616				// case.
617				if !(epsilonEqualsOne(last.scoreWeight) && epsilonEqualsOne(c.scoreWeight)) {
618					last.scoreWeight = ((last.scoreWeight * float64(mergeRun-1)) + c.scoreWeight) / float64(mergeRun)
619				}
620
621				// latest candidate goes further, update our end
622				if end > lastEnd {
623					last.byteMatchSz = end - last.byteOffset
624				}
625
626				continue
627			} else {
628				mergeRun = 1
629			}
630
631			res = append(res, c)
632		}
633	} else {
634		// Remove overlapping candidates. This guarantees that the matches
635		// are non-overlapping, but also preserves expected match counts.
636		sort.Sort((sortByOffsetSlice)(cands))
637		res = cands[:0]
638		for i, c := range cands {
639			if i == 0 {
640				res = append(res, c)
641				continue
642			}
643			last := res[len(res)-1]
644			lastEnd := last.byteOffset + last.byteMatchSz
645			if lastEnd > c.byteOffset {
646				continue
647			}
648
649			res = append(res, c)
650		}
651	}
652
653	return res
654}
655
656func (d *indexData) branchIndex(docID uint32) int {
657	mask := d.fileBranchMasks[docID]
658	idx := 0
659	for mask != 0 {
660		if mask&0x1 != 0 {
661			return idx
662		}
663		idx++
664		mask >>= 1
665	}
666	return -1
667}
668
669// gatherBranches returns a list of branch names taking into account any branch
670// filters in the query. If the query contains a branch filter, it returns all
671// branches containing the docID and matching the branch filter. Otherwise, it
672// returns all branches containing docID.
673func (d *indexData) gatherBranches(docID uint32, mt matchTree, known map[matchTree]bool) []string {
674	var mask uint64
675	visitMatchAtoms(mt, known, func(mt matchTree) {
676		bq, ok := mt.(*branchQueryMatchTree)
677		if !ok {
678			return
679		}
680
681		mask = mask | bq.branchMask()
682	})
683
684	if mask == 0 {
685		mask = d.fileBranchMasks[docID]
686	}
687
688	var branches []string
689	id := uint32(1)
690	branchNames := d.branchNames[d.repos[docID]]
691	for mask != 0 {
692		if mask&0x1 != 0 {
693			branches = append(branches, branchNames[uint(id)])
694		}
695		id <<= 1
696		mask >>= 1
697	}
698
699	return branches
700}
701
702func (d *indexData) List(ctx context.Context, q query.Q, opts *ListOptions) (rl *RepoList, err error) {
703	var include func(rle *RepoListEntry) bool
704
705	q = d.simplify(q)
706	if c, ok := q.(*query.Const); ok {
707		if !c.Value {
708			return &RepoList{}, nil
709		}
710		include = func(rle *RepoListEntry) bool {
711			return true
712		}
713	} else {
714		sr, err := d.Search(ctx, q, &SearchOptions{
715			ShardRepoMaxMatchCount: 1,
716		})
717		if err != nil {
718			return nil, err
719		}
720
721		foundRepos := make(map[string]struct{}, len(sr.Files))
722		for _, file := range sr.Files {
723			foundRepos[file.Repository] = struct{}{}
724		}
725
726		include = func(rle *RepoListEntry) bool {
727			_, ok := foundRepos[rle.Repository.Name]
728			return ok
729		}
730	}
731
732	var l RepoList
733
734	field, err := opts.GetField()
735	if err != nil {
736		return nil, err
737	}
738	switch field {
739	case RepoListFieldRepos:
740		l.Repos = make([]*RepoListEntry, 0, len(d.repoListEntry))
741	case RepoListFieldReposMap:
742		l.ReposMap = make(ReposMap, len(d.repoListEntry))
743	}
744
745	for i := range d.repoListEntry {
746		if d.repoMetaData[i].Tombstone {
747			continue
748		}
749		rle := &d.repoListEntry[i]
750		if !include(rle) {
751			continue
752		}
753
754		l.Stats.Add(&rle.Stats)
755
756		// Backwards compat for when ID is missing
757		if rle.Repository.ID == 0 {
758			l.Repos = append(l.Repos, rle)
759			continue
760		}
761
762		switch field {
763		case RepoListFieldRepos:
764			l.Repos = append(l.Repos, rle)
765		case RepoListFieldReposMap:
766			l.ReposMap[rle.Repository.ID] = MinimalRepoListEntry{
767				HasSymbols:    rle.Repository.HasSymbols,
768				Branches:      rle.Repository.Branches,
769				IndexTimeUnix: rle.IndexMetadata.IndexTime.Unix(),
770			}
771		}
772
773	}
774
775	// Only one of these fields is populated and in all cases the size of that
776	// field is the number of Repos in this shard.
777	l.Stats.Repos = len(l.Repos) + len(l.ReposMap)
778
779	return &l, nil
780}
781
782// regexpToMatchTreeRecursive converts a regular expression to a matchTree mt. If
783// mt is equivalent to the input r, isEqual = true and the matchTree can be used
784// in place of the regex r. If singleLine = true, then the matchTree and all
785// its children only match terms on the same line. singleLine is used during
786// recursion to decide whether to return an andLineMatchTree (singleLine = true)
787// or a andMatchTree (singleLine = false).
788func (d *indexData) regexpToMatchTreeRecursive(r *syntax.Regexp, minTextSize int, fileName bool, caseSensitive bool) (mt matchTree, isEqual bool, singleLine bool, err error) {
789	// TODO - we could perhaps transform Begin/EndText in '\n'?
790	// TODO - we could perhaps transform CharClass in (OrQuery )
791	// if there are just a few runes, and part of a OpConcat?
792	switch r.Op {
793	case syntax.OpLiteral:
794		s := string(r.Rune)
795		if len(s) >= minTextSize {
796			mt, err := d.newSubstringMatchTree(&query.Substring{Pattern: s, FileName: fileName, CaseSensitive: caseSensitive})
797			return mt, true, !strings.Contains(s, "\n"), err
798		}
799	case syntax.OpCapture:
800		return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
801
802	case syntax.OpPlus:
803		return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
804
805	case syntax.OpRepeat:
806		if r.Min == 1 {
807			return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
808		} else if r.Min > 1 {
809			// (x){2,} can't be expressed precisely by the matchTree
810			mt, _, singleLine, err := d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
811			return mt, false, singleLine, err
812		}
813	case syntax.OpConcat, syntax.OpAlternate:
814		var qs []matchTree
815		isEq := true
816		singleLine = true
817		for _, sr := range r.Sub {
818			if sq, subIsEq, subSingleLine, err := d.regexpToMatchTreeRecursive(sr, minTextSize, fileName, caseSensitive); sq != nil {
819				if err != nil {
820					return nil, false, false, err
821				}
822				isEq = isEq && subIsEq
823				singleLine = singleLine && subSingleLine
824				qs = append(qs, sq)
825			}
826		}
827		if r.Op == syntax.OpConcat {
828			if len(qs) > 1 {
829				isEq = false
830			}
831			newQs := make([]matchTree, 0, len(qs))
832			for _, q := range qs {
833				if _, ok := q.(*bruteForceMatchTree); ok {
834					continue
835				}
836				newQs = append(newQs, q)
837			}
838			if len(newQs) == 1 {
839				return newQs[0], isEq, singleLine, nil
840			}
841			if len(newQs) == 0 {
842				return &bruteForceMatchTree{}, isEq, singleLine, nil
843			}
844			if singleLine {
845				return &andLineMatchTree{andMatchTree{children: newQs}}, isEq, singleLine, nil
846			}
847			return &andMatchTree{newQs}, isEq, singleLine, nil
848		}
849		for _, q := range qs {
850			if _, ok := q.(*bruteForceMatchTree); ok {
851				return q, isEq, false, nil
852			}
853		}
854		if len(qs) == 0 {
855			return &noMatchTree{Why: "const"}, isEq, false, nil
856		}
857		return &orMatchTree{qs}, isEq, false, nil
858	case syntax.OpStar:
859		if r.Sub[0].Op == syntax.OpAnyCharNotNL {
860			return &bruteForceMatchTree{}, false, true, nil
861		}
862	}
863	return &bruteForceMatchTree{}, false, false, nil
864}
865
866type timer struct {
867	last time.Time
868}
869
870func newTimer() *timer {
871	return &timer{
872		last: time.Now(),
873	}
874}
875
876func (t *timer) Elapsed() time.Duration {
877	now := time.Now()
878	d := now.Sub(t.last)
879	t.last = now
880	return d
881}
Configure Feed

Configure Feed