eval.go at f75df3d8a3f8de3003da19567bf13429563036cb · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / eval.go
at f75df3d8a3f8de3003da19567bf13429563036cb 24 kB View raw
Keegan Carruthers-Smith stat: introduce timing stats around shard search (#633) 2y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package zoekt
 16
 17import (
 18	"context"
 19	"fmt"
 20	"log"
 21	"math"
 22	"regexp/syntax"
 23	"sort"
 24	"strings"
 25	"time"
 26
 27	enry_data "github.com/go-enry/go-enry/v2/data"
 28	"github.com/grafana/regexp"
 29
 30	"github.com/sourcegraph/zoekt/query"
 31)
 32
 33const maxUInt16 = 0xffff
 34
 35func (m *FileMatch) addScore(what string, s float64, debugScore bool) {
 36	if debugScore {
 37		m.Debug += fmt.Sprintf("%s:%.2f, ", what, s)
 38	}
 39	m.Score += s
 40}
 41
 42func (m *FileMatch) addKeywordScore(score float64, sumTf float64, L float64, debugScore bool) {
 43	if debugScore {
 44		m.Debug += fmt.Sprintf("keyword-score:%.2f (sum-tf: %.2f, length-ratio: %.2f)", score, sumTf, L)
 45	}
 46	m.Score += score
 47}
 48
 49// simplifyMultiRepo takes a query and a predicate. It returns Const(true) if all
 50// repository names fulfill the predicate, Const(false) if none of them do, and q
 51// otherwise.
 52func (d *indexData) simplifyMultiRepo(q query.Q, predicate func(*Repository) bool) query.Q {
 53	count := 0
 54	alive := len(d.repoMetaData)
 55	for i := range d.repoMetaData {
 56		if d.repoMetaData[i].Tombstone {
 57			alive--
 58		} else if predicate(&d.repoMetaData[i]) {
 59			count++
 60		}
 61	}
 62	if count == alive {
 63		return &query.Const{Value: true}
 64	}
 65	if count > 0 {
 66		return q
 67	}
 68	return &query.Const{Value: false}
 69}
 70
 71func (d *indexData) simplify(in query.Q) query.Q {
 72	eval := query.Map(in, func(q query.Q) query.Q {
 73		switch r := q.(type) {
 74		case *query.Repo:
 75			return d.simplifyMultiRepo(q, func(repo *Repository) bool {
 76				return r.Regexp.MatchString(repo.Name)
 77			})
 78		case *query.RepoRegexp:
 79			return d.simplifyMultiRepo(q, func(repo *Repository) bool {
 80				return r.Regexp.MatchString(repo.Name)
 81			})
 82		case *query.BranchesRepos:
 83			for i := range d.repoMetaData {
 84				for _, br := range r.List {
 85					if br.Repos.Contains(d.repoMetaData[i].ID) {
 86						return q
 87					}
 88				}
 89			}
 90			return &query.Const{Value: false}
 91		case *query.RepoSet:
 92			return d.simplifyMultiRepo(q, func(repo *Repository) bool {
 93				return r.Set[repo.Name]
 94			})
 95		case *query.RepoIDs:
 96			return d.simplifyMultiRepo(q, func(repo *Repository) bool {
 97				return r.Repos.Contains(repo.ID)
 98			})
 99		case *query.Language:
100			_, has := d.metaData.LanguageMap[r.Language]
101			if !has && d.metaData.IndexFeatureVersion < 12 {
102				// For index files that haven't been re-indexed by go-enry,
103				// fall back to file-based matching and continue even if this
104				// repo doesn't have the specific language present.
105				extsForLang := enry_data.ExtensionsByLanguage[r.Language]
106				if extsForLang != nil {
107					extFrags := make([]string, 0, len(extsForLang))
108					for _, ext := range extsForLang {
109						extFrags = append(extFrags, regexp.QuoteMeta(ext))
110					}
111					if len(extFrags) > 0 {
112						pattern := fmt.Sprintf("(?i)(%s)$", strings.Join(extFrags, "|"))
113						// inlined copy of query.regexpQuery
114						re, err := syntax.Parse(pattern, syntax.Perl)
115						if err != nil {
116							return &query.Const{Value: false}
117						}
118						if re.Op == syntax.OpLiteral {
119							return &query.Substring{
120								Pattern:  string(re.Rune),
121								FileName: true,
122							}
123						}
124						return &query.Regexp{
125							Regexp:   re,
126							FileName: true,
127						}
128					}
129				}
130			}
131			if !has {
132				return &query.Const{Value: false}
133			}
134		}
135		return q
136	})
137	return query.Simplify(eval)
138}
139
140func (o *SearchOptions) SetDefaults() {
141	if o.ShardMaxMatchCount == 0 {
142		// We cap the total number of matches, so overly broad
143		// searches don't crash the machine.
144		o.ShardMaxMatchCount = 100000
145	}
146	if o.TotalMaxMatchCount == 0 {
147		o.TotalMaxMatchCount = 10 * o.ShardMaxMatchCount
148	}
149}
150
151func (d *indexData) Search(ctx context.Context, q query.Q, opts *SearchOptions) (sr *SearchResult, err error) {
152	timer := newTimer()
153
154	copyOpts := *opts
155	opts = &copyOpts
156	opts.SetDefaults()
157
158	var res SearchResult
159	if len(d.fileNameIndex) == 0 {
160		return &res, nil
161	}
162
163	select {
164	case <-ctx.Done():
165		res.Stats.ShardsSkipped++
166		return &res, nil
167	default:
168	}
169
170	q = d.simplify(q)
171	if c, ok := q.(*query.Const); ok && !c.Value {
172		return &res, nil
173	}
174
175	if opts.EstimateDocCount {
176		res.Stats.ShardFilesConsidered = len(d.fileBranchMasks)
177		return &res, nil
178	}
179
180	q = query.Map(q, query.ExpandFileContent)
181
182	mt, err := d.newMatchTree(q, matchTreeOpt{})
183	if err != nil {
184		return nil, err
185	}
186
187	// Capture the costs of construction before pruning
188	updateMatchTreeStats(mt, &res.Stats)
189
190	mt, err = pruneMatchTree(mt)
191	if err != nil {
192		return nil, err
193	}
194	res.Stats.MatchTreeConstruction = timer.Elapsed()
195	if mt == nil {
196		res.Stats.ShardsSkippedFilter++
197		return &res, nil
198	}
199
200	res.Stats.ShardsScanned++
201
202	cp := &contentProvider{
203		id:    d,
204		stats: &res.Stats,
205	}
206
207	// Track the number of documents found in a repository for
208	// ShardRepoMaxMatchCount
209	var (
210		lastRepoID     uint16
211		repoMatchCount int
212	)
213
214	docCount := uint32(len(d.fileBranchMasks))
215	lastDoc := int(-1)
216
217nextFileMatch:
218	for {
219		canceled := false
220		select {
221		case <-ctx.Done():
222			canceled = true
223		default:
224		}
225
226		nextDoc := mt.nextDoc()
227		if int(nextDoc) <= lastDoc {
228			nextDoc = uint32(lastDoc + 1)
229		}
230
231		for ; nextDoc < docCount; nextDoc++ {
232			repoID := d.repos[nextDoc]
233			repoMetadata := &d.repoMetaData[repoID]
234
235			// Skip tombstoned repositories
236			if repoMetadata.Tombstone {
237				continue
238			}
239
240			// Skip documents that are tombstoned
241			if len(repoMetadata.FileTombstones) > 0 {
242				if _, tombstoned := repoMetadata.FileTombstones[string(d.fileName(nextDoc))]; tombstoned {
243					continue
244				}
245			}
246
247			// Skip documents over ShardRepoMaxMatchCount if specified.
248			if opts.ShardRepoMaxMatchCount > 0 {
249				if repoMatchCount >= opts.ShardRepoMaxMatchCount && repoID == lastRepoID {
250					res.Stats.FilesSkipped++
251					continue
252				}
253			}
254
255			break
256		}
257
258		if nextDoc >= docCount {
259			break
260		}
261
262		lastDoc = int(nextDoc)
263
264		// We track lastRepoID for ShardRepoMaxMatchCount
265		if lastRepoID != d.repos[nextDoc] {
266			lastRepoID = d.repos[nextDoc]
267			repoMatchCount = 0
268		}
269
270		if canceled || (res.Stats.MatchCount >= opts.ShardMaxMatchCount && opts.ShardMaxMatchCount > 0) {
271			res.Stats.FilesSkipped += int(docCount - nextDoc)
272			break
273		}
274
275		res.Stats.FilesConsidered++
276		mt.prepare(nextDoc)
277
278		cp.setDocument(nextDoc)
279
280		known := make(map[matchTree]bool)
281		md := d.repoMetaData[d.repos[nextDoc]]
282
283		for cost := costMin; cost <= costMax; cost++ {
284			v, ok := mt.matches(cp, cost, known)
285			if ok && !v {
286				continue nextFileMatch
287			}
288
289			if cost == costMax && !ok {
290				log.Panicf("did not decide. Repo %s, doc %d, known %v",
291					md.Name, nextDoc, known)
292			}
293		}
294
295		fileMatch := FileMatch{
296			Repository:         md.Name,
297			RepositoryID:       md.ID,
298			RepositoryPriority: md.priority,
299			FileName:           string(d.fileName(nextDoc)),
300			Checksum:           d.getChecksum(nextDoc),
301			Language:           d.languageMap[d.getLanguage(nextDoc)],
302		}
303
304		if s := d.subRepos[nextDoc]; s > 0 {
305			if s >= uint32(len(d.subRepoPaths[d.repos[nextDoc]])) {
306				log.Panicf("corrupt index: subrepo %d beyond %v", s, d.subRepoPaths)
307			}
308			path := d.subRepoPaths[d.repos[nextDoc]][s]
309			fileMatch.SubRepositoryPath = path
310			sr := md.SubRepoMap[path]
311			fileMatch.SubRepositoryName = sr.Name
312			if idx := d.branchIndex(nextDoc); idx >= 0 {
313				fileMatch.Version = sr.Branches[idx].Version
314			}
315		} else {
316			idx := d.branchIndex(nextDoc)
317			if idx >= 0 {
318				fileMatch.Version = md.Branches[idx].Version
319			}
320		}
321
322		shouldMergeMatches := !opts.ChunkMatches
323		finalCands := gatherMatches(mt, known, shouldMergeMatches)
324
325		if len(finalCands) == 0 {
326			nm := d.fileName(nextDoc)
327			finalCands = append(finalCands,
328				&candidateMatch{
329					caseSensitive: false,
330					fileName:      true,
331					substrBytes:   nm,
332					substrLowered: nm,
333					file:          nextDoc,
334					runeOffset:    0,
335					byteOffset:    0,
336					byteMatchSz:   uint32(len(nm)),
337				})
338		}
339
340		if opts.ChunkMatches {
341			fileMatch.ChunkMatches = cp.fillChunkMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts.DebugScore)
342		} else {
343			fileMatch.LineMatches = cp.fillMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts.DebugScore)
344		}
345
346		if opts.UseKeywordScoring {
347			d.scoreFileUsingBM25(&fileMatch, nextDoc, finalCands, opts)
348		} else {
349			// Use the standard, non-experimental scoring method by default
350			d.scoreFile(&fileMatch, nextDoc, mt, known, opts)
351		}
352
353		fileMatch.Branches = d.gatherBranches(nextDoc, mt, known)
354		sortMatchesByScore(fileMatch.LineMatches)
355		sortChunkMatchesByScore(fileMatch.ChunkMatches)
356		if opts.Whole {
357			fileMatch.Content = cp.data(false)
358		}
359
360		matchedChunkRanges := 0
361		for _, cm := range fileMatch.ChunkMatches {
362			matchedChunkRanges += len(cm.Ranges)
363		}
364
365		repoMatchCount += len(fileMatch.LineMatches)
366		repoMatchCount += matchedChunkRanges
367
368		if opts.DebugScore {
369			fileMatch.Debug = fmt.Sprintf("score:%.2f <- %s", fileMatch.Score, fileMatch.Debug)
370		}
371
372		res.Files = append(res.Files, fileMatch)
373		res.Stats.MatchCount += len(fileMatch.LineMatches)
374		res.Stats.MatchCount += matchedChunkRanges
375		res.Stats.FileCount++
376	}
377
378	// We do not sort Files here, instead we rely on the shards pkg to do file
379	// ranking. If we sorted now, we would break the assumption that results
380	// from the same repo in a shard appear next to each other.
381
382	for _, md := range d.repoMetaData {
383		r := md
384		addRepo(&res, &r)
385		for _, v := range r.SubRepoMap {
386			addRepo(&res, v)
387		}
388	}
389
390	// Update stats based on work done during document search.
391	updateMatchTreeStats(mt, &res.Stats)
392
393	// If document ranking is enabled, then we can rank and truncate the files to save memory.
394	if opts.UseDocumentRanks {
395		res.Files = SortAndTruncateFiles(res.Files, opts)
396	}
397
398	res.Stats.MatchTreeSearch = timer.Elapsed()
399
400	return &res, nil
401}
402
403// scoreFile computes a score for the file match using various scoring signals, like
404// whether there's an exact match on a symbol, the number of query clauses that matched, etc.
405func (d *indexData) scoreFile(fileMatch *FileMatch, doc uint32, mt matchTree, known map[matchTree]bool, opts *SearchOptions) {
406	atomMatchCount := 0
407	visitMatches(mt, known, func(mt matchTree) {
408		atomMatchCount++
409	})
410
411	// atom-count boosts files with matches from more than 1 atom. The
412	// maximum boost is scoreFactorAtomMatch.
413	if atomMatchCount > 0 {
414		fileMatch.addScore("atom", (1.0-1.0/float64(atomMatchCount))*scoreFactorAtomMatch, opts.DebugScore)
415	}
416
417	maxFileScore := 0.0
418	repetitions := 0
419	for i := range fileMatch.LineMatches {
420		if maxFileScore < fileMatch.LineMatches[i].Score {
421			maxFileScore = fileMatch.LineMatches[i].Score
422			repetitions = 0
423		} else if maxFileScore == fileMatch.LineMatches[i].Score {
424			repetitions += 1
425		}
426
427		// Order by ordering in file.
428		fileMatch.LineMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.LineMatches))))
429	}
430
431	for i := range fileMatch.ChunkMatches {
432		if maxFileScore < fileMatch.ChunkMatches[i].Score {
433			maxFileScore = fileMatch.ChunkMatches[i].Score
434		}
435
436		// Order by ordering in file.
437		fileMatch.ChunkMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.ChunkMatches))))
438	}
439
440	// Maintain ordering of input files. This
441	// strictly dominates the in-file ordering of
442	// the matches.
443	fileMatch.addScore("fragment", maxFileScore, opts.DebugScore)
444
445	// Prefer docs with several top-scored matches.
446	fileMatch.addScore("repetition-boost", scoreRepetitionFactor*float64(repetitions), opts.DebugScore)
447
448	if opts.UseDocumentRanks && len(d.ranks) > int(doc) {
449		weight := scoreFileRankFactor
450		if opts.DocumentRanksWeight > 0.0 {
451			weight = opts.DocumentRanksWeight
452		}
453
454		ranks := d.ranks[doc]
455		// The ranks slice always contains one entry representing the file rank (unless it's empty since the
456		// file doesn't have a rank). This is left over from when documents could have multiple rank signals,
457		// and we plan to clean this up.
458		if len(ranks) > 0 {
459			// The file rank represents a log (base 2) count. The log ranks should be bounded at 32, but we
460			// cap it just in case to ensure it falls in the range [0, 1].
461			normalized := math.Min(1.0, ranks[0]/32.0)
462			fileMatch.addScore("file-rank", weight*normalized, opts.DebugScore)
463		}
464	}
465
466	md := d.repoMetaData[d.repos[doc]]
467	fileMatch.addScore("doc-order", scoreFileOrderFactor*(1.0-float64(doc)/float64(len(d.boundaries))), opts.DebugScore)
468	fileMatch.addScore("repo-rank", scoreRepoRankFactor*float64(md.Rank)/maxUInt16, opts.DebugScore)
469}
470
471// scoreFileUsingBM25 computes a score for the file match using an approximation to BM25, the most common scoring
472// algorithm for keyword search: https://en.wikipedia.org/wiki/Okapi_BM25. It implements all parts of the formula
473// except inverse document frequency (idf), since we don't have access to global term frequency statistics.
474//
475// This scoring strategy ignores all other signals including document ranks. This keeps things simple for now,
476// since BM25 is not normalized and can be tricky to combine with other scoring signals.
477func (d *indexData) scoreFileUsingBM25(fileMatch *FileMatch, doc uint32, cands []*candidateMatch, opts *SearchOptions) {
478	// Treat each candidate match as a term and compute the frequencies. For now, ignore case
479	// sensitivity and treat filenames and symbols the same as content.
480	termFreqs := map[string]int{}
481	for _, cand := range cands {
482		term := string(cand.substrLowered)
483		termFreqs[term]++
484	}
485
486	// Compute the file length ratio. Usually the calculation would be based on terms, but using
487	// bytes should work fine, as we're just computing a ratio.
488	fileLength := float64(d.boundaries[doc+1] - d.boundaries[doc])
489	numFiles := len(d.boundaries)
490	averageFileLength := float64(d.boundaries[numFiles-1]) / float64(numFiles)
491	L := fileLength / averageFileLength
492
493	// Use standard parameter defaults (used in Lucene and academic papers)
494	k, b := 1.2, 0.75
495	sumTf := 0.0 // Just for debugging
496	score := 0.0
497	for _, freq := range termFreqs {
498		tf := float64(freq)
499		sumTf += tf
500		score += ((k + 1.0) * tf) / (k*(1.0-b+b*L) + tf)
501	}
502
503	fileMatch.addKeywordScore(score, sumTf, L, opts.DebugScore)
504}
505
506func addRepo(res *SearchResult, repo *Repository) {
507	if res.RepoURLs == nil {
508		res.RepoURLs = map[string]string{}
509	}
510	res.RepoURLs[repo.Name] = repo.FileURLTemplate
511
512	if res.LineFragments == nil {
513		res.LineFragments = map[string]string{}
514	}
515	res.LineFragments[repo.Name] = repo.LineFragmentTemplate
516}
517
518type sortByOffsetSlice []*candidateMatch
519
520func (m sortByOffsetSlice) Len() int      { return len(m) }
521func (m sortByOffsetSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
522func (m sortByOffsetSlice) Less(i, j int) bool {
523	return m[i].byteOffset < m[j].byteOffset
524}
525
526// Gather matches from this document. This never returns a mixture of
527// filename/content matches: if there are content matches, all
528// filename matches are trimmed from the result. The matches are
529// returned in document order and are non-overlapping.
530//
531// If `merge` is set, overlapping and adjacent matches will be merged
532// into a single match. Otherwise, overlapping matches will be removed,
533// but adjacent matches will remain.
534func gatherMatches(mt matchTree, known map[matchTree]bool, merge bool) []*candidateMatch {
535	var cands []*candidateMatch
536	visitMatches(mt, known, func(mt matchTree) {
537		if smt, ok := mt.(*substrMatchTree); ok {
538			cands = append(cands, smt.current...)
539		}
540		if rmt, ok := mt.(*regexpMatchTree); ok {
541			cands = append(cands, rmt.found...)
542		}
543		if rmt, ok := mt.(*wordMatchTree); ok {
544			cands = append(cands, rmt.found...)
545		}
546		if smt, ok := mt.(*symbolRegexpMatchTree); ok {
547			cands = append(cands, smt.found...)
548		}
549	})
550
551	foundContentMatch := false
552	for _, c := range cands {
553		if !c.fileName {
554			foundContentMatch = true
555			break
556		}
557	}
558
559	res := cands[:0]
560	for _, c := range cands {
561		if !foundContentMatch || !c.fileName {
562			res = append(res, c)
563		}
564	}
565	cands = res
566
567	if merge {
568		// Merge adjacent candidates. This guarantees that the matches
569		// are non-overlapping.
570		sort.Sort((sortByOffsetSlice)(cands))
571		res = cands[:0]
572		for i, c := range cands {
573			if i == 0 {
574				res = append(res, c)
575				continue
576			}
577			last := res[len(res)-1]
578			lastEnd := last.byteOffset + last.byteMatchSz
579			end := c.byteOffset + c.byteMatchSz
580			if lastEnd >= c.byteOffset {
581				if end > lastEnd {
582					last.byteMatchSz = end - last.byteOffset
583				}
584				continue
585			}
586
587			res = append(res, c)
588		}
589	} else {
590		// Remove overlapping candidates. This guarantees that the matches
591		// are non-overlapping, but also preserves expected match counts.
592		sort.Sort((sortByOffsetSlice)(cands))
593		res = cands[:0]
594		for i, c := range cands {
595			if i == 0 {
596				res = append(res, c)
597				continue
598			}
599			last := res[len(res)-1]
600			lastEnd := last.byteOffset + last.byteMatchSz
601			if lastEnd > c.byteOffset {
602				continue
603			}
604
605			res = append(res, c)
606		}
607	}
608
609	return res
610}
611
612func (d *indexData) branchIndex(docID uint32) int {
613	mask := d.fileBranchMasks[docID]
614	idx := 0
615	for mask != 0 {
616		if mask&0x1 != 0 {
617			return idx
618		}
619		idx++
620		mask >>= 1
621	}
622	return -1
623}
624
625// gatherBranches returns a list of branch names taking into account any branch
626// filters in the query. If the query contains a branch filter, it returns all
627// branches containing the docID and matching the branch filter. Otherwise, it
628// returns all branches containing docID.
629func (d *indexData) gatherBranches(docID uint32, mt matchTree, known map[matchTree]bool) []string {
630	var mask uint64
631	visitMatches(mt, known, func(mt matchTree) {
632		bq, ok := mt.(*branchQueryMatchTree)
633		if !ok {
634			return
635		}
636
637		mask = mask | bq.branchMask()
638	})
639
640	if mask == 0 {
641		mask = d.fileBranchMasks[docID]
642	}
643
644	var branches []string
645	id := uint32(1)
646	branchNames := d.branchNames[d.repos[docID]]
647	for mask != 0 {
648		if mask&0x1 != 0 {
649			branches = append(branches, branchNames[uint(id)])
650		}
651		id <<= 1
652		mask >>= 1
653	}
654
655	return branches
656}
657
658func (d *indexData) List(ctx context.Context, q query.Q, opts *ListOptions) (rl *RepoList, err error) {
659	var include func(rle *RepoListEntry) bool
660
661	q = d.simplify(q)
662	if c, ok := q.(*query.Const); ok {
663		if !c.Value {
664			return &RepoList{}, nil
665		}
666		include = func(rle *RepoListEntry) bool {
667			return true
668		}
669	} else {
670		sr, err := d.Search(ctx, q, &SearchOptions{
671			ShardRepoMaxMatchCount: 1,
672		})
673		if err != nil {
674			return nil, err
675		}
676
677		foundRepos := make(map[string]struct{}, len(sr.Files))
678		for _, file := range sr.Files {
679			foundRepos[file.Repository] = struct{}{}
680		}
681
682		include = func(rle *RepoListEntry) bool {
683			_, ok := foundRepos[rle.Repository.Name]
684			return ok
685		}
686	}
687
688	var l RepoList
689
690	field, err := opts.GetField()
691	if err != nil {
692		return nil, err
693	}
694	switch field {
695	case RepoListFieldRepos:
696		l.Repos = make([]*RepoListEntry, 0, len(d.repoListEntry))
697	case RepoListFieldMinimal:
698		l.Minimal = make(map[uint32]*MinimalRepoListEntry, len(d.repoListEntry))
699	case RepoListFieldReposMap:
700		l.ReposMap = make(ReposMap, len(d.repoListEntry))
701	}
702
703	for i := range d.repoListEntry {
704		if d.repoMetaData[i].Tombstone {
705			continue
706		}
707		rle := &d.repoListEntry[i]
708		if !include(rle) {
709			continue
710		}
711
712		l.Stats.Add(&rle.Stats)
713
714		// Backwards compat for when ID is missing
715		if rle.Repository.ID == 0 {
716			l.Repos = append(l.Repos, rle)
717			continue
718		}
719
720		switch field {
721		case RepoListFieldRepos:
722			l.Repos = append(l.Repos, rle)
723		case RepoListFieldMinimal:
724			l.Minimal[rle.Repository.ID] = &MinimalRepoListEntry{
725				HasSymbols:    rle.Repository.HasSymbols,
726				Branches:      rle.Repository.Branches,
727				IndexTimeUnix: rle.IndexMetadata.IndexTime.Unix(),
728			}
729		case RepoListFieldReposMap:
730			l.ReposMap[rle.Repository.ID] = MinimalRepoListEntry{
731				HasSymbols:    rle.Repository.HasSymbols,
732				Branches:      rle.Repository.Branches,
733				IndexTimeUnix: rle.IndexMetadata.IndexTime.Unix(),
734			}
735		}
736
737	}
738
739	// Only one of these fields is populated and in all cases the size of that
740	// field is the number of Repos in this shard.
741	l.Stats.Repos = len(l.Repos) + len(l.Minimal) + len(l.ReposMap)
742
743	return &l, nil
744}
745
746// regexpToMatchTreeRecursive converts a regular expression to a matchTree mt. If
747// mt is equivalent to the input r, isEqual = true and the matchTree can be used
748// in place of the regex r. If singleLine = true, then the matchTree and all
749// its children only match terms on the same line. singleLine is used during
750// recursion to decide whether to return an andLineMatchTree (singleLine = true)
751// or a andMatchTree (singleLine = false).
752func (d *indexData) regexpToMatchTreeRecursive(r *syntax.Regexp, minTextSize int, fileName bool, caseSensitive bool) (mt matchTree, isEqual bool, singleLine bool, err error) {
753	// TODO - we could perhaps transform Begin/EndText in '\n'?
754	// TODO - we could perhaps transform CharClass in (OrQuery )
755	// if there are just a few runes, and part of a OpConcat?
756	switch r.Op {
757	case syntax.OpLiteral:
758		s := string(r.Rune)
759		if len(s) >= minTextSize {
760			mt, err := d.newSubstringMatchTree(&query.Substring{Pattern: s, FileName: fileName, CaseSensitive: caseSensitive})
761			return mt, true, !strings.Contains(s, "\n"), err
762		}
763	case syntax.OpCapture:
764		return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
765
766	case syntax.OpPlus:
767		return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
768
769	case syntax.OpRepeat:
770		if r.Min == 1 {
771			return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
772		} else if r.Min > 1 {
773			// (x){2,} can't be expressed precisely by the matchTree
774			mt, _, singleLine, err := d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
775			return mt, false, singleLine, err
776		}
777	case syntax.OpConcat, syntax.OpAlternate:
778		var qs []matchTree
779		isEq := true
780		singleLine = true
781		for _, sr := range r.Sub {
782			if sq, subIsEq, subSingleLine, err := d.regexpToMatchTreeRecursive(sr, minTextSize, fileName, caseSensitive); sq != nil {
783				if err != nil {
784					return nil, false, false, err
785				}
786				isEq = isEq && subIsEq
787				singleLine = singleLine && subSingleLine
788				qs = append(qs, sq)
789			}
790		}
791		if r.Op == syntax.OpConcat {
792			if len(qs) > 1 {
793				isEq = false
794			}
795			newQs := make([]matchTree, 0, len(qs))
796			for _, q := range qs {
797				if _, ok := q.(*bruteForceMatchTree); ok {
798					continue
799				}
800				newQs = append(newQs, q)
801			}
802			if len(newQs) == 1 {
803				return newQs[0], isEq, singleLine, nil
804			}
805			if len(newQs) == 0 {
806				return &bruteForceMatchTree{}, isEq, singleLine, nil
807			}
808			if singleLine {
809				return &andLineMatchTree{andMatchTree{children: newQs}}, isEq, singleLine, nil
810			}
811			return &andMatchTree{newQs}, isEq, singleLine, nil
812		}
813		for _, q := range qs {
814			if _, ok := q.(*bruteForceMatchTree); ok {
815				return q, isEq, false, nil
816			}
817		}
818		if len(qs) == 0 {
819			return &noMatchTree{Why: "const"}, isEq, false, nil
820		}
821		return &orMatchTree{qs}, isEq, false, nil
822	case syntax.OpStar:
823		if r.Sub[0].Op == syntax.OpAnyCharNotNL {
824			return &bruteForceMatchTree{}, false, true, nil
825		}
826	}
827	return &bruteForceMatchTree{}, false, false, nil
828}
829
830type timer struct {
831	last time.Time
832}
833
834func newTimer() *timer {
835	return &timer{
836		last: time.Now(),
837	}
838}
839
840func (t *timer) Elapsed() time.Duration {
841	now := time.Now()
842	d := now.Sub(t.last)
843	t.last = now
844	return d
845}
Configure Feed

Configure Feed