eval.go at b9e6d9433e2e4438be6727ac3f1ae0a00e283220 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / eval.go
at b9e6d9433e2e4438be6727ac3f1ae0a00e283220 24 kB View raw
Keegan Carruthers-Smith shards: populate RepoList.Stats.Repos (#605) 3y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package zoekt
 16
 17import (
 18	"context"
 19	"fmt"
 20	"log"
 21	"math"
 22	"regexp/syntax"
 23	"sort"
 24	"strings"
 25
 26	enry_data "github.com/go-enry/go-enry/v2/data"
 27	"github.com/grafana/regexp"
 28
 29	"github.com/sourcegraph/zoekt/query"
 30)
 31
 32const maxUInt16 = 0xffff
 33
 34func (m *FileMatch) addScore(what string, s float64, debugScore bool) {
 35	if debugScore {
 36		m.Debug += fmt.Sprintf("%s:%.2f, ", what, s)
 37	}
 38	m.Score += s
 39}
 40
 41func (m *FileMatch) addKeywordScore(score float64, sumTf float64, L float64, debugScore bool) {
 42	if debugScore {
 43		m.Debug += fmt.Sprintf("keyword-score:%.2f (sum-tf: %.2f, length-ratio: %.2f)", score, sumTf, L)
 44	}
 45	m.Score += score
 46}
 47
 48// simplifyMultiRepo takes a query and a predicate. It returns Const(true) if all
 49// repository names fulfill the predicate, Const(false) if none of them do, and q
 50// otherwise.
 51func (d *indexData) simplifyMultiRepo(q query.Q, predicate func(*Repository) bool) query.Q {
 52	count := 0
 53	alive := len(d.repoMetaData)
 54	for i := range d.repoMetaData {
 55		if d.repoMetaData[i].Tombstone {
 56			alive--
 57		} else if predicate(&d.repoMetaData[i]) {
 58			count++
 59		}
 60	}
 61	if count == alive {
 62		return &query.Const{Value: true}
 63	}
 64	if count > 0 {
 65		return q
 66	}
 67	return &query.Const{Value: false}
 68}
 69
 70func (d *indexData) simplify(in query.Q) query.Q {
 71	eval := query.Map(in, func(q query.Q) query.Q {
 72		switch r := q.(type) {
 73		case *query.Repo:
 74			return d.simplifyMultiRepo(q, func(repo *Repository) bool {
 75				return r.Regexp.MatchString(repo.Name)
 76			})
 77		case *query.RepoRegexp:
 78			return d.simplifyMultiRepo(q, func(repo *Repository) bool {
 79				return r.Regexp.MatchString(repo.Name)
 80			})
 81		case *query.BranchesRepos:
 82			for i := range d.repoMetaData {
 83				for _, br := range r.List {
 84					if br.Repos.Contains(d.repoMetaData[i].ID) {
 85						return q
 86					}
 87				}
 88			}
 89			return &query.Const{Value: false}
 90		case *query.RepoSet:
 91			return d.simplifyMultiRepo(q, func(repo *Repository) bool {
 92				return r.Set[repo.Name]
 93			})
 94		case *query.RepoIDs:
 95			return d.simplifyMultiRepo(q, func(repo *Repository) bool {
 96				return r.Repos.Contains(repo.ID)
 97			})
 98		case *query.Language:
 99			_, has := d.metaData.LanguageMap[r.Language]
100			if !has && d.metaData.IndexFeatureVersion < 12 {
101				// For index files that haven't been re-indexed by go-enry,
102				// fall back to file-based matching and continue even if this
103				// repo doesn't have the specific language present.
104				extsForLang := enry_data.ExtensionsByLanguage[r.Language]
105				if extsForLang != nil {
106					extFrags := make([]string, 0, len(extsForLang))
107					for _, ext := range extsForLang {
108						extFrags = append(extFrags, regexp.QuoteMeta(ext))
109					}
110					if len(extFrags) > 0 {
111						pattern := fmt.Sprintf("(?i)(%s)$", strings.Join(extFrags, "|"))
112						// inlined copy of query.regexpQuery
113						re, err := syntax.Parse(pattern, syntax.Perl)
114						if err != nil {
115							return &query.Const{Value: false}
116						}
117						if re.Op == syntax.OpLiteral {
118							return &query.Substring{
119								Pattern:  string(re.Rune),
120								FileName: true,
121							}
122						}
123						return &query.Regexp{
124							Regexp:   re,
125							FileName: true,
126						}
127					}
128				}
129			}
130			if !has {
131				return &query.Const{Value: false}
132			}
133		}
134		return q
135	})
136	return query.Simplify(eval)
137}
138
139func (o *SearchOptions) SetDefaults() {
140	if o.ShardMaxMatchCount == 0 {
141		// We cap the total number of matches, so overly broad
142		// searches don't crash the machine.
143		o.ShardMaxMatchCount = 100000
144	}
145	if o.TotalMaxMatchCount == 0 {
146		o.TotalMaxMatchCount = 10 * o.ShardMaxMatchCount
147	}
148}
149
150func (d *indexData) Search(ctx context.Context, q query.Q, opts *SearchOptions) (sr *SearchResult, err error) {
151	copyOpts := *opts
152	opts = &copyOpts
153	opts.SetDefaults()
154
155	var res SearchResult
156	if len(d.fileNameIndex) == 0 {
157		return &res, nil
158	}
159
160	select {
161	case <-ctx.Done():
162		res.Stats.ShardsSkipped++
163		return &res, nil
164	default:
165	}
166
167	q = d.simplify(q)
168	if c, ok := q.(*query.Const); ok && !c.Value {
169		return &res, nil
170	}
171
172	if opts.EstimateDocCount {
173		res.Stats.ShardFilesConsidered = len(d.fileBranchMasks)
174		return &res, nil
175	}
176
177	q = query.Map(q, query.ExpandFileContent)
178
179	mt, err := d.newMatchTree(q, matchTreeOpt{})
180	if err != nil {
181		return nil, err
182	}
183
184	mt, err = pruneMatchTree(mt)
185	if err != nil {
186		return nil, err
187	}
188	if mt == nil {
189		res.Stats.ShardsSkippedFilter++
190		return &res, nil
191	}
192
193	res.Stats.ShardsScanned++
194
195	cp := &contentProvider{
196		id:    d,
197		stats: &res.Stats,
198	}
199
200	// Track the number of documents found in a repository for
201	// ShardRepoMaxMatchCount
202	var (
203		lastRepoID     uint16
204		repoMatchCount int
205	)
206
207	docCount := uint32(len(d.fileBranchMasks))
208	lastDoc := int(-1)
209
210nextFileMatch:
211	for {
212		canceled := false
213		select {
214		case <-ctx.Done():
215			canceled = true
216		default:
217		}
218
219		nextDoc := mt.nextDoc()
220		if int(nextDoc) <= lastDoc {
221			nextDoc = uint32(lastDoc + 1)
222		}
223
224		for ; nextDoc < docCount; nextDoc++ {
225			repoID := d.repos[nextDoc]
226			repoMetadata := &d.repoMetaData[repoID]
227
228			// Skip tombstoned repositories
229			if repoMetadata.Tombstone {
230				continue
231			}
232
233			// Skip documents that are tombstoned
234			if len(repoMetadata.FileTombstones) > 0 {
235				if _, tombstoned := repoMetadata.FileTombstones[string(d.fileName(nextDoc))]; tombstoned {
236					continue
237				}
238			}
239
240			// Skip documents over ShardRepoMaxMatchCount if specified.
241			if opts.ShardRepoMaxMatchCount > 0 {
242				if repoMatchCount >= opts.ShardRepoMaxMatchCount && repoID == lastRepoID {
243					res.Stats.FilesSkipped++
244					continue
245				}
246			}
247
248			break
249		}
250
251		if nextDoc >= docCount {
252			break
253		}
254
255		lastDoc = int(nextDoc)
256
257		// We track lastRepoID for ShardRepoMaxMatchCount
258		if lastRepoID != d.repos[nextDoc] {
259			lastRepoID = d.repos[nextDoc]
260			repoMatchCount = 0
261		}
262
263		if canceled || (res.Stats.MatchCount >= opts.ShardMaxMatchCount && opts.ShardMaxMatchCount > 0) {
264			res.Stats.FilesSkipped += int(docCount - nextDoc)
265			break
266		}
267
268		res.Stats.FilesConsidered++
269		mt.prepare(nextDoc)
270
271		cp.setDocument(nextDoc)
272
273		known := make(map[matchTree]bool)
274		md := d.repoMetaData[d.repos[nextDoc]]
275
276		for cost := costMin; cost <= costMax; cost++ {
277			v, ok := mt.matches(cp, cost, known)
278			if ok && !v {
279				continue nextFileMatch
280			}
281
282			if cost == costMax && !ok {
283				log.Panicf("did not decide. Repo %s, doc %d, known %v",
284					md.Name, nextDoc, known)
285			}
286		}
287
288		fileMatch := FileMatch{
289			Repository:         md.Name,
290			RepositoryID:       md.ID,
291			RepositoryPriority: md.priority,
292			FileName:           string(d.fileName(nextDoc)),
293			Checksum:           d.getChecksum(nextDoc),
294			Language:           d.languageMap[d.getLanguage(nextDoc)],
295		}
296
297		if s := d.subRepos[nextDoc]; s > 0 {
298			if s >= uint32(len(d.subRepoPaths[d.repos[nextDoc]])) {
299				log.Panicf("corrupt index: subrepo %d beyond %v", s, d.subRepoPaths)
300			}
301			path := d.subRepoPaths[d.repos[nextDoc]][s]
302			fileMatch.SubRepositoryPath = path
303			sr := md.SubRepoMap[path]
304			fileMatch.SubRepositoryName = sr.Name
305			if idx := d.branchIndex(nextDoc); idx >= 0 {
306				fileMatch.Version = sr.Branches[idx].Version
307			}
308		} else {
309			idx := d.branchIndex(nextDoc)
310			if idx >= 0 {
311				fileMatch.Version = md.Branches[idx].Version
312			}
313		}
314
315		shouldMergeMatches := !opts.ChunkMatches
316		finalCands := gatherMatches(mt, known, shouldMergeMatches)
317
318		if len(finalCands) == 0 {
319			nm := d.fileName(nextDoc)
320			finalCands = append(finalCands,
321				&candidateMatch{
322					caseSensitive: false,
323					fileName:      true,
324					substrBytes:   nm,
325					substrLowered: nm,
326					file:          nextDoc,
327					runeOffset:    0,
328					byteOffset:    0,
329					byteMatchSz:   uint32(len(nm)),
330				})
331		}
332
333		if opts.ChunkMatches {
334			fileMatch.ChunkMatches = cp.fillChunkMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts.DebugScore)
335		} else {
336			fileMatch.LineMatches = cp.fillMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts.DebugScore)
337		}
338
339		if opts.UseKeywordScoring {
340			d.scoreFileUsingBM25(&fileMatch, nextDoc, finalCands, opts)
341		} else {
342			// Use the standard, non-experimental scoring method by default
343			d.scoreFile(&fileMatch, nextDoc, mt, known, opts)
344		}
345
346		fileMatch.Branches = d.gatherBranches(nextDoc, mt, known)
347		sortMatchesByScore(fileMatch.LineMatches)
348		sortChunkMatchesByScore(fileMatch.ChunkMatches)
349		if opts.Whole {
350			fileMatch.Content = cp.data(false)
351		}
352
353		matchedChunkRanges := 0
354		for _, cm := range fileMatch.ChunkMatches {
355			matchedChunkRanges += len(cm.Ranges)
356		}
357
358		repoMatchCount += len(fileMatch.LineMatches)
359		repoMatchCount += matchedChunkRanges
360
361		if opts.DebugScore {
362			fileMatch.Debug = fmt.Sprintf("score:%.2f <- %s", fileMatch.Score, fileMatch.Debug)
363		}
364
365		res.Files = append(res.Files, fileMatch)
366		res.Stats.MatchCount += len(fileMatch.LineMatches)
367		res.Stats.MatchCount += matchedChunkRanges
368		res.Stats.FileCount++
369	}
370
371	// We do not sort Files here, instead we rely on the shards pkg to do file
372	// ranking. If we sorted now, we would break the assumption that results
373	// from the same repo in a shard appear next to each other.
374
375	for _, md := range d.repoMetaData {
376		r := md
377		addRepo(&res, &r)
378		for _, v := range r.SubRepoMap {
379			addRepo(&res, v)
380		}
381	}
382
383	visitMatchTree(mt, func(mt matchTree) {
384		if atom, ok := mt.(interface{ updateStats(*Stats) }); ok {
385			atom.updateStats(&res.Stats)
386		}
387	})
388
389	// If document ranking is enabled, then we can rank and truncate the files to save memory.
390	if limit := opts.MaxDocDisplayCount; opts.UseDocumentRanks && limit > 0 && limit < len(res.Files) {
391		SortFiles(res.Files)
392		res.Files = res.Files[:limit]
393	}
394
395	return &res, nil
396}
397
398// scoreFile computes a score for the file match using various scoring signals, like
399// whether there's an exact match on a symbol, the number of query clauses that matched, etc.
400func (d *indexData) scoreFile(fileMatch *FileMatch, doc uint32, mt matchTree, known map[matchTree]bool, opts *SearchOptions) {
401	atomMatchCount := 0
402	visitMatches(mt, known, func(mt matchTree) {
403		atomMatchCount++
404	})
405
406	// atom-count boosts files with matches from more than 1 atom. The
407	// maximum boost is scoreFactorAtomMatch.
408	if atomMatchCount > 0 {
409		fileMatch.addScore("atom", (1.0-1.0/float64(atomMatchCount))*scoreFactorAtomMatch, opts.DebugScore)
410	}
411
412	maxFileScore := 0.0
413	repetitions := 0
414	for i := range fileMatch.LineMatches {
415		if maxFileScore < fileMatch.LineMatches[i].Score {
416			maxFileScore = fileMatch.LineMatches[i].Score
417			repetitions = 0
418		} else if maxFileScore == fileMatch.LineMatches[i].Score {
419			repetitions += 1
420		}
421
422		// Order by ordering in file.
423		fileMatch.LineMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.LineMatches))))
424	}
425
426	for i := range fileMatch.ChunkMatches {
427		if maxFileScore < fileMatch.ChunkMatches[i].Score {
428			maxFileScore = fileMatch.ChunkMatches[i].Score
429		}
430
431		// Order by ordering in file.
432		fileMatch.ChunkMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.ChunkMatches))))
433	}
434
435	// Maintain ordering of input files. This
436	// strictly dominates the in-file ordering of
437	// the matches.
438	fileMatch.addScore("fragment", maxFileScore, opts.DebugScore)
439
440	// Prefer docs with several top-scored matches.
441	fileMatch.addScore("repetition-boost", scoreRepetitionFactor*float64(repetitions), opts.DebugScore)
442
443	if opts.UseDocumentRanks && len(d.ranks) > int(doc) {
444		weight := scoreFileRankFactor
445		if opts.DocumentRanksWeight > 0.0 {
446			weight = opts.DocumentRanksWeight
447		}
448
449		ranks := d.ranks[doc]
450		// The ranks slice always contains one entry representing the file rank (unless it's empty since the
451		// file doesn't have a rank). This is left over from when documents could have multiple rank signals,
452		// and we plan to clean this up.
453		if len(ranks) > 0 {
454			// The file rank represents a log (base 2) count. The log ranks should be bounded at 32, but we
455			// cap it just in case to ensure it falls in the range [0, 1].
456			normalized := math.Min(1.0, ranks[0]/32.0)
457			fileMatch.addScore("file-rank", weight*normalized, opts.DebugScore)
458		}
459	}
460
461	md := d.repoMetaData[d.repos[doc]]
462	fileMatch.addScore("doc-order", scoreFileOrderFactor*(1.0-float64(doc)/float64(len(d.boundaries))), opts.DebugScore)
463	fileMatch.addScore("repo-rank", scoreRepoRankFactor*float64(md.Rank)/maxUInt16, opts.DebugScore)
464}
465
466// scoreFileUsingBM25 computes a score for the file match using an approximation to BM25, the most common scoring
467// algorithm for keyword search: https://en.wikipedia.org/wiki/Okapi_BM25. It implements all parts of the formula
468// except inverse document frequency (idf), since we don't have access to global term frequency statistics.
469//
470// This scoring strategy ignores all other signals including document ranks. This keeps things simple for now,
471// since BM25 is not normalized and can be tricky to combine with other scoring signals.
472func (d *indexData) scoreFileUsingBM25(fileMatch *FileMatch, doc uint32, cands []*candidateMatch, opts *SearchOptions) {
473	// Treat each candidate match as a term and compute the frequencies. For now, ignore case
474	// sensitivity and treat filenames and symbols the same as content.
475	termFreqs := map[string]int{}
476	for _, cand := range cands {
477		term := string(cand.substrLowered)
478		termFreqs[term]++
479	}
480
481	// Compute the file length ratio. Usually the calculation would be based on terms, but using
482	// bytes should work fine, as we're just computing a ratio.
483	fileLength := float64(d.boundaries[doc+1] - d.boundaries[doc])
484	numFiles := len(d.boundaries)
485	averageFileLength := float64(d.boundaries[numFiles - 1]) / float64(numFiles)
486	L := fileLength / averageFileLength
487
488	// Use standard parameter defaults (used in Lucene and academic papers)
489	k, b := 1.2, 0.75
490	sumTf := 0.0 // Just for debugging
491	score := 0.0
492	for _, freq := range termFreqs {
493		tf := float64(freq)
494		sumTf += tf
495		score += ((k + 1.0) * tf) / (k * (1.0 - b + b * L) + tf)
496	}
497
498	fileMatch.addKeywordScore(score, sumTf, L, opts.DebugScore)
499}
500
501func addRepo(res *SearchResult, repo *Repository) {
502	if res.RepoURLs == nil {
503		res.RepoURLs = map[string]string{}
504	}
505	res.RepoURLs[repo.Name] = repo.FileURLTemplate
506
507	if res.LineFragments == nil {
508		res.LineFragments = map[string]string{}
509	}
510	res.LineFragments[repo.Name] = repo.LineFragmentTemplate
511}
512
513type sortByOffsetSlice []*candidateMatch
514
515func (m sortByOffsetSlice) Len() int      { return len(m) }
516func (m sortByOffsetSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
517func (m sortByOffsetSlice) Less(i, j int) bool {
518	return m[i].byteOffset < m[j].byteOffset
519}
520
521// Gather matches from this document. This never returns a mixture of
522// filename/content matches: if there are content matches, all
523// filename matches are trimmed from the result. The matches are
524// returned in document order and are non-overlapping.
525//
526// If `merge` is set, overlapping and adjacent matches will be merged
527// into a single match. Otherwise, overlapping matches will be removed,
528// but adjacent matches will remain.
529func gatherMatches(mt matchTree, known map[matchTree]bool, merge bool) []*candidateMatch {
530	var cands []*candidateMatch
531	visitMatches(mt, known, func(mt matchTree) {
532		if smt, ok := mt.(*substrMatchTree); ok {
533			cands = append(cands, smt.current...)
534		}
535		if rmt, ok := mt.(*regexpMatchTree); ok {
536			cands = append(cands, rmt.found...)
537		}
538		if rmt, ok := mt.(*wordMatchTree); ok {
539			cands = append(cands, rmt.found...)
540		}
541		if smt, ok := mt.(*symbolRegexpMatchTree); ok {
542			cands = append(cands, smt.found...)
543		}
544	})
545
546	foundContentMatch := false
547	for _, c := range cands {
548		if !c.fileName {
549			foundContentMatch = true
550			break
551		}
552	}
553
554	res := cands[:0]
555	for _, c := range cands {
556		if !foundContentMatch || !c.fileName {
557			res = append(res, c)
558		}
559	}
560	cands = res
561
562	if merge {
563		// Merge adjacent candidates. This guarantees that the matches
564		// are non-overlapping.
565		sort.Sort((sortByOffsetSlice)(cands))
566		res = cands[:0]
567		for i, c := range cands {
568			if i == 0 {
569				res = append(res, c)
570				continue
571			}
572			last := res[len(res)-1]
573			lastEnd := last.byteOffset + last.byteMatchSz
574			end := c.byteOffset + c.byteMatchSz
575			if lastEnd >= c.byteOffset {
576				if end > lastEnd {
577					last.byteMatchSz = end - last.byteOffset
578				}
579				continue
580			}
581
582			res = append(res, c)
583		}
584	} else {
585		// Remove overlapping candidates. This guarantees that the matches
586		// are non-overlapping, but also preserves expected match counts.
587		sort.Sort((sortByOffsetSlice)(cands))
588		res = cands[:0]
589		for i, c := range cands {
590			if i == 0 {
591				res = append(res, c)
592				continue
593			}
594			last := res[len(res)-1]
595			lastEnd := last.byteOffset + last.byteMatchSz
596			if lastEnd > c.byteOffset {
597				continue
598			}
599
600			res = append(res, c)
601		}
602	}
603
604	return res
605}
606
607func (d *indexData) branchIndex(docID uint32) int {
608	mask := d.fileBranchMasks[docID]
609	idx := 0
610	for mask != 0 {
611		if mask&0x1 != 0 {
612			return idx
613		}
614		idx++
615		mask >>= 1
616	}
617	return -1
618}
619
620// gatherBranches returns a list of branch names taking into account any branch
621// filters in the query. If the query contains a branch filter, it returns all
622// branches containing the docID and matching the branch filter. Otherwise, it
623// returns all branches containing docID.
624func (d *indexData) gatherBranches(docID uint32, mt matchTree, known map[matchTree]bool) []string {
625	var mask uint64
626	visitMatches(mt, known, func(mt matchTree) {
627		bq, ok := mt.(*branchQueryMatchTree)
628		if !ok {
629			return
630		}
631
632		mask = mask | bq.branchMask()
633	})
634
635	if mask == 0 {
636		mask = d.fileBranchMasks[docID]
637	}
638
639	var branches []string
640	id := uint32(1)
641	branchNames := d.branchNames[d.repos[docID]]
642	for mask != 0 {
643		if mask&0x1 != 0 {
644			branches = append(branches, branchNames[uint(id)])
645		}
646		id <<= 1
647		mask >>= 1
648	}
649
650	return branches
651}
652
653func (d *indexData) List(ctx context.Context, q query.Q, opts *ListOptions) (rl *RepoList, err error) {
654	var include func(rle *RepoListEntry) bool
655
656	q = d.simplify(q)
657	if c, ok := q.(*query.Const); ok {
658		if !c.Value {
659			return &RepoList{}, nil
660		}
661		include = func(rle *RepoListEntry) bool {
662			return true
663		}
664	} else {
665		sr, err := d.Search(ctx, q, &SearchOptions{
666			ShardRepoMaxMatchCount: 1,
667		})
668		if err != nil {
669			return nil, err
670		}
671
672		foundRepos := make(map[string]struct{}, len(sr.Files))
673		for _, file := range sr.Files {
674			foundRepos[file.Repository] = struct{}{}
675		}
676
677		include = func(rle *RepoListEntry) bool {
678			_, ok := foundRepos[rle.Repository.Name]
679			return ok
680		}
681	}
682
683	var l RepoList
684
685	field, err := opts.GetField()
686	if err != nil {
687		return nil, err
688	}
689	switch field {
690	case RepoListFieldRepos:
691		l.Repos = make([]*RepoListEntry, 0, len(d.repoListEntry))
692	case RepoListFieldMinimal:
693		l.Minimal = make(map[uint32]*MinimalRepoListEntry, len(d.repoListEntry))
694	case RepoListFieldReposMap:
695		l.ReposMap = make(ReposMap, len(d.repoListEntry))
696	}
697
698	for i := range d.repoListEntry {
699		if d.repoMetaData[i].Tombstone {
700			continue
701		}
702		rle := &d.repoListEntry[i]
703		if !include(rle) {
704			continue
705		}
706
707		l.Stats.Add(&rle.Stats)
708
709		// Backwards compat for when ID is missing
710		if rle.Repository.ID == 0 {
711			l.Repos = append(l.Repos, rle)
712			continue
713		}
714
715		switch field {
716		case RepoListFieldRepos:
717			l.Repos = append(l.Repos, rle)
718		case RepoListFieldMinimal:
719			l.Minimal[rle.Repository.ID] = &MinimalRepoListEntry{
720				HasSymbols:    rle.Repository.HasSymbols,
721				Branches:      rle.Repository.Branches,
722				IndexTimeUnix: rle.IndexMetadata.IndexTime.Unix(),
723			}
724		case RepoListFieldReposMap:
725			l.ReposMap[rle.Repository.ID] = MinimalRepoListEntry{
726				HasSymbols:    rle.Repository.HasSymbols,
727				Branches:      rle.Repository.Branches,
728				IndexTimeUnix: rle.IndexMetadata.IndexTime.Unix(),
729			}
730		}
731
732	}
733
734	// Only one of these fields is populated and in all cases the size of that
735	// field is the number of Repos in this shard.
736	l.Stats.Repos = len(l.Repos) + len(l.Minimal) + len(l.ReposMap)
737
738	return &l, nil
739}
740
741// regexpToMatchTreeRecursive converts a regular expression to a matchTree mt. If
742// mt is equivalent to the input r, isEqual = true and the matchTree can be used
743// in place of the regex r. If singleLine = true, then the matchTree and all
744// its children only match terms on the same line. singleLine is used during
745// recursion to decide whether to return an andLineMatchTree (singleLine = true)
746// or a andMatchTree (singleLine = false).
747func (d *indexData) regexpToMatchTreeRecursive(r *syntax.Regexp, minTextSize int, fileName bool, caseSensitive bool) (mt matchTree, isEqual bool, singleLine bool, err error) {
748	// TODO - we could perhaps transform Begin/EndText in '\n'?
749	// TODO - we could perhaps transform CharClass in (OrQuery )
750	// if there are just a few runes, and part of a OpConcat?
751	switch r.Op {
752	case syntax.OpLiteral:
753		s := string(r.Rune)
754		if len(s) >= minTextSize {
755			mt, err := d.newSubstringMatchTree(&query.Substring{Pattern: s, FileName: fileName, CaseSensitive: caseSensitive})
756			return mt, true, !strings.Contains(s, "\n"), err
757		}
758	case syntax.OpCapture:
759		return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
760
761	case syntax.OpPlus:
762		return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
763
764	case syntax.OpRepeat:
765		if r.Min == 1 {
766			return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
767		} else if r.Min > 1 {
768			// (x){2,} can't be expressed precisely by the matchTree
769			mt, _, singleLine, err := d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
770			return mt, false, singleLine, err
771		}
772	case syntax.OpConcat, syntax.OpAlternate:
773		var qs []matchTree
774		isEq := true
775		singleLine = true
776		for _, sr := range r.Sub {
777			if sq, subIsEq, subSingleLine, err := d.regexpToMatchTreeRecursive(sr, minTextSize, fileName, caseSensitive); sq != nil {
778				if err != nil {
779					return nil, false, false, err
780				}
781				isEq = isEq && subIsEq
782				singleLine = singleLine && subSingleLine
783				qs = append(qs, sq)
784			}
785		}
786		if r.Op == syntax.OpConcat {
787			if len(qs) > 1 {
788				isEq = false
789			}
790			newQs := make([]matchTree, 0, len(qs))
791			for _, q := range qs {
792				if _, ok := q.(*bruteForceMatchTree); ok {
793					continue
794				}
795				newQs = append(newQs, q)
796			}
797			if len(newQs) == 1 {
798				return newQs[0], isEq, singleLine, nil
799			}
800			if len(newQs) == 0 {
801				return &bruteForceMatchTree{}, isEq, singleLine, nil
802			}
803			if singleLine {
804				return &andLineMatchTree{andMatchTree{children: newQs}}, isEq, singleLine, nil
805			}
806			return &andMatchTree{newQs}, isEq, singleLine, nil
807		}
808		for _, q := range qs {
809			if _, ok := q.(*bruteForceMatchTree); ok {
810				return q, isEq, false, nil
811			}
812		}
813		if len(qs) == 0 {
814			return &noMatchTree{"const"}, isEq, false, nil
815		}
816		return &orMatchTree{qs}, isEq, false, nil
817	case syntax.OpStar:
818		if r.Sub[0].Op == syntax.OpAnyCharNotNL {
819			return &bruteForceMatchTree{}, false, true, nil
820		}
821	}
822	return &bruteForceMatchTree{}, false, false, nil
823}
Configure Feed

Configure Feed