eval.go at b45da91240d3a8d97acef8605e7d4f8ce25f81b2 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / eval.go
at b45da91240d3a8d97acef8605e7d4f8ce25f81b2 23 kB View raw
Julie Tibshirani Add experimental option for keyword scoring (#583) 3y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package zoekt
 16
 17import (
 18	"context"
 19	"fmt"
 20	"log"
 21	"math"
 22	"regexp/syntax"
 23	"sort"
 24	"strings"
 25
 26	enry_data "github.com/go-enry/go-enry/v2/data"
 27	"github.com/grafana/regexp"
 28
 29	"github.com/sourcegraph/zoekt/query"
 30)
 31
 32const maxUInt16 = 0xffff
 33
 34func (m *FileMatch) addScore(what string, s float64, debugScore bool) {
 35	if debugScore {
 36		m.Debug += fmt.Sprintf("%s:%.2f, ", what, s)
 37	}
 38	m.Score += s
 39}
 40
 41// simplifyMultiRepo takes a query and a predicate. It returns Const(true) if all
 42// repository names fulfill the predicate, Const(false) if none of them do, and q
 43// otherwise.
 44func (d *indexData) simplifyMultiRepo(q query.Q, predicate func(*Repository) bool) query.Q {
 45	count := 0
 46	alive := len(d.repoMetaData)
 47	for i := range d.repoMetaData {
 48		if d.repoMetaData[i].Tombstone {
 49			alive--
 50		} else if predicate(&d.repoMetaData[i]) {
 51			count++
 52		}
 53	}
 54	if count == alive {
 55		return &query.Const{Value: true}
 56	}
 57	if count > 0 {
 58		return q
 59	}
 60	return &query.Const{Value: false}
 61}
 62
 63func (d *indexData) simplify(in query.Q) query.Q {
 64	eval := query.Map(in, func(q query.Q) query.Q {
 65		switch r := q.(type) {
 66		case *query.Repo:
 67			return d.simplifyMultiRepo(q, func(repo *Repository) bool {
 68				return r.Regexp.MatchString(repo.Name)
 69			})
 70		case *query.RepoRegexp:
 71			return d.simplifyMultiRepo(q, func(repo *Repository) bool {
 72				return r.Regexp.MatchString(repo.Name)
 73			})
 74		case *query.BranchesRepos:
 75			for i := range d.repoMetaData {
 76				for _, br := range r.List {
 77					if br.Repos.Contains(d.repoMetaData[i].ID) {
 78						return q
 79					}
 80				}
 81			}
 82			return &query.Const{Value: false}
 83		case *query.RepoSet:
 84			return d.simplifyMultiRepo(q, func(repo *Repository) bool {
 85				return r.Set[repo.Name]
 86			})
 87		case *query.RepoIDs:
 88			return d.simplifyMultiRepo(q, func(repo *Repository) bool {
 89				return r.Repos.Contains(repo.ID)
 90			})
 91		case *query.Language:
 92			_, has := d.metaData.LanguageMap[r.Language]
 93			if !has && d.metaData.IndexFeatureVersion < 12 {
 94				// For index files that haven't been re-indexed by go-enry,
 95				// fall back to file-based matching and continue even if this
 96				// repo doesn't have the specific language present.
 97				extsForLang := enry_data.ExtensionsByLanguage[r.Language]
 98				if extsForLang != nil {
 99					extFrags := make([]string, 0, len(extsForLang))
100					for _, ext := range extsForLang {
101						extFrags = append(extFrags, regexp.QuoteMeta(ext))
102					}
103					if len(extFrags) > 0 {
104						pattern := fmt.Sprintf("(?i)(%s)$", strings.Join(extFrags, "|"))
105						// inlined copy of query.regexpQuery
106						re, err := syntax.Parse(pattern, syntax.Perl)
107						if err != nil {
108							return &query.Const{Value: false}
109						}
110						if re.Op == syntax.OpLiteral {
111							return &query.Substring{
112								Pattern:  string(re.Rune),
113								FileName: true,
114							}
115						}
116						return &query.Regexp{
117							Regexp:   re,
118							FileName: true,
119						}
120					}
121				}
122			}
123			if !has {
124				return &query.Const{Value: false}
125			}
126		}
127		return q
128	})
129	return query.Simplify(eval)
130}
131
132func (o *SearchOptions) SetDefaults() {
133	if o.ShardMaxMatchCount == 0 {
134		// We cap the total number of matches, so overly broad
135		// searches don't crash the machine.
136		o.ShardMaxMatchCount = 100000
137	}
138	if o.TotalMaxMatchCount == 0 {
139		o.TotalMaxMatchCount = 10 * o.ShardMaxMatchCount
140	}
141}
142
143func (d *indexData) Search(ctx context.Context, q query.Q, opts *SearchOptions) (sr *SearchResult, err error) {
144	copyOpts := *opts
145	opts = &copyOpts
146	opts.SetDefaults()
147
148	var res SearchResult
149	if len(d.fileNameIndex) == 0 {
150		return &res, nil
151	}
152
153	select {
154	case <-ctx.Done():
155		res.Stats.ShardsSkipped++
156		return &res, nil
157	default:
158	}
159
160	q = d.simplify(q)
161	if c, ok := q.(*query.Const); ok && !c.Value {
162		return &res, nil
163	}
164
165	if opts.EstimateDocCount {
166		res.Stats.ShardFilesConsidered = len(d.fileBranchMasks)
167		return &res, nil
168	}
169
170	q = query.Map(q, query.ExpandFileContent)
171
172	mt, err := d.newMatchTree(q, matchTreeOpt{})
173	if err != nil {
174		return nil, err
175	}
176
177	mt, err = pruneMatchTree(mt)
178	if err != nil {
179		return nil, err
180	}
181	if mt == nil {
182		res.Stats.ShardsSkippedFilter++
183		return &res, nil
184	}
185
186	res.Stats.ShardsScanned++
187
188	cp := &contentProvider{
189		id:    d,
190		stats: &res.Stats,
191	}
192
193	// Track the number of documents found in a repository for
194	// ShardRepoMaxMatchCount
195	var (
196		lastRepoID     uint16
197		repoMatchCount int
198	)
199
200	docCount := uint32(len(d.fileBranchMasks))
201	lastDoc := int(-1)
202
203nextFileMatch:
204	for {
205		canceled := false
206		select {
207		case <-ctx.Done():
208			canceled = true
209		default:
210		}
211
212		nextDoc := mt.nextDoc()
213		if int(nextDoc) <= lastDoc {
214			nextDoc = uint32(lastDoc + 1)
215		}
216
217		for ; nextDoc < docCount; nextDoc++ {
218			repoID := d.repos[nextDoc]
219			repoMetadata := &d.repoMetaData[repoID]
220
221			// Skip tombstoned repositories
222			if repoMetadata.Tombstone {
223				continue
224			}
225
226			// Skip documents that are tombstoned
227			if len(repoMetadata.FileTombstones) > 0 {
228				if _, tombstoned := repoMetadata.FileTombstones[string(d.fileName(nextDoc))]; tombstoned {
229					continue
230				}
231			}
232
233			// Skip documents over ShardRepoMaxMatchCount if specified.
234			if opts.ShardRepoMaxMatchCount > 0 {
235				if repoMatchCount >= opts.ShardRepoMaxMatchCount && repoID == lastRepoID {
236					res.Stats.FilesSkipped++
237					continue
238				}
239			}
240
241			break
242		}
243
244		if nextDoc >= docCount {
245			break
246		}
247
248		lastDoc = int(nextDoc)
249
250		// We track lastRepoID for ShardRepoMaxMatchCount
251		if lastRepoID != d.repos[nextDoc] {
252			lastRepoID = d.repos[nextDoc]
253			repoMatchCount = 0
254		}
255
256		if canceled || (res.Stats.MatchCount >= opts.ShardMaxMatchCount && opts.ShardMaxMatchCount > 0) {
257			res.Stats.FilesSkipped += int(docCount - nextDoc)
258			break
259		}
260
261		res.Stats.FilesConsidered++
262		mt.prepare(nextDoc)
263
264		cp.setDocument(nextDoc)
265
266		known := make(map[matchTree]bool)
267		md := d.repoMetaData[d.repos[nextDoc]]
268
269		for cost := costMin; cost <= costMax; cost++ {
270			v, ok := mt.matches(cp, cost, known)
271			if ok && !v {
272				continue nextFileMatch
273			}
274
275			if cost == costMax && !ok {
276				log.Panicf("did not decide. Repo %s, doc %d, known %v",
277					md.Name, nextDoc, known)
278			}
279		}
280
281		fileMatch := FileMatch{
282			Repository:         md.Name,
283			RepositoryID:       md.ID,
284			RepositoryPriority: md.priority,
285			FileName:           string(d.fileName(nextDoc)),
286			Checksum:           d.getChecksum(nextDoc),
287			Language:           d.languageMap[d.getLanguage(nextDoc)],
288		}
289
290		if s := d.subRepos[nextDoc]; s > 0 {
291			if s >= uint32(len(d.subRepoPaths[d.repos[nextDoc]])) {
292				log.Panicf("corrupt index: subrepo %d beyond %v", s, d.subRepoPaths)
293			}
294			path := d.subRepoPaths[d.repos[nextDoc]][s]
295			fileMatch.SubRepositoryPath = path
296			sr := md.SubRepoMap[path]
297			fileMatch.SubRepositoryName = sr.Name
298			if idx := d.branchIndex(nextDoc); idx >= 0 {
299				fileMatch.Version = sr.Branches[idx].Version
300			}
301		} else {
302			idx := d.branchIndex(nextDoc)
303			if idx >= 0 {
304				fileMatch.Version = md.Branches[idx].Version
305			}
306		}
307
308		shouldMergeMatches := !opts.ChunkMatches
309		finalCands := gatherMatches(mt, known, shouldMergeMatches)
310
311		if len(finalCands) == 0 {
312			nm := d.fileName(nextDoc)
313			finalCands = append(finalCands,
314				&candidateMatch{
315					caseSensitive: false,
316					fileName:      true,
317					substrBytes:   nm,
318					substrLowered: nm,
319					file:          nextDoc,
320					runeOffset:    0,
321					byteOffset:    0,
322					byteMatchSz:   uint32(len(nm)),
323				})
324		}
325
326		if opts.ChunkMatches {
327			fileMatch.ChunkMatches = cp.fillChunkMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts.DebugScore)
328		} else {
329			fileMatch.LineMatches = cp.fillMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts.DebugScore)
330		}
331
332		if opts.UseKeywordScoring {
333			d.scoreFileUsingBM25(&fileMatch, nextDoc, finalCands, opts)
334		} else {
335			// Use the standard, non-experimental scoring method by default
336			d.scoreFile(&fileMatch, nextDoc, mt, known, opts)
337		}
338
339		fileMatch.Branches = d.gatherBranches(nextDoc, mt, known)
340		sortMatchesByScore(fileMatch.LineMatches)
341		sortChunkMatchesByScore(fileMatch.ChunkMatches)
342		if opts.Whole {
343			fileMatch.Content = cp.data(false)
344		}
345
346		matchedChunkRanges := 0
347		for _, cm := range fileMatch.ChunkMatches {
348			matchedChunkRanges += len(cm.Ranges)
349		}
350
351		repoMatchCount += len(fileMatch.LineMatches)
352		repoMatchCount += matchedChunkRanges
353
354		if opts.DebugScore {
355			fileMatch.Debug = fmt.Sprintf("score:%.2f <- %s", fileMatch.Score, fileMatch.Debug)
356		}
357
358		res.Files = append(res.Files, fileMatch)
359		res.Stats.MatchCount += len(fileMatch.LineMatches)
360		res.Stats.MatchCount += matchedChunkRanges
361		res.Stats.FileCount++
362	}
363
364	// We do not sort Files here, instead we rely on the shards pkg to do file
365	// ranking. If we sorted now, we would break the assumption that results
366	// from the same repo in a shard appear next to each other.
367
368	for _, md := range d.repoMetaData {
369		r := md
370		addRepo(&res, &r)
371		for _, v := range r.SubRepoMap {
372			addRepo(&res, v)
373		}
374	}
375
376	visitMatchTree(mt, func(mt matchTree) {
377		if atom, ok := mt.(interface{ updateStats(*Stats) }); ok {
378			atom.updateStats(&res.Stats)
379		}
380	})
381
382	// If document ranking is enabled, then we can rank and truncate the files to save memory.
383	if limit := opts.MaxDocDisplayCount; opts.UseDocumentRanks && limit > 0 && limit < len(res.Files) {
384		SortFiles(res.Files)
385		res.Files = res.Files[:limit]
386	}
387
388	return &res, nil
389}
390
391// scoreFile computes a score for the file match using various scoring signals, like
392// whether there's an exact match on a symbol, the number of query clauses that matched, etc.
393func (d *indexData) scoreFile(fileMatch *FileMatch, doc uint32, mt matchTree, known map[matchTree]bool, opts *SearchOptions) {
394	atomMatchCount := 0
395	visitMatches(mt, known, func(mt matchTree) {
396		atomMatchCount++
397	})
398
399	// atom-count boosts files with matches from more than 1 atom. The
400	// maximum boost is scoreFactorAtomMatch.
401	if atomMatchCount > 0 {
402		fileMatch.addScore("atom", (1.0-1.0/float64(atomMatchCount))*scoreFactorAtomMatch, opts.DebugScore)
403	}
404
405	maxFileScore := 0.0
406	repetitions := 0
407	for i := range fileMatch.LineMatches {
408		if maxFileScore < fileMatch.LineMatches[i].Score {
409			maxFileScore = fileMatch.LineMatches[i].Score
410			repetitions = 0
411		} else if maxFileScore == fileMatch.LineMatches[i].Score {
412			repetitions += 1
413		}
414
415		// Order by ordering in file.
416		fileMatch.LineMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.LineMatches))))
417	}
418
419	for i := range fileMatch.ChunkMatches {
420		if maxFileScore < fileMatch.ChunkMatches[i].Score {
421			maxFileScore = fileMatch.ChunkMatches[i].Score
422		}
423
424		// Order by ordering in file.
425		fileMatch.ChunkMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.ChunkMatches))))
426	}
427
428	// Maintain ordering of input files. This
429	// strictly dominates the in-file ordering of
430	// the matches.
431	fileMatch.addScore("fragment", maxFileScore, opts.DebugScore)
432
433	// Prefer docs with several top-scored matches.
434	fileMatch.addScore("repetition-boost", scoreRepetitionFactor*float64(repetitions), opts.DebugScore)
435
436	if opts.UseDocumentRanks && len(d.ranks) > int(doc) {
437		weight := scoreFileRankFactor
438		if opts.DocumentRanksWeight > 0.0 {
439			weight = opts.DocumentRanksWeight
440		}
441
442		ranks := d.ranks[doc]
443		// The ranks slice always contains one entry representing the file rank (unless it's empty since the
444		// file doesn't have a rank). This is left over from when documents could have multiple rank signals,
445		// and we plan to clean this up.
446		if len(ranks) > 0 {
447			// The file rank represents a log (base 2) count. The log ranks should be bounded at 32, but we
448			// cap it just in case to ensure it falls in the range [0, 1].
449			normalized := math.Min(1.0, ranks[0]/32.0)
450			fileMatch.addScore("file-rank", weight*normalized, opts.DebugScore)
451		}
452	}
453
454	md := d.repoMetaData[d.repos[doc]]
455	fileMatch.addScore("doc-order", scoreFileOrderFactor*(1.0-float64(doc)/float64(len(d.boundaries))), opts.DebugScore)
456	fileMatch.addScore("repo-rank", scoreRepoRankFactor*float64(md.Rank)/maxUInt16, opts.DebugScore)
457}
458
459// scoreFileUsingBM25 computes a score for the file match using an approximation to BM25, the most common scoring
460// algorithm for keyword search: https://en.wikipedia.org/wiki/Okapi_BM25. It implements all parts of the formula
461// except inverse document frequency (idf), since we don't have access to global term frequency statistics.
462//
463// This scoring strategy ignores all other signals including document ranks. This keeps things simple for now,
464// since BM25 is not normalized and can be tricky to combine with other scoring signals.
465func (d *indexData) scoreFileUsingBM25(fileMatch *FileMatch, doc uint32, cands []*candidateMatch, opts *SearchOptions) {
466	// Treat each candidate match as a term and compute the frequencies. For now, ignore case
467	// sensitivity and treat filenames and symbols the same as content.
468	termFreqs := map[string]int{}
469	for _, cand := range cands {
470		term := string(cand.substrLowered)
471		termFreqs[term]++
472	}
473
474	// Compute the file length ratio. Usually the calculation would be based on terms, but using
475	// bytes should work fine, as we're just computing a ratio.
476	fileLength := float64(d.boundaries[doc+1] - d.boundaries[doc])
477	numFiles := len(d.boundaries)
478	averageFileLength := float64(d.boundaries[numFiles - 1]) / float64(numFiles)
479	L := fileLength / averageFileLength
480
481	// Use standard parameter defaults (used in Lucene and academic papers)
482	k, b := 1.2, 0.75
483	score := 0.0
484	for _, freq := range termFreqs {
485		tf := float64(freq)
486		score += ((k + 1.0) * tf) / (k * (1.0 - b + b * L) + tf)
487	}
488
489	fileMatch.addScore("keyword-score", score, opts.DebugScore)
490}
491
492func addRepo(res *SearchResult, repo *Repository) {
493	if res.RepoURLs == nil {
494		res.RepoURLs = map[string]string{}
495	}
496	res.RepoURLs[repo.Name] = repo.FileURLTemplate
497
498	if res.LineFragments == nil {
499		res.LineFragments = map[string]string{}
500	}
501	res.LineFragments[repo.Name] = repo.LineFragmentTemplate
502}
503
504type sortByOffsetSlice []*candidateMatch
505
506func (m sortByOffsetSlice) Len() int      { return len(m) }
507func (m sortByOffsetSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
508func (m sortByOffsetSlice) Less(i, j int) bool {
509	return m[i].byteOffset < m[j].byteOffset
510}
511
512// Gather matches from this document. This never returns a mixture of
513// filename/content matches: if there are content matches, all
514// filename matches are trimmed from the result. The matches are
515// returned in document order and are non-overlapping.
516//
517// If `merge` is set, overlapping and adjacent matches will be merged
518// into a single match. Otherwise, overlapping matches will be removed,
519// but adjacent matches will remain.
520func gatherMatches(mt matchTree, known map[matchTree]bool, merge bool) []*candidateMatch {
521	var cands []*candidateMatch
522	visitMatches(mt, known, func(mt matchTree) {
523		if smt, ok := mt.(*substrMatchTree); ok {
524			cands = append(cands, smt.current...)
525		}
526		if rmt, ok := mt.(*regexpMatchTree); ok {
527			cands = append(cands, rmt.found...)
528		}
529		if rmt, ok := mt.(*wordMatchTree); ok {
530			cands = append(cands, rmt.found...)
531		}
532		if smt, ok := mt.(*symbolRegexpMatchTree); ok {
533			cands = append(cands, smt.found...)
534		}
535	})
536
537	foundContentMatch := false
538	for _, c := range cands {
539		if !c.fileName {
540			foundContentMatch = true
541			break
542		}
543	}
544
545	res := cands[:0]
546	for _, c := range cands {
547		if !foundContentMatch || !c.fileName {
548			res = append(res, c)
549		}
550	}
551	cands = res
552
553	if merge {
554		// Merge adjacent candidates. This guarantees that the matches
555		// are non-overlapping.
556		sort.Sort((sortByOffsetSlice)(cands))
557		res = cands[:0]
558		for i, c := range cands {
559			if i == 0 {
560				res = append(res, c)
561				continue
562			}
563			last := res[len(res)-1]
564			lastEnd := last.byteOffset + last.byteMatchSz
565			end := c.byteOffset + c.byteMatchSz
566			if lastEnd >= c.byteOffset {
567				if end > lastEnd {
568					last.byteMatchSz = end - last.byteOffset
569				}
570				continue
571			}
572
573			res = append(res, c)
574		}
575	} else {
576		// Remove overlapping candidates. This guarantees that the matches
577		// are non-overlapping, but also preserves expected match counts.
578		sort.Sort((sortByOffsetSlice)(cands))
579		res = cands[:0]
580		for i, c := range cands {
581			if i == 0 {
582				res = append(res, c)
583				continue
584			}
585			last := res[len(res)-1]
586			lastEnd := last.byteOffset + last.byteMatchSz
587			if lastEnd > c.byteOffset {
588				continue
589			}
590
591			res = append(res, c)
592		}
593	}
594
595	return res
596}
597
598func (d *indexData) branchIndex(docID uint32) int {
599	mask := d.fileBranchMasks[docID]
600	idx := 0
601	for mask != 0 {
602		if mask&0x1 != 0 {
603			return idx
604		}
605		idx++
606		mask >>= 1
607	}
608	return -1
609}
610
611// gatherBranches returns a list of branch names taking into account any branch
612// filters in the query. If the query contains a branch filter, it returns all
613// branches containing the docID and matching the branch filter. Otherwise, it
614// returns all branches containing docID.
615func (d *indexData) gatherBranches(docID uint32, mt matchTree, known map[matchTree]bool) []string {
616	var mask uint64
617	visitMatches(mt, known, func(mt matchTree) {
618		bq, ok := mt.(*branchQueryMatchTree)
619		if !ok {
620			return
621		}
622
623		mask = mask | bq.branchMask()
624	})
625
626	if mask == 0 {
627		mask = d.fileBranchMasks[docID]
628	}
629
630	var branches []string
631	id := uint32(1)
632	branchNames := d.branchNames[d.repos[docID]]
633	for mask != 0 {
634		if mask&0x1 != 0 {
635			branches = append(branches, branchNames[uint(id)])
636		}
637		id <<= 1
638		mask >>= 1
639	}
640
641	return branches
642}
643
644func (d *indexData) List(ctx context.Context, q query.Q, opts *ListOptions) (rl *RepoList, err error) {
645	var include func(rle *RepoListEntry) bool
646
647	q = d.simplify(q)
648	if c, ok := q.(*query.Const); ok {
649		if !c.Value {
650			return &RepoList{}, nil
651		}
652		include = func(rle *RepoListEntry) bool {
653			return true
654		}
655	} else {
656		sr, err := d.Search(ctx, q, &SearchOptions{
657			ShardRepoMaxMatchCount: 1,
658		})
659		if err != nil {
660			return nil, err
661		}
662
663		foundRepos := make(map[string]struct{}, len(sr.Files))
664		for _, file := range sr.Files {
665			foundRepos[file.Repository] = struct{}{}
666		}
667
668		include = func(rle *RepoListEntry) bool {
669			_, ok := foundRepos[rle.Repository.Name]
670			return ok
671		}
672	}
673
674	var l RepoList
675
676	field, err := opts.GetField()
677	if err != nil {
678		return nil, err
679	}
680	switch field {
681	case RepoListFieldRepos:
682		l.Repos = make([]*RepoListEntry, 0, len(d.repoListEntry))
683	case RepoListFieldMinimal:
684		l.Minimal = make(map[uint32]*MinimalRepoListEntry, len(d.repoListEntry))
685	case RepoListFieldReposMap:
686		l.ReposMap = make(ReposMap, len(d.repoListEntry))
687	}
688
689	for i := range d.repoListEntry {
690		if d.repoMetaData[i].Tombstone {
691			continue
692		}
693		rle := &d.repoListEntry[i]
694		if !include(rle) {
695			continue
696		}
697
698		l.Stats.Add(&rle.Stats)
699
700		// Backwards compat for when ID is missing
701		if rle.Repository.ID == 0 {
702			l.Repos = append(l.Repos, rle)
703			continue
704		}
705
706		switch field {
707		case RepoListFieldRepos:
708			l.Repos = append(l.Repos, rle)
709		case RepoListFieldMinimal:
710			l.Minimal[rle.Repository.ID] = &MinimalRepoListEntry{
711				HasSymbols: rle.Repository.HasSymbols,
712				Branches:   rle.Repository.Branches,
713			}
714		case RepoListFieldReposMap:
715			l.ReposMap[rle.Repository.ID] = MinimalRepoListEntry{
716				HasSymbols: rle.Repository.HasSymbols,
717				Branches:   rle.Repository.Branches,
718			}
719		}
720
721	}
722
723	return &l, nil
724}
725
726// regexpToMatchTreeRecursive converts a regular expression to a matchTree mt. If
727// mt is equivalent to the input r, isEqual = true and the matchTree can be used
728// in place of the regex r. If singleLine = true, then the matchTree and all
729// its children only match terms on the same line. singleLine is used during
730// recursion to decide whether to return an andLineMatchTree (singleLine = true)
731// or a andMatchTree (singleLine = false).
732func (d *indexData) regexpToMatchTreeRecursive(r *syntax.Regexp, minTextSize int, fileName bool, caseSensitive bool) (mt matchTree, isEqual bool, singleLine bool, err error) {
733	// TODO - we could perhaps transform Begin/EndText in '\n'?
734	// TODO - we could perhaps transform CharClass in (OrQuery )
735	// if there are just a few runes, and part of a OpConcat?
736	switch r.Op {
737	case syntax.OpLiteral:
738		s := string(r.Rune)
739		if len(s) >= minTextSize {
740			mt, err := d.newSubstringMatchTree(&query.Substring{Pattern: s, FileName: fileName, CaseSensitive: caseSensitive})
741			return mt, true, !strings.Contains(s, "\n"), err
742		}
743	case syntax.OpCapture:
744		return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
745
746	case syntax.OpPlus:
747		return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
748
749	case syntax.OpRepeat:
750		if r.Min == 1 {
751			return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
752		} else if r.Min > 1 {
753			// (x){2,} can't be expressed precisely by the matchTree
754			mt, _, singleLine, err := d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
755			return mt, false, singleLine, err
756		}
757	case syntax.OpConcat, syntax.OpAlternate:
758		var qs []matchTree
759		isEq := true
760		singleLine = true
761		for _, sr := range r.Sub {
762			if sq, subIsEq, subSingleLine, err := d.regexpToMatchTreeRecursive(sr, minTextSize, fileName, caseSensitive); sq != nil {
763				if err != nil {
764					return nil, false, false, err
765				}
766				isEq = isEq && subIsEq
767				singleLine = singleLine && subSingleLine
768				qs = append(qs, sq)
769			}
770		}
771		if r.Op == syntax.OpConcat {
772			if len(qs) > 1 {
773				isEq = false
774			}
775			newQs := make([]matchTree, 0, len(qs))
776			for _, q := range qs {
777				if _, ok := q.(*bruteForceMatchTree); ok {
778					continue
779				}
780				newQs = append(newQs, q)
781			}
782			if len(newQs) == 1 {
783				return newQs[0], isEq, singleLine, nil
784			}
785			if len(newQs) == 0 {
786				return &bruteForceMatchTree{}, isEq, singleLine, nil
787			}
788			if singleLine {
789				return &andLineMatchTree{andMatchTree{children: newQs}}, isEq, singleLine, nil
790			}
791			return &andMatchTree{newQs}, isEq, singleLine, nil
792		}
793		for _, q := range qs {
794			if _, ok := q.(*bruteForceMatchTree); ok {
795				return q, isEq, false, nil
796			}
797		}
798		if len(qs) == 0 {
799			return &noMatchTree{"const"}, isEq, false, nil
800		}
801		return &orMatchTree{qs}, isEq, false, nil
802	case syntax.OpStar:
803		if r.Sub[0].Op == syntax.OpAnyCharNotNL {
804			return &bruteForceMatchTree{}, false, true, nil
805		}
806	}
807	return &bruteForceMatchTree{}, false, false, nil
808}
Configure Feed

Configure Feed