fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "context"
19 "fmt"
20 "log"
21 "math"
22 "regexp/syntax"
23 "sort"
24 "strings"
25
26 enry_data "github.com/go-enry/go-enry/v2/data"
27 "github.com/grafana/regexp"
28
29 "github.com/sourcegraph/zoekt/query"
30)
31
32const maxUInt16 = 0xffff
33
34func (m *FileMatch) addScore(what string, s float64, debugScore bool) {
35 if debugScore {
36 m.Debug += fmt.Sprintf("%s:%.2f, ", what, s)
37 }
38 m.Score += s
39}
40
41// simplifyMultiRepo takes a query and a predicate. It returns Const(true) if all
42// repository names fulfill the predicate, Const(false) if none of them do, and q
43// otherwise.
44func (d *indexData) simplifyMultiRepo(q query.Q, predicate func(*Repository) bool) query.Q {
45 count := 0
46 alive := len(d.repoMetaData)
47 for i := range d.repoMetaData {
48 if d.repoMetaData[i].Tombstone {
49 alive--
50 } else if predicate(&d.repoMetaData[i]) {
51 count++
52 }
53 }
54 if count == alive {
55 return &query.Const{Value: true}
56 }
57 if count > 0 {
58 return q
59 }
60 return &query.Const{Value: false}
61}
62
63func (d *indexData) simplify(in query.Q) query.Q {
64 eval := query.Map(in, func(q query.Q) query.Q {
65 switch r := q.(type) {
66 case *query.Repo:
67 return d.simplifyMultiRepo(q, func(repo *Repository) bool {
68 return r.Regexp.MatchString(repo.Name)
69 })
70 case *query.RepoRegexp:
71 return d.simplifyMultiRepo(q, func(repo *Repository) bool {
72 return r.Regexp.MatchString(repo.Name)
73 })
74 case *query.BranchesRepos:
75 for i := range d.repoMetaData {
76 for _, br := range r.List {
77 if br.Repos.Contains(d.repoMetaData[i].ID) {
78 return q
79 }
80 }
81 }
82 return &query.Const{Value: false}
83 case *query.RepoSet:
84 return d.simplifyMultiRepo(q, func(repo *Repository) bool {
85 return r.Set[repo.Name]
86 })
87 case *query.RepoIDs:
88 return d.simplifyMultiRepo(q, func(repo *Repository) bool {
89 return r.Repos.Contains(repo.ID)
90 })
91 case *query.Language:
92 _, has := d.metaData.LanguageMap[r.Language]
93 if !has && d.metaData.IndexFeatureVersion < 12 {
94 // For index files that haven't been re-indexed by go-enry,
95 // fall back to file-based matching and continue even if this
96 // repo doesn't have the specific language present.
97 extsForLang := enry_data.ExtensionsByLanguage[r.Language]
98 if extsForLang != nil {
99 extFrags := make([]string, 0, len(extsForLang))
100 for _, ext := range extsForLang {
101 extFrags = append(extFrags, regexp.QuoteMeta(ext))
102 }
103 if len(extFrags) > 0 {
104 pattern := fmt.Sprintf("(?i)(%s)$", strings.Join(extFrags, "|"))
105 // inlined copy of query.regexpQuery
106 re, err := syntax.Parse(pattern, syntax.Perl)
107 if err != nil {
108 return &query.Const{Value: false}
109 }
110 if re.Op == syntax.OpLiteral {
111 return &query.Substring{
112 Pattern: string(re.Rune),
113 FileName: true,
114 }
115 }
116 return &query.Regexp{
117 Regexp: re,
118 FileName: true,
119 }
120 }
121 }
122 }
123 if !has {
124 return &query.Const{Value: false}
125 }
126 }
127 return q
128 })
129 return query.Simplify(eval)
130}
131
132func (o *SearchOptions) SetDefaults() {
133 if o.ShardMaxMatchCount == 0 {
134 // We cap the total number of matches, so overly broad
135 // searches don't crash the machine.
136 o.ShardMaxMatchCount = 100000
137 }
138 if o.TotalMaxMatchCount == 0 {
139 o.TotalMaxMatchCount = 10 * o.ShardMaxMatchCount
140 }
141}
142
143func (d *indexData) Search(ctx context.Context, q query.Q, opts *SearchOptions) (sr *SearchResult, err error) {
144 copyOpts := *opts
145 opts = ©Opts
146 opts.SetDefaults()
147
148 var res SearchResult
149 if len(d.fileNameIndex) == 0 {
150 return &res, nil
151 }
152
153 select {
154 case <-ctx.Done():
155 res.Stats.ShardsSkipped++
156 return &res, nil
157 default:
158 }
159
160 q = d.simplify(q)
161 if c, ok := q.(*query.Const); ok && !c.Value {
162 return &res, nil
163 }
164
165 if opts.EstimateDocCount {
166 res.Stats.ShardFilesConsidered = len(d.fileBranchMasks)
167 return &res, nil
168 }
169
170 q = query.Map(q, query.ExpandFileContent)
171
172 mt, err := d.newMatchTree(q, matchTreeOpt{})
173 if err != nil {
174 return nil, err
175 }
176
177 mt, err = pruneMatchTree(mt)
178 if err != nil {
179 return nil, err
180 }
181 if mt == nil {
182 res.Stats.ShardsSkippedFilter++
183 return &res, nil
184 }
185
186 res.Stats.ShardsScanned++
187
188 cp := &contentProvider{
189 id: d,
190 stats: &res.Stats,
191 }
192
193 // Track the number of documents found in a repository for
194 // ShardRepoMaxMatchCount
195 var (
196 lastRepoID uint16
197 repoMatchCount int
198 )
199
200 docCount := uint32(len(d.fileBranchMasks))
201 lastDoc := int(-1)
202
203nextFileMatch:
204 for {
205 canceled := false
206 select {
207 case <-ctx.Done():
208 canceled = true
209 default:
210 }
211
212 nextDoc := mt.nextDoc()
213 if int(nextDoc) <= lastDoc {
214 nextDoc = uint32(lastDoc + 1)
215 }
216
217 for ; nextDoc < docCount; nextDoc++ {
218 repoID := d.repos[nextDoc]
219 repoMetadata := &d.repoMetaData[repoID]
220
221 // Skip tombstoned repositories
222 if repoMetadata.Tombstone {
223 continue
224 }
225
226 // Skip documents that are tombstoned
227 if len(repoMetadata.FileTombstones) > 0 {
228 if _, tombstoned := repoMetadata.FileTombstones[string(d.fileName(nextDoc))]; tombstoned {
229 continue
230 }
231 }
232
233 // Skip documents over ShardRepoMaxMatchCount if specified.
234 if opts.ShardRepoMaxMatchCount > 0 {
235 if repoMatchCount >= opts.ShardRepoMaxMatchCount && repoID == lastRepoID {
236 res.Stats.FilesSkipped++
237 continue
238 }
239 }
240
241 break
242 }
243
244 if nextDoc >= docCount {
245 break
246 }
247
248 lastDoc = int(nextDoc)
249
250 // We track lastRepoID for ShardRepoMaxMatchCount
251 if lastRepoID != d.repos[nextDoc] {
252 lastRepoID = d.repos[nextDoc]
253 repoMatchCount = 0
254 }
255
256 if canceled || (res.Stats.MatchCount >= opts.ShardMaxMatchCount && opts.ShardMaxMatchCount > 0) {
257 res.Stats.FilesSkipped += int(docCount - nextDoc)
258 break
259 }
260
261 res.Stats.FilesConsidered++
262 mt.prepare(nextDoc)
263
264 cp.setDocument(nextDoc)
265
266 known := make(map[matchTree]bool)
267 md := d.repoMetaData[d.repos[nextDoc]]
268
269 for cost := costMin; cost <= costMax; cost++ {
270 v, ok := mt.matches(cp, cost, known)
271 if ok && !v {
272 continue nextFileMatch
273 }
274
275 if cost == costMax && !ok {
276 log.Panicf("did not decide. Repo %s, doc %d, known %v",
277 md.Name, nextDoc, known)
278 }
279 }
280
281 fileMatch := FileMatch{
282 Repository: md.Name,
283 RepositoryID: md.ID,
284 RepositoryPriority: md.priority,
285 FileName: string(d.fileName(nextDoc)),
286 Checksum: d.getChecksum(nextDoc),
287 Language: d.languageMap[d.getLanguage(nextDoc)],
288 }
289
290 if s := d.subRepos[nextDoc]; s > 0 {
291 if s >= uint32(len(d.subRepoPaths[d.repos[nextDoc]])) {
292 log.Panicf("corrupt index: subrepo %d beyond %v", s, d.subRepoPaths)
293 }
294 path := d.subRepoPaths[d.repos[nextDoc]][s]
295 fileMatch.SubRepositoryPath = path
296 sr := md.SubRepoMap[path]
297 fileMatch.SubRepositoryName = sr.Name
298 if idx := d.branchIndex(nextDoc); idx >= 0 {
299 fileMatch.Version = sr.Branches[idx].Version
300 }
301 } else {
302 idx := d.branchIndex(nextDoc)
303 if idx >= 0 {
304 fileMatch.Version = md.Branches[idx].Version
305 }
306 }
307
308 shouldMergeMatches := !opts.ChunkMatches
309 finalCands := gatherMatches(mt, known, shouldMergeMatches)
310
311 if len(finalCands) == 0 {
312 nm := d.fileName(nextDoc)
313 finalCands = append(finalCands,
314 &candidateMatch{
315 caseSensitive: false,
316 fileName: true,
317 substrBytes: nm,
318 substrLowered: nm,
319 file: nextDoc,
320 runeOffset: 0,
321 byteOffset: 0,
322 byteMatchSz: uint32(len(nm)),
323 })
324 }
325
326 if opts.ChunkMatches {
327 fileMatch.ChunkMatches = cp.fillChunkMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts.DebugScore)
328 } else {
329 fileMatch.LineMatches = cp.fillMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts.DebugScore)
330 }
331
332 if opts.UseKeywordScoring {
333 d.scoreFileUsingBM25(&fileMatch, nextDoc, finalCands, opts)
334 } else {
335 // Use the standard, non-experimental scoring method by default
336 d.scoreFile(&fileMatch, nextDoc, mt, known, opts)
337 }
338
339 fileMatch.Branches = d.gatherBranches(nextDoc, mt, known)
340 sortMatchesByScore(fileMatch.LineMatches)
341 sortChunkMatchesByScore(fileMatch.ChunkMatches)
342 if opts.Whole {
343 fileMatch.Content = cp.data(false)
344 }
345
346 matchedChunkRanges := 0
347 for _, cm := range fileMatch.ChunkMatches {
348 matchedChunkRanges += len(cm.Ranges)
349 }
350
351 repoMatchCount += len(fileMatch.LineMatches)
352 repoMatchCount += matchedChunkRanges
353
354 if opts.DebugScore {
355 fileMatch.Debug = fmt.Sprintf("score:%.2f <- %s", fileMatch.Score, fileMatch.Debug)
356 }
357
358 res.Files = append(res.Files, fileMatch)
359 res.Stats.MatchCount += len(fileMatch.LineMatches)
360 res.Stats.MatchCount += matchedChunkRanges
361 res.Stats.FileCount++
362 }
363
364 // We do not sort Files here, instead we rely on the shards pkg to do file
365 // ranking. If we sorted now, we would break the assumption that results
366 // from the same repo in a shard appear next to each other.
367
368 for _, md := range d.repoMetaData {
369 r := md
370 addRepo(&res, &r)
371 for _, v := range r.SubRepoMap {
372 addRepo(&res, v)
373 }
374 }
375
376 visitMatchTree(mt, func(mt matchTree) {
377 if atom, ok := mt.(interface{ updateStats(*Stats) }); ok {
378 atom.updateStats(&res.Stats)
379 }
380 })
381
382 // If document ranking is enabled, then we can rank and truncate the files to save memory.
383 if limit := opts.MaxDocDisplayCount; opts.UseDocumentRanks && limit > 0 && limit < len(res.Files) {
384 SortFiles(res.Files)
385 res.Files = res.Files[:limit]
386 }
387
388 return &res, nil
389}
390
391// scoreFile computes a score for the file match using various scoring signals, like
392// whether there's an exact match on a symbol, the number of query clauses that matched, etc.
393func (d *indexData) scoreFile(fileMatch *FileMatch, doc uint32, mt matchTree, known map[matchTree]bool, opts *SearchOptions) {
394 atomMatchCount := 0
395 visitMatches(mt, known, func(mt matchTree) {
396 atomMatchCount++
397 })
398
399 // atom-count boosts files with matches from more than 1 atom. The
400 // maximum boost is scoreFactorAtomMatch.
401 if atomMatchCount > 0 {
402 fileMatch.addScore("atom", (1.0-1.0/float64(atomMatchCount))*scoreFactorAtomMatch, opts.DebugScore)
403 }
404
405 maxFileScore := 0.0
406 repetitions := 0
407 for i := range fileMatch.LineMatches {
408 if maxFileScore < fileMatch.LineMatches[i].Score {
409 maxFileScore = fileMatch.LineMatches[i].Score
410 repetitions = 0
411 } else if maxFileScore == fileMatch.LineMatches[i].Score {
412 repetitions += 1
413 }
414
415 // Order by ordering in file.
416 fileMatch.LineMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.LineMatches))))
417 }
418
419 for i := range fileMatch.ChunkMatches {
420 if maxFileScore < fileMatch.ChunkMatches[i].Score {
421 maxFileScore = fileMatch.ChunkMatches[i].Score
422 }
423
424 // Order by ordering in file.
425 fileMatch.ChunkMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.ChunkMatches))))
426 }
427
428 // Maintain ordering of input files. This
429 // strictly dominates the in-file ordering of
430 // the matches.
431 fileMatch.addScore("fragment", maxFileScore, opts.DebugScore)
432
433 // Prefer docs with several top-scored matches.
434 fileMatch.addScore("repetition-boost", scoreRepetitionFactor*float64(repetitions), opts.DebugScore)
435
436 if opts.UseDocumentRanks && len(d.ranks) > int(doc) {
437 weight := scoreFileRankFactor
438 if opts.DocumentRanksWeight > 0.0 {
439 weight = opts.DocumentRanksWeight
440 }
441
442 ranks := d.ranks[doc]
443 // The ranks slice always contains one entry representing the file rank (unless it's empty since the
444 // file doesn't have a rank). This is left over from when documents could have multiple rank signals,
445 // and we plan to clean this up.
446 if len(ranks) > 0 {
447 // The file rank represents a log (base 2) count. The log ranks should be bounded at 32, but we
448 // cap it just in case to ensure it falls in the range [0, 1].
449 normalized := math.Min(1.0, ranks[0]/32.0)
450 fileMatch.addScore("file-rank", weight*normalized, opts.DebugScore)
451 }
452 }
453
454 md := d.repoMetaData[d.repos[doc]]
455 fileMatch.addScore("doc-order", scoreFileOrderFactor*(1.0-float64(doc)/float64(len(d.boundaries))), opts.DebugScore)
456 fileMatch.addScore("repo-rank", scoreRepoRankFactor*float64(md.Rank)/maxUInt16, opts.DebugScore)
457}
458
459// scoreFileUsingBM25 computes a score for the file match using an approximation to BM25, the most common scoring
460// algorithm for keyword search: https://en.wikipedia.org/wiki/Okapi_BM25. It implements all parts of the formula
461// except inverse document frequency (idf), since we don't have access to global term frequency statistics.
462//
463// This scoring strategy ignores all other signals including document ranks. This keeps things simple for now,
464// since BM25 is not normalized and can be tricky to combine with other scoring signals.
465func (d *indexData) scoreFileUsingBM25(fileMatch *FileMatch, doc uint32, cands []*candidateMatch, opts *SearchOptions) {
466 // Treat each candidate match as a term and compute the frequencies. For now, ignore case
467 // sensitivity and treat filenames and symbols the same as content.
468 termFreqs := map[string]int{}
469 for _, cand := range cands {
470 term := string(cand.substrLowered)
471 termFreqs[term]++
472 }
473
474 // Compute the file length ratio. Usually the calculation would be based on terms, but using
475 // bytes should work fine, as we're just computing a ratio.
476 fileLength := float64(d.boundaries[doc+1] - d.boundaries[doc])
477 numFiles := len(d.boundaries)
478 averageFileLength := float64(d.boundaries[numFiles - 1]) / float64(numFiles)
479 L := fileLength / averageFileLength
480
481 // Use standard parameter defaults (used in Lucene and academic papers)
482 k, b := 1.2, 0.75
483 score := 0.0
484 for _, freq := range termFreqs {
485 tf := float64(freq)
486 score += ((k + 1.0) * tf) / (k * (1.0 - b + b * L) + tf)
487 }
488
489 fileMatch.addScore("keyword-score", score, opts.DebugScore)
490}
491
492func addRepo(res *SearchResult, repo *Repository) {
493 if res.RepoURLs == nil {
494 res.RepoURLs = map[string]string{}
495 }
496 res.RepoURLs[repo.Name] = repo.FileURLTemplate
497
498 if res.LineFragments == nil {
499 res.LineFragments = map[string]string{}
500 }
501 res.LineFragments[repo.Name] = repo.LineFragmentTemplate
502}
503
504type sortByOffsetSlice []*candidateMatch
505
506func (m sortByOffsetSlice) Len() int { return len(m) }
507func (m sortByOffsetSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
508func (m sortByOffsetSlice) Less(i, j int) bool {
509 return m[i].byteOffset < m[j].byteOffset
510}
511
512// Gather matches from this document. This never returns a mixture of
513// filename/content matches: if there are content matches, all
514// filename matches are trimmed from the result. The matches are
515// returned in document order and are non-overlapping.
516//
517// If `merge` is set, overlapping and adjacent matches will be merged
518// into a single match. Otherwise, overlapping matches will be removed,
519// but adjacent matches will remain.
520func gatherMatches(mt matchTree, known map[matchTree]bool, merge bool) []*candidateMatch {
521 var cands []*candidateMatch
522 visitMatches(mt, known, func(mt matchTree) {
523 if smt, ok := mt.(*substrMatchTree); ok {
524 cands = append(cands, smt.current...)
525 }
526 if rmt, ok := mt.(*regexpMatchTree); ok {
527 cands = append(cands, rmt.found...)
528 }
529 if rmt, ok := mt.(*wordMatchTree); ok {
530 cands = append(cands, rmt.found...)
531 }
532 if smt, ok := mt.(*symbolRegexpMatchTree); ok {
533 cands = append(cands, smt.found...)
534 }
535 })
536
537 foundContentMatch := false
538 for _, c := range cands {
539 if !c.fileName {
540 foundContentMatch = true
541 break
542 }
543 }
544
545 res := cands[:0]
546 for _, c := range cands {
547 if !foundContentMatch || !c.fileName {
548 res = append(res, c)
549 }
550 }
551 cands = res
552
553 if merge {
554 // Merge adjacent candidates. This guarantees that the matches
555 // are non-overlapping.
556 sort.Sort((sortByOffsetSlice)(cands))
557 res = cands[:0]
558 for i, c := range cands {
559 if i == 0 {
560 res = append(res, c)
561 continue
562 }
563 last := res[len(res)-1]
564 lastEnd := last.byteOffset + last.byteMatchSz
565 end := c.byteOffset + c.byteMatchSz
566 if lastEnd >= c.byteOffset {
567 if end > lastEnd {
568 last.byteMatchSz = end - last.byteOffset
569 }
570 continue
571 }
572
573 res = append(res, c)
574 }
575 } else {
576 // Remove overlapping candidates. This guarantees that the matches
577 // are non-overlapping, but also preserves expected match counts.
578 sort.Sort((sortByOffsetSlice)(cands))
579 res = cands[:0]
580 for i, c := range cands {
581 if i == 0 {
582 res = append(res, c)
583 continue
584 }
585 last := res[len(res)-1]
586 lastEnd := last.byteOffset + last.byteMatchSz
587 if lastEnd > c.byteOffset {
588 continue
589 }
590
591 res = append(res, c)
592 }
593 }
594
595 return res
596}
597
598func (d *indexData) branchIndex(docID uint32) int {
599 mask := d.fileBranchMasks[docID]
600 idx := 0
601 for mask != 0 {
602 if mask&0x1 != 0 {
603 return idx
604 }
605 idx++
606 mask >>= 1
607 }
608 return -1
609}
610
611// gatherBranches returns a list of branch names taking into account any branch
612// filters in the query. If the query contains a branch filter, it returns all
613// branches containing the docID and matching the branch filter. Otherwise, it
614// returns all branches containing docID.
615func (d *indexData) gatherBranches(docID uint32, mt matchTree, known map[matchTree]bool) []string {
616 var mask uint64
617 visitMatches(mt, known, func(mt matchTree) {
618 bq, ok := mt.(*branchQueryMatchTree)
619 if !ok {
620 return
621 }
622
623 mask = mask | bq.branchMask()
624 })
625
626 if mask == 0 {
627 mask = d.fileBranchMasks[docID]
628 }
629
630 var branches []string
631 id := uint32(1)
632 branchNames := d.branchNames[d.repos[docID]]
633 for mask != 0 {
634 if mask&0x1 != 0 {
635 branches = append(branches, branchNames[uint(id)])
636 }
637 id <<= 1
638 mask >>= 1
639 }
640
641 return branches
642}
643
644func (d *indexData) List(ctx context.Context, q query.Q, opts *ListOptions) (rl *RepoList, err error) {
645 var include func(rle *RepoListEntry) bool
646
647 q = d.simplify(q)
648 if c, ok := q.(*query.Const); ok {
649 if !c.Value {
650 return &RepoList{}, nil
651 }
652 include = func(rle *RepoListEntry) bool {
653 return true
654 }
655 } else {
656 sr, err := d.Search(ctx, q, &SearchOptions{
657 ShardRepoMaxMatchCount: 1,
658 })
659 if err != nil {
660 return nil, err
661 }
662
663 foundRepos := make(map[string]struct{}, len(sr.Files))
664 for _, file := range sr.Files {
665 foundRepos[file.Repository] = struct{}{}
666 }
667
668 include = func(rle *RepoListEntry) bool {
669 _, ok := foundRepos[rle.Repository.Name]
670 return ok
671 }
672 }
673
674 var l RepoList
675
676 field, err := opts.GetField()
677 if err != nil {
678 return nil, err
679 }
680 switch field {
681 case RepoListFieldRepos:
682 l.Repos = make([]*RepoListEntry, 0, len(d.repoListEntry))
683 case RepoListFieldMinimal:
684 l.Minimal = make(map[uint32]*MinimalRepoListEntry, len(d.repoListEntry))
685 case RepoListFieldReposMap:
686 l.ReposMap = make(ReposMap, len(d.repoListEntry))
687 }
688
689 for i := range d.repoListEntry {
690 if d.repoMetaData[i].Tombstone {
691 continue
692 }
693 rle := &d.repoListEntry[i]
694 if !include(rle) {
695 continue
696 }
697
698 l.Stats.Add(&rle.Stats)
699
700 // Backwards compat for when ID is missing
701 if rle.Repository.ID == 0 {
702 l.Repos = append(l.Repos, rle)
703 continue
704 }
705
706 switch field {
707 case RepoListFieldRepos:
708 l.Repos = append(l.Repos, rle)
709 case RepoListFieldMinimal:
710 l.Minimal[rle.Repository.ID] = &MinimalRepoListEntry{
711 HasSymbols: rle.Repository.HasSymbols,
712 Branches: rle.Repository.Branches,
713 }
714 case RepoListFieldReposMap:
715 l.ReposMap[rle.Repository.ID] = MinimalRepoListEntry{
716 HasSymbols: rle.Repository.HasSymbols,
717 Branches: rle.Repository.Branches,
718 }
719 }
720
721 }
722
723 return &l, nil
724}
725
726// regexpToMatchTreeRecursive converts a regular expression to a matchTree mt. If
727// mt is equivalent to the input r, isEqual = true and the matchTree can be used
728// in place of the regex r. If singleLine = true, then the matchTree and all
729// its children only match terms on the same line. singleLine is used during
730// recursion to decide whether to return an andLineMatchTree (singleLine = true)
731// or a andMatchTree (singleLine = false).
732func (d *indexData) regexpToMatchTreeRecursive(r *syntax.Regexp, minTextSize int, fileName bool, caseSensitive bool) (mt matchTree, isEqual bool, singleLine bool, err error) {
733 // TODO - we could perhaps transform Begin/EndText in '\n'?
734 // TODO - we could perhaps transform CharClass in (OrQuery )
735 // if there are just a few runes, and part of a OpConcat?
736 switch r.Op {
737 case syntax.OpLiteral:
738 s := string(r.Rune)
739 if len(s) >= minTextSize {
740 mt, err := d.newSubstringMatchTree(&query.Substring{Pattern: s, FileName: fileName, CaseSensitive: caseSensitive})
741 return mt, true, !strings.Contains(s, "\n"), err
742 }
743 case syntax.OpCapture:
744 return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
745
746 case syntax.OpPlus:
747 return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
748
749 case syntax.OpRepeat:
750 if r.Min == 1 {
751 return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
752 } else if r.Min > 1 {
753 // (x){2,} can't be expressed precisely by the matchTree
754 mt, _, singleLine, err := d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
755 return mt, false, singleLine, err
756 }
757 case syntax.OpConcat, syntax.OpAlternate:
758 var qs []matchTree
759 isEq := true
760 singleLine = true
761 for _, sr := range r.Sub {
762 if sq, subIsEq, subSingleLine, err := d.regexpToMatchTreeRecursive(sr, minTextSize, fileName, caseSensitive); sq != nil {
763 if err != nil {
764 return nil, false, false, err
765 }
766 isEq = isEq && subIsEq
767 singleLine = singleLine && subSingleLine
768 qs = append(qs, sq)
769 }
770 }
771 if r.Op == syntax.OpConcat {
772 if len(qs) > 1 {
773 isEq = false
774 }
775 newQs := make([]matchTree, 0, len(qs))
776 for _, q := range qs {
777 if _, ok := q.(*bruteForceMatchTree); ok {
778 continue
779 }
780 newQs = append(newQs, q)
781 }
782 if len(newQs) == 1 {
783 return newQs[0], isEq, singleLine, nil
784 }
785 if len(newQs) == 0 {
786 return &bruteForceMatchTree{}, isEq, singleLine, nil
787 }
788 if singleLine {
789 return &andLineMatchTree{andMatchTree{children: newQs}}, isEq, singleLine, nil
790 }
791 return &andMatchTree{newQs}, isEq, singleLine, nil
792 }
793 for _, q := range qs {
794 if _, ok := q.(*bruteForceMatchTree); ok {
795 return q, isEq, false, nil
796 }
797 }
798 if len(qs) == 0 {
799 return &noMatchTree{"const"}, isEq, false, nil
800 }
801 return &orMatchTree{qs}, isEq, false, nil
802 case syntax.OpStar:
803 if r.Sub[0].Op == syntax.OpAnyCharNotNL {
804 return &bruteForceMatchTree{}, false, true, nil
805 }
806 }
807 return &bruteForceMatchTree{}, false, false, nil
808}