fork of https://github.com/sourcegraph/zoekt
1// Copyright 2018 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "fmt"
19 "log"
20 "strings"
21 "unicode/utf8"
22
23 "github.com/grafana/regexp"
24 "github.com/sourcegraph/zoekt/query"
25)
26
27// A docIterator iterates over documents in order.
28type docIterator interface {
29 // provide the next document where we can may find something
30 // interesting.
31 nextDoc() uint32
32
33 // clears any per-document state of the docIterator, and
34 // prepares for evaluating the given doc. The argument is
35 // strictly increasing over time.
36 prepare(nextDoc uint32)
37}
38
39const (
40 costConst = 0
41 costMemory = 1
42 costContent = 2
43 costRegexp = 3
44)
45
46const (
47 costMin = costConst
48 costMax = costRegexp
49)
50
51// An expression tree coupled with matches. The matchtree has two
52// functions:
53//
54// * it implements boolean combinations (and, or, not)
55//
56// * it implements shortcuts, where we skip documents (for example: if
57// there are no trigram matches, we can be sure there are no substring
58// matches). The matchtree iterates over the documents as they are
59// ordered in the shard.
60//
61// The general process for a given (shard, query) is:
62//
63// - construct matchTree for the query
64//
65// - find all different leaf matchTrees (substring, regexp, etc.)
66//
67// in a loop:
68//
69// - find next doc to process using nextDoc
70//
71// - evaluate atoms (leaf expressions that match text)
72//
73// - evaluate the tree using matches(), storing the result in map.
74//
75// - if the complete tree returns (matches() == true) for the document,
76// collect all text matches by looking at leaf matchTrees
77type matchTree interface {
78 docIterator
79
80 // returns whether this matches, and if we are sure.
81 matches(cp *contentProvider, cost int, known map[matchTree]bool) (match bool, sure bool)
82}
83
84// docMatchTree iterates over documents for which predicate(docID) returns true.
85type docMatchTree struct {
86 // the number of documents in a shard.
87 numDocs uint32
88
89 predicate func(docID uint32) bool
90
91 // provides additional information about the reason why the docMatchTree was
92 // created.
93 reason string
94
95 // mutable
96 firstDone bool
97 docID uint32
98}
99
100type bruteForceMatchTree struct {
101 // mutable
102 firstDone bool
103 docID uint32
104}
105
106type andLineMatchTree struct {
107 andMatchTree
108}
109
110type andMatchTree struct {
111 children []matchTree
112}
113
114type orMatchTree struct {
115 children []matchTree
116}
117
118type notMatchTree struct {
119 child matchTree
120}
121
122// Returns only the filename of child matches.
123type fileNameMatchTree struct {
124 child matchTree
125}
126
127// Don't visit this subtree for collecting matches.
128type noVisitMatchTree struct {
129 matchTree
130}
131
132type regexpMatchTree struct {
133 regexp *regexp.Regexp
134
135 fileName bool
136
137 // mutable
138 reEvaluated bool
139 found []*candidateMatch
140
141 // nextDoc, prepare.
142 bruteForceMatchTree
143}
144
145type substrMatchTree struct {
146 matchIterator
147
148 query *query.Substring
149 caseSensitive bool
150 fileName bool
151
152 // mutable
153 current []*candidateMatch
154 contEvaluated bool
155}
156
157type branchQueryMatchTree struct {
158 fileMasks []uint64
159 masks []uint64
160 repos []uint16
161
162 // mutable
163 firstDone bool
164 docID uint32
165}
166
167type symbolRegexpMatchTree struct {
168 matchTree
169 regexp *regexp.Regexp
170 all bool // skips regex match if .*
171
172 reEvaluated bool
173 found []*candidateMatch
174}
175
176func (t *symbolRegexpMatchTree) prepare(doc uint32) {
177 t.reEvaluated = false
178}
179
180func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
181 if t.reEvaluated {
182 return len(t.found) > 0, true
183 }
184
185 if cost < costRegexp {
186 return false, false
187 }
188
189 sections := cp.docSections()
190 content := cp.data(false)
191
192 found := t.found[:0]
193 for i, sec := range sections {
194 var idx []int
195 if t.all {
196 idx = []int{0, int(sec.End - sec.Start)}
197 } else {
198 idx = t.regexp.FindIndex(content[sec.Start:sec.End])
199 if idx == nil {
200 continue
201 }
202 }
203
204 cm := &candidateMatch{
205 byteOffset: sec.Start + uint32(idx[0]),
206 byteMatchSz: uint32(idx[1] - idx[0]),
207 symbol: true,
208 symbolIdx: uint32(i),
209 }
210 found = append(found, cm)
211 }
212 t.found = found
213 t.reEvaluated = true
214
215 return len(t.found) > 0, true
216}
217
218type symbolSubstrMatchTree struct {
219 *substrMatchTree
220
221 patternSize uint32
222 fileEndRunes []uint32
223 fileEndSymbol []uint32
224
225 doc uint32
226 sections []DocumentSection
227
228 secID uint32
229}
230
231func (t *symbolSubstrMatchTree) prepare(doc uint32) {
232 t.substrMatchTree.prepare(doc)
233 t.doc = doc
234
235 var fileStart uint32
236 if doc > 0 {
237 fileStart = t.fileEndRunes[doc-1]
238 }
239
240 var sections []DocumentSection
241 if len(t.sections) > 0 {
242 most := t.fileEndSymbol[len(t.fileEndSymbol)-1]
243 if most == uint32(len(t.sections)) {
244 sections = t.sections[t.fileEndSymbol[doc]:t.fileEndSymbol[doc+1]]
245 } else {
246 for t.secID < uint32(len(t.sections)) && t.sections[t.secID].Start < fileStart {
247 t.secID++
248 }
249
250 fileEnd, symbolEnd := t.fileEndRunes[doc], t.secID
251 for symbolEnd < uint32(len(t.sections)) && t.sections[symbolEnd].Start < fileEnd {
252 symbolEnd++
253 }
254
255 sections = t.sections[t.secID:symbolEnd]
256 }
257 }
258
259 secIdx := 0
260 trimmed := t.current[:0]
261 for len(sections) > secIdx && len(t.current) > 0 {
262 start := fileStart + t.current[0].runeOffset
263 end := start + t.patternSize
264 if start >= sections[secIdx].End {
265 secIdx++
266 continue
267 }
268
269 if start < sections[secIdx].Start {
270 t.current = t.current[1:]
271 continue
272 }
273
274 if end <= sections[secIdx].End {
275 t.current[0].symbol = true
276 t.current[0].symbolIdx = uint32(secIdx)
277 trimmed = append(trimmed, t.current[0])
278 }
279
280 t.current = t.current[1:]
281 }
282 t.current = trimmed
283}
284
285// all prepare methods
286
287func (t *bruteForceMatchTree) prepare(doc uint32) {
288 t.docID = doc
289 t.firstDone = true
290}
291
292func (t *docMatchTree) prepare(doc uint32) {
293 t.docID = doc
294 t.firstDone = true
295}
296
297func (t *andMatchTree) prepare(doc uint32) {
298 for _, c := range t.children {
299 c.prepare(doc)
300 }
301}
302
303func (t *regexpMatchTree) prepare(doc uint32) {
304 t.found = t.found[:0]
305 t.reEvaluated = false
306 t.bruteForceMatchTree.prepare(doc)
307}
308
309func (t *orMatchTree) prepare(doc uint32) {
310 for _, c := range t.children {
311 c.prepare(doc)
312 }
313}
314
315func (t *notMatchTree) prepare(doc uint32) {
316 t.child.prepare(doc)
317}
318
319func (t *fileNameMatchTree) prepare(doc uint32) {
320 t.child.prepare(doc)
321}
322
323func (t *substrMatchTree) prepare(nextDoc uint32) {
324 t.matchIterator.prepare(nextDoc)
325 t.current = t.matchIterator.candidates()
326 t.contEvaluated = false
327}
328
329func (t *branchQueryMatchTree) prepare(doc uint32) {
330 t.firstDone = true
331 t.docID = doc
332}
333
334// nextDoc
335
336func (t *docMatchTree) nextDoc() uint32 {
337 var start uint32
338 if t.firstDone {
339 start = t.docID + 1
340 }
341 for i := start; i < t.numDocs; i++ {
342 if t.predicate(i) {
343 return i
344 }
345 }
346 return maxUInt32
347}
348
349func (t *bruteForceMatchTree) nextDoc() uint32 {
350 if !t.firstDone {
351 return 0
352 }
353 return t.docID + 1
354}
355
356func (t *andMatchTree) nextDoc() uint32 {
357 var max uint32
358 for _, c := range t.children {
359 m := c.nextDoc()
360 if m > max {
361 max = m
362 }
363 }
364 return max
365}
366
367func (t *orMatchTree) nextDoc() uint32 {
368 min := uint32(maxUInt32)
369 for _, c := range t.children {
370 m := c.nextDoc()
371 if m < min {
372 min = m
373 }
374 }
375 return min
376}
377
378func (t *notMatchTree) nextDoc() uint32 {
379 return 0
380}
381
382func (t *fileNameMatchTree) nextDoc() uint32 {
383 return t.child.nextDoc()
384}
385
386func (t *branchQueryMatchTree) nextDoc() uint32 {
387 var start uint32
388 if t.firstDone {
389 start = t.docID + 1
390 }
391
392 for i := start; i < uint32(len(t.fileMasks)); i++ {
393 if (t.masks[t.repos[i]] & t.fileMasks[i]) != 0 {
394 return i
395 }
396 }
397 return maxUInt32
398}
399
400// all String methods
401
402func (t *bruteForceMatchTree) String() string {
403 return "all"
404}
405
406func (t *docMatchTree) String() string {
407 return fmt.Sprintf("doc(%s)", t.reason)
408}
409
410func (t *andMatchTree) String() string {
411 return fmt.Sprintf("and%v", t.children)
412}
413
414func (t *regexpMatchTree) String() string {
415 f := ""
416 if t.fileName {
417 f = "f"
418 }
419 return fmt.Sprintf("%sre(%s)", f, t.regexp)
420}
421
422func (t *orMatchTree) String() string {
423 return fmt.Sprintf("or%v", t.children)
424}
425
426func (t *notMatchTree) String() string {
427 return fmt.Sprintf("not(%v)", t.child)
428}
429
430func (t *noVisitMatchTree) String() string {
431 return fmt.Sprintf("novisit(%v)", t.matchTree)
432}
433
434func (t *fileNameMatchTree) String() string {
435 return fmt.Sprintf("f(%v)", t.child)
436}
437
438func (t *substrMatchTree) String() string {
439 f := ""
440 if t.fileName {
441 f = "f"
442 }
443
444 return fmt.Sprintf("%ssubstr(%q, %v, %v)", f, t.query.Pattern, t.current, t.matchIterator)
445}
446
447func (t *branchQueryMatchTree) String() string {
448 return fmt.Sprintf("branch(%x)", t.masks)
449}
450
451func (t *symbolSubstrMatchTree) String() string {
452 return fmt.Sprintf("symbol(%v)", t.substrMatchTree)
453}
454
455func (t *symbolRegexpMatchTree) String() string {
456 return fmt.Sprintf("symbol(%v)", t.matchTree)
457}
458
459// visitMatches visits all atoms in matchTree. Note: This visits
460// noVisitMatchTree. For collecting matches use visitMatches.
461func visitMatchTree(t matchTree, f func(matchTree)) {
462 switch s := t.(type) {
463 case *andMatchTree:
464 for _, ch := range s.children {
465 visitMatchTree(ch, f)
466 }
467 case *orMatchTree:
468 for _, ch := range s.children {
469 visitMatchTree(ch, f)
470 }
471 case *andLineMatchTree:
472 visitMatchTree(&s.andMatchTree, f)
473 case *noVisitMatchTree:
474 visitMatchTree(s.matchTree, f)
475 case *notMatchTree:
476 visitMatchTree(s.child, f)
477 case *fileNameMatchTree:
478 visitMatchTree(s.child, f)
479 case *symbolSubstrMatchTree:
480 visitMatchTree(s.substrMatchTree, f)
481 case *symbolRegexpMatchTree:
482 visitMatchTree(s.matchTree, f)
483 default:
484 f(t)
485 }
486}
487
488// visitMatches visits all atoms which can contribute matches. Note: This
489// skips noVisitMatchTree.
490func visitMatches(t matchTree, known map[matchTree]bool, f func(matchTree)) {
491 switch s := t.(type) {
492 case *andMatchTree:
493 for _, ch := range s.children {
494 if known[ch] {
495 visitMatches(ch, known, f)
496 }
497 }
498 case *andLineMatchTree:
499 visitMatches(&s.andMatchTree, known, f)
500 case *orMatchTree:
501 for _, ch := range s.children {
502 if known[ch] {
503 visitMatches(ch, known, f)
504 }
505 }
506 case *symbolSubstrMatchTree:
507 visitMatches(s.substrMatchTree, known, f)
508 case *notMatchTree:
509 case *noVisitMatchTree:
510 // don't collect into negative trees.
511 case *fileNameMatchTree:
512 // We will just gather the filename if we do not visit this tree.
513 default:
514 f(s)
515 }
516}
517
518// all matches() methods.
519
520func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
521 return t.predicate(cp.idx), true
522}
523
524func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
525 return true, true
526}
527
528// andLineMatchTree is a performance optimization of andMatchTree. For content
529// searches we don't want to run the regex engine if there is no line that
530// contains matches from all terms.
531func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
532 matches, sure := t.andMatchTree.matches(cp, cost, known)
533 if !(sure && matches) {
534 return matches, sure
535 }
536
537 // find child with fewest candidates
538 min := maxUInt32
539 fewestChildren := 0
540 for ix, child := range t.children {
541 v, ok := child.(*substrMatchTree)
542 // make sure we are running a content search and that all candidates are a
543 // substrMatchTree
544 if !ok || v.fileName {
545 return matches, sure
546 }
547 if len(v.current) < min {
548 min = len(v.current)
549 fewestChildren = ix
550 }
551 }
552
553 type lineRange struct {
554 start int
555 end int
556 }
557 lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current))
558 prev := -1
559 for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current {
560 line, byteStart, byteEnd := cp.newlines().atOffset(candidate.byteOffset)
561 if line == prev {
562 continue
563 }
564 prev = line
565 lines = append(lines, lineRange{byteStart, byteEnd})
566 }
567
568 // children keeps track of the children's candidates we have already seen.
569 children := make([][]*candidateMatch, 0, len(t.children)-1)
570 for j, child := range t.children {
571 if j == fewestChildren {
572 continue
573 }
574 children = append(children, child.(*substrMatchTree).current)
575 }
576
577nextLine:
578 for i := 0; i < len(lines); i++ {
579 hits := 1
580 nextChild:
581 for j := range children {
582 nextCandidate:
583 for len(children[j]) > 0 {
584 candidate := children[j][0]
585 bo := int(cp.findOffset(false, candidate.runeOffset))
586 if bo < lines[i].start {
587 children[j] = children[j][1:]
588 continue nextCandidate
589 }
590 if bo <= lines[i].end {
591 hits++
592 continue nextChild
593 }
594 // move the `lines` iterator forward until bo <= line.end
595 for i < len(lines) && bo > lines[i].end {
596 i++
597 }
598 i--
599 continue nextLine
600 }
601 }
602 // return early once we found any line that contains matches from all children
603 if hits == len(t.children) {
604 return matches, true
605 }
606 }
607 return false, true
608}
609
610func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
611 sure := true
612
613 for _, ch := range t.children {
614 v, ok := evalMatchTree(cp, cost, known, ch)
615 if ok && !v {
616 return false, true
617 }
618 if !ok {
619 sure = false
620 }
621 }
622
623 return true, sure
624}
625
626func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
627 matches := false
628 sure := true
629 for _, ch := range t.children {
630 v, ok := evalMatchTree(cp, cost, known, ch)
631 if ok {
632 // we could short-circuit, but we want to use
633 // the other possibilities as a ranking
634 // signal.
635 matches = matches || v
636 } else {
637 sure = false
638 }
639 }
640 return matches, sure
641}
642
643func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
644 return t.fileMasks[t.docID]&t.masks[t.repos[t.docID]] != 0, true
645}
646
647func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
648 if t.reEvaluated {
649 return len(t.found) > 0, true
650 }
651
652 if cost < costRegexp {
653 return false, false
654 }
655
656 cp.stats.RegexpsConsidered++
657 idxs := t.regexp.FindAllIndex(cp.data(t.fileName), -1)
658 found := t.found[:0]
659 for _, idx := range idxs {
660 cm := &candidateMatch{
661 byteOffset: uint32(idx[0]),
662 byteMatchSz: uint32(idx[1] - idx[0]),
663 fileName: t.fileName,
664 }
665
666 found = append(found, cm)
667 }
668 t.found = found
669 t.reEvaluated = true
670
671 return len(t.found) > 0, true
672}
673
674// breakMatchesOnNewlines returns matches resulting from breaking each element
675// of cms on newlines within text.
676func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch {
677 var lineCMs []*candidateMatch
678 for _, cm := range cms {
679 lineCMs = append(lineCMs, breakOnNewlines(cm, text)...)
680 }
681 return lineCMs
682}
683
684// breakOnNewlines returns matches resulting from breaking cm on newlines
685// within text.
686func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch {
687 var cms []*candidateMatch
688 addMe := &candidateMatch{}
689 *addMe = *cm
690 for i := uint32(cm.byteOffset); i < cm.byteOffset+cm.byteMatchSz; i++ {
691 if text[i] == '\n' {
692 addMe.byteMatchSz = i - addMe.byteOffset
693 if addMe.byteMatchSz != 0 {
694 cms = append(cms, addMe)
695 }
696
697 addMe = &candidateMatch{}
698 *addMe = *cm
699 addMe.byteOffset = i + 1
700 }
701 }
702 addMe.byteMatchSz = cm.byteOffset + cm.byteMatchSz - addMe.byteOffset
703 if addMe.byteMatchSz != 0 {
704 cms = append(cms, addMe)
705 }
706 return cms
707}
708
709func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) (bool, bool) {
710 if v, ok := known[mt]; ok {
711 return v, true
712 }
713
714 v, ok := mt.matches(cp, cost, known)
715 if ok {
716 known[mt] = v
717 }
718
719 return v, ok
720}
721
722func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
723 v, ok := evalMatchTree(cp, cost, known, t.child)
724 return !v, ok
725}
726
727func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
728 return evalMatchTree(cp, cost, known, t.child)
729}
730
731func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
732 if t.contEvaluated {
733 return len(t.current) > 0, true
734 }
735
736 if len(t.current) == 0 {
737 return false, true
738 }
739
740 if t.fileName && cost < costMemory {
741 return false, false
742 }
743
744 if !t.fileName && cost < costContent {
745 return false, false
746 }
747
748 pruned := t.current[:0]
749 for _, m := range t.current {
750 if m.byteOffset == 0 && m.runeOffset > 0 {
751 m.byteOffset = cp.findOffset(m.fileName, m.runeOffset)
752 }
753 if m.matchContent(cp.data(m.fileName)) {
754 pruned = append(pruned, m)
755 }
756 }
757 t.current = pruned
758 t.contEvaluated = true
759
760 return len(t.current) > 0, true
761}
762
763func (d *indexData) newMatchTree(q query.Q) (matchTree, error) {
764 if q == nil {
765 return nil, fmt.Errorf("got nil (sub)query")
766 }
767 switch s := q.(type) {
768 case *query.Regexp:
769 // RegexpToMatchTreeRecursive tries to distill a matchTree that matches a
770 // superset of the regexp. If the returned matchTree is equivalent to the
771 // original regexp, it returns true. An equivalent matchTree has the same
772 // behaviour as the original regexp and can be used instead.
773 //
774 subMT, isEq, _, err := d.regexpToMatchTreeRecursive(s.Regexp, ngramSize, s.FileName, s.CaseSensitive)
775 if err != nil {
776 return nil, err
777 }
778 // if the query can be used in place of the regexp
779 // return the subtree
780 if isEq {
781 return subMT, nil
782 }
783
784 prefix := ""
785 if !s.CaseSensitive {
786 prefix = "(?i)"
787 }
788
789 tr := ®expMatchTree{
790 regexp: regexp.MustCompile(prefix + s.Regexp.String()),
791 fileName: s.FileName,
792 }
793
794 return &andMatchTree{
795 children: []matchTree{
796 tr, &noVisitMatchTree{subMT},
797 },
798 }, nil
799 case *query.And:
800 var r []matchTree
801 for _, ch := range s.Children {
802 ct, err := d.newMatchTree(ch)
803 if err != nil {
804 return nil, err
805 }
806 r = append(r, ct)
807 }
808 return &andMatchTree{r}, nil
809 case *query.Or:
810 var r []matchTree
811 for _, ch := range s.Children {
812 ct, err := d.newMatchTree(ch)
813 if err != nil {
814 return nil, err
815 }
816 r = append(r, ct)
817 }
818 return &orMatchTree{r}, nil
819 case *query.Not:
820 ct, err := d.newMatchTree(s.Child)
821 return ¬MatchTree{
822 child: ct,
823 }, err
824
825 case *query.Type:
826 if s.Type != query.TypeFileName {
827 break
828 }
829
830 ct, err := d.newMatchTree(s.Child)
831 if err != nil {
832 return nil, err
833 }
834
835 return &fileNameMatchTree{
836 child: ct,
837 }, nil
838
839 case *query.Substring:
840 return d.newSubstringMatchTree(s)
841
842 case *query.Branch:
843 masks := make([]uint64, 0, len(d.repoMetaData))
844 if s.Pattern == "HEAD" {
845 for i := 0; i < len(d.repoMetaData); i++ {
846 masks = append(masks, 1)
847 }
848 } else {
849 for _, branchIDs := range d.branchIDs {
850 mask := uint64(0)
851 for nm, m := range branchIDs {
852 if (s.Exact && nm == s.Pattern) || (!s.Exact && strings.Contains(nm, s.Pattern)) {
853 mask |= uint64(m)
854 }
855 }
856 masks = append(masks, mask)
857 }
858
859 }
860 return &branchQueryMatchTree{
861 masks: masks,
862 fileMasks: d.fileBranchMasks,
863 repos: d.repos,
864 }, nil
865 case *query.Const:
866 if s.Value {
867 return &bruteForceMatchTree{}, nil
868 } else {
869 return &noMatchTree{"const"}, nil
870 }
871 case *query.Language:
872 code, ok := d.metaData.LanguageMap[s.Language]
873 if !ok {
874 return &noMatchTree{"lang"}, nil
875 }
876 return &docMatchTree{
877 reason: "language",
878 numDocs: d.numDocs(),
879 predicate: func(docID uint32) bool {
880 return d.getLanguage(docID) == code
881 },
882 }, nil
883
884 case *query.Symbol:
885 subMT, err := d.newMatchTree(s.Expr)
886 if err != nil {
887 return nil, err
888 }
889
890 if substr, ok := subMT.(*substrMatchTree); ok {
891 // We have a feature flag for lazy decoding. If runeDocSectionsRaw is
892 // non-nil it means we need to lazily decode on request.
893 sections := d.runeDocSections
894 if sections == nil && d.runeDocSectionsRaw != nil {
895 sections = unmarshalDocSections(d.runeDocSectionsRaw, nil)
896 }
897
898 return &symbolSubstrMatchTree{
899 substrMatchTree: substr,
900 patternSize: uint32(utf8.RuneCountInString(substr.query.Pattern)),
901 fileEndRunes: d.fileEndRunes,
902 fileEndSymbol: d.fileEndSymbol,
903 sections: sections,
904 }, nil
905 }
906
907 var regexp *regexp.Regexp
908 visitMatchTree(subMT, func(mt matchTree) {
909 if t, ok := mt.(*regexpMatchTree); ok {
910 regexp = t.regexp
911 }
912 })
913 if regexp == nil {
914 return nil, fmt.Errorf("found %T inside query.Symbol", subMT)
915 }
916
917 return &symbolRegexpMatchTree{
918 regexp: regexp,
919 all: regexp.String() == "(?i)(?-s:.)*",
920 matchTree: subMT,
921 }, nil
922
923 case *query.FileNameSet:
924 return &docMatchTree{
925 reason: "FileNameSet",
926 numDocs: d.numDocs(),
927 predicate: func(docID uint32) bool {
928 fileName := d.fileName(docID)
929 _, ok := s.Set[string(fileName)]
930 return ok
931 },
932 }, nil
933
934 case *query.BranchesRepos:
935 reposBranchesWant := make([]uint64, len(d.repoMetaData))
936 for repoIdx := range d.repoMetaData {
937 var mask uint64
938 for _, br := range s.List {
939 if br.Repos.Contains(d.repoMetaData[repoIdx].ID) {
940 mask |= uint64(d.branchIDs[repoIdx][br.Branch])
941 }
942 }
943 reposBranchesWant[repoIdx] = mask
944 }
945 return &docMatchTree{
946 reason: "BranchesRepos",
947 numDocs: d.numDocs(),
948 predicate: func(docID uint32) bool {
949 return d.fileBranchMasks[docID]&reposBranchesWant[d.repos[docID]] != 0
950 },
951 }, nil
952
953 case *query.RepoSet:
954 reposWant := make([]bool, len(d.repoMetaData))
955 for repoIdx, r := range d.repoMetaData {
956 if _, ok := s.Set[r.Name]; ok {
957 reposWant[repoIdx] = true
958 }
959 }
960 return &docMatchTree{
961 reason: "RepoSet",
962 numDocs: d.numDocs(),
963 predicate: func(docID uint32) bool {
964 return reposWant[d.repos[docID]]
965 },
966 }, nil
967
968 case *query.Repo:
969 reposWant := make([]bool, len(d.repoMetaData))
970 for repoIdx, r := range d.repoMetaData {
971 if s.Regexp.MatchString(r.Name) {
972 reposWant[repoIdx] = true
973 }
974 }
975 return &docMatchTree{
976 reason: "Repo",
977 numDocs: d.numDocs(),
978 predicate: func(docID uint32) bool {
979 return reposWant[d.repos[docID]]
980 },
981 }, nil
982
983 case *query.RepoRegexp:
984 reposWant := make([]bool, len(d.repoMetaData))
985 for repoIdx, r := range d.repoMetaData {
986 if s.Regexp.MatchString(r.Name) {
987 reposWant[repoIdx] = true
988 }
989 }
990 return &docMatchTree{
991 reason: "RepoRegexp",
992 numDocs: d.numDocs(),
993 predicate: func(docID uint32) bool {
994 return reposWant[d.repos[docID]]
995 },
996 }, nil
997
998 case query.RawConfig:
999 return &docMatchTree{
1000 reason: s.String(),
1001 numDocs: d.numDocs(),
1002 predicate: func(docID uint32) bool {
1003 return uint8(s)&d.rawConfigMasks[d.repos[docID]] == uint8(s)
1004 },
1005 }, nil
1006 }
1007 log.Panicf("type %T", q)
1008 return nil, nil
1009}
1010
1011func (d *indexData) newSubstringMatchTree(s *query.Substring) (matchTree, error) {
1012 st := &substrMatchTree{
1013 query: s,
1014 caseSensitive: s.CaseSensitive,
1015 fileName: s.FileName,
1016 }
1017
1018 if utf8.RuneCountInString(s.Pattern) < ngramSize {
1019 prefix := ""
1020 if !s.CaseSensitive {
1021 prefix = "(?i)"
1022 }
1023 t := ®expMatchTree{
1024 regexp: regexp.MustCompile(prefix + regexp.QuoteMeta(s.Pattern)),
1025 fileName: s.FileName,
1026 }
1027 return t, nil
1028 }
1029
1030 result, err := d.iterateNgrams(s)
1031 if err != nil {
1032 return nil, err
1033 }
1034 st.matchIterator = result
1035 return st, nil
1036}
1037
1038// pruneMatchTree removes impossible branches from the matchTree, as indicated
1039// by substrMatchTree having a noMatchTree and the resulting impossible and clauses and so forth.
1040func pruneMatchTree(mt matchTree) (matchTree, error) {
1041 var err error
1042 switch mt := mt.(type) {
1043 // leaf nodes that we test with the filter:
1044 case *substrMatchTree:
1045 if res, ok := mt.matchIterator.(*ngramIterationResults); ok {
1046 if _, ok := res.matchIterator.(*noMatchTree); ok {
1047 return nil, nil
1048 }
1049 }
1050 // recursive tree structures:
1051 case *andMatchTree:
1052 // Any branch of an and becoming impossible means the entire clause
1053 // is impossible. Otherwise, just handle rewrites.
1054 for i, child := range mt.children {
1055 newChild, err := pruneMatchTree(child)
1056 if err != nil {
1057 return nil, err
1058 }
1059 if newChild == nil {
1060 return nil, nil
1061 }
1062 mt.children[i] = newChild
1063 }
1064 case *orMatchTree:
1065 // *All* branches of an OR becoming impossible means the entire clause
1066 // is impossible. Otherwise, drop impossible subclauses and handle
1067 // rewrites, including simplifying to a singular resulting child branch.
1068 n := 0
1069 for _, child := range mt.children {
1070 newChild, err := pruneMatchTree(child)
1071 if err != nil {
1072 return nil, err
1073 }
1074 if newChild != nil {
1075 mt.children[n] = newChild
1076 n++
1077 }
1078 }
1079 mt.children = mt.children[:n]
1080 if len(mt.children) == 1 {
1081 return mt.children[0], nil
1082 } else if len(mt.children) == 0 {
1083 return nil, nil
1084 }
1085 case *noVisitMatchTree:
1086 mt.matchTree, err = pruneMatchTree(mt.matchTree)
1087 if err != nil {
1088 return nil, err
1089 }
1090 if mt.matchTree == nil {
1091 return nil, nil
1092 }
1093 case *fileNameMatchTree:
1094 mt.child, err = pruneMatchTree(mt.child)
1095 case *andLineMatchTree:
1096 child, err := pruneMatchTree(&mt.andMatchTree)
1097 if err != nil {
1098 return nil, err
1099 }
1100 if child == nil {
1101 return nil, nil
1102 }
1103 if c, ok := child.(*andMatchTree); ok {
1104 mt.andMatchTree = *c
1105 } else {
1106 // the and was simplified to a single clause,
1107 // so the linematch portion is irrelevant.
1108 return mt, nil
1109 }
1110 case *notMatchTree:
1111 mt.child, err = pruneMatchTree(mt.child)
1112 if err != nil {
1113 return nil, err
1114 }
1115 if mt.child == nil {
1116 // not false => true
1117 return &bruteForceMatchTree{}, nil
1118 }
1119 // unhandled:
1120 case *docMatchTree:
1121 case *bruteForceMatchTree:
1122 case *regexpMatchTree:
1123 }
1124 return mt, err
1125}