fork of https://github.com/sourcegraph/zoekt
1// Copyright 2018 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "bytes"
19 "fmt"
20 "log"
21 "regexp/syntax"
22 "strings"
23 "unicode/utf8"
24
25 "github.com/grafana/regexp"
26
27 "github.com/sourcegraph/zoekt/query"
28)
29
30// A docIterator iterates over documents in order.
31type docIterator interface {
32 // provide the next document where we can may find something
33 // interesting.
34 nextDoc() uint32
35
36 // clears any per-document state of the docIterator, and
37 // prepares for evaluating the given doc. The argument is
38 // strictly increasing over time.
39 prepare(nextDoc uint32)
40}
41
42const (
43 costConst = 0
44 costMemory = 1
45 costContent = 2
46 costRegexp = 3
47)
48
49const (
50 costMin = costConst
51 costMax = costRegexp
52)
53
54// An expression tree coupled with matches. The matchtree has two
55// functions:
56//
57// * it implements boolean combinations (and, or, not)
58//
59// * it implements shortcuts, where we skip documents (for example: if
60// there are no trigram matches, we can be sure there are no substring
61// matches). The matchtree iterates over the documents as they are
62// ordered in the shard.
63//
64// The general process for a given (shard, query) is:
65//
66// - construct matchTree for the query
67//
68// - find all different leaf matchTrees (substring, regexp, etc.)
69//
70// in a loop:
71//
72// - find next doc to process using nextDoc
73//
74// - evaluate atoms (leaf expressions that match text)
75//
76// - evaluate the tree using matches(), storing the result in map.
77//
78// - if the complete tree returns (matches() == true) for the document,
79// collect all text matches by looking at leaf matchTrees
80type matchTree interface {
81 docIterator
82
83 // returns whether this matches, and if we are sure.
84 matches(cp *contentProvider, cost int, known map[matchTree]bool) (match bool, sure bool)
85}
86
87// docMatchTree iterates over documents for which predicate(docID) returns true.
88type docMatchTree struct {
89 // the number of documents in a shard.
90 numDocs uint32
91
92 predicate func(docID uint32) bool
93
94 // provides additional information about the reason why the docMatchTree was
95 // created.
96 reason string
97
98 // mutable
99 firstDone bool
100 docID uint32
101}
102
103type bruteForceMatchTree struct {
104 // mutable
105 firstDone bool
106 docID uint32
107}
108
109type andLineMatchTree struct {
110 andMatchTree
111}
112
113type andMatchTree struct {
114 children []matchTree
115}
116
117type orMatchTree struct {
118 children []matchTree
119}
120
121type notMatchTree struct {
122 child matchTree
123}
124
125// Returns only the filename of child matches.
126type fileNameMatchTree struct {
127 child matchTree
128}
129
130// Don't visit this subtree for collecting matches.
131type noVisitMatchTree struct {
132 matchTree
133}
134
135type regexpMatchTree struct {
136 regexp *regexp.Regexp
137
138 fileName bool
139
140 // mutable
141 reEvaluated bool
142 found []*candidateMatch
143
144 // nextDoc, prepare.
145 bruteForceMatchTree
146}
147
148// \bLITERAL\b
149type wordMatchTree struct {
150 word string
151
152 fileName bool
153
154 // mutable
155 evaluated bool
156 found []*candidateMatch
157
158 // nextDoc, prepare.
159 bruteForceMatchTree
160}
161
162type substrMatchTree struct {
163 matchIterator
164
165 query *query.Substring
166 caseSensitive bool
167 fileName bool
168
169 // mutable
170 current []*candidateMatch
171 contEvaluated bool
172}
173
174type branchQueryMatchTree struct {
175 fileMasks []uint64
176 masks []uint64
177 repos []uint16
178
179 // mutable
180 firstDone bool
181 docID uint32
182}
183
184func (t *branchQueryMatchTree) branchMask() uint64 {
185 return t.fileMasks[t.docID] & t.masks[t.repos[t.docID]]
186}
187
188type symbolRegexpMatchTree struct {
189 matchTree
190 regexp *regexp.Regexp
191 all bool // skips regex match if .*
192
193 reEvaluated bool
194 found []*candidateMatch
195}
196
197func (t *symbolRegexpMatchTree) prepare(doc uint32) {
198 t.reEvaluated = false
199}
200
201func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
202 if t.reEvaluated {
203 return len(t.found) > 0, true
204 }
205
206 if cost < costRegexp {
207 return false, false
208 }
209
210 sections := cp.docSections()
211 content := cp.data(false)
212
213 found := t.found[:0]
214 for i, sec := range sections {
215 var idx []int
216 if t.all {
217 idx = []int{0, int(sec.End - sec.Start)}
218 } else {
219 idx = t.regexp.FindIndex(content[sec.Start:sec.End])
220 if idx == nil {
221 continue
222 }
223 }
224
225 cm := &candidateMatch{
226 byteOffset: sec.Start + uint32(idx[0]),
227 byteMatchSz: uint32(idx[1] - idx[0]),
228 symbol: true,
229 symbolIdx: uint32(i),
230 }
231 found = append(found, cm)
232 }
233 t.found = found
234 t.reEvaluated = true
235
236 return len(t.found) > 0, true
237}
238
239type symbolSubstrMatchTree struct {
240 *substrMatchTree
241
242 patternSize uint32
243 fileEndRunes []uint32
244 fileEndSymbol []uint32
245
246 doc uint32
247 sections []DocumentSection
248
249 secID uint32
250}
251
252func (t *symbolSubstrMatchTree) prepare(doc uint32) {
253 t.substrMatchTree.prepare(doc)
254 t.doc = doc
255
256 var fileStart uint32
257 if doc > 0 {
258 fileStart = t.fileEndRunes[doc-1]
259 }
260
261 var sections []DocumentSection
262 if len(t.sections) > 0 {
263 most := t.fileEndSymbol[len(t.fileEndSymbol)-1]
264 if most == uint32(len(t.sections)) {
265 sections = t.sections[t.fileEndSymbol[doc]:t.fileEndSymbol[doc+1]]
266 } else {
267 for t.secID < uint32(len(t.sections)) && t.sections[t.secID].Start < fileStart {
268 t.secID++
269 }
270
271 fileEnd, symbolEnd := t.fileEndRunes[doc], t.secID
272 for symbolEnd < uint32(len(t.sections)) && t.sections[symbolEnd].Start < fileEnd {
273 symbolEnd++
274 }
275
276 sections = t.sections[t.secID:symbolEnd]
277 }
278 }
279
280 secIdx := 0
281 trimmed := t.current[:0]
282 for len(sections) > secIdx && len(t.current) > 0 {
283 start := fileStart + t.current[0].runeOffset
284 end := start + t.patternSize
285 if start >= sections[secIdx].End {
286 secIdx++
287 continue
288 }
289
290 if start < sections[secIdx].Start {
291 t.current = t.current[1:]
292 continue
293 }
294
295 if end <= sections[secIdx].End {
296 t.current[0].symbol = true
297 t.current[0].symbolIdx = uint32(secIdx)
298 trimmed = append(trimmed, t.current[0])
299 }
300
301 t.current = t.current[1:]
302 }
303 t.current = trimmed
304}
305
306// all prepare methods
307
308func (t *bruteForceMatchTree) prepare(doc uint32) {
309 t.docID = doc
310 t.firstDone = true
311}
312
313func (t *docMatchTree) prepare(doc uint32) {
314 t.docID = doc
315 t.firstDone = true
316}
317
318func (t *andMatchTree) prepare(doc uint32) {
319 for _, c := range t.children {
320 c.prepare(doc)
321 }
322}
323
324func (t *regexpMatchTree) prepare(doc uint32) {
325 t.found = t.found[:0]
326 t.reEvaluated = false
327 t.bruteForceMatchTree.prepare(doc)
328}
329
330func (t *wordMatchTree) prepare(doc uint32) {
331 t.found = t.found[:0]
332 t.evaluated = false
333 t.bruteForceMatchTree.prepare(doc)
334}
335
336func (t *orMatchTree) prepare(doc uint32) {
337 for _, c := range t.children {
338 c.prepare(doc)
339 }
340}
341
342func (t *notMatchTree) prepare(doc uint32) {
343 t.child.prepare(doc)
344}
345
346func (t *fileNameMatchTree) prepare(doc uint32) {
347 t.child.prepare(doc)
348}
349
350func (t *substrMatchTree) prepare(nextDoc uint32) {
351 t.matchIterator.prepare(nextDoc)
352 t.current = t.matchIterator.candidates()
353 t.contEvaluated = false
354}
355
356func (t *branchQueryMatchTree) prepare(doc uint32) {
357 t.firstDone = true
358 t.docID = doc
359}
360
361// nextDoc
362
363func (t *docMatchTree) nextDoc() uint32 {
364 var start uint32
365 if t.firstDone {
366 start = t.docID + 1
367 }
368 for i := start; i < t.numDocs; i++ {
369 if t.predicate(i) {
370 return i
371 }
372 }
373 return maxUInt32
374}
375
376func (t *bruteForceMatchTree) nextDoc() uint32 {
377 if !t.firstDone {
378 return 0
379 }
380 return t.docID + 1
381}
382
383func (t *andMatchTree) nextDoc() uint32 {
384 var max uint32
385 for _, c := range t.children {
386 m := c.nextDoc()
387 if m > max {
388 max = m
389 }
390 }
391 return max
392}
393
394func (t *orMatchTree) nextDoc() uint32 {
395 min := uint32(maxUInt32)
396 for _, c := range t.children {
397 m := c.nextDoc()
398 if m < min {
399 min = m
400 }
401 }
402 return min
403}
404
405func (t *notMatchTree) nextDoc() uint32 {
406 return 0
407}
408
409func (t *fileNameMatchTree) nextDoc() uint32 {
410 return t.child.nextDoc()
411}
412
413func (t *branchQueryMatchTree) nextDoc() uint32 {
414 var start uint32
415 if t.firstDone {
416 start = t.docID + 1
417 }
418
419 for i := start; i < uint32(len(t.fileMasks)); i++ {
420 if (t.masks[t.repos[i]] & t.fileMasks[i]) != 0 {
421 return i
422 }
423 }
424 return maxUInt32
425}
426
427// all String methods
428
429func (t *bruteForceMatchTree) String() string {
430 return "all"
431}
432
433func (t *docMatchTree) String() string {
434 return fmt.Sprintf("doc(%s)", t.reason)
435}
436
437func (t *andMatchTree) String() string {
438 return fmt.Sprintf("and%v", t.children)
439}
440
441func (t *regexpMatchTree) String() string {
442 f := ""
443 if t.fileName {
444 f = "f"
445 }
446 return fmt.Sprintf("%sre(%s)", f, t.regexp)
447}
448
449func (t *wordMatchTree) String() string {
450 f := ""
451 if t.fileName {
452 f = "f"
453 }
454 return fmt.Sprintf("%sword(%s)", f, t.word)
455}
456
457func (t *orMatchTree) String() string {
458 return fmt.Sprintf("or%v", t.children)
459}
460
461func (t *notMatchTree) String() string {
462 return fmt.Sprintf("not(%v)", t.child)
463}
464
465func (t *noVisitMatchTree) String() string {
466 return fmt.Sprintf("novisit(%v)", t.matchTree)
467}
468
469func (t *fileNameMatchTree) String() string {
470 return fmt.Sprintf("f(%v)", t.child)
471}
472
473func (t *substrMatchTree) String() string {
474 f := ""
475 if t.fileName {
476 f = "f"
477 }
478
479 return fmt.Sprintf("%ssubstr(%q, %v, %v)", f, t.query.Pattern, t.current, t.matchIterator)
480}
481
482func (t *branchQueryMatchTree) String() string {
483 return fmt.Sprintf("branch(%x)", t.masks)
484}
485
486func (t *symbolSubstrMatchTree) String() string {
487 return fmt.Sprintf("symbol(%v)", t.substrMatchTree)
488}
489
490func (t *symbolRegexpMatchTree) String() string {
491 return fmt.Sprintf("symbol(%v)", t.matchTree)
492}
493
494// visitMatches visits all atoms in matchTree. Note: This visits
495// noVisitMatchTree. For collecting matches use visitMatches.
496func visitMatchTree(t matchTree, f func(matchTree)) {
497 switch s := t.(type) {
498 case *andMatchTree:
499 for _, ch := range s.children {
500 visitMatchTree(ch, f)
501 }
502 case *orMatchTree:
503 for _, ch := range s.children {
504 visitMatchTree(ch, f)
505 }
506 case *andLineMatchTree:
507 visitMatchTree(&s.andMatchTree, f)
508 case *noVisitMatchTree:
509 visitMatchTree(s.matchTree, f)
510 case *notMatchTree:
511 visitMatchTree(s.child, f)
512 case *fileNameMatchTree:
513 visitMatchTree(s.child, f)
514 case *symbolSubstrMatchTree:
515 visitMatchTree(s.substrMatchTree, f)
516 case *symbolRegexpMatchTree:
517 visitMatchTree(s.matchTree, f)
518 default:
519 f(t)
520 }
521}
522
523// updateMatchTreeStats calls updateStats on all atoms in mt which have that
524// function defined.
525func updateMatchTreeStats(mt matchTree, stats *Stats) {
526 visitMatchTree(mt, func(mt matchTree) {
527 if atom, ok := mt.(interface{ updateStats(*Stats) }); ok {
528 atom.updateStats(stats)
529 }
530 })
531}
532
533// visitMatches visits all atoms which can contribute matches. Note: This
534// skips noVisitMatchTree.
535func visitMatches(t matchTree, known map[matchTree]bool, f func(matchTree)) {
536 switch s := t.(type) {
537 case *andMatchTree:
538 for _, ch := range s.children {
539 if known[ch] {
540 visitMatches(ch, known, f)
541 }
542 }
543 case *andLineMatchTree:
544 visitMatches(&s.andMatchTree, known, f)
545 case *orMatchTree:
546 for _, ch := range s.children {
547 if known[ch] {
548 visitMatches(ch, known, f)
549 }
550 }
551 case *symbolSubstrMatchTree:
552 visitMatches(s.substrMatchTree, known, f)
553 case *notMatchTree:
554 case *noVisitMatchTree:
555 // don't collect into negative trees.
556 case *fileNameMatchTree:
557 // We will just gather the filename if we do not visit this tree.
558 default:
559 f(s)
560 }
561}
562
563// all matches() methods.
564
565func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
566 return t.predicate(cp.idx), true
567}
568
569func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
570 return true, true
571}
572
573// andLineMatchTree is a performance optimization of andMatchTree. For content
574// searches we don't want to run the regex engine if there is no line that
575// contains matches from all terms.
576func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
577 matches, sure := t.andMatchTree.matches(cp, cost, known)
578 if !(sure && matches) {
579 return matches, sure
580 }
581
582 // find child with fewest candidates
583 min := maxUInt32
584 fewestChildren := 0
585 for ix, child := range t.children {
586 v, ok := child.(*substrMatchTree)
587 // make sure we are running a content search and that all candidates are a
588 // substrMatchTree
589 if !ok || v.fileName {
590 return matches, sure
591 }
592 if len(v.current) < min {
593 min = len(v.current)
594 fewestChildren = ix
595 }
596 }
597
598 type lineRange struct {
599 start int
600 end int
601 }
602 lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current))
603 prev := -1
604 for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current {
605 line, byteStart, byteEnd := cp.newlines().atOffset(candidate.byteOffset)
606 if line == prev {
607 continue
608 }
609 prev = line
610 lines = append(lines, lineRange{byteStart, byteEnd})
611 }
612
613 // children keeps track of the children's candidates we have already seen.
614 children := make([][]*candidateMatch, 0, len(t.children)-1)
615 for j, child := range t.children {
616 if j == fewestChildren {
617 continue
618 }
619 children = append(children, child.(*substrMatchTree).current)
620 }
621
622nextLine:
623 for i := 0; i < len(lines); i++ {
624 hits := 1
625 nextChild:
626 for j := range children {
627 nextCandidate:
628 for len(children[j]) > 0 {
629 candidate := children[j][0]
630 bo := int(cp.findOffset(false, candidate.runeOffset))
631 if bo < lines[i].start {
632 children[j] = children[j][1:]
633 continue nextCandidate
634 }
635 if bo <= lines[i].end {
636 hits++
637 continue nextChild
638 }
639 // move the `lines` iterator forward until bo <= line.end
640 for i < len(lines) && bo > lines[i].end {
641 i++
642 }
643 i--
644 continue nextLine
645 }
646 }
647 // return early once we found any line that contains matches from all children
648 if hits == len(t.children) {
649 return matches, true
650 }
651 }
652 return false, true
653}
654
655func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
656 sure := true
657
658 for _, ch := range t.children {
659 v, ok := evalMatchTree(cp, cost, known, ch)
660 if ok && !v {
661 return false, true
662 }
663 if !ok {
664 sure = false
665 }
666 }
667
668 return true, sure
669}
670
671func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
672 matches := false
673 sure := true
674 for _, ch := range t.children {
675 v, ok := evalMatchTree(cp, cost, known, ch)
676 if ok {
677 // we could short-circuit, but we want to use
678 // the other possibilities as a ranking
679 // signal.
680 matches = matches || v
681 } else {
682 sure = false
683 }
684 }
685 return matches, sure
686}
687
688func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
689 return t.branchMask() != 0, true
690}
691
692func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
693 if t.reEvaluated {
694 return len(t.found) > 0, true
695 }
696
697 if cost < costRegexp {
698 return false, false
699 }
700
701 cp.stats.RegexpsConsidered++
702 idxs := t.regexp.FindAllIndex(cp.data(t.fileName), -1)
703 found := t.found[:0]
704 for _, idx := range idxs {
705 cm := &candidateMatch{
706 byteOffset: uint32(idx[0]),
707 byteMatchSz: uint32(idx[1] - idx[0]),
708 fileName: t.fileName,
709 }
710
711 found = append(found, cm)
712 }
713 t.found = found
714 t.reEvaluated = true
715
716 return len(t.found) > 0, true
717}
718
719func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
720 if t.evaluated {
721 return len(t.found) > 0, true
722 }
723
724 if cost < costRegexp {
725 return false, false
726 }
727
728 data := cp.data(t.fileName)
729 offset := 0
730 found := t.found[:0]
731 for {
732 idx := bytes.Index(data[offset:], []byte(t.word))
733 if idx < 0 {
734 break
735 }
736
737 relStartOffset := offset + idx
738 relEndOffset := relStartOffset + len(t.word)
739
740 startBoundary := relStartOffset < len(data) && (relStartOffset == 0 || !characterClass(data[relStartOffset-1]))
741 endBoundary := relEndOffset > 0 && (relEndOffset == len(data) || !characterClass(data[relEndOffset]))
742 if startBoundary && endBoundary {
743 found = append(found, &candidateMatch{
744 byteOffset: uint32(offset + idx),
745 byteMatchSz: uint32(len(t.word)),
746 fileName: t.fileName,
747 })
748 }
749 offset += idx + len(t.word)
750 }
751
752 t.found = found
753 t.evaluated = true
754
755 return len(t.found) > 0, true
756}
757
758// breakMatchesOnNewlines returns matches resulting from breaking each element
759// of cms on newlines within text.
760func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch {
761 var lineCMs []*candidateMatch
762 for _, cm := range cms {
763 lineCMs = append(lineCMs, breakOnNewlines(cm, text)...)
764 }
765 return lineCMs
766}
767
768// breakOnNewlines returns matches resulting from breaking cm on newlines
769// within text.
770func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch {
771 var cms []*candidateMatch
772 addMe := &candidateMatch{}
773 *addMe = *cm
774 for i := uint32(cm.byteOffset); i < cm.byteOffset+cm.byteMatchSz; i++ {
775 if text[i] == '\n' {
776 addMe.byteMatchSz = i - addMe.byteOffset
777 if addMe.byteMatchSz != 0 {
778 cms = append(cms, addMe)
779 }
780
781 addMe = &candidateMatch{}
782 *addMe = *cm
783 addMe.byteOffset = i + 1
784 }
785 }
786 addMe.byteMatchSz = cm.byteOffset + cm.byteMatchSz - addMe.byteOffset
787 if addMe.byteMatchSz != 0 {
788 cms = append(cms, addMe)
789 }
790 return cms
791}
792
793func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) (bool, bool) {
794 if v, ok := known[mt]; ok {
795 return v, true
796 }
797
798 v, ok := mt.matches(cp, cost, known)
799 if ok {
800 known[mt] = v
801 }
802
803 return v, ok
804}
805
806func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
807 v, ok := evalMatchTree(cp, cost, known, t.child)
808 return !v, ok
809}
810
811func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
812 return evalMatchTree(cp, cost, known, t.child)
813}
814
815func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
816 if t.contEvaluated {
817 return len(t.current) > 0, true
818 }
819
820 if len(t.current) == 0 {
821 return false, true
822 }
823
824 if t.fileName && cost < costMemory {
825 return false, false
826 }
827
828 if !t.fileName && cost < costContent {
829 return false, false
830 }
831
832 pruned := t.current[:0]
833 for _, m := range t.current {
834 if m.byteOffset == 0 && m.runeOffset > 0 {
835 m.byteOffset = cp.findOffset(m.fileName, m.runeOffset)
836 }
837 if m.matchContent(cp.data(m.fileName)) {
838 pruned = append(pruned, m)
839 }
840 }
841 t.current = pruned
842 t.contEvaluated = true
843
844 return len(t.current) > 0, true
845}
846
847type matchTreeOpt struct {
848 // DisableWordMatchOptimization is used to disable the use of wordMatchTree.
849 // This was added since we do not support wordMatchTree with symbol search.
850 DisableWordMatchOptimization bool
851}
852
853func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error) {
854 if q == nil {
855 return nil, fmt.Errorf("got nil (sub)query")
856 }
857 switch s := q.(type) {
858 case *query.Regexp:
859 // RegexpToMatchTreeRecursive tries to distill a matchTree that matches a
860 // superset of the regexp. If the returned matchTree is equivalent to the
861 // original regexp, it returns true. An equivalent matchTree has the same
862 // behaviour as the original regexp and can be used instead.
863 //
864 subMT, isEq, _, err := d.regexpToMatchTreeRecursive(s.Regexp, ngramSize, s.FileName, s.CaseSensitive)
865 if err != nil {
866 return nil, err
867 }
868 // if the query can be used in place of the regexp
869 // return the subtree
870 if isEq {
871 return subMT, nil
872 }
873
874 var tr matchTree
875 if wmt, ok := regexpToWordMatchTree(s, opt); ok {
876 // A common search we get is "\bLITERAL\b". Avoid the regex engine and
877 // provide something faster.
878 tr = wmt
879 } else {
880 prefix := ""
881 if !s.CaseSensitive {
882 prefix = "(?i)"
883 }
884
885 tr = ®expMatchTree{
886 regexp: regexp.MustCompile(prefix + s.Regexp.String()),
887 fileName: s.FileName,
888 }
889 }
890
891 return &andMatchTree{
892 children: []matchTree{
893 tr, &noVisitMatchTree{subMT},
894 },
895 }, nil
896 case *query.And:
897 var r []matchTree
898 for _, ch := range s.Children {
899 ct, err := d.newMatchTree(ch, opt)
900 if err != nil {
901 return nil, err
902 }
903 r = append(r, ct)
904 }
905 return &andMatchTree{r}, nil
906 case *query.Or:
907 var r []matchTree
908 for _, ch := range s.Children {
909 ct, err := d.newMatchTree(ch, opt)
910 if err != nil {
911 return nil, err
912 }
913 r = append(r, ct)
914 }
915 return &orMatchTree{r}, nil
916 case *query.Not:
917 ct, err := d.newMatchTree(s.Child, opt)
918 return ¬MatchTree{
919 child: ct,
920 }, err
921
922 case *query.Type:
923 if s.Type != query.TypeFileName {
924 break
925 }
926
927 ct, err := d.newMatchTree(s.Child, opt)
928 if err != nil {
929 return nil, err
930 }
931
932 return &fileNameMatchTree{
933 child: ct,
934 }, nil
935
936 case *query.Substring:
937 return d.newSubstringMatchTree(s)
938
939 case *query.Branch:
940 masks := make([]uint64, 0, len(d.repoMetaData))
941 if s.Pattern == "HEAD" {
942 for i := 0; i < len(d.repoMetaData); i++ {
943 masks = append(masks, 1)
944 }
945 } else {
946 for _, branchIDs := range d.branchIDs {
947 mask := uint64(0)
948 for nm, m := range branchIDs {
949 if (s.Exact && nm == s.Pattern) || (!s.Exact && strings.Contains(nm, s.Pattern)) {
950 mask |= uint64(m)
951 }
952 }
953 masks = append(masks, mask)
954 }
955
956 }
957 return &branchQueryMatchTree{
958 masks: masks,
959 fileMasks: d.fileBranchMasks,
960 repos: d.repos,
961 }, nil
962 case *query.Const:
963 if s.Value {
964 return &bruteForceMatchTree{}, nil
965 } else {
966 return &noMatchTree{Why: "const"}, nil
967 }
968 case *query.Language:
969 code, ok := d.metaData.LanguageMap[s.Language]
970 if !ok {
971 return &noMatchTree{Why: "lang"}, nil
972 }
973 return &docMatchTree{
974 reason: "language",
975 numDocs: d.numDocs(),
976 predicate: func(docID uint32) bool {
977 return d.getLanguage(docID) == code
978 },
979 }, nil
980
981 case *query.Symbol:
982 // Disable WordMatchTree since we don't support it in symbols yet.
983 optCopy := opt
984 optCopy.DisableWordMatchOptimization = true
985
986 subMT, err := d.newMatchTree(s.Expr, optCopy)
987 if err != nil {
988 return nil, err
989 }
990
991 if substr, ok := subMT.(*substrMatchTree); ok {
992 return &symbolSubstrMatchTree{
993 substrMatchTree: substr,
994 patternSize: uint32(utf8.RuneCountInString(substr.query.Pattern)),
995 fileEndRunes: d.fileEndRunes,
996 fileEndSymbol: d.fileEndSymbol,
997 sections: d.runeDocSections,
998 }, nil
999 }
1000
1001 var regexp *regexp.Regexp
1002 visitMatchTree(subMT, func(mt matchTree) {
1003 if t, ok := mt.(*regexpMatchTree); ok {
1004 regexp = t.regexp
1005 }
1006 })
1007 if regexp == nil {
1008 return nil, fmt.Errorf("found %T inside query.Symbol", subMT)
1009 }
1010
1011 return &symbolRegexpMatchTree{
1012 regexp: regexp,
1013 all: regexp.String() == "(?i)(?-s:.)*",
1014 matchTree: subMT,
1015 }, nil
1016
1017 case *query.FileNameSet:
1018 return &docMatchTree{
1019 reason: "FileNameSet",
1020 numDocs: d.numDocs(),
1021 predicate: func(docID uint32) bool {
1022 fileName := d.fileName(docID)
1023 _, ok := s.Set[string(fileName)]
1024 return ok
1025 },
1026 }, nil
1027
1028 case *query.BranchesRepos:
1029 reposBranchesWant := make([]uint64, len(d.repoMetaData))
1030 for repoIdx := range d.repoMetaData {
1031 var mask uint64
1032 for _, br := range s.List {
1033 if br.Repos.Contains(d.repoMetaData[repoIdx].ID) {
1034 mask |= uint64(d.branchIDs[repoIdx][br.Branch])
1035 }
1036 }
1037 reposBranchesWant[repoIdx] = mask
1038 }
1039 return &docMatchTree{
1040 reason: "BranchesRepos",
1041 numDocs: d.numDocs(),
1042 predicate: func(docID uint32) bool {
1043 return d.fileBranchMasks[docID]&reposBranchesWant[d.repos[docID]] != 0
1044 },
1045 }, nil
1046
1047 case *query.RepoSet:
1048 reposWant := make([]bool, len(d.repoMetaData))
1049 for repoIdx, r := range d.repoMetaData {
1050 if _, ok := s.Set[r.Name]; ok {
1051 reposWant[repoIdx] = true
1052 }
1053 }
1054 return &docMatchTree{
1055 reason: "RepoSet",
1056 numDocs: d.numDocs(),
1057 predicate: func(docID uint32) bool {
1058 return reposWant[d.repos[docID]]
1059 },
1060 }, nil
1061
1062 case *query.RepoIDs:
1063 reposWant := make([]bool, len(d.repoMetaData))
1064 for repoIdx, r := range d.repoMetaData {
1065 if s.Repos.Contains(r.ID) {
1066 reposWant[repoIdx] = true
1067 }
1068 }
1069 return &docMatchTree{
1070 reason: "RepoIDs",
1071 numDocs: d.numDocs(),
1072 predicate: func(docID uint32) bool {
1073 return reposWant[d.repos[docID]]
1074 },
1075 }, nil
1076
1077 case *query.Repo:
1078 reposWant := make([]bool, len(d.repoMetaData))
1079 for repoIdx, r := range d.repoMetaData {
1080 if s.Regexp.MatchString(r.Name) {
1081 reposWant[repoIdx] = true
1082 }
1083 }
1084 return &docMatchTree{
1085 reason: "Repo",
1086 numDocs: d.numDocs(),
1087 predicate: func(docID uint32) bool {
1088 return reposWant[d.repos[docID]]
1089 },
1090 }, nil
1091
1092 case *query.RepoRegexp:
1093 reposWant := make([]bool, len(d.repoMetaData))
1094 for repoIdx, r := range d.repoMetaData {
1095 if s.Regexp.MatchString(r.Name) {
1096 reposWant[repoIdx] = true
1097 }
1098 }
1099 return &docMatchTree{
1100 reason: "RepoRegexp",
1101 numDocs: d.numDocs(),
1102 predicate: func(docID uint32) bool {
1103 return reposWant[d.repos[docID]]
1104 },
1105 }, nil
1106
1107 case query.RawConfig:
1108 return &docMatchTree{
1109 reason: s.String(),
1110 numDocs: d.numDocs(),
1111 predicate: func(docID uint32) bool {
1112 return uint8(s)&d.rawConfigMasks[d.repos[docID]] == uint8(s)
1113 },
1114 }, nil
1115 }
1116 log.Panicf("type %T", q)
1117 return nil, nil
1118}
1119
1120func (d *indexData) newSubstringMatchTree(s *query.Substring) (matchTree, error) {
1121 st := &substrMatchTree{
1122 query: s,
1123 caseSensitive: s.CaseSensitive,
1124 fileName: s.FileName,
1125 }
1126
1127 if utf8.RuneCountInString(s.Pattern) < ngramSize {
1128 prefix := ""
1129 if !s.CaseSensitive {
1130 prefix = "(?i)"
1131 }
1132 t := ®expMatchTree{
1133 regexp: regexp.MustCompile(prefix + regexp.QuoteMeta(s.Pattern)),
1134 fileName: s.FileName,
1135 }
1136 return t, nil
1137 }
1138
1139 result, err := d.iterateNgrams(s)
1140 if err != nil {
1141 return nil, err
1142 }
1143 st.matchIterator = result
1144 return st, nil
1145}
1146
1147func regexpToWordMatchTree(q *query.Regexp, opt matchTreeOpt) (_ *wordMatchTree, ok bool) {
1148 if opt.DisableWordMatchOptimization {
1149 return nil, false
1150 }
1151 // Needs to be case sensitive
1152 if !q.CaseSensitive || q.Regexp.Flags&syntax.FoldCase != 0 {
1153 return nil, false
1154 }
1155 // We want a regex that looks like Op.Concat[OpWordBoundary OpLiteral OpWordBoundary]
1156 if q.Regexp.Op != syntax.OpConcat || len(q.Regexp.Sub) != 3 {
1157 return nil, false
1158 }
1159 sub := q.Regexp.Sub
1160 if sub[0].Op != syntax.OpWordBoundary || sub[1].Op != syntax.OpLiteral || sub[2].Op != syntax.OpWordBoundary {
1161 return nil, false
1162 }
1163
1164 return &wordMatchTree{
1165 word: string(sub[1].Rune),
1166 fileName: q.FileName,
1167 }, true
1168}
1169
1170// pruneMatchTree removes impossible branches from the matchTree, as indicated
1171// by substrMatchTree having a noMatchTree and the resulting impossible and clauses and so forth.
1172func pruneMatchTree(mt matchTree) (matchTree, error) {
1173 var err error
1174 switch mt := mt.(type) {
1175 // leaf nodes that we test with the filter:
1176 case *substrMatchTree:
1177 if res, ok := mt.matchIterator.(*ngramIterationResults); ok {
1178 if _, ok := res.matchIterator.(*noMatchTree); ok {
1179 return nil, nil
1180 }
1181 }
1182 // recursive tree structures:
1183 case *andMatchTree:
1184 // Any branch of an and becoming impossible means the entire clause
1185 // is impossible. Otherwise, just handle rewrites.
1186 for i, child := range mt.children {
1187 newChild, err := pruneMatchTree(child)
1188 if err != nil {
1189 return nil, err
1190 }
1191 if newChild == nil {
1192 return nil, nil
1193 }
1194 mt.children[i] = newChild
1195 }
1196 case *orMatchTree:
1197 // *All* branches of an OR becoming impossible means the entire clause
1198 // is impossible. Otherwise, drop impossible subclauses and handle
1199 // rewrites, including simplifying to a singular resulting child branch.
1200 n := 0
1201 for _, child := range mt.children {
1202 newChild, err := pruneMatchTree(child)
1203 if err != nil {
1204 return nil, err
1205 }
1206 if newChild != nil {
1207 mt.children[n] = newChild
1208 n++
1209 }
1210 }
1211 mt.children = mt.children[:n]
1212 if len(mt.children) == 1 {
1213 return mt.children[0], nil
1214 } else if len(mt.children) == 0 {
1215 return nil, nil
1216 }
1217 case *noVisitMatchTree:
1218 mt.matchTree, err = pruneMatchTree(mt.matchTree)
1219 if err != nil {
1220 return nil, err
1221 }
1222 if mt.matchTree == nil {
1223 return nil, nil
1224 }
1225 case *fileNameMatchTree:
1226 mt.child, err = pruneMatchTree(mt.child)
1227 case *andLineMatchTree:
1228 child, err := pruneMatchTree(&mt.andMatchTree)
1229 if err != nil {
1230 return nil, err
1231 }
1232 if child == nil {
1233 return nil, nil
1234 }
1235 if c, ok := child.(*andMatchTree); ok {
1236 mt.andMatchTree = *c
1237 } else {
1238 // the and was simplified to a single clause,
1239 // so the linematch portion is irrelevant.
1240 return mt, nil
1241 }
1242 case *notMatchTree:
1243 mt.child, err = pruneMatchTree(mt.child)
1244 if err != nil {
1245 return nil, err
1246 }
1247 if mt.child == nil {
1248 // not false => true
1249 return &bruteForceMatchTree{}, nil
1250 }
1251 // unhandled:
1252 case *docMatchTree:
1253 case *bruteForceMatchTree:
1254 case *regexpMatchTree:
1255 case *wordMatchTree:
1256 }
1257 return mt, err
1258}