fork of https://github.com/sourcegraph/zoekt
1// Copyright 2018 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "bytes"
19 "fmt"
20 "log"
21 "regexp/syntax"
22 "strings"
23 "unicode/utf8"
24
25 "github.com/grafana/regexp"
26
27 "github.com/sourcegraph/zoekt/query"
28)
29
30// A docIterator iterates over documents in order.
31type docIterator interface {
32 // provide the next document where we can may find something
33 // interesting.
34 nextDoc() uint32
35
36 // clears any per-document state of the docIterator, and
37 // prepares for evaluating the given doc. The argument is
38 // strictly increasing over time.
39 prepare(nextDoc uint32)
40}
41
42const (
43 costConst = 0
44 costMemory = 1
45 costContent = 2
46 costRegexp = 3
47)
48
49const (
50 costMin = costConst
51 costMax = costRegexp
52)
53
54// An expression tree coupled with matches. The matchtree has two
55// functions:
56//
57// * it implements boolean combinations (and, or, not)
58//
59// * it implements shortcuts, where we skip documents (for example: if
60// there are no trigram matches, we can be sure there are no substring
61// matches). The matchtree iterates over the documents as they are
62// ordered in the shard.
63//
64// The general process for a given (shard, query) is:
65//
66// - construct matchTree for the query
67//
68// - find all different leaf matchTrees (substring, regexp, etc.)
69//
70// in a loop:
71//
72// - find next doc to process using nextDoc
73//
74// - evaluate atoms (leaf expressions that match text)
75//
76// - evaluate the tree using matches(), storing the result in map.
77//
78// - if the complete tree returns (matches() == true) for the document,
79// collect all text matches by looking at leaf matchTrees
80type matchTree interface {
81 docIterator
82
83 // returns whether this matches, and if we are sure.
84 matches(cp *contentProvider, cost int, known map[matchTree]bool) (match bool, sure bool)
85}
86
87// docMatchTree iterates over documents for which predicate(docID) returns true.
88type docMatchTree struct {
89 // the number of documents in a shard.
90 numDocs uint32
91
92 predicate func(docID uint32) bool
93
94 // provides additional information about the reason why the docMatchTree was
95 // created.
96 reason string
97
98 // mutable
99 firstDone bool
100 docID uint32
101}
102
103type bruteForceMatchTree struct {
104 // mutable
105 firstDone bool
106 docID uint32
107}
108
109type andLineMatchTree struct {
110 andMatchTree
111}
112
113type andMatchTree struct {
114 children []matchTree
115}
116
117type orMatchTree struct {
118 children []matchTree
119}
120
121type notMatchTree struct {
122 child matchTree
123}
124
125// Returns only the filename of child matches.
126type fileNameMatchTree struct {
127 child matchTree
128}
129
130// Don't visit this subtree for collecting matches.
131type noVisitMatchTree struct {
132 matchTree
133}
134
135type regexpMatchTree struct {
136 regexp *regexp.Regexp
137
138 fileName bool
139
140 // mutable
141 reEvaluated bool
142 found []*candidateMatch
143
144 // nextDoc, prepare.
145 bruteForceMatchTree
146}
147
148// \bLITERAL\b
149type wordMatchTree struct {
150 word string
151
152 fileName bool
153
154 // mutable
155 evaluated bool
156 found []*candidateMatch
157
158 // nextDoc, prepare.
159 bruteForceMatchTree
160}
161
162type substrMatchTree struct {
163 matchIterator
164
165 query *query.Substring
166 caseSensitive bool
167 fileName bool
168
169 // mutable
170 current []*candidateMatch
171 contEvaluated bool
172}
173
174type branchQueryMatchTree struct {
175 fileMasks []uint64
176 masks []uint64
177 repos []uint16
178
179 // mutable
180 firstDone bool
181 docID uint32
182}
183
184func (t *branchQueryMatchTree) branchMask() uint64 {
185 return t.fileMasks[t.docID] & t.masks[t.repos[t.docID]]
186}
187
188type symbolRegexpMatchTree struct {
189 matchTree
190 regexp *regexp.Regexp
191 all bool // skips regex match if .*
192
193 reEvaluated bool
194 found []*candidateMatch
195}
196
197func (t *symbolRegexpMatchTree) prepare(doc uint32) {
198 t.reEvaluated = false
199 t.found = t.found[:0]
200 t.matchTree.prepare(doc)
201}
202
203func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
204 if t.reEvaluated {
205 return len(t.found) > 0, true
206 }
207
208 if cost < costRegexp {
209 return false, false
210 }
211
212 sections := cp.docSections()
213 content := cp.data(false)
214
215 found := t.found[:0]
216 for i, sec := range sections {
217 var idx []int
218 if t.all {
219 idx = []int{0, int(sec.End - sec.Start)}
220 } else {
221 idx = t.regexp.FindIndex(content[sec.Start:sec.End])
222 if idx == nil {
223 continue
224 }
225 }
226
227 cm := &candidateMatch{
228 byteOffset: sec.Start + uint32(idx[0]),
229 byteMatchSz: uint32(idx[1] - idx[0]),
230 symbol: true,
231 symbolIdx: uint32(i),
232 }
233 found = append(found, cm)
234 }
235 t.found = found
236 t.reEvaluated = true
237
238 return len(t.found) > 0, true
239}
240
241type symbolSubstrMatchTree struct {
242 *substrMatchTree
243
244 patternSize uint32
245 fileEndRunes []uint32
246 fileEndSymbol []uint32
247
248 doc uint32
249 sections []DocumentSection
250
251 secID uint32
252}
253
254func (t *symbolSubstrMatchTree) prepare(doc uint32) {
255 t.substrMatchTree.prepare(doc)
256 t.doc = doc
257
258 var fileStart uint32
259 if doc > 0 {
260 fileStart = t.fileEndRunes[doc-1]
261 }
262
263 var sections []DocumentSection
264 if len(t.sections) > 0 {
265 most := t.fileEndSymbol[len(t.fileEndSymbol)-1]
266 if most == uint32(len(t.sections)) {
267 sections = t.sections[t.fileEndSymbol[doc]:t.fileEndSymbol[doc+1]]
268 } else {
269 for t.secID < uint32(len(t.sections)) && t.sections[t.secID].Start < fileStart {
270 t.secID++
271 }
272
273 fileEnd, symbolEnd := t.fileEndRunes[doc], t.secID
274 for symbolEnd < uint32(len(t.sections)) && t.sections[symbolEnd].Start < fileEnd {
275 symbolEnd++
276 }
277
278 sections = t.sections[t.secID:symbolEnd]
279 }
280 }
281
282 secIdx := 0
283 trimmed := t.current[:0]
284 for len(sections) > secIdx && len(t.current) > 0 {
285 start := fileStart + t.current[0].runeOffset
286 end := start + t.patternSize
287 if start >= sections[secIdx].End {
288 secIdx++
289 continue
290 }
291
292 if start < sections[secIdx].Start {
293 t.current = t.current[1:]
294 continue
295 }
296
297 if end <= sections[secIdx].End {
298 t.current[0].symbol = true
299 t.current[0].symbolIdx = uint32(secIdx)
300 trimmed = append(trimmed, t.current[0])
301 }
302
303 t.current = t.current[1:]
304 }
305 t.current = trimmed
306}
307
308// all prepare methods
309
310func (t *bruteForceMatchTree) prepare(doc uint32) {
311 t.docID = doc
312 t.firstDone = true
313}
314
315func (t *docMatchTree) prepare(doc uint32) {
316 t.docID = doc
317 t.firstDone = true
318}
319
320func (t *andMatchTree) prepare(doc uint32) {
321 for _, c := range t.children {
322 c.prepare(doc)
323 }
324}
325
326func (t *regexpMatchTree) prepare(doc uint32) {
327 t.found = t.found[:0]
328 t.reEvaluated = false
329 t.bruteForceMatchTree.prepare(doc)
330}
331
332func (t *wordMatchTree) prepare(doc uint32) {
333 t.found = t.found[:0]
334 t.evaluated = false
335 t.bruteForceMatchTree.prepare(doc)
336}
337
338func (t *orMatchTree) prepare(doc uint32) {
339 for _, c := range t.children {
340 c.prepare(doc)
341 }
342}
343
344func (t *notMatchTree) prepare(doc uint32) {
345 t.child.prepare(doc)
346}
347
348func (t *fileNameMatchTree) prepare(doc uint32) {
349 t.child.prepare(doc)
350}
351
352func (t *substrMatchTree) prepare(nextDoc uint32) {
353 t.matchIterator.prepare(nextDoc)
354 t.current = t.matchIterator.candidates()
355 t.contEvaluated = false
356}
357
358func (t *branchQueryMatchTree) prepare(doc uint32) {
359 t.firstDone = true
360 t.docID = doc
361}
362
363// nextDoc
364
365func (t *docMatchTree) nextDoc() uint32 {
366 var start uint32
367 if t.firstDone {
368 start = t.docID + 1
369 }
370 for i := start; i < t.numDocs; i++ {
371 if t.predicate(i) {
372 return i
373 }
374 }
375 return maxUInt32
376}
377
378func (t *bruteForceMatchTree) nextDoc() uint32 {
379 if !t.firstDone {
380 return 0
381 }
382 return t.docID + 1
383}
384
385func (t *andMatchTree) nextDoc() uint32 {
386 var max uint32
387 for _, c := range t.children {
388 m := c.nextDoc()
389 if m > max {
390 max = m
391 }
392 }
393 return max
394}
395
396func (t *orMatchTree) nextDoc() uint32 {
397 min := uint32(maxUInt32)
398 for _, c := range t.children {
399 m := c.nextDoc()
400 if m < min {
401 min = m
402 }
403 }
404 return min
405}
406
407func (t *notMatchTree) nextDoc() uint32 {
408 return 0
409}
410
411func (t *fileNameMatchTree) nextDoc() uint32 {
412 return t.child.nextDoc()
413}
414
415func (t *branchQueryMatchTree) nextDoc() uint32 {
416 var start uint32
417 if t.firstDone {
418 start = t.docID + 1
419 }
420
421 for i := start; i < uint32(len(t.fileMasks)); i++ {
422 if (t.masks[t.repos[i]] & t.fileMasks[i]) != 0 {
423 return i
424 }
425 }
426 return maxUInt32
427}
428
429// all String methods
430
431func (t *bruteForceMatchTree) String() string {
432 return "all"
433}
434
435func (t *docMatchTree) String() string {
436 return fmt.Sprintf("doc(%s)", t.reason)
437}
438
439func (t *andMatchTree) String() string {
440 return fmt.Sprintf("and%v", t.children)
441}
442
443func (t *regexpMatchTree) String() string {
444 f := ""
445 if t.fileName {
446 f = "f"
447 }
448 return fmt.Sprintf("%sre(%s)", f, t.regexp)
449}
450
451func (t *wordMatchTree) String() string {
452 f := ""
453 if t.fileName {
454 f = "f"
455 }
456 return fmt.Sprintf("%sword(%s)", f, t.word)
457}
458
459func (t *orMatchTree) String() string {
460 return fmt.Sprintf("or%v", t.children)
461}
462
463func (t *notMatchTree) String() string {
464 return fmt.Sprintf("not(%v)", t.child)
465}
466
467func (t *noVisitMatchTree) String() string {
468 return fmt.Sprintf("novisit(%v)", t.matchTree)
469}
470
471func (t *fileNameMatchTree) String() string {
472 return fmt.Sprintf("f(%v)", t.child)
473}
474
475func (t *substrMatchTree) String() string {
476 f := ""
477 if t.fileName {
478 f = "f"
479 }
480
481 return fmt.Sprintf("%ssubstr(%q, %v, %v)", f, t.query.Pattern, t.current, t.matchIterator)
482}
483
484func (t *branchQueryMatchTree) String() string {
485 return fmt.Sprintf("branch(%x)", t.masks)
486}
487
488func (t *symbolSubstrMatchTree) String() string {
489 return fmt.Sprintf("symbol(%v)", t.substrMatchTree)
490}
491
492func (t *symbolRegexpMatchTree) String() string {
493 return fmt.Sprintf("symbol(%v)", t.matchTree)
494}
495
496// visitMatches visits all atoms in matchTree. Note: This visits
497// noVisitMatchTree. For collecting matches use visitMatches.
498func visitMatchTree(t matchTree, f func(matchTree)) {
499 switch s := t.(type) {
500 case *andMatchTree:
501 for _, ch := range s.children {
502 visitMatchTree(ch, f)
503 }
504 case *orMatchTree:
505 for _, ch := range s.children {
506 visitMatchTree(ch, f)
507 }
508 case *andLineMatchTree:
509 visitMatchTree(&s.andMatchTree, f)
510 case *noVisitMatchTree:
511 visitMatchTree(s.matchTree, f)
512 case *notMatchTree:
513 visitMatchTree(s.child, f)
514 case *fileNameMatchTree:
515 visitMatchTree(s.child, f)
516 case *symbolSubstrMatchTree:
517 visitMatchTree(s.substrMatchTree, f)
518 case *symbolRegexpMatchTree:
519 visitMatchTree(s.matchTree, f)
520 default:
521 f(t)
522 }
523}
524
525// updateMatchTreeStats calls updateStats on all atoms in mt which have that
526// function defined.
527func updateMatchTreeStats(mt matchTree, stats *Stats) {
528 visitMatchTree(mt, func(mt matchTree) {
529 if atom, ok := mt.(interface{ updateStats(*Stats) }); ok {
530 atom.updateStats(stats)
531 }
532 })
533}
534
535// visitMatches visits all atoms which can contribute matches. Note: This
536// skips noVisitMatchTree.
537func visitMatches(t matchTree, known map[matchTree]bool, f func(matchTree)) {
538 switch s := t.(type) {
539 case *andMatchTree:
540 for _, ch := range s.children {
541 if known[ch] {
542 visitMatches(ch, known, f)
543 }
544 }
545 case *andLineMatchTree:
546 visitMatches(&s.andMatchTree, known, f)
547 case *orMatchTree:
548 for _, ch := range s.children {
549 if known[ch] {
550 visitMatches(ch, known, f)
551 }
552 }
553 case *symbolSubstrMatchTree:
554 visitMatches(s.substrMatchTree, known, f)
555 case *notMatchTree:
556 case *noVisitMatchTree:
557 // don't collect into negative trees.
558 case *fileNameMatchTree:
559 // We will just gather the filename if we do not visit this tree.
560 default:
561 f(s)
562 }
563}
564
565// all matches() methods.
566
567func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
568 return t.predicate(cp.idx), true
569}
570
571func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
572 return true, true
573}
574
575// andLineMatchTree is a performance optimization of andMatchTree. For content
576// searches we don't want to run the regex engine if there is no line that
577// contains matches from all terms.
578func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
579 matches, sure := t.andMatchTree.matches(cp, cost, known)
580 if !(sure && matches) {
581 return matches, sure
582 }
583
584 // find child with fewest candidates
585 min := maxUInt32
586 fewestChildren := 0
587 for ix, child := range t.children {
588 v, ok := child.(*substrMatchTree)
589 // make sure we are running a content search and that all candidates are a
590 // substrMatchTree
591 if !ok || v.fileName {
592 return matches, sure
593 }
594 if len(v.current) < min {
595 min = len(v.current)
596 fewestChildren = ix
597 }
598 }
599
600 type lineRange struct {
601 start int
602 end int
603 }
604 lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current))
605 prev := -1
606 for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current {
607 line, byteStart, byteEnd := cp.newlines().atOffset(candidate.byteOffset)
608 if line == prev {
609 continue
610 }
611 prev = line
612 lines = append(lines, lineRange{byteStart, byteEnd})
613 }
614
615 // children keeps track of the children's candidates we have already seen.
616 children := make([][]*candidateMatch, 0, len(t.children)-1)
617 for j, child := range t.children {
618 if j == fewestChildren {
619 continue
620 }
621 children = append(children, child.(*substrMatchTree).current)
622 }
623
624nextLine:
625 for i := 0; i < len(lines); i++ {
626 hits := 1
627 nextChild:
628 for j := range children {
629 nextCandidate:
630 for len(children[j]) > 0 {
631 candidate := children[j][0]
632 bo := int(cp.findOffset(false, candidate.runeOffset))
633 if bo < lines[i].start {
634 children[j] = children[j][1:]
635 continue nextCandidate
636 }
637 if bo <= lines[i].end {
638 hits++
639 continue nextChild
640 }
641 // move the `lines` iterator forward until bo <= line.end
642 for i < len(lines) && bo > lines[i].end {
643 i++
644 }
645 i--
646 continue nextLine
647 }
648 }
649 // return early once we found any line that contains matches from all children
650 if hits == len(t.children) {
651 return matches, true
652 }
653 }
654 return false, true
655}
656
657func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
658 sure := true
659
660 for _, ch := range t.children {
661 v, ok := evalMatchTree(cp, cost, known, ch)
662 if ok && !v {
663 return false, true
664 }
665 if !ok {
666 sure = false
667 }
668 }
669
670 return true, sure
671}
672
673func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
674 matches := false
675 sure := true
676 for _, ch := range t.children {
677 v, ok := evalMatchTree(cp, cost, known, ch)
678 if ok {
679 // we could short-circuit, but we want to use
680 // the other possibilities as a ranking
681 // signal.
682 matches = matches || v
683 } else {
684 sure = false
685 }
686 }
687 return matches, sure
688}
689
690func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
691 return t.branchMask() != 0, true
692}
693
694func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
695 if t.reEvaluated {
696 return len(t.found) > 0, true
697 }
698
699 if cost < costRegexp {
700 return false, false
701 }
702
703 cp.stats.RegexpsConsidered++
704 idxs := t.regexp.FindAllIndex(cp.data(t.fileName), -1)
705 found := t.found[:0]
706 for _, idx := range idxs {
707 cm := &candidateMatch{
708 byteOffset: uint32(idx[0]),
709 byteMatchSz: uint32(idx[1] - idx[0]),
710 fileName: t.fileName,
711 }
712
713 found = append(found, cm)
714 }
715 t.found = found
716 t.reEvaluated = true
717
718 return len(t.found) > 0, true
719}
720
721func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
722 if t.evaluated {
723 return len(t.found) > 0, true
724 }
725
726 if cost < costRegexp {
727 return false, false
728 }
729
730 data := cp.data(t.fileName)
731 offset := 0
732 found := t.found[:0]
733 for {
734 idx := bytes.Index(data[offset:], []byte(t.word))
735 if idx < 0 {
736 break
737 }
738
739 relStartOffset := offset + idx
740 relEndOffset := relStartOffset + len(t.word)
741
742 startBoundary := relStartOffset < len(data) && (relStartOffset == 0 || !characterClass(data[relStartOffset-1]))
743 endBoundary := relEndOffset > 0 && (relEndOffset == len(data) || !characterClass(data[relEndOffset]))
744 if startBoundary && endBoundary {
745 found = append(found, &candidateMatch{
746 byteOffset: uint32(offset + idx),
747 byteMatchSz: uint32(len(t.word)),
748 fileName: t.fileName,
749 })
750 }
751 offset += idx + len(t.word)
752 }
753
754 t.found = found
755 t.evaluated = true
756
757 return len(t.found) > 0, true
758}
759
760// breakMatchesOnNewlines returns matches resulting from breaking each element
761// of cms on newlines within text.
762func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch {
763 var lineCMs []*candidateMatch
764 for _, cm := range cms {
765 lineCMs = append(lineCMs, breakOnNewlines(cm, text)...)
766 }
767 return lineCMs
768}
769
770// breakOnNewlines returns matches resulting from breaking cm on newlines
771// within text.
772func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch {
773 var cms []*candidateMatch
774 addMe := &candidateMatch{}
775 *addMe = *cm
776 for i := uint32(cm.byteOffset); i < cm.byteOffset+cm.byteMatchSz; i++ {
777 if text[i] == '\n' {
778 addMe.byteMatchSz = i - addMe.byteOffset
779 if addMe.byteMatchSz != 0 {
780 cms = append(cms, addMe)
781 }
782
783 addMe = &candidateMatch{}
784 *addMe = *cm
785 addMe.byteOffset = i + 1
786 }
787 }
788 addMe.byteMatchSz = cm.byteOffset + cm.byteMatchSz - addMe.byteOffset
789 if addMe.byteMatchSz != 0 {
790 cms = append(cms, addMe)
791 }
792 return cms
793}
794
795func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) (bool, bool) {
796 if v, ok := known[mt]; ok {
797 return v, true
798 }
799
800 v, ok := mt.matches(cp, cost, known)
801 if ok {
802 known[mt] = v
803 }
804
805 return v, ok
806}
807
808func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
809 v, ok := evalMatchTree(cp, cost, known, t.child)
810 return !v, ok
811}
812
813func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
814 return evalMatchTree(cp, cost, known, t.child)
815}
816
817func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
818 if t.contEvaluated {
819 return len(t.current) > 0, true
820 }
821
822 if len(t.current) == 0 {
823 return false, true
824 }
825
826 if t.fileName && cost < costMemory {
827 return false, false
828 }
829
830 if !t.fileName && cost < costContent {
831 return false, false
832 }
833
834 pruned := t.current[:0]
835 for _, m := range t.current {
836 if m.byteOffset == 0 && m.runeOffset > 0 {
837 m.byteOffset = cp.findOffset(m.fileName, m.runeOffset)
838 }
839 if m.matchContent(cp.data(m.fileName)) {
840 pruned = append(pruned, m)
841 }
842 }
843 t.current = pruned
844 t.contEvaluated = true
845
846 return len(t.current) > 0, true
847}
848
849type matchTreeOpt struct {
850 // DisableWordMatchOptimization is used to disable the use of wordMatchTree.
851 // This was added since we do not support wordMatchTree with symbol search.
852 DisableWordMatchOptimization bool
853}
854
855func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error) {
856 if q == nil {
857 return nil, fmt.Errorf("got nil (sub)query")
858 }
859 switch s := q.(type) {
860 case *query.Regexp:
861 // RegexpToMatchTreeRecursive tries to distill a matchTree that matches a
862 // superset of the regexp. If the returned matchTree is equivalent to the
863 // original regexp, it returns true. An equivalent matchTree has the same
864 // behaviour as the original regexp and can be used instead.
865 //
866 subMT, isEq, _, err := d.regexpToMatchTreeRecursive(s.Regexp, ngramSize, s.FileName, s.CaseSensitive)
867 if err != nil {
868 return nil, err
869 }
870 // if the query can be used in place of the regexp
871 // return the subtree
872 if isEq {
873 return subMT, nil
874 }
875
876 var tr matchTree
877 if wmt, ok := regexpToWordMatchTree(s, opt); ok {
878 // A common search we get is "\bLITERAL\b". Avoid the regex engine and
879 // provide something faster.
880 tr = wmt
881 } else {
882 prefix := ""
883 if !s.CaseSensitive {
884 prefix = "(?i)"
885 }
886
887 tr = ®expMatchTree{
888 regexp: regexp.MustCompile(prefix + s.Regexp.String()),
889 fileName: s.FileName,
890 }
891 }
892
893 return &andMatchTree{
894 children: []matchTree{
895 tr, &noVisitMatchTree{subMT},
896 },
897 }, nil
898 case *query.And:
899 var r []matchTree
900 for _, ch := range s.Children {
901 ct, err := d.newMatchTree(ch, opt)
902 if err != nil {
903 return nil, err
904 }
905 r = append(r, ct)
906 }
907 return &andMatchTree{r}, nil
908 case *query.Or:
909 var r []matchTree
910 for _, ch := range s.Children {
911 ct, err := d.newMatchTree(ch, opt)
912 if err != nil {
913 return nil, err
914 }
915 r = append(r, ct)
916 }
917 return &orMatchTree{r}, nil
918 case *query.Not:
919 ct, err := d.newMatchTree(s.Child, opt)
920 return ¬MatchTree{
921 child: ct,
922 }, err
923
924 case *query.Type:
925 if s.Type != query.TypeFileName {
926 break
927 }
928
929 ct, err := d.newMatchTree(s.Child, opt)
930 if err != nil {
931 return nil, err
932 }
933
934 return &fileNameMatchTree{
935 child: ct,
936 }, nil
937
938 case *query.Substring:
939 return d.newSubstringMatchTree(s)
940
941 case *query.Branch:
942 masks := make([]uint64, 0, len(d.repoMetaData))
943 if s.Pattern == "HEAD" {
944 for i := 0; i < len(d.repoMetaData); i++ {
945 masks = append(masks, 1)
946 }
947 } else {
948 for _, branchIDs := range d.branchIDs {
949 mask := uint64(0)
950 for nm, m := range branchIDs {
951 if (s.Exact && nm == s.Pattern) || (!s.Exact && strings.Contains(nm, s.Pattern)) {
952 mask |= uint64(m)
953 }
954 }
955 masks = append(masks, mask)
956 }
957
958 }
959 return &branchQueryMatchTree{
960 masks: masks,
961 fileMasks: d.fileBranchMasks,
962 repos: d.repos,
963 }, nil
964 case *query.Const:
965 if s.Value {
966 return &bruteForceMatchTree{}, nil
967 } else {
968 return &noMatchTree{Why: "const"}, nil
969 }
970 case *query.Language:
971 code, ok := d.metaData.LanguageMap[s.Language]
972 if !ok {
973 return &noMatchTree{Why: "lang"}, nil
974 }
975 return &docMatchTree{
976 reason: "language",
977 numDocs: d.numDocs(),
978 predicate: func(docID uint32) bool {
979 return d.getLanguage(docID) == code
980 },
981 }, nil
982
983 case *query.Symbol:
984 // Disable WordMatchTree since we don't support it in symbols yet.
985 optCopy := opt
986 optCopy.DisableWordMatchOptimization = true
987
988 subMT, err := d.newMatchTree(s.Expr, optCopy)
989 if err != nil {
990 return nil, err
991 }
992
993 if substr, ok := subMT.(*substrMatchTree); ok {
994 return &symbolSubstrMatchTree{
995 substrMatchTree: substr,
996 patternSize: uint32(utf8.RuneCountInString(substr.query.Pattern)),
997 fileEndRunes: d.fileEndRunes,
998 fileEndSymbol: d.fileEndSymbol,
999 sections: d.runeDocSections,
1000 }, nil
1001 }
1002
1003 var regexp *regexp.Regexp
1004 visitMatchTree(subMT, func(mt matchTree) {
1005 if t, ok := mt.(*regexpMatchTree); ok {
1006 regexp = t.regexp
1007 }
1008 })
1009 if regexp == nil {
1010 return nil, fmt.Errorf("found %T inside query.Symbol", subMT)
1011 }
1012
1013 return &symbolRegexpMatchTree{
1014 regexp: regexp,
1015 all: regexp.String() == "(?i)(?-s:.)*",
1016 matchTree: subMT,
1017 }, nil
1018
1019 case *query.FileNameSet:
1020 return &docMatchTree{
1021 reason: "FileNameSet",
1022 numDocs: d.numDocs(),
1023 predicate: func(docID uint32) bool {
1024 fileName := d.fileName(docID)
1025 _, ok := s.Set[string(fileName)]
1026 return ok
1027 },
1028 }, nil
1029
1030 case *query.BranchesRepos:
1031 reposBranchesWant := make([]uint64, len(d.repoMetaData))
1032 for repoIdx := range d.repoMetaData {
1033 var mask uint64
1034 for _, br := range s.List {
1035 if br.Repos.Contains(d.repoMetaData[repoIdx].ID) {
1036 mask |= uint64(d.branchIDs[repoIdx][br.Branch])
1037 }
1038 }
1039 reposBranchesWant[repoIdx] = mask
1040 }
1041 return &docMatchTree{
1042 reason: "BranchesRepos",
1043 numDocs: d.numDocs(),
1044 predicate: func(docID uint32) bool {
1045 return d.fileBranchMasks[docID]&reposBranchesWant[d.repos[docID]] != 0
1046 },
1047 }, nil
1048
1049 case *query.RepoSet:
1050 reposWant := make([]bool, len(d.repoMetaData))
1051 for repoIdx, r := range d.repoMetaData {
1052 if _, ok := s.Set[r.Name]; ok {
1053 reposWant[repoIdx] = true
1054 }
1055 }
1056 return &docMatchTree{
1057 reason: "RepoSet",
1058 numDocs: d.numDocs(),
1059 predicate: func(docID uint32) bool {
1060 return reposWant[d.repos[docID]]
1061 },
1062 }, nil
1063
1064 case *query.RepoIDs:
1065 reposWant := make([]bool, len(d.repoMetaData))
1066 for repoIdx, r := range d.repoMetaData {
1067 if s.Repos.Contains(r.ID) {
1068 reposWant[repoIdx] = true
1069 }
1070 }
1071 return &docMatchTree{
1072 reason: "RepoIDs",
1073 numDocs: d.numDocs(),
1074 predicate: func(docID uint32) bool {
1075 return reposWant[d.repos[docID]]
1076 },
1077 }, nil
1078
1079 case *query.Repo:
1080 reposWant := make([]bool, len(d.repoMetaData))
1081 for repoIdx, r := range d.repoMetaData {
1082 if s.Regexp.MatchString(r.Name) {
1083 reposWant[repoIdx] = true
1084 }
1085 }
1086 return &docMatchTree{
1087 reason: "Repo",
1088 numDocs: d.numDocs(),
1089 predicate: func(docID uint32) bool {
1090 return reposWant[d.repos[docID]]
1091 },
1092 }, nil
1093
1094 case *query.RepoRegexp:
1095 reposWant := make([]bool, len(d.repoMetaData))
1096 for repoIdx, r := range d.repoMetaData {
1097 if s.Regexp.MatchString(r.Name) {
1098 reposWant[repoIdx] = true
1099 }
1100 }
1101 return &docMatchTree{
1102 reason: "RepoRegexp",
1103 numDocs: d.numDocs(),
1104 predicate: func(docID uint32) bool {
1105 return reposWant[d.repos[docID]]
1106 },
1107 }, nil
1108
1109 case query.RawConfig:
1110 return &docMatchTree{
1111 reason: s.String(),
1112 numDocs: d.numDocs(),
1113 predicate: func(docID uint32) bool {
1114 return uint8(s)&d.rawConfigMasks[d.repos[docID]] == uint8(s)
1115 },
1116 }, nil
1117 }
1118 log.Panicf("type %T", q)
1119 return nil, nil
1120}
1121
1122func (d *indexData) newSubstringMatchTree(s *query.Substring) (matchTree, error) {
1123 st := &substrMatchTree{
1124 query: s,
1125 caseSensitive: s.CaseSensitive,
1126 fileName: s.FileName,
1127 }
1128
1129 if utf8.RuneCountInString(s.Pattern) < ngramSize {
1130 prefix := ""
1131 if !s.CaseSensitive {
1132 prefix = "(?i)"
1133 }
1134 t := ®expMatchTree{
1135 regexp: regexp.MustCompile(prefix + regexp.QuoteMeta(s.Pattern)),
1136 fileName: s.FileName,
1137 }
1138 return t, nil
1139 }
1140
1141 result, err := d.iterateNgrams(s)
1142 if err != nil {
1143 return nil, err
1144 }
1145 st.matchIterator = result
1146 return st, nil
1147}
1148
1149func regexpToWordMatchTree(q *query.Regexp, opt matchTreeOpt) (_ *wordMatchTree, ok bool) {
1150 if opt.DisableWordMatchOptimization {
1151 return nil, false
1152 }
1153 // Needs to be case sensitive
1154 if !q.CaseSensitive || q.Regexp.Flags&syntax.FoldCase != 0 {
1155 return nil, false
1156 }
1157 // We want a regex that looks like Op.Concat[OpWordBoundary OpLiteral OpWordBoundary]
1158 if q.Regexp.Op != syntax.OpConcat || len(q.Regexp.Sub) != 3 {
1159 return nil, false
1160 }
1161 sub := q.Regexp.Sub
1162 if sub[0].Op != syntax.OpWordBoundary || sub[1].Op != syntax.OpLiteral || sub[2].Op != syntax.OpWordBoundary {
1163 return nil, false
1164 }
1165
1166 return &wordMatchTree{
1167 word: string(sub[1].Rune),
1168 fileName: q.FileName,
1169 }, true
1170}
1171
1172// pruneMatchTree removes impossible branches from the matchTree, as indicated
1173// by substrMatchTree having a noMatchTree and the resulting impossible and clauses and so forth.
1174func pruneMatchTree(mt matchTree) (matchTree, error) {
1175 var err error
1176 switch mt := mt.(type) {
1177 // leaf nodes that we test with the filter:
1178 case *substrMatchTree:
1179 if res, ok := mt.matchIterator.(*ngramIterationResults); ok {
1180 if _, ok := res.matchIterator.(*noMatchTree); ok {
1181 return nil, nil
1182 }
1183 }
1184 // recursive tree structures:
1185 case *andMatchTree:
1186 // Any branch of an and becoming impossible means the entire clause
1187 // is impossible. Otherwise, just handle rewrites.
1188 for i, child := range mt.children {
1189 newChild, err := pruneMatchTree(child)
1190 if err != nil {
1191 return nil, err
1192 }
1193 if newChild == nil {
1194 return nil, nil
1195 }
1196 mt.children[i] = newChild
1197 }
1198 case *orMatchTree:
1199 // *All* branches of an OR becoming impossible means the entire clause
1200 // is impossible. Otherwise, drop impossible subclauses and handle
1201 // rewrites, including simplifying to a singular resulting child branch.
1202 n := 0
1203 for _, child := range mt.children {
1204 newChild, err := pruneMatchTree(child)
1205 if err != nil {
1206 return nil, err
1207 }
1208 if newChild != nil {
1209 mt.children[n] = newChild
1210 n++
1211 }
1212 }
1213 mt.children = mt.children[:n]
1214 if len(mt.children) == 1 {
1215 return mt.children[0], nil
1216 } else if len(mt.children) == 0 {
1217 return nil, nil
1218 }
1219 case *noVisitMatchTree:
1220 mt.matchTree, err = pruneMatchTree(mt.matchTree)
1221 if err != nil {
1222 return nil, err
1223 }
1224 if mt.matchTree == nil {
1225 return nil, nil
1226 }
1227 case *fileNameMatchTree:
1228 mt.child, err = pruneMatchTree(mt.child)
1229 case *andLineMatchTree:
1230 child, err := pruneMatchTree(&mt.andMatchTree)
1231 if err != nil {
1232 return nil, err
1233 }
1234 if child == nil {
1235 return nil, nil
1236 }
1237 if c, ok := child.(*andMatchTree); ok {
1238 mt.andMatchTree = *c
1239 } else {
1240 // the and was simplified to a single clause,
1241 // so the linematch portion is irrelevant.
1242 return mt, nil
1243 }
1244 case *notMatchTree:
1245 mt.child, err = pruneMatchTree(mt.child)
1246 if err != nil {
1247 return nil, err
1248 }
1249 if mt.child == nil {
1250 // not false => true
1251 return &bruteForceMatchTree{}, nil
1252 }
1253 // unhandled:
1254 case *docMatchTree:
1255 case *bruteForceMatchTree:
1256 case *regexpMatchTree:
1257 case *wordMatchTree:
1258 }
1259 return mt, err
1260}