fork of https://github.com/sourcegraph/zoekt
1// Copyright 2018 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "bytes"
19 "fmt"
20 "log"
21 "regexp/syntax"
22 "strings"
23 "unicode/utf8"
24
25 "github.com/grafana/regexp"
26
27 "github.com/sourcegraph/zoekt/query"
28)
29
30// A docIterator iterates over documents in order.
31type docIterator interface {
32 // provide the next document where we can may find something
33 // interesting.
34 nextDoc() uint32
35
36 // clears any per-document state of the docIterator, and
37 // prepares for evaluating the given doc. The argument is
38 // strictly increasing over time.
39 prepare(nextDoc uint32)
40}
41
42const (
43 costConst = 0
44 costMemory = 1
45 costContent = 2
46 costRegexp = 3
47)
48
49const (
50 costMin = costConst
51 costMax = costRegexp
52)
53
54// An expression tree coupled with matches. The matchtree has two
55// functions:
56//
57// * it implements boolean combinations (and, or, not)
58//
59// * it implements shortcuts, where we skip documents (for example: if
60// there are no trigram matches, we can be sure there are no substring
61// matches). The matchtree iterates over the documents as they are
62// ordered in the shard.
63//
64// The general process for a given (shard, query) is:
65//
66// - construct matchTree for the query
67//
68// - find all different leaf matchTrees (substring, regexp, etc.)
69//
70// in a loop:
71//
72// - find next doc to process using nextDoc
73//
74// - evaluate atoms (leaf expressions that match text)
75//
76// - evaluate the tree using matches(), storing the result in map.
77//
78// - if the complete tree returns (matches() == true) for the document,
79// collect all text matches by looking at leaf matchTrees
80type matchTree interface {
81 docIterator
82
83 // returns whether this matches, and if we are sure.
84 matches(cp *contentProvider, cost int, known map[matchTree]bool) (match bool, sure bool)
85}
86
87// docMatchTree iterates over documents for which predicate(docID) returns true.
88type docMatchTree struct {
89 // the number of documents in a shard.
90 numDocs uint32
91
92 predicate func(docID uint32) bool
93
94 // provides additional information about the reason why the docMatchTree was
95 // created.
96 reason string
97
98 // mutable
99 firstDone bool
100 docID uint32
101}
102
103type bruteForceMatchTree struct {
104 // mutable
105 firstDone bool
106 docID uint32
107}
108
109type andLineMatchTree struct {
110 andMatchTree
111}
112
113type andMatchTree struct {
114 children []matchTree
115}
116
117type orMatchTree struct {
118 children []matchTree
119}
120
121type notMatchTree struct {
122 child matchTree
123}
124
125// Returns only the filename of child matches.
126type fileNameMatchTree struct {
127 child matchTree
128}
129
130// Don't visit this subtree for collecting matches.
131type noVisitMatchTree struct {
132 matchTree
133}
134
135type regexpMatchTree struct {
136 regexp *regexp.Regexp
137
138 fileName bool
139
140 // mutable
141 reEvaluated bool
142 found []*candidateMatch
143
144 // nextDoc, prepare.
145 bruteForceMatchTree
146}
147
148// \bLITERAL\b
149type wordMatchTree struct {
150 word string
151
152 fileName bool
153
154 // mutable
155 evaluated bool
156 found []*candidateMatch
157
158 // nextDoc, prepare.
159 bruteForceMatchTree
160}
161
162type substrMatchTree struct {
163 matchIterator
164
165 query *query.Substring
166 caseSensitive bool
167 fileName bool
168
169 // mutable
170 current []*candidateMatch
171 contEvaluated bool
172}
173
174type branchQueryMatchTree struct {
175 fileMasks []uint64
176 masks []uint64
177 repos []uint16
178
179 // mutable
180 firstDone bool
181 docID uint32
182}
183
184func (t *branchQueryMatchTree) branchMask() uint64 {
185 return t.fileMasks[t.docID] & t.masks[t.repos[t.docID]]
186}
187
188type symbolRegexpMatchTree struct {
189 matchTree
190 regexp *regexp.Regexp
191 all bool // skips regex match if .*
192
193 reEvaluated bool
194 found []*candidateMatch
195}
196
197func (t *symbolRegexpMatchTree) prepare(doc uint32) {
198 t.reEvaluated = false
199}
200
201func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
202 if t.reEvaluated {
203 return len(t.found) > 0, true
204 }
205
206 if cost < costRegexp {
207 return false, false
208 }
209
210 sections := cp.docSections()
211 content := cp.data(false)
212
213 found := t.found[:0]
214 for i, sec := range sections {
215 var idx []int
216 if t.all {
217 idx = []int{0, int(sec.End - sec.Start)}
218 } else {
219 idx = t.regexp.FindIndex(content[sec.Start:sec.End])
220 if idx == nil {
221 continue
222 }
223 }
224
225 cm := &candidateMatch{
226 byteOffset: sec.Start + uint32(idx[0]),
227 byteMatchSz: uint32(idx[1] - idx[0]),
228 symbol: true,
229 symbolIdx: uint32(i),
230 }
231 found = append(found, cm)
232 }
233 t.found = found
234 t.reEvaluated = true
235
236 return len(t.found) > 0, true
237}
238
239type symbolSubstrMatchTree struct {
240 *substrMatchTree
241
242 patternSize uint32
243 fileEndRunes []uint32
244 fileEndSymbol []uint32
245
246 doc uint32
247 sections []DocumentSection
248
249 secID uint32
250}
251
252func (t *symbolSubstrMatchTree) prepare(doc uint32) {
253 t.substrMatchTree.prepare(doc)
254 t.doc = doc
255
256 var fileStart uint32
257 if doc > 0 {
258 fileStart = t.fileEndRunes[doc-1]
259 }
260
261 var sections []DocumentSection
262 if len(t.sections) > 0 {
263 most := t.fileEndSymbol[len(t.fileEndSymbol)-1]
264 if most == uint32(len(t.sections)) {
265 sections = t.sections[t.fileEndSymbol[doc]:t.fileEndSymbol[doc+1]]
266 } else {
267 for t.secID < uint32(len(t.sections)) && t.sections[t.secID].Start < fileStart {
268 t.secID++
269 }
270
271 fileEnd, symbolEnd := t.fileEndRunes[doc], t.secID
272 for symbolEnd < uint32(len(t.sections)) && t.sections[symbolEnd].Start < fileEnd {
273 symbolEnd++
274 }
275
276 sections = t.sections[t.secID:symbolEnd]
277 }
278 }
279
280 secIdx := 0
281 trimmed := t.current[:0]
282 for len(sections) > secIdx && len(t.current) > 0 {
283 start := fileStart + t.current[0].runeOffset
284 end := start + t.patternSize
285 if start >= sections[secIdx].End {
286 secIdx++
287 continue
288 }
289
290 if start < sections[secIdx].Start {
291 t.current = t.current[1:]
292 continue
293 }
294
295 if end <= sections[secIdx].End {
296 t.current[0].symbol = true
297 t.current[0].symbolIdx = uint32(secIdx)
298 trimmed = append(trimmed, t.current[0])
299 }
300
301 t.current = t.current[1:]
302 }
303 t.current = trimmed
304}
305
306// all prepare methods
307
308func (t *bruteForceMatchTree) prepare(doc uint32) {
309 t.docID = doc
310 t.firstDone = true
311}
312
313func (t *docMatchTree) prepare(doc uint32) {
314 t.docID = doc
315 t.firstDone = true
316}
317
318func (t *andMatchTree) prepare(doc uint32) {
319 for _, c := range t.children {
320 c.prepare(doc)
321 }
322}
323
324func (t *regexpMatchTree) prepare(doc uint32) {
325 t.found = t.found[:0]
326 t.reEvaluated = false
327 t.bruteForceMatchTree.prepare(doc)
328}
329
330func (t *wordMatchTree) prepare(doc uint32) {
331 t.found = t.found[:0]
332 t.evaluated = false
333 t.bruteForceMatchTree.prepare(doc)
334}
335
336func (t *orMatchTree) prepare(doc uint32) {
337 for _, c := range t.children {
338 c.prepare(doc)
339 }
340}
341
342func (t *notMatchTree) prepare(doc uint32) {
343 t.child.prepare(doc)
344}
345
346func (t *fileNameMatchTree) prepare(doc uint32) {
347 t.child.prepare(doc)
348}
349
350func (t *substrMatchTree) prepare(nextDoc uint32) {
351 t.matchIterator.prepare(nextDoc)
352 t.current = t.matchIterator.candidates()
353 t.contEvaluated = false
354}
355
356func (t *branchQueryMatchTree) prepare(doc uint32) {
357 t.firstDone = true
358 t.docID = doc
359}
360
361// nextDoc
362
363func (t *docMatchTree) nextDoc() uint32 {
364 var start uint32
365 if t.firstDone {
366 start = t.docID + 1
367 }
368 for i := start; i < t.numDocs; i++ {
369 if t.predicate(i) {
370 return i
371 }
372 }
373 return maxUInt32
374}
375
376func (t *bruteForceMatchTree) nextDoc() uint32 {
377 if !t.firstDone {
378 return 0
379 }
380 return t.docID + 1
381}
382
383func (t *andMatchTree) nextDoc() uint32 {
384 var max uint32
385 for _, c := range t.children {
386 m := c.nextDoc()
387 if m > max {
388 max = m
389 }
390 }
391 return max
392}
393
394func (t *orMatchTree) nextDoc() uint32 {
395 min := uint32(maxUInt32)
396 for _, c := range t.children {
397 m := c.nextDoc()
398 if m < min {
399 min = m
400 }
401 }
402 return min
403}
404
405func (t *notMatchTree) nextDoc() uint32 {
406 return 0
407}
408
409func (t *fileNameMatchTree) nextDoc() uint32 {
410 return t.child.nextDoc()
411}
412
413func (t *branchQueryMatchTree) nextDoc() uint32 {
414 var start uint32
415 if t.firstDone {
416 start = t.docID + 1
417 }
418
419 for i := start; i < uint32(len(t.fileMasks)); i++ {
420 if (t.masks[t.repos[i]] & t.fileMasks[i]) != 0 {
421 return i
422 }
423 }
424 return maxUInt32
425}
426
427// all String methods
428
429func (t *bruteForceMatchTree) String() string {
430 return "all"
431}
432
433func (t *docMatchTree) String() string {
434 return fmt.Sprintf("doc(%s)", t.reason)
435}
436
437func (t *andMatchTree) String() string {
438 return fmt.Sprintf("and%v", t.children)
439}
440
441func (t *regexpMatchTree) String() string {
442 f := ""
443 if t.fileName {
444 f = "f"
445 }
446 return fmt.Sprintf("%sre(%s)", f, t.regexp)
447}
448
449func (t *wordMatchTree) String() string {
450 f := ""
451 if t.fileName {
452 f = "f"
453 }
454 return fmt.Sprintf("%sword(%s)", f, t.word)
455}
456
457func (t *orMatchTree) String() string {
458 return fmt.Sprintf("or%v", t.children)
459}
460
461func (t *notMatchTree) String() string {
462 return fmt.Sprintf("not(%v)", t.child)
463}
464
465func (t *noVisitMatchTree) String() string {
466 return fmt.Sprintf("novisit(%v)", t.matchTree)
467}
468
469func (t *fileNameMatchTree) String() string {
470 return fmt.Sprintf("f(%v)", t.child)
471}
472
473func (t *substrMatchTree) String() string {
474 f := ""
475 if t.fileName {
476 f = "f"
477 }
478
479 return fmt.Sprintf("%ssubstr(%q, %v, %v)", f, t.query.Pattern, t.current, t.matchIterator)
480}
481
482func (t *branchQueryMatchTree) String() string {
483 return fmt.Sprintf("branch(%x)", t.masks)
484}
485
486func (t *symbolSubstrMatchTree) String() string {
487 return fmt.Sprintf("symbol(%v)", t.substrMatchTree)
488}
489
490func (t *symbolRegexpMatchTree) String() string {
491 return fmt.Sprintf("symbol(%v)", t.matchTree)
492}
493
494// visitMatches visits all atoms in matchTree. Note: This visits
495// noVisitMatchTree. For collecting matches use visitMatches.
496func visitMatchTree(t matchTree, f func(matchTree)) {
497 switch s := t.(type) {
498 case *andMatchTree:
499 for _, ch := range s.children {
500 visitMatchTree(ch, f)
501 }
502 case *orMatchTree:
503 for _, ch := range s.children {
504 visitMatchTree(ch, f)
505 }
506 case *andLineMatchTree:
507 visitMatchTree(&s.andMatchTree, f)
508 case *noVisitMatchTree:
509 visitMatchTree(s.matchTree, f)
510 case *notMatchTree:
511 visitMatchTree(s.child, f)
512 case *fileNameMatchTree:
513 visitMatchTree(s.child, f)
514 case *symbolSubstrMatchTree:
515 visitMatchTree(s.substrMatchTree, f)
516 case *symbolRegexpMatchTree:
517 visitMatchTree(s.matchTree, f)
518 default:
519 f(t)
520 }
521}
522
523// visitMatches visits all atoms which can contribute matches. Note: This
524// skips noVisitMatchTree.
525func visitMatches(t matchTree, known map[matchTree]bool, f func(matchTree)) {
526 switch s := t.(type) {
527 case *andMatchTree:
528 for _, ch := range s.children {
529 if known[ch] {
530 visitMatches(ch, known, f)
531 }
532 }
533 case *andLineMatchTree:
534 visitMatches(&s.andMatchTree, known, f)
535 case *orMatchTree:
536 for _, ch := range s.children {
537 if known[ch] {
538 visitMatches(ch, known, f)
539 }
540 }
541 case *symbolSubstrMatchTree:
542 visitMatches(s.substrMatchTree, known, f)
543 case *notMatchTree:
544 case *noVisitMatchTree:
545 // don't collect into negative trees.
546 case *fileNameMatchTree:
547 // We will just gather the filename if we do not visit this tree.
548 default:
549 f(s)
550 }
551}
552
553// all matches() methods.
554
555func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
556 return t.predicate(cp.idx), true
557}
558
559func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
560 return true, true
561}
562
563// andLineMatchTree is a performance optimization of andMatchTree. For content
564// searches we don't want to run the regex engine if there is no line that
565// contains matches from all terms.
566func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
567 matches, sure := t.andMatchTree.matches(cp, cost, known)
568 if !(sure && matches) {
569 return matches, sure
570 }
571
572 // find child with fewest candidates
573 min := maxUInt32
574 fewestChildren := 0
575 for ix, child := range t.children {
576 v, ok := child.(*substrMatchTree)
577 // make sure we are running a content search and that all candidates are a
578 // substrMatchTree
579 if !ok || v.fileName {
580 return matches, sure
581 }
582 if len(v.current) < min {
583 min = len(v.current)
584 fewestChildren = ix
585 }
586 }
587
588 type lineRange struct {
589 start int
590 end int
591 }
592 lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current))
593 prev := -1
594 for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current {
595 line, byteStart, byteEnd := cp.newlines().atOffset(candidate.byteOffset)
596 if line == prev {
597 continue
598 }
599 prev = line
600 lines = append(lines, lineRange{byteStart, byteEnd})
601 }
602
603 // children keeps track of the children's candidates we have already seen.
604 children := make([][]*candidateMatch, 0, len(t.children)-1)
605 for j, child := range t.children {
606 if j == fewestChildren {
607 continue
608 }
609 children = append(children, child.(*substrMatchTree).current)
610 }
611
612nextLine:
613 for i := 0; i < len(lines); i++ {
614 hits := 1
615 nextChild:
616 for j := range children {
617 nextCandidate:
618 for len(children[j]) > 0 {
619 candidate := children[j][0]
620 bo := int(cp.findOffset(false, candidate.runeOffset))
621 if bo < lines[i].start {
622 children[j] = children[j][1:]
623 continue nextCandidate
624 }
625 if bo <= lines[i].end {
626 hits++
627 continue nextChild
628 }
629 // move the `lines` iterator forward until bo <= line.end
630 for i < len(lines) && bo > lines[i].end {
631 i++
632 }
633 i--
634 continue nextLine
635 }
636 }
637 // return early once we found any line that contains matches from all children
638 if hits == len(t.children) {
639 return matches, true
640 }
641 }
642 return false, true
643}
644
645func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
646 sure := true
647
648 for _, ch := range t.children {
649 v, ok := evalMatchTree(cp, cost, known, ch)
650 if ok && !v {
651 return false, true
652 }
653 if !ok {
654 sure = false
655 }
656 }
657
658 return true, sure
659}
660
661func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
662 matches := false
663 sure := true
664 for _, ch := range t.children {
665 v, ok := evalMatchTree(cp, cost, known, ch)
666 if ok {
667 // we could short-circuit, but we want to use
668 // the other possibilities as a ranking
669 // signal.
670 matches = matches || v
671 } else {
672 sure = false
673 }
674 }
675 return matches, sure
676}
677
678func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
679 return t.branchMask() != 0, true
680}
681
682func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
683 if t.reEvaluated {
684 return len(t.found) > 0, true
685 }
686
687 if cost < costRegexp {
688 return false, false
689 }
690
691 cp.stats.RegexpsConsidered++
692 idxs := t.regexp.FindAllIndex(cp.data(t.fileName), -1)
693 found := t.found[:0]
694 for _, idx := range idxs {
695 cm := &candidateMatch{
696 byteOffset: uint32(idx[0]),
697 byteMatchSz: uint32(idx[1] - idx[0]),
698 fileName: t.fileName,
699 }
700
701 found = append(found, cm)
702 }
703 t.found = found
704 t.reEvaluated = true
705
706 return len(t.found) > 0, true
707}
708
709func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
710 if t.evaluated {
711 return len(t.found) > 0, true
712 }
713
714 if cost < costRegexp {
715 return false, false
716 }
717
718 data := cp.data(t.fileName)
719 offset := 0
720 found := t.found[:0]
721 for {
722 idx := bytes.Index(data[offset:], []byte(t.word))
723 if idx < 0 {
724 break
725 }
726
727 relStartOffset := offset + idx
728 relEndOffset := relStartOffset + len(t.word)
729
730 startBoundary := relStartOffset < len(data) && (relStartOffset == 0 || !characterClass(data[relStartOffset-1]))
731 endBoundary := relEndOffset > 0 && (relEndOffset == len(data) || !characterClass(data[relEndOffset]))
732 if startBoundary && endBoundary {
733 found = append(found, &candidateMatch{
734 byteOffset: uint32(offset + idx),
735 byteMatchSz: uint32(len(t.word)),
736 fileName: t.fileName,
737 })
738 }
739 offset += idx + len(t.word)
740 }
741
742 t.found = found
743 t.evaluated = true
744
745 return len(t.found) > 0, true
746}
747
748// breakMatchesOnNewlines returns matches resulting from breaking each element
749// of cms on newlines within text.
750func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch {
751 var lineCMs []*candidateMatch
752 for _, cm := range cms {
753 lineCMs = append(lineCMs, breakOnNewlines(cm, text)...)
754 }
755 return lineCMs
756}
757
758// breakOnNewlines returns matches resulting from breaking cm on newlines
759// within text.
760func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch {
761 var cms []*candidateMatch
762 addMe := &candidateMatch{}
763 *addMe = *cm
764 for i := uint32(cm.byteOffset); i < cm.byteOffset+cm.byteMatchSz; i++ {
765 if text[i] == '\n' {
766 addMe.byteMatchSz = i - addMe.byteOffset
767 if addMe.byteMatchSz != 0 {
768 cms = append(cms, addMe)
769 }
770
771 addMe = &candidateMatch{}
772 *addMe = *cm
773 addMe.byteOffset = i + 1
774 }
775 }
776 addMe.byteMatchSz = cm.byteOffset + cm.byteMatchSz - addMe.byteOffset
777 if addMe.byteMatchSz != 0 {
778 cms = append(cms, addMe)
779 }
780 return cms
781}
782
783func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) (bool, bool) {
784 if v, ok := known[mt]; ok {
785 return v, true
786 }
787
788 v, ok := mt.matches(cp, cost, known)
789 if ok {
790 known[mt] = v
791 }
792
793 return v, ok
794}
795
796func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
797 v, ok := evalMatchTree(cp, cost, known, t.child)
798 return !v, ok
799}
800
801func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
802 return evalMatchTree(cp, cost, known, t.child)
803}
804
805func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
806 if t.contEvaluated {
807 return len(t.current) > 0, true
808 }
809
810 if len(t.current) == 0 {
811 return false, true
812 }
813
814 if t.fileName && cost < costMemory {
815 return false, false
816 }
817
818 if !t.fileName && cost < costContent {
819 return false, false
820 }
821
822 pruned := t.current[:0]
823 for _, m := range t.current {
824 if m.byteOffset == 0 && m.runeOffset > 0 {
825 m.byteOffset = cp.findOffset(m.fileName, m.runeOffset)
826 }
827 if m.matchContent(cp.data(m.fileName)) {
828 pruned = append(pruned, m)
829 }
830 }
831 t.current = pruned
832 t.contEvaluated = true
833
834 return len(t.current) > 0, true
835}
836
837type matchTreeOpt struct {
838 // DisableWordMatchOptimization is used to disable the use of wordMatchTree.
839 // This was added since we do not support wordMatchTree with symbol search.
840 DisableWordMatchOptimization bool
841}
842
843func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error) {
844 if q == nil {
845 return nil, fmt.Errorf("got nil (sub)query")
846 }
847 switch s := q.(type) {
848 case *query.Regexp:
849 // RegexpToMatchTreeRecursive tries to distill a matchTree that matches a
850 // superset of the regexp. If the returned matchTree is equivalent to the
851 // original regexp, it returns true. An equivalent matchTree has the same
852 // behaviour as the original regexp and can be used instead.
853 //
854 subMT, isEq, _, err := d.regexpToMatchTreeRecursive(s.Regexp, ngramSize, s.FileName, s.CaseSensitive)
855 if err != nil {
856 return nil, err
857 }
858 // if the query can be used in place of the regexp
859 // return the subtree
860 if isEq {
861 return subMT, nil
862 }
863
864 var tr matchTree
865 if wmt, ok := regexpToWordMatchTree(s, opt); ok {
866 // A common search we get is "\bLITERAL\b". Avoid the regex engine and
867 // provide something faster.
868 tr = wmt
869 } else {
870 prefix := ""
871 if !s.CaseSensitive {
872 prefix = "(?i)"
873 }
874
875 tr = ®expMatchTree{
876 regexp: regexp.MustCompile(prefix + s.Regexp.String()),
877 fileName: s.FileName,
878 }
879 }
880
881 return &andMatchTree{
882 children: []matchTree{
883 tr, &noVisitMatchTree{subMT},
884 },
885 }, nil
886 case *query.And:
887 var r []matchTree
888 for _, ch := range s.Children {
889 ct, err := d.newMatchTree(ch, opt)
890 if err != nil {
891 return nil, err
892 }
893 r = append(r, ct)
894 }
895 return &andMatchTree{r}, nil
896 case *query.Or:
897 var r []matchTree
898 for _, ch := range s.Children {
899 ct, err := d.newMatchTree(ch, opt)
900 if err != nil {
901 return nil, err
902 }
903 r = append(r, ct)
904 }
905 return &orMatchTree{r}, nil
906 case *query.Not:
907 ct, err := d.newMatchTree(s.Child, opt)
908 return ¬MatchTree{
909 child: ct,
910 }, err
911
912 case *query.Type:
913 if s.Type != query.TypeFileName {
914 break
915 }
916
917 ct, err := d.newMatchTree(s.Child, opt)
918 if err != nil {
919 return nil, err
920 }
921
922 return &fileNameMatchTree{
923 child: ct,
924 }, nil
925
926 case *query.Substring:
927 return d.newSubstringMatchTree(s)
928
929 case *query.Branch:
930 masks := make([]uint64, 0, len(d.repoMetaData))
931 if s.Pattern == "HEAD" {
932 for i := 0; i < len(d.repoMetaData); i++ {
933 masks = append(masks, 1)
934 }
935 } else {
936 for _, branchIDs := range d.branchIDs {
937 mask := uint64(0)
938 for nm, m := range branchIDs {
939 if (s.Exact && nm == s.Pattern) || (!s.Exact && strings.Contains(nm, s.Pattern)) {
940 mask |= uint64(m)
941 }
942 }
943 masks = append(masks, mask)
944 }
945
946 }
947 return &branchQueryMatchTree{
948 masks: masks,
949 fileMasks: d.fileBranchMasks,
950 repos: d.repos,
951 }, nil
952 case *query.Const:
953 if s.Value {
954 return &bruteForceMatchTree{}, nil
955 } else {
956 return &noMatchTree{"const"}, nil
957 }
958 case *query.Language:
959 code, ok := d.metaData.LanguageMap[s.Language]
960 if !ok {
961 return &noMatchTree{"lang"}, nil
962 }
963 return &docMatchTree{
964 reason: "language",
965 numDocs: d.numDocs(),
966 predicate: func(docID uint32) bool {
967 return d.getLanguage(docID) == code
968 },
969 }, nil
970
971 case *query.Symbol:
972 // Disable WordMatchTree since we don't support it in symbols yet.
973 optCopy := opt
974 optCopy.DisableWordMatchOptimization = true
975
976 subMT, err := d.newMatchTree(s.Expr, optCopy)
977 if err != nil {
978 return nil, err
979 }
980
981 if substr, ok := subMT.(*substrMatchTree); ok {
982 // We have a feature flag for lazy decoding. If runeDocSectionsRaw is
983 // non-nil it means we need to lazily decode on request.
984 sections := d.runeDocSections
985 if sections == nil && d.runeDocSectionsRaw != nil {
986 sections = unmarshalDocSections(d.runeDocSectionsRaw, nil)
987 }
988
989 return &symbolSubstrMatchTree{
990 substrMatchTree: substr,
991 patternSize: uint32(utf8.RuneCountInString(substr.query.Pattern)),
992 fileEndRunes: d.fileEndRunes,
993 fileEndSymbol: d.fileEndSymbol,
994 sections: sections,
995 }, nil
996 }
997
998 var regexp *regexp.Regexp
999 visitMatchTree(subMT, func(mt matchTree) {
1000 if t, ok := mt.(*regexpMatchTree); ok {
1001 regexp = t.regexp
1002 }
1003 })
1004 if regexp == nil {
1005 return nil, fmt.Errorf("found %T inside query.Symbol", subMT)
1006 }
1007
1008 return &symbolRegexpMatchTree{
1009 regexp: regexp,
1010 all: regexp.String() == "(?i)(?-s:.)*",
1011 matchTree: subMT,
1012 }, nil
1013
1014 case *query.FileNameSet:
1015 return &docMatchTree{
1016 reason: "FileNameSet",
1017 numDocs: d.numDocs(),
1018 predicate: func(docID uint32) bool {
1019 fileName := d.fileName(docID)
1020 _, ok := s.Set[string(fileName)]
1021 return ok
1022 },
1023 }, nil
1024
1025 case *query.BranchesRepos:
1026 reposBranchesWant := make([]uint64, len(d.repoMetaData))
1027 for repoIdx := range d.repoMetaData {
1028 var mask uint64
1029 for _, br := range s.List {
1030 if br.Repos.Contains(d.repoMetaData[repoIdx].ID) {
1031 mask |= uint64(d.branchIDs[repoIdx][br.Branch])
1032 }
1033 }
1034 reposBranchesWant[repoIdx] = mask
1035 }
1036 return &docMatchTree{
1037 reason: "BranchesRepos",
1038 numDocs: d.numDocs(),
1039 predicate: func(docID uint32) bool {
1040 return d.fileBranchMasks[docID]&reposBranchesWant[d.repos[docID]] != 0
1041 },
1042 }, nil
1043
1044 case *query.RepoSet:
1045 reposWant := make([]bool, len(d.repoMetaData))
1046 for repoIdx, r := range d.repoMetaData {
1047 if _, ok := s.Set[r.Name]; ok {
1048 reposWant[repoIdx] = true
1049 }
1050 }
1051 return &docMatchTree{
1052 reason: "RepoSet",
1053 numDocs: d.numDocs(),
1054 predicate: func(docID uint32) bool {
1055 return reposWant[d.repos[docID]]
1056 },
1057 }, nil
1058
1059 case *query.RepoIDs:
1060 reposWant := make([]bool, len(d.repoMetaData))
1061 for repoIdx, r := range d.repoMetaData {
1062 if s.Repos.Contains(r.ID) {
1063 reposWant[repoIdx] = true
1064 }
1065 }
1066 return &docMatchTree{
1067 reason: "RepoIDs",
1068 numDocs: d.numDocs(),
1069 predicate: func(docID uint32) bool {
1070 return reposWant[d.repos[docID]]
1071 },
1072 }, nil
1073
1074 case *query.Repo:
1075 reposWant := make([]bool, len(d.repoMetaData))
1076 for repoIdx, r := range d.repoMetaData {
1077 if s.Regexp.MatchString(r.Name) {
1078 reposWant[repoIdx] = true
1079 }
1080 }
1081 return &docMatchTree{
1082 reason: "Repo",
1083 numDocs: d.numDocs(),
1084 predicate: func(docID uint32) bool {
1085 return reposWant[d.repos[docID]]
1086 },
1087 }, nil
1088
1089 case *query.RepoRegexp:
1090 reposWant := make([]bool, len(d.repoMetaData))
1091 for repoIdx, r := range d.repoMetaData {
1092 if s.Regexp.MatchString(r.Name) {
1093 reposWant[repoIdx] = true
1094 }
1095 }
1096 return &docMatchTree{
1097 reason: "RepoRegexp",
1098 numDocs: d.numDocs(),
1099 predicate: func(docID uint32) bool {
1100 return reposWant[d.repos[docID]]
1101 },
1102 }, nil
1103
1104 case query.RawConfig:
1105 return &docMatchTree{
1106 reason: s.String(),
1107 numDocs: d.numDocs(),
1108 predicate: func(docID uint32) bool {
1109 return uint8(s)&d.rawConfigMasks[d.repos[docID]] == uint8(s)
1110 },
1111 }, nil
1112 }
1113 log.Panicf("type %T", q)
1114 return nil, nil
1115}
1116
1117func (d *indexData) newSubstringMatchTree(s *query.Substring) (matchTree, error) {
1118 st := &substrMatchTree{
1119 query: s,
1120 caseSensitive: s.CaseSensitive,
1121 fileName: s.FileName,
1122 }
1123
1124 if utf8.RuneCountInString(s.Pattern) < ngramSize {
1125 prefix := ""
1126 if !s.CaseSensitive {
1127 prefix = "(?i)"
1128 }
1129 t := ®expMatchTree{
1130 regexp: regexp.MustCompile(prefix + regexp.QuoteMeta(s.Pattern)),
1131 fileName: s.FileName,
1132 }
1133 return t, nil
1134 }
1135
1136 result, err := d.iterateNgrams(s)
1137 if err != nil {
1138 return nil, err
1139 }
1140 st.matchIterator = result
1141 return st, nil
1142}
1143
1144func regexpToWordMatchTree(q *query.Regexp, opt matchTreeOpt) (_ *wordMatchTree, ok bool) {
1145 if opt.DisableWordMatchOptimization {
1146 return nil, false
1147 }
1148 // Needs to be case sensitive
1149 if !q.CaseSensitive || q.Regexp.Flags&syntax.FoldCase != 0 {
1150 return nil, false
1151 }
1152 // We want a regex that looks like Op.Concat[OpWordBoundary OpLiteral OpWordBoundary]
1153 if q.Regexp.Op != syntax.OpConcat || len(q.Regexp.Sub) != 3 {
1154 return nil, false
1155 }
1156 sub := q.Regexp.Sub
1157 if sub[0].Op != syntax.OpWordBoundary || sub[1].Op != syntax.OpLiteral || sub[2].Op != syntax.OpWordBoundary {
1158 return nil, false
1159 }
1160
1161 return &wordMatchTree{
1162 word: string(sub[1].Rune),
1163 fileName: q.FileName,
1164 }, true
1165}
1166
1167// pruneMatchTree removes impossible branches from the matchTree, as indicated
1168// by substrMatchTree having a noMatchTree and the resulting impossible and clauses and so forth.
1169func pruneMatchTree(mt matchTree) (matchTree, error) {
1170 var err error
1171 switch mt := mt.(type) {
1172 // leaf nodes that we test with the filter:
1173 case *substrMatchTree:
1174 if res, ok := mt.matchIterator.(*ngramIterationResults); ok {
1175 if _, ok := res.matchIterator.(*noMatchTree); ok {
1176 return nil, nil
1177 }
1178 }
1179 // recursive tree structures:
1180 case *andMatchTree:
1181 // Any branch of an and becoming impossible means the entire clause
1182 // is impossible. Otherwise, just handle rewrites.
1183 for i, child := range mt.children {
1184 newChild, err := pruneMatchTree(child)
1185 if err != nil {
1186 return nil, err
1187 }
1188 if newChild == nil {
1189 return nil, nil
1190 }
1191 mt.children[i] = newChild
1192 }
1193 case *orMatchTree:
1194 // *All* branches of an OR becoming impossible means the entire clause
1195 // is impossible. Otherwise, drop impossible subclauses and handle
1196 // rewrites, including simplifying to a singular resulting child branch.
1197 n := 0
1198 for _, child := range mt.children {
1199 newChild, err := pruneMatchTree(child)
1200 if err != nil {
1201 return nil, err
1202 }
1203 if newChild != nil {
1204 mt.children[n] = newChild
1205 n++
1206 }
1207 }
1208 mt.children = mt.children[:n]
1209 if len(mt.children) == 1 {
1210 return mt.children[0], nil
1211 } else if len(mt.children) == 0 {
1212 return nil, nil
1213 }
1214 case *noVisitMatchTree:
1215 mt.matchTree, err = pruneMatchTree(mt.matchTree)
1216 if err != nil {
1217 return nil, err
1218 }
1219 if mt.matchTree == nil {
1220 return nil, nil
1221 }
1222 case *fileNameMatchTree:
1223 mt.child, err = pruneMatchTree(mt.child)
1224 case *andLineMatchTree:
1225 child, err := pruneMatchTree(&mt.andMatchTree)
1226 if err != nil {
1227 return nil, err
1228 }
1229 if child == nil {
1230 return nil, nil
1231 }
1232 if c, ok := child.(*andMatchTree); ok {
1233 mt.andMatchTree = *c
1234 } else {
1235 // the and was simplified to a single clause,
1236 // so the linematch portion is irrelevant.
1237 return mt, nil
1238 }
1239 case *notMatchTree:
1240 mt.child, err = pruneMatchTree(mt.child)
1241 if err != nil {
1242 return nil, err
1243 }
1244 if mt.child == nil {
1245 // not false => true
1246 return &bruteForceMatchTree{}, nil
1247 }
1248 // unhandled:
1249 case *docMatchTree:
1250 case *bruteForceMatchTree:
1251 case *regexpMatchTree:
1252 case *wordMatchTree:
1253 }
1254 return mt, err
1255}