fork of https://github.com/sourcegraph/zoekt
1// Copyright 2018 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "bytes"
19 "fmt"
20 "log"
21 "regexp/syntax"
22 "strings"
23 "unicode/utf8"
24
25 "github.com/grafana/regexp"
26
27 "github.com/sourcegraph/zoekt/query"
28)
29
30// A docIterator iterates over documents in order.
31type docIterator interface {
32 // provide the next document where we can may find something
33 // interesting.
34 nextDoc() uint32
35
36 // clears any per-document state of the docIterator, and
37 // prepares for evaluating the given doc. The argument is
38 // strictly increasing over time.
39 prepare(nextDoc uint32)
40}
41
42const (
43 costConst = 0
44 costMemory = 1
45 costContent = 2
46 costRegexp = 3
47)
48
49const (
50 costMin = costConst
51 costMax = costRegexp
52)
53
54// An expression tree coupled with matches. The matchtree has two
55// functions:
56//
57// * it implements boolean combinations (and, or, not)
58//
59// * it implements shortcuts, where we skip documents (for example: if
60// there are no trigram matches, we can be sure there are no substring
61// matches). The matchtree iterates over the documents as they are
62// ordered in the shard.
63//
64// The general process for a given (shard, query) is:
65//
66// - construct matchTree for the query
67//
68// - find all different leaf matchTrees (substring, regexp, etc.)
69//
70// in a loop:
71//
72// - find next doc to process using nextDoc
73//
74// - evaluate atoms (leaf expressions that match text)
75//
76// - evaluate the tree using matches(), storing the result in map.
77//
78// - if the complete tree returns (matches() == true) for the document,
79// collect all text matches by looking at leaf matchTrees
80type matchTree interface {
81 docIterator
82
83 // returns whether this matches, and if we are sure.
84 matches(cp *contentProvider, cost int, known map[matchTree]bool) (match bool, sure bool)
85}
86
87// docMatchTree iterates over documents for which predicate(docID) returns true.
88type docMatchTree struct {
89 // the number of documents in a shard.
90 numDocs uint32
91
92 predicate func(docID uint32) bool
93
94 // provides additional information about the reason why the docMatchTree was
95 // created.
96 reason string
97
98 // mutable
99 firstDone bool
100 docID uint32
101}
102
103type bruteForceMatchTree struct {
104 // mutable
105 firstDone bool
106 docID uint32
107}
108
109type andLineMatchTree struct {
110 andMatchTree
111}
112
113type andMatchTree struct {
114 children []matchTree
115}
116
117type orMatchTree struct {
118 children []matchTree
119}
120
121type notMatchTree struct {
122 child matchTree
123}
124
125// Returns only the filename of child matches.
126type fileNameMatchTree struct {
127 child matchTree
128}
129
130// Don't visit this subtree for collecting matches.
131type noVisitMatchTree struct {
132 matchTree
133}
134
135type regexpMatchTree struct {
136 regexp *regexp.Regexp
137
138 fileName bool
139
140 // mutable
141 reEvaluated bool
142 found []*candidateMatch
143
144 // nextDoc, prepare.
145 bruteForceMatchTree
146}
147
148// \bLITERAL\b
149type wordMatchTree struct {
150 word string
151
152 fileName bool
153
154 // mutable
155 evaluated bool
156 found []*candidateMatch
157
158 // nextDoc, prepare.
159 bruteForceMatchTree
160}
161
162type substrMatchTree struct {
163 matchIterator
164
165 query *query.Substring
166 caseSensitive bool
167 fileName bool
168
169 // mutable
170 current []*candidateMatch
171 contEvaluated bool
172}
173
174type branchQueryMatchTree struct {
175 fileMasks []uint64
176 masks []uint64
177 repos []uint16
178
179 // mutable
180 firstDone bool
181 docID uint32
182}
183
184type symbolRegexpMatchTree struct {
185 matchTree
186 regexp *regexp.Regexp
187 all bool // skips regex match if .*
188
189 reEvaluated bool
190 found []*candidateMatch
191}
192
193func (t *symbolRegexpMatchTree) prepare(doc uint32) {
194 t.reEvaluated = false
195}
196
197func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
198 if t.reEvaluated {
199 return len(t.found) > 0, true
200 }
201
202 if cost < costRegexp {
203 return false, false
204 }
205
206 sections := cp.docSections()
207 content := cp.data(false)
208
209 found := t.found[:0]
210 for i, sec := range sections {
211 var idx []int
212 if t.all {
213 idx = []int{0, int(sec.End - sec.Start)}
214 } else {
215 idx = t.regexp.FindIndex(content[sec.Start:sec.End])
216 if idx == nil {
217 continue
218 }
219 }
220
221 cm := &candidateMatch{
222 byteOffset: sec.Start + uint32(idx[0]),
223 byteMatchSz: uint32(idx[1] - idx[0]),
224 symbol: true,
225 symbolIdx: uint32(i),
226 }
227 found = append(found, cm)
228 }
229 t.found = found
230 t.reEvaluated = true
231
232 return len(t.found) > 0, true
233}
234
235type symbolSubstrMatchTree struct {
236 *substrMatchTree
237
238 patternSize uint32
239 fileEndRunes []uint32
240 fileEndSymbol []uint32
241
242 doc uint32
243 sections []DocumentSection
244
245 secID uint32
246}
247
248func (t *symbolSubstrMatchTree) prepare(doc uint32) {
249 t.substrMatchTree.prepare(doc)
250 t.doc = doc
251
252 var fileStart uint32
253 if doc > 0 {
254 fileStart = t.fileEndRunes[doc-1]
255 }
256
257 var sections []DocumentSection
258 if len(t.sections) > 0 {
259 most := t.fileEndSymbol[len(t.fileEndSymbol)-1]
260 if most == uint32(len(t.sections)) {
261 sections = t.sections[t.fileEndSymbol[doc]:t.fileEndSymbol[doc+1]]
262 } else {
263 for t.secID < uint32(len(t.sections)) && t.sections[t.secID].Start < fileStart {
264 t.secID++
265 }
266
267 fileEnd, symbolEnd := t.fileEndRunes[doc], t.secID
268 for symbolEnd < uint32(len(t.sections)) && t.sections[symbolEnd].Start < fileEnd {
269 symbolEnd++
270 }
271
272 sections = t.sections[t.secID:symbolEnd]
273 }
274 }
275
276 secIdx := 0
277 trimmed := t.current[:0]
278 for len(sections) > secIdx && len(t.current) > 0 {
279 start := fileStart + t.current[0].runeOffset
280 end := start + t.patternSize
281 if start >= sections[secIdx].End {
282 secIdx++
283 continue
284 }
285
286 if start < sections[secIdx].Start {
287 t.current = t.current[1:]
288 continue
289 }
290
291 if end <= sections[secIdx].End {
292 t.current[0].symbol = true
293 t.current[0].symbolIdx = uint32(secIdx)
294 trimmed = append(trimmed, t.current[0])
295 }
296
297 t.current = t.current[1:]
298 }
299 t.current = trimmed
300}
301
302// all prepare methods
303
304func (t *bruteForceMatchTree) prepare(doc uint32) {
305 t.docID = doc
306 t.firstDone = true
307}
308
309func (t *docMatchTree) prepare(doc uint32) {
310 t.docID = doc
311 t.firstDone = true
312}
313
314func (t *andMatchTree) prepare(doc uint32) {
315 for _, c := range t.children {
316 c.prepare(doc)
317 }
318}
319
320func (t *regexpMatchTree) prepare(doc uint32) {
321 t.found = t.found[:0]
322 t.reEvaluated = false
323 t.bruteForceMatchTree.prepare(doc)
324}
325
326func (t *wordMatchTree) prepare(doc uint32) {
327 t.found = t.found[:0]
328 t.evaluated = false
329 t.bruteForceMatchTree.prepare(doc)
330}
331
332func (t *orMatchTree) prepare(doc uint32) {
333 for _, c := range t.children {
334 c.prepare(doc)
335 }
336}
337
338func (t *notMatchTree) prepare(doc uint32) {
339 t.child.prepare(doc)
340}
341
342func (t *fileNameMatchTree) prepare(doc uint32) {
343 t.child.prepare(doc)
344}
345
346func (t *substrMatchTree) prepare(nextDoc uint32) {
347 t.matchIterator.prepare(nextDoc)
348 t.current = t.matchIterator.candidates()
349 t.contEvaluated = false
350}
351
352func (t *branchQueryMatchTree) prepare(doc uint32) {
353 t.firstDone = true
354 t.docID = doc
355}
356
357// nextDoc
358
359func (t *docMatchTree) nextDoc() uint32 {
360 var start uint32
361 if t.firstDone {
362 start = t.docID + 1
363 }
364 for i := start; i < t.numDocs; i++ {
365 if t.predicate(i) {
366 return i
367 }
368 }
369 return maxUInt32
370}
371
372func (t *bruteForceMatchTree) nextDoc() uint32 {
373 if !t.firstDone {
374 return 0
375 }
376 return t.docID + 1
377}
378
379func (t *andMatchTree) nextDoc() uint32 {
380 var max uint32
381 for _, c := range t.children {
382 m := c.nextDoc()
383 if m > max {
384 max = m
385 }
386 }
387 return max
388}
389
390func (t *orMatchTree) nextDoc() uint32 {
391 min := uint32(maxUInt32)
392 for _, c := range t.children {
393 m := c.nextDoc()
394 if m < min {
395 min = m
396 }
397 }
398 return min
399}
400
401func (t *notMatchTree) nextDoc() uint32 {
402 return 0
403}
404
405func (t *fileNameMatchTree) nextDoc() uint32 {
406 return t.child.nextDoc()
407}
408
409func (t *branchQueryMatchTree) nextDoc() uint32 {
410 var start uint32
411 if t.firstDone {
412 start = t.docID + 1
413 }
414
415 for i := start; i < uint32(len(t.fileMasks)); i++ {
416 if (t.masks[t.repos[i]] & t.fileMasks[i]) != 0 {
417 return i
418 }
419 }
420 return maxUInt32
421}
422
423// all String methods
424
425func (t *bruteForceMatchTree) String() string {
426 return "all"
427}
428
429func (t *docMatchTree) String() string {
430 return fmt.Sprintf("doc(%s)", t.reason)
431}
432
433func (t *andMatchTree) String() string {
434 return fmt.Sprintf("and%v", t.children)
435}
436
437func (t *regexpMatchTree) String() string {
438 f := ""
439 if t.fileName {
440 f = "f"
441 }
442 return fmt.Sprintf("%sre(%s)", f, t.regexp)
443}
444
445func (t *wordMatchTree) String() string {
446 f := ""
447 if t.fileName {
448 f = "f"
449 }
450 return fmt.Sprintf("%sword(%s)", f, t.word)
451}
452
453func (t *orMatchTree) String() string {
454 return fmt.Sprintf("or%v", t.children)
455}
456
457func (t *notMatchTree) String() string {
458 return fmt.Sprintf("not(%v)", t.child)
459}
460
461func (t *noVisitMatchTree) String() string {
462 return fmt.Sprintf("novisit(%v)", t.matchTree)
463}
464
465func (t *fileNameMatchTree) String() string {
466 return fmt.Sprintf("f(%v)", t.child)
467}
468
469func (t *substrMatchTree) String() string {
470 f := ""
471 if t.fileName {
472 f = "f"
473 }
474
475 return fmt.Sprintf("%ssubstr(%q, %v, %v)", f, t.query.Pattern, t.current, t.matchIterator)
476}
477
478func (t *branchQueryMatchTree) String() string {
479 return fmt.Sprintf("branch(%x)", t.masks)
480}
481
482func (t *symbolSubstrMatchTree) String() string {
483 return fmt.Sprintf("symbol(%v)", t.substrMatchTree)
484}
485
486func (t *symbolRegexpMatchTree) String() string {
487 return fmt.Sprintf("symbol(%v)", t.matchTree)
488}
489
490// visitMatches visits all atoms in matchTree. Note: This visits
491// noVisitMatchTree. For collecting matches use visitMatches.
492func visitMatchTree(t matchTree, f func(matchTree)) {
493 switch s := t.(type) {
494 case *andMatchTree:
495 for _, ch := range s.children {
496 visitMatchTree(ch, f)
497 }
498 case *orMatchTree:
499 for _, ch := range s.children {
500 visitMatchTree(ch, f)
501 }
502 case *andLineMatchTree:
503 visitMatchTree(&s.andMatchTree, f)
504 case *noVisitMatchTree:
505 visitMatchTree(s.matchTree, f)
506 case *notMatchTree:
507 visitMatchTree(s.child, f)
508 case *fileNameMatchTree:
509 visitMatchTree(s.child, f)
510 case *symbolSubstrMatchTree:
511 visitMatchTree(s.substrMatchTree, f)
512 case *symbolRegexpMatchTree:
513 visitMatchTree(s.matchTree, f)
514 default:
515 f(t)
516 }
517}
518
519// visitMatches visits all atoms which can contribute matches. Note: This
520// skips noVisitMatchTree.
521func visitMatches(t matchTree, known map[matchTree]bool, f func(matchTree)) {
522 switch s := t.(type) {
523 case *andMatchTree:
524 for _, ch := range s.children {
525 if known[ch] {
526 visitMatches(ch, known, f)
527 }
528 }
529 case *andLineMatchTree:
530 visitMatches(&s.andMatchTree, known, f)
531 case *orMatchTree:
532 for _, ch := range s.children {
533 if known[ch] {
534 visitMatches(ch, known, f)
535 }
536 }
537 case *symbolSubstrMatchTree:
538 visitMatches(s.substrMatchTree, known, f)
539 case *notMatchTree:
540 case *noVisitMatchTree:
541 // don't collect into negative trees.
542 case *fileNameMatchTree:
543 // We will just gather the filename if we do not visit this tree.
544 default:
545 f(s)
546 }
547}
548
549// all matches() methods.
550
551func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
552 return t.predicate(cp.idx), true
553}
554
555func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
556 return true, true
557}
558
559// andLineMatchTree is a performance optimization of andMatchTree. For content
560// searches we don't want to run the regex engine if there is no line that
561// contains matches from all terms.
562func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
563 matches, sure := t.andMatchTree.matches(cp, cost, known)
564 if !(sure && matches) {
565 return matches, sure
566 }
567
568 // find child with fewest candidates
569 min := maxUInt32
570 fewestChildren := 0
571 for ix, child := range t.children {
572 v, ok := child.(*substrMatchTree)
573 // make sure we are running a content search and that all candidates are a
574 // substrMatchTree
575 if !ok || v.fileName {
576 return matches, sure
577 }
578 if len(v.current) < min {
579 min = len(v.current)
580 fewestChildren = ix
581 }
582 }
583
584 type lineRange struct {
585 start int
586 end int
587 }
588 lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current))
589 prev := -1
590 for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current {
591 line, byteStart, byteEnd := cp.newlines().atOffset(candidate.byteOffset)
592 if line == prev {
593 continue
594 }
595 prev = line
596 lines = append(lines, lineRange{byteStart, byteEnd})
597 }
598
599 // children keeps track of the children's candidates we have already seen.
600 children := make([][]*candidateMatch, 0, len(t.children)-1)
601 for j, child := range t.children {
602 if j == fewestChildren {
603 continue
604 }
605 children = append(children, child.(*substrMatchTree).current)
606 }
607
608nextLine:
609 for i := 0; i < len(lines); i++ {
610 hits := 1
611 nextChild:
612 for j := range children {
613 nextCandidate:
614 for len(children[j]) > 0 {
615 candidate := children[j][0]
616 bo := int(cp.findOffset(false, candidate.runeOffset))
617 if bo < lines[i].start {
618 children[j] = children[j][1:]
619 continue nextCandidate
620 }
621 if bo <= lines[i].end {
622 hits++
623 continue nextChild
624 }
625 // move the `lines` iterator forward until bo <= line.end
626 for i < len(lines) && bo > lines[i].end {
627 i++
628 }
629 i--
630 continue nextLine
631 }
632 }
633 // return early once we found any line that contains matches from all children
634 if hits == len(t.children) {
635 return matches, true
636 }
637 }
638 return false, true
639}
640
641func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
642 sure := true
643
644 for _, ch := range t.children {
645 v, ok := evalMatchTree(cp, cost, known, ch)
646 if ok && !v {
647 return false, true
648 }
649 if !ok {
650 sure = false
651 }
652 }
653
654 return true, sure
655}
656
657func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
658 matches := false
659 sure := true
660 for _, ch := range t.children {
661 v, ok := evalMatchTree(cp, cost, known, ch)
662 if ok {
663 // we could short-circuit, but we want to use
664 // the other possibilities as a ranking
665 // signal.
666 matches = matches || v
667 } else {
668 sure = false
669 }
670 }
671 return matches, sure
672}
673
674func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
675 return t.fileMasks[t.docID]&t.masks[t.repos[t.docID]] != 0, true
676}
677
678func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
679 if t.reEvaluated {
680 return len(t.found) > 0, true
681 }
682
683 if cost < costRegexp {
684 return false, false
685 }
686
687 cp.stats.RegexpsConsidered++
688 idxs := t.regexp.FindAllIndex(cp.data(t.fileName), -1)
689 found := t.found[:0]
690 for _, idx := range idxs {
691 cm := &candidateMatch{
692 byteOffset: uint32(idx[0]),
693 byteMatchSz: uint32(idx[1] - idx[0]),
694 fileName: t.fileName,
695 }
696
697 found = append(found, cm)
698 }
699 t.found = found
700 t.reEvaluated = true
701
702 return len(t.found) > 0, true
703}
704
705func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
706 if t.evaluated {
707 return len(t.found) > 0, true
708 }
709
710 if cost < costRegexp {
711 return false, false
712 }
713
714 data := cp.data(t.fileName)
715 offset := 0
716 found := t.found[:0]
717 for {
718 idx := bytes.Index(data[offset:], []byte(t.word))
719 if idx < 0 {
720 break
721 }
722
723 relStartOffset := offset + idx
724 relEndOffset := relStartOffset + len(t.word)
725
726 startBoundary := relStartOffset < len(data) && (relStartOffset == 0 || !characterClass(data[relStartOffset-1]))
727 endBoundary := relEndOffset > 0 && (relEndOffset == len(data) || !characterClass(data[relEndOffset]))
728 if startBoundary && endBoundary {
729 found = append(found, &candidateMatch{
730 byteOffset: uint32(offset + idx),
731 byteMatchSz: uint32(len(t.word)),
732 fileName: t.fileName,
733 })
734 }
735 offset += idx + len(t.word)
736 }
737
738 t.found = found
739 t.evaluated = true
740
741 return len(t.found) > 0, true
742}
743
744// breakMatchesOnNewlines returns matches resulting from breaking each element
745// of cms on newlines within text.
746func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch {
747 var lineCMs []*candidateMatch
748 for _, cm := range cms {
749 lineCMs = append(lineCMs, breakOnNewlines(cm, text)...)
750 }
751 return lineCMs
752}
753
754// breakOnNewlines returns matches resulting from breaking cm on newlines
755// within text.
756func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch {
757 var cms []*candidateMatch
758 addMe := &candidateMatch{}
759 *addMe = *cm
760 for i := uint32(cm.byteOffset); i < cm.byteOffset+cm.byteMatchSz; i++ {
761 if text[i] == '\n' {
762 addMe.byteMatchSz = i - addMe.byteOffset
763 if addMe.byteMatchSz != 0 {
764 cms = append(cms, addMe)
765 }
766
767 addMe = &candidateMatch{}
768 *addMe = *cm
769 addMe.byteOffset = i + 1
770 }
771 }
772 addMe.byteMatchSz = cm.byteOffset + cm.byteMatchSz - addMe.byteOffset
773 if addMe.byteMatchSz != 0 {
774 cms = append(cms, addMe)
775 }
776 return cms
777}
778
779func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) (bool, bool) {
780 if v, ok := known[mt]; ok {
781 return v, true
782 }
783
784 v, ok := mt.matches(cp, cost, known)
785 if ok {
786 known[mt] = v
787 }
788
789 return v, ok
790}
791
792func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
793 v, ok := evalMatchTree(cp, cost, known, t.child)
794 return !v, ok
795}
796
797func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
798 return evalMatchTree(cp, cost, known, t.child)
799}
800
801func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
802 if t.contEvaluated {
803 return len(t.current) > 0, true
804 }
805
806 if len(t.current) == 0 {
807 return false, true
808 }
809
810 if t.fileName && cost < costMemory {
811 return false, false
812 }
813
814 if !t.fileName && cost < costContent {
815 return false, false
816 }
817
818 pruned := t.current[:0]
819 for _, m := range t.current {
820 if m.byteOffset == 0 && m.runeOffset > 0 {
821 m.byteOffset = cp.findOffset(m.fileName, m.runeOffset)
822 }
823 if m.matchContent(cp.data(m.fileName)) {
824 pruned = append(pruned, m)
825 }
826 }
827 t.current = pruned
828 t.contEvaluated = true
829
830 return len(t.current) > 0, true
831}
832
833func (d *indexData) newMatchTree(q query.Q) (matchTree, error) {
834 if q == nil {
835 return nil, fmt.Errorf("got nil (sub)query")
836 }
837 switch s := q.(type) {
838 case *query.Regexp:
839 // RegexpToMatchTreeRecursive tries to distill a matchTree that matches a
840 // superset of the regexp. If the returned matchTree is equivalent to the
841 // original regexp, it returns true. An equivalent matchTree has the same
842 // behaviour as the original regexp and can be used instead.
843 //
844 subMT, isEq, _, err := d.regexpToMatchTreeRecursive(s.Regexp, ngramSize, s.FileName, s.CaseSensitive)
845 if err != nil {
846 return nil, err
847 }
848 // if the query can be used in place of the regexp
849 // return the subtree
850 if isEq {
851 return subMT, nil
852 }
853
854 var tr matchTree
855 if wmt, ok := regexpToWordMatchTree(s); ok {
856 // A common search we get is "\bLITERAL\b". Avoid the regex engine and
857 // provide something faster.
858 tr = wmt
859 } else {
860 prefix := ""
861 if !s.CaseSensitive {
862 prefix = "(?i)"
863 }
864
865 tr = ®expMatchTree{
866 regexp: regexp.MustCompile(prefix + s.Regexp.String()),
867 fileName: s.FileName,
868 }
869 }
870
871 return &andMatchTree{
872 children: []matchTree{
873 tr, &noVisitMatchTree{subMT},
874 },
875 }, nil
876 case *query.And:
877 var r []matchTree
878 for _, ch := range s.Children {
879 ct, err := d.newMatchTree(ch)
880 if err != nil {
881 return nil, err
882 }
883 r = append(r, ct)
884 }
885 return &andMatchTree{r}, nil
886 case *query.Or:
887 var r []matchTree
888 for _, ch := range s.Children {
889 ct, err := d.newMatchTree(ch)
890 if err != nil {
891 return nil, err
892 }
893 r = append(r, ct)
894 }
895 return &orMatchTree{r}, nil
896 case *query.Not:
897 ct, err := d.newMatchTree(s.Child)
898 return ¬MatchTree{
899 child: ct,
900 }, err
901
902 case *query.Type:
903 if s.Type != query.TypeFileName {
904 break
905 }
906
907 ct, err := d.newMatchTree(s.Child)
908 if err != nil {
909 return nil, err
910 }
911
912 return &fileNameMatchTree{
913 child: ct,
914 }, nil
915
916 case *query.Substring:
917 return d.newSubstringMatchTree(s)
918
919 case *query.Branch:
920 masks := make([]uint64, 0, len(d.repoMetaData))
921 if s.Pattern == "HEAD" {
922 for i := 0; i < len(d.repoMetaData); i++ {
923 masks = append(masks, 1)
924 }
925 } else {
926 for _, branchIDs := range d.branchIDs {
927 mask := uint64(0)
928 for nm, m := range branchIDs {
929 if (s.Exact && nm == s.Pattern) || (!s.Exact && strings.Contains(nm, s.Pattern)) {
930 mask |= uint64(m)
931 }
932 }
933 masks = append(masks, mask)
934 }
935
936 }
937 return &branchQueryMatchTree{
938 masks: masks,
939 fileMasks: d.fileBranchMasks,
940 repos: d.repos,
941 }, nil
942 case *query.Const:
943 if s.Value {
944 return &bruteForceMatchTree{}, nil
945 } else {
946 return &noMatchTree{"const"}, nil
947 }
948 case *query.Language:
949 code, ok := d.metaData.LanguageMap[s.Language]
950 if !ok {
951 return &noMatchTree{"lang"}, nil
952 }
953 return &docMatchTree{
954 reason: "language",
955 numDocs: d.numDocs(),
956 predicate: func(docID uint32) bool {
957 return d.getLanguage(docID) == code
958 },
959 }, nil
960
961 case *query.Symbol:
962 subMT, err := d.newMatchTree(s.Expr)
963 if err != nil {
964 return nil, err
965 }
966
967 if substr, ok := subMT.(*substrMatchTree); ok {
968 // We have a feature flag for lazy decoding. If runeDocSectionsRaw is
969 // non-nil it means we need to lazily decode on request.
970 sections := d.runeDocSections
971 if sections == nil && d.runeDocSectionsRaw != nil {
972 sections = unmarshalDocSections(d.runeDocSectionsRaw, nil)
973 }
974
975 return &symbolSubstrMatchTree{
976 substrMatchTree: substr,
977 patternSize: uint32(utf8.RuneCountInString(substr.query.Pattern)),
978 fileEndRunes: d.fileEndRunes,
979 fileEndSymbol: d.fileEndSymbol,
980 sections: sections,
981 }, nil
982 }
983
984 var regexp *regexp.Regexp
985 visitMatchTree(subMT, func(mt matchTree) {
986 if t, ok := mt.(*regexpMatchTree); ok {
987 regexp = t.regexp
988 }
989 })
990 if regexp == nil {
991 return nil, fmt.Errorf("found %T inside query.Symbol", subMT)
992 }
993
994 return &symbolRegexpMatchTree{
995 regexp: regexp,
996 all: regexp.String() == "(?i)(?-s:.)*",
997 matchTree: subMT,
998 }, nil
999
1000 case *query.FileNameSet:
1001 return &docMatchTree{
1002 reason: "FileNameSet",
1003 numDocs: d.numDocs(),
1004 predicate: func(docID uint32) bool {
1005 fileName := d.fileName(docID)
1006 _, ok := s.Set[string(fileName)]
1007 return ok
1008 },
1009 }, nil
1010
1011 case *query.BranchesRepos:
1012 reposBranchesWant := make([]uint64, len(d.repoMetaData))
1013 for repoIdx := range d.repoMetaData {
1014 var mask uint64
1015 for _, br := range s.List {
1016 if br.Repos.Contains(d.repoMetaData[repoIdx].ID) {
1017 mask |= uint64(d.branchIDs[repoIdx][br.Branch])
1018 }
1019 }
1020 reposBranchesWant[repoIdx] = mask
1021 }
1022 return &docMatchTree{
1023 reason: "BranchesRepos",
1024 numDocs: d.numDocs(),
1025 predicate: func(docID uint32) bool {
1026 return d.fileBranchMasks[docID]&reposBranchesWant[d.repos[docID]] != 0
1027 },
1028 }, nil
1029
1030 case *query.RepoSet:
1031 reposWant := make([]bool, len(d.repoMetaData))
1032 for repoIdx, r := range d.repoMetaData {
1033 if _, ok := s.Set[r.Name]; ok {
1034 reposWant[repoIdx] = true
1035 }
1036 }
1037 return &docMatchTree{
1038 reason: "RepoSet",
1039 numDocs: d.numDocs(),
1040 predicate: func(docID uint32) bool {
1041 return reposWant[d.repos[docID]]
1042 },
1043 }, nil
1044
1045 case *query.RepoIDs:
1046 reposWant := make([]bool, len(d.repoMetaData))
1047 for repoIdx, r := range d.repoMetaData {
1048 if s.Repos.Contains(r.ID) {
1049 reposWant[repoIdx] = true
1050 }
1051 }
1052 return &docMatchTree{
1053 reason: "RepoIDs",
1054 numDocs: d.numDocs(),
1055 predicate: func(docID uint32) bool {
1056 return reposWant[d.repos[docID]]
1057 },
1058 }, nil
1059
1060 case *query.Repo:
1061 reposWant := make([]bool, len(d.repoMetaData))
1062 for repoIdx, r := range d.repoMetaData {
1063 if s.Regexp.MatchString(r.Name) {
1064 reposWant[repoIdx] = true
1065 }
1066 }
1067 return &docMatchTree{
1068 reason: "Repo",
1069 numDocs: d.numDocs(),
1070 predicate: func(docID uint32) bool {
1071 return reposWant[d.repos[docID]]
1072 },
1073 }, nil
1074
1075 case *query.RepoRegexp:
1076 reposWant := make([]bool, len(d.repoMetaData))
1077 for repoIdx, r := range d.repoMetaData {
1078 if s.Regexp.MatchString(r.Name) {
1079 reposWant[repoIdx] = true
1080 }
1081 }
1082 return &docMatchTree{
1083 reason: "RepoRegexp",
1084 numDocs: d.numDocs(),
1085 predicate: func(docID uint32) bool {
1086 return reposWant[d.repos[docID]]
1087 },
1088 }, nil
1089
1090 case query.RawConfig:
1091 return &docMatchTree{
1092 reason: s.String(),
1093 numDocs: d.numDocs(),
1094 predicate: func(docID uint32) bool {
1095 return uint8(s)&d.rawConfigMasks[d.repos[docID]] == uint8(s)
1096 },
1097 }, nil
1098 }
1099 log.Panicf("type %T", q)
1100 return nil, nil
1101}
1102
1103func (d *indexData) newSubstringMatchTree(s *query.Substring) (matchTree, error) {
1104 st := &substrMatchTree{
1105 query: s,
1106 caseSensitive: s.CaseSensitive,
1107 fileName: s.FileName,
1108 }
1109
1110 if utf8.RuneCountInString(s.Pattern) < ngramSize {
1111 prefix := ""
1112 if !s.CaseSensitive {
1113 prefix = "(?i)"
1114 }
1115 t := ®expMatchTree{
1116 regexp: regexp.MustCompile(prefix + regexp.QuoteMeta(s.Pattern)),
1117 fileName: s.FileName,
1118 }
1119 return t, nil
1120 }
1121
1122 result, err := d.iterateNgrams(s)
1123 if err != nil {
1124 return nil, err
1125 }
1126 st.matchIterator = result
1127 return st, nil
1128}
1129
1130func regexpToWordMatchTree(q *query.Regexp) (_ *wordMatchTree, ok bool) {
1131 // Needs to be case sensitive
1132 if !q.CaseSensitive || q.Regexp.Flags&syntax.FoldCase != 0 {
1133 return nil, false
1134 }
1135 // We want a regex that looks like Op.Concat[OpWordBoundary OpLiteral OpWordBoundary]
1136 if q.Regexp.Op != syntax.OpConcat || len(q.Regexp.Sub) != 3 {
1137 return nil, false
1138 }
1139 sub := q.Regexp.Sub
1140 if sub[0].Op != syntax.OpWordBoundary || sub[1].Op != syntax.OpLiteral || sub[2].Op != syntax.OpWordBoundary {
1141 return nil, false
1142 }
1143
1144 return &wordMatchTree{
1145 word: string(sub[1].Rune),
1146 fileName: q.FileName,
1147 }, true
1148}
1149
1150// pruneMatchTree removes impossible branches from the matchTree, as indicated
1151// by substrMatchTree having a noMatchTree and the resulting impossible and clauses and so forth.
1152func pruneMatchTree(mt matchTree) (matchTree, error) {
1153 var err error
1154 switch mt := mt.(type) {
1155 // leaf nodes that we test with the filter:
1156 case *substrMatchTree:
1157 if res, ok := mt.matchIterator.(*ngramIterationResults); ok {
1158 if _, ok := res.matchIterator.(*noMatchTree); ok {
1159 return nil, nil
1160 }
1161 }
1162 // recursive tree structures:
1163 case *andMatchTree:
1164 // Any branch of an and becoming impossible means the entire clause
1165 // is impossible. Otherwise, just handle rewrites.
1166 for i, child := range mt.children {
1167 newChild, err := pruneMatchTree(child)
1168 if err != nil {
1169 return nil, err
1170 }
1171 if newChild == nil {
1172 return nil, nil
1173 }
1174 mt.children[i] = newChild
1175 }
1176 case *orMatchTree:
1177 // *All* branches of an OR becoming impossible means the entire clause
1178 // is impossible. Otherwise, drop impossible subclauses and handle
1179 // rewrites, including simplifying to a singular resulting child branch.
1180 n := 0
1181 for _, child := range mt.children {
1182 newChild, err := pruneMatchTree(child)
1183 if err != nil {
1184 return nil, err
1185 }
1186 if newChild != nil {
1187 mt.children[n] = newChild
1188 n++
1189 }
1190 }
1191 mt.children = mt.children[:n]
1192 if len(mt.children) == 1 {
1193 return mt.children[0], nil
1194 } else if len(mt.children) == 0 {
1195 return nil, nil
1196 }
1197 case *noVisitMatchTree:
1198 mt.matchTree, err = pruneMatchTree(mt.matchTree)
1199 if err != nil {
1200 return nil, err
1201 }
1202 if mt.matchTree == nil {
1203 return nil, nil
1204 }
1205 case *fileNameMatchTree:
1206 mt.child, err = pruneMatchTree(mt.child)
1207 case *andLineMatchTree:
1208 child, err := pruneMatchTree(&mt.andMatchTree)
1209 if err != nil {
1210 return nil, err
1211 }
1212 if child == nil {
1213 return nil, nil
1214 }
1215 if c, ok := child.(*andMatchTree); ok {
1216 mt.andMatchTree = *c
1217 } else {
1218 // the and was simplified to a single clause,
1219 // so the linematch portion is irrelevant.
1220 return mt, nil
1221 }
1222 case *notMatchTree:
1223 mt.child, err = pruneMatchTree(mt.child)
1224 if err != nil {
1225 return nil, err
1226 }
1227 if mt.child == nil {
1228 // not false => true
1229 return &bruteForceMatchTree{}, nil
1230 }
1231 // unhandled:
1232 case *docMatchTree:
1233 case *bruteForceMatchTree:
1234 case *regexpMatchTree:
1235 case *wordMatchTree:
1236 }
1237 return mt, err
1238}