fork of https://github.com/sourcegraph/zoekt
1// Copyright 2018 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "bytes"
19 "fmt"
20 "log"
21 "regexp/syntax"
22 "strings"
23 "unicode/utf8"
24
25 "github.com/grafana/regexp"
26
27 "github.com/sourcegraph/zoekt/query"
28)
29
30// A docIterator iterates over documents in order.
31type docIterator interface {
32 // provide the next document where we may find something interesting.
33 //
34 // This is like a "peek" and shouldn't mutate state. prepare is what should
35 // change state.
36 nextDoc() uint32
37
38 // clears any per-document state of the docIterator, and
39 // prepares for evaluating the given doc. The argument is
40 // strictly increasing over time.
41 prepare(nextDoc uint32)
42}
43
44// costs are passed in increasing order to matchTree.matches until they do not
45// return matchesRequiresHigherCost.
46const (
47 costConst = 0
48 costMemory = 1
49 costContent = 2
50 costRegexp = 3
51)
52
53const (
54 costMin = costConst
55 costMax = costRegexp
56)
57
58// matchesState is an enum for the state of a matchTree after a call to
59// matchTree.matches.
60type matchesState uint8
61
62const (
63 // matchesRequiresHigherCost is returned when matchTree.matches hasn't done
64 // a search yet since the cost value is not high enough.
65 matchesRequiresHigherCost matchesState = iota
66
67 // matchesFound is returned when matchTree.matches has done a search and
68 // found one or more matches.
69 matchesFound
70
71 // matchesNone is returned when matchTree.matches has done a search and
72 // found nothing.
73 matchesNone
74)
75
76// matchesStatePred is a helper which returns matchesFound if b is true
77// otherwise returns matchesNone.
78func matchesStatePred(b bool) matchesState {
79 if b {
80 return matchesFound
81 }
82 return matchesNone
83}
84
85// matchesStateForSlice is a helper which returns matchesFound if v is
86// non-empty otherwise returns matchesNone.
87func matchesStateForSlice[T any](v []T) matchesState {
88 return matchesStatePred(len(v) > 0)
89}
90
91// An expression tree coupled with matches. The matchtree has two
92// functions:
93//
94// * it implements boolean combinations (and, or, not)
95//
96// * it implements shortcuts, where we skip documents (for example: if
97// there are no trigram matches, we can be sure there are no substring
98// matches). The matchtree iterates over the documents as they are
99// ordered in the shard.
100//
101// The general process for a given (shard, query) is:
102//
103// - construct matchTree for the query
104//
105// - find all different leaf matchTrees (substring, regexp, etc.)
106//
107// in a loop:
108//
109// - find next doc to process using nextDoc
110//
111// - evaluate atoms (leaf expressions that match text)
112//
113// - evaluate the tree using matches(), storing the result in map.
114//
115// - if the complete tree returns (matches() != matchesRequiresHigherCost)
116// for the document, collect all text matches by looking at leaf
117// matchTrees.
118type matchTree interface {
119 docIterator
120
121 // matches if cost is high enough. See documentation for matchesState's
122 // values.
123 //
124 // Note: Do not call this directly, rather use evalMatchTree which uses
125 // known to cache responses once the state transitions away from
126 // matchesRequiresHigherCost.
127 matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState
128}
129
130// docMatchTree iterates over documents for which predicate(docID) returns true.
131type docMatchTree struct {
132 // the number of documents in a shard.
133 numDocs uint32
134
135 predicate func(docID uint32) bool
136
137 // provides additional information about the reason why the docMatchTree was
138 // created.
139 reason string
140
141 // mutable
142 firstDone bool
143 docID uint32
144}
145
146type bruteForceMatchTree struct {
147 // mutable
148 firstDone bool
149 docID uint32
150}
151
152type andLineMatchTree struct {
153 andMatchTree
154}
155
156type andMatchTree struct {
157 children []matchTree
158}
159
160type orMatchTree struct {
161 children []matchTree
162}
163
164type notMatchTree struct {
165 child matchTree
166}
167
168// Returns only the filename of child matches.
169type fileNameMatchTree struct {
170 child matchTree
171}
172
173// Don't visit this subtree for collecting matches.
174type noVisitMatchTree struct {
175 matchTree
176}
177
178type regexpMatchTree struct {
179 regexp *regexp.Regexp
180
181 fileName bool
182
183 // mutable
184 reEvaluated bool
185 found []*candidateMatch
186
187 // nextDoc, prepare.
188 bruteForceMatchTree
189}
190
191// \bLITERAL\b
192type wordMatchTree struct {
193 word string
194
195 fileName bool
196
197 // mutable
198 evaluated bool
199 found []*candidateMatch
200
201 // nextDoc, prepare.
202 bruteForceMatchTree
203}
204
205type substrMatchTree struct {
206 matchIterator
207
208 query *query.Substring
209 caseSensitive bool
210 fileName bool
211
212 // mutable
213 current []*candidateMatch
214 contEvaluated bool
215}
216
217type branchQueryMatchTree struct {
218 fileMasks []uint64
219 masks []uint64
220 repos []uint16
221
222 // mutable
223 firstDone bool
224 docID uint32
225}
226
227func (t *branchQueryMatchTree) branchMask() uint64 {
228 return t.fileMasks[t.docID] & t.masks[t.repos[t.docID]]
229}
230
231type symbolRegexpMatchTree struct {
232 matchTree
233 regexp *regexp.Regexp
234 all bool // skips regex match if .*
235
236 reEvaluated bool
237 found []*candidateMatch
238}
239
240func (t *symbolRegexpMatchTree) prepare(doc uint32) {
241 t.reEvaluated = false
242 t.found = t.found[:0]
243 t.matchTree.prepare(doc)
244}
245
246func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
247 if t.reEvaluated {
248 return matchesStateForSlice(t.found)
249 }
250
251 if cost < costRegexp {
252 return matchesRequiresHigherCost
253 }
254
255 sections := cp.docSections()
256 content := cp.data(false)
257
258 found := t.found[:0]
259 for i, sec := range sections {
260 var idx []int
261 if t.all {
262 idx = []int{0, int(sec.End - sec.Start)}
263 } else {
264 idx = t.regexp.FindIndex(content[sec.Start:sec.End])
265 if idx == nil {
266 continue
267 }
268 }
269
270 cm := &candidateMatch{
271 byteOffset: sec.Start + uint32(idx[0]),
272 byteMatchSz: uint32(idx[1] - idx[0]),
273 symbol: true,
274 symbolIdx: uint32(i),
275 }
276 found = append(found, cm)
277 }
278 t.found = found
279 t.reEvaluated = true
280
281 return matchesStateForSlice(t.found)
282}
283
284type symbolSubstrMatchTree struct {
285 *substrMatchTree
286
287 patternSize uint32
288 fileEndRunes []uint32
289 fileEndSymbol []uint32
290
291 doc uint32
292 sections []DocumentSection
293
294 secID uint32
295}
296
297func (t *symbolSubstrMatchTree) prepare(doc uint32) {
298 t.substrMatchTree.prepare(doc)
299 t.doc = doc
300
301 var fileStart uint32
302 if doc > 0 {
303 fileStart = t.fileEndRunes[doc-1]
304 }
305
306 var sections []DocumentSection
307 if len(t.sections) > 0 {
308 most := t.fileEndSymbol[len(t.fileEndSymbol)-1]
309 if most == uint32(len(t.sections)) {
310 sections = t.sections[t.fileEndSymbol[doc]:t.fileEndSymbol[doc+1]]
311 } else {
312 for t.secID < uint32(len(t.sections)) && t.sections[t.secID].Start < fileStart {
313 t.secID++
314 }
315
316 fileEnd, symbolEnd := t.fileEndRunes[doc], t.secID
317 for symbolEnd < uint32(len(t.sections)) && t.sections[symbolEnd].Start < fileEnd {
318 symbolEnd++
319 }
320
321 sections = t.sections[t.secID:symbolEnd]
322 }
323 }
324
325 secIdx := 0
326 trimmed := t.current[:0]
327 for len(sections) > secIdx && len(t.current) > 0 {
328 start := fileStart + t.current[0].runeOffset
329 end := start + t.patternSize
330 if start >= sections[secIdx].End {
331 secIdx++
332 continue
333 }
334
335 if start < sections[secIdx].Start {
336 t.current = t.current[1:]
337 continue
338 }
339
340 if end <= sections[secIdx].End {
341 t.current[0].symbol = true
342 t.current[0].symbolIdx = uint32(secIdx)
343 trimmed = append(trimmed, t.current[0])
344 }
345
346 t.current = t.current[1:]
347 }
348 t.current = trimmed
349}
350
351// all prepare methods
352
353func (t *bruteForceMatchTree) prepare(doc uint32) {
354 t.docID = doc
355 t.firstDone = true
356}
357
358func (t *docMatchTree) prepare(doc uint32) {
359 t.docID = doc
360 t.firstDone = true
361}
362
363func (t *andMatchTree) prepare(doc uint32) {
364 for _, c := range t.children {
365 c.prepare(doc)
366 }
367}
368
369func (t *regexpMatchTree) prepare(doc uint32) {
370 t.found = t.found[:0]
371 t.reEvaluated = false
372 t.bruteForceMatchTree.prepare(doc)
373}
374
375func (t *wordMatchTree) prepare(doc uint32) {
376 t.found = t.found[:0]
377 t.evaluated = false
378 t.bruteForceMatchTree.prepare(doc)
379}
380
381func (t *orMatchTree) prepare(doc uint32) {
382 for _, c := range t.children {
383 c.prepare(doc)
384 }
385}
386
387func (t *notMatchTree) prepare(doc uint32) {
388 t.child.prepare(doc)
389}
390
391func (t *fileNameMatchTree) prepare(doc uint32) {
392 t.child.prepare(doc)
393}
394
395func (t *substrMatchTree) prepare(nextDoc uint32) {
396 t.matchIterator.prepare(nextDoc)
397 t.current = t.matchIterator.candidates()
398 t.contEvaluated = false
399}
400
401func (t *branchQueryMatchTree) prepare(doc uint32) {
402 t.firstDone = true
403 t.docID = doc
404}
405
406// nextDoc
407
408func (t *docMatchTree) nextDoc() uint32 {
409 var start uint32
410 if t.firstDone {
411 start = t.docID + 1
412 }
413 for i := start; i < t.numDocs; i++ {
414 if t.predicate(i) {
415 return i
416 }
417 }
418 return maxUInt32
419}
420
421func (t *bruteForceMatchTree) nextDoc() uint32 {
422 if !t.firstDone {
423 return 0
424 }
425 return t.docID + 1
426}
427
428func (t *andMatchTree) nextDoc() uint32 {
429 var max uint32
430 for _, c := range t.children {
431 m := c.nextDoc()
432 if m > max {
433 max = m
434 }
435 }
436 return max
437}
438
439func (t *orMatchTree) nextDoc() uint32 {
440 min := uint32(maxUInt32)
441 for _, c := range t.children {
442 m := c.nextDoc()
443 if m < min {
444 min = m
445 }
446 }
447 return min
448}
449
450func (t *notMatchTree) nextDoc() uint32 {
451 return 0
452}
453
454func (t *fileNameMatchTree) nextDoc() uint32 {
455 return t.child.nextDoc()
456}
457
458func (t *branchQueryMatchTree) nextDoc() uint32 {
459 var start uint32
460 if t.firstDone {
461 start = t.docID + 1
462 }
463
464 for i := start; i < uint32(len(t.fileMasks)); i++ {
465 if (t.masks[t.repos[i]] & t.fileMasks[i]) != 0 {
466 return i
467 }
468 }
469 return maxUInt32
470}
471
472// all String methods
473
474func (t *bruteForceMatchTree) String() string {
475 return "all"
476}
477
478func (t *docMatchTree) String() string {
479 return fmt.Sprintf("doc(%s)", t.reason)
480}
481
482func (t *andMatchTree) String() string {
483 return fmt.Sprintf("and%v", t.children)
484}
485
486func (t *regexpMatchTree) String() string {
487 f := ""
488 if t.fileName {
489 f = "f"
490 }
491 return fmt.Sprintf("%sre(%s)", f, t.regexp)
492}
493
494func (t *wordMatchTree) String() string {
495 f := ""
496 if t.fileName {
497 f = "f"
498 }
499 return fmt.Sprintf("%sword(%s)", f, t.word)
500}
501
502func (t *orMatchTree) String() string {
503 return fmt.Sprintf("or%v", t.children)
504}
505
506func (t *notMatchTree) String() string {
507 return fmt.Sprintf("not(%v)", t.child)
508}
509
510func (t *noVisitMatchTree) String() string {
511 return fmt.Sprintf("novisit(%v)", t.matchTree)
512}
513
514func (t *fileNameMatchTree) String() string {
515 return fmt.Sprintf("f(%v)", t.child)
516}
517
518func (t *substrMatchTree) String() string {
519 f := ""
520 if t.fileName {
521 f = "f"
522 }
523
524 return fmt.Sprintf("%ssubstr(%q, %v, %v)", f, t.query.Pattern, t.current, t.matchIterator)
525}
526
527func (t *branchQueryMatchTree) String() string {
528 return fmt.Sprintf("branch(%x)", t.masks)
529}
530
531func (t *symbolSubstrMatchTree) String() string {
532 return fmt.Sprintf("symbol(%v)", t.substrMatchTree)
533}
534
535func (t *symbolRegexpMatchTree) String() string {
536 return fmt.Sprintf("symbol(%v)", t.matchTree)
537}
538
539// visitMatches visits all atoms in matchTree. Note: This visits
540// noVisitMatchTree. For collecting matches use visitMatches.
541func visitMatchTree(t matchTree, f func(matchTree)) {
542 switch s := t.(type) {
543 case *andMatchTree:
544 for _, ch := range s.children {
545 visitMatchTree(ch, f)
546 }
547 case *orMatchTree:
548 for _, ch := range s.children {
549 visitMatchTree(ch, f)
550 }
551 case *andLineMatchTree:
552 visitMatchTree(&s.andMatchTree, f)
553 case *noVisitMatchTree:
554 visitMatchTree(s.matchTree, f)
555 case *notMatchTree:
556 visitMatchTree(s.child, f)
557 case *fileNameMatchTree:
558 visitMatchTree(s.child, f)
559 case *symbolSubstrMatchTree:
560 visitMatchTree(s.substrMatchTree, f)
561 case *symbolRegexpMatchTree:
562 visitMatchTree(s.matchTree, f)
563 default:
564 f(t)
565 }
566}
567
568// updateMatchTreeStats calls updateStats on all atoms in mt which have that
569// function defined.
570func updateMatchTreeStats(mt matchTree, stats *Stats) {
571 visitMatchTree(mt, func(mt matchTree) {
572 if atom, ok := mt.(interface{ updateStats(*Stats) }); ok {
573 atom.updateStats(stats)
574 }
575 })
576}
577
578// visitMatches visits all atoms which can contribute matches. Note: This
579// skips noVisitMatchTree.
580func visitMatches(t matchTree, known map[matchTree]bool, f func(matchTree)) {
581 switch s := t.(type) {
582 case *andMatchTree:
583 for _, ch := range s.children {
584 if known[ch] {
585 visitMatches(ch, known, f)
586 }
587 }
588 case *andLineMatchTree:
589 visitMatches(&s.andMatchTree, known, f)
590 case *orMatchTree:
591 for _, ch := range s.children {
592 if known[ch] {
593 visitMatches(ch, known, f)
594 }
595 }
596 case *symbolSubstrMatchTree:
597 visitMatches(s.substrMatchTree, known, f)
598 case *notMatchTree:
599 case *noVisitMatchTree:
600 // don't collect into negative trees.
601 case *fileNameMatchTree:
602 // We will just gather the filename if we do not visit this tree.
603 default:
604 f(s)
605 }
606}
607
608// all matches() methods.
609
610func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
611 return matchesStatePred(t.predicate(cp.idx))
612}
613
614func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
615 return matchesFound
616}
617
618// andLineMatchTree is a performance optimization of andMatchTree. For content
619// searches we don't want to run the regex engine if there is no line that
620// contains matches from all terms.
621func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
622 if state := evalMatchTree(cp, cost, known, &t.andMatchTree); state != matchesFound {
623 return state
624 }
625
626 // Invariant: all children have matches. If any line contains all of them we
627 // can return MatchesFound.
628
629 // find child with fewest candidates
630 min := maxUInt32
631 fewestChildren := 0
632 for ix, child := range t.children {
633 v, ok := child.(*substrMatchTree)
634 // make sure we are running a content search and that all candidates are a
635 // substrMatchTree
636 if !ok || v.fileName {
637 return matchesFound
638 }
639 if len(v.current) < min {
640 min = len(v.current)
641 fewestChildren = ix
642 }
643 }
644
645 type lineRange struct {
646 start int
647 end int
648 }
649 lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current))
650 prev := -1
651 for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current {
652 line, byteStart, byteEnd := cp.newlines().atOffset(candidate.byteOffset)
653 if line == prev {
654 continue
655 }
656 prev = line
657 lines = append(lines, lineRange{byteStart, byteEnd})
658 }
659
660 // children keeps track of the children's candidates we have already seen.
661 children := make([][]*candidateMatch, 0, len(t.children)-1)
662 for j, child := range t.children {
663 if j == fewestChildren {
664 continue
665 }
666 children = append(children, child.(*substrMatchTree).current)
667 }
668
669nextLine:
670 for i := 0; i < len(lines); i++ {
671 hits := 1
672 nextChild:
673 for j := range children {
674 nextCandidate:
675 for len(children[j]) > 0 {
676 candidate := children[j][0]
677 bo := int(cp.findOffset(false, candidate.runeOffset))
678 if bo < lines[i].start {
679 children[j] = children[j][1:]
680 continue nextCandidate
681 }
682 if bo <= lines[i].end {
683 hits++
684 continue nextChild
685 }
686 // move the `lines` iterator forward until bo <= line.end
687 for i < len(lines) && bo > lines[i].end {
688 i++
689 }
690 i--
691 continue nextLine
692 }
693 }
694 // return early once we found any line that contains matches from all children
695 if hits == len(t.children) {
696 return matchesFound
697 }
698 }
699 return matchesNone
700}
701
702func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
703 // We have found matches unless a child needs to do more work or it hasn't
704 // found matches.
705 state := matchesFound
706
707 for _, ch := range t.children {
708 switch evalMatchTree(cp, cost, known, ch) {
709 case matchesRequiresHigherCost:
710 // keep evaluating other children incase we come across matchesNone
711 state = matchesRequiresHigherCost
712 case matchesFound:
713 // will return this if every child has this value
714 case matchesNone:
715 return matchesNone
716 }
717 }
718
719 return state
720}
721
722func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
723 // we could short-circuit, but we want to use the other possibilities as a
724 // ranking signal. So we always return the most conservative state.
725 state := matchesNone
726 for _, ch := range t.children {
727 switch evalMatchTree(cp, cost, known, ch) {
728 case matchesRequiresHigherCost:
729 state = matchesRequiresHigherCost
730 case matchesFound:
731 if state != matchesRequiresHigherCost {
732 state = matchesFound
733 }
734 case matchesNone:
735 // noop
736 }
737 }
738 return state
739}
740
741func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
742 return matchesStatePred(t.branchMask() != 0)
743}
744
745func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
746 if t.reEvaluated {
747 return matchesStateForSlice(t.found)
748 }
749
750 if cost < costRegexp {
751 return matchesRequiresHigherCost
752 }
753
754 cp.stats.RegexpsConsidered++
755 idxs := t.regexp.FindAllIndex(cp.data(t.fileName), -1)
756 found := t.found[:0]
757 for _, idx := range idxs {
758 cm := &candidateMatch{
759 byteOffset: uint32(idx[0]),
760 byteMatchSz: uint32(idx[1] - idx[0]),
761 fileName: t.fileName,
762 }
763
764 found = append(found, cm)
765 }
766 t.found = found
767 t.reEvaluated = true
768
769 return matchesStateForSlice(t.found)
770}
771
772func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
773 if t.evaluated {
774 return matchesStateForSlice(t.found)
775 }
776
777 if cost < costRegexp {
778 return matchesRequiresHigherCost
779 }
780
781 data := cp.data(t.fileName)
782 offset := 0
783 found := t.found[:0]
784 for {
785 idx := bytes.Index(data[offset:], []byte(t.word))
786 if idx < 0 {
787 break
788 }
789
790 relStartOffset := offset + idx
791 relEndOffset := relStartOffset + len(t.word)
792
793 startBoundary := relStartOffset < len(data) && (relStartOffset == 0 || !characterClass(data[relStartOffset-1]))
794 endBoundary := relEndOffset > 0 && (relEndOffset == len(data) || !characterClass(data[relEndOffset]))
795 if startBoundary && endBoundary {
796 found = append(found, &candidateMatch{
797 byteOffset: uint32(offset + idx),
798 byteMatchSz: uint32(len(t.word)),
799 fileName: t.fileName,
800 })
801 }
802 offset += idx + len(t.word)
803 }
804
805 t.found = found
806 t.evaluated = true
807
808 return matchesStateForSlice(t.found)
809}
810
811// breakMatchesOnNewlines returns matches resulting from breaking each element
812// of cms on newlines within text.
813func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch {
814 var lineCMs []*candidateMatch
815 for _, cm := range cms {
816 lineCMs = append(lineCMs, breakOnNewlines(cm, text)...)
817 }
818 return lineCMs
819}
820
821// breakOnNewlines returns matches resulting from breaking cm on newlines
822// within text.
823func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch {
824 var cms []*candidateMatch
825 addMe := &candidateMatch{}
826 *addMe = *cm
827 for i := uint32(cm.byteOffset); i < cm.byteOffset+cm.byteMatchSz; i++ {
828 if text[i] == '\n' {
829 addMe.byteMatchSz = i - addMe.byteOffset
830 if addMe.byteMatchSz != 0 {
831 cms = append(cms, addMe)
832 }
833
834 addMe = &candidateMatch{}
835 *addMe = *cm
836 addMe.byteOffset = i + 1
837 }
838 }
839 addMe.byteMatchSz = cm.byteOffset + cm.byteMatchSz - addMe.byteOffset
840 if addMe.byteMatchSz != 0 {
841 cms = append(cms, addMe)
842 }
843 return cms
844}
845
846// evalMatchTree should be called instead of directly calling
847// matchTree.matches. It cache known values for future evaluation at higher
848// costs.
849func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) matchesState {
850 if v, ok := known[mt]; ok {
851 return matchesStatePred(v)
852 }
853
854 ms := mt.matches(cp, cost, known)
855 if ms != matchesRequiresHigherCost {
856 known[mt] = ms == matchesFound
857 }
858
859 return ms
860}
861
862func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
863 switch evalMatchTree(cp, cost, known, t.child) {
864 case matchesRequiresHigherCost:
865 return matchesRequiresHigherCost
866 case matchesFound:
867 return matchesNone
868 case matchesNone:
869 return matchesFound
870 default:
871 panic("unreachable")
872 }
873}
874
875func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
876 return evalMatchTree(cp, cost, known, t.child)
877}
878
879func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
880 if t.contEvaluated {
881 return matchesStateForSlice(t.current)
882 }
883
884 if len(t.current) == 0 {
885 return matchesNone
886 }
887
888 if t.fileName && cost < costMemory {
889 return matchesRequiresHigherCost
890 }
891
892 if !t.fileName && cost < costContent {
893 return matchesRequiresHigherCost
894 }
895
896 pruned := t.current[:0]
897 for _, m := range t.current {
898 if m.byteOffset == 0 && m.runeOffset > 0 {
899 m.byteOffset = cp.findOffset(m.fileName, m.runeOffset)
900 }
901 if m.matchContent(cp.data(m.fileName)) {
902 pruned = append(pruned, m)
903 }
904 }
905 t.current = pruned
906 t.contEvaluated = true
907
908 return matchesStateForSlice(t.current)
909}
910
911type matchTreeOpt struct {
912 // DisableWordMatchOptimization is used to disable the use of wordMatchTree.
913 // This was added since we do not support wordMatchTree with symbol search.
914 DisableWordMatchOptimization bool
915}
916
917func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error) {
918 if q == nil {
919 return nil, fmt.Errorf("got nil (sub)query")
920 }
921 switch s := q.(type) {
922 case *query.Regexp:
923 // RegexpToMatchTreeRecursive tries to distill a matchTree that matches a
924 // superset of the regexp. If the returned matchTree is equivalent to the
925 // original regexp, it returns true. An equivalent matchTree has the same
926 // behaviour as the original regexp and can be used instead.
927 //
928 subMT, isEq, _, err := d.regexpToMatchTreeRecursive(s.Regexp, ngramSize, s.FileName, s.CaseSensitive)
929 if err != nil {
930 return nil, err
931 }
932 // if the query can be used in place of the regexp
933 // return the subtree
934 if isEq {
935 return subMT, nil
936 }
937
938 var tr matchTree
939 if wmt, ok := regexpToWordMatchTree(s, opt); ok {
940 // A common search we get is "\bLITERAL\b". Avoid the regex engine and
941 // provide something faster.
942 tr = wmt
943 } else {
944 prefix := ""
945 if !s.CaseSensitive {
946 prefix = "(?i)"
947 }
948
949 tr = ®expMatchTree{
950 regexp: regexp.MustCompile(prefix + s.Regexp.String()),
951 fileName: s.FileName,
952 }
953 }
954
955 return &andMatchTree{
956 children: []matchTree{
957 tr, &noVisitMatchTree{subMT},
958 },
959 }, nil
960 case *query.And:
961 var r []matchTree
962 for _, ch := range s.Children {
963 ct, err := d.newMatchTree(ch, opt)
964 if err != nil {
965 return nil, err
966 }
967 r = append(r, ct)
968 }
969 return &andMatchTree{r}, nil
970 case *query.Or:
971 var r []matchTree
972 for _, ch := range s.Children {
973 ct, err := d.newMatchTree(ch, opt)
974 if err != nil {
975 return nil, err
976 }
977 r = append(r, ct)
978 }
979 return &orMatchTree{r}, nil
980 case *query.Not:
981 ct, err := d.newMatchTree(s.Child, opt)
982 return ¬MatchTree{
983 child: ct,
984 }, err
985
986 case *query.Type:
987 if s.Type != query.TypeFileName {
988 break
989 }
990
991 ct, err := d.newMatchTree(s.Child, opt)
992 if err != nil {
993 return nil, err
994 }
995
996 return &fileNameMatchTree{
997 child: ct,
998 }, nil
999
1000 case *query.Substring:
1001 return d.newSubstringMatchTree(s)
1002
1003 case *query.Branch:
1004 masks := make([]uint64, 0, len(d.repoMetaData))
1005 if s.Pattern == "HEAD" {
1006 for i := 0; i < len(d.repoMetaData); i++ {
1007 masks = append(masks, 1)
1008 }
1009 } else {
1010 for _, branchIDs := range d.branchIDs {
1011 mask := uint64(0)
1012 for nm, m := range branchIDs {
1013 if (s.Exact && nm == s.Pattern) || (!s.Exact && strings.Contains(nm, s.Pattern)) {
1014 mask |= uint64(m)
1015 }
1016 }
1017 masks = append(masks, mask)
1018 }
1019
1020 }
1021 return &branchQueryMatchTree{
1022 masks: masks,
1023 fileMasks: d.fileBranchMasks,
1024 repos: d.repos,
1025 }, nil
1026 case *query.Const:
1027 if s.Value {
1028 return &bruteForceMatchTree{}, nil
1029 } else {
1030 return &noMatchTree{Why: "const"}, nil
1031 }
1032 case *query.Language:
1033 code, ok := d.metaData.LanguageMap[s.Language]
1034 if !ok {
1035 return &noMatchTree{Why: "lang"}, nil
1036 }
1037 return &docMatchTree{
1038 reason: "language",
1039 numDocs: d.numDocs(),
1040 predicate: func(docID uint32) bool {
1041 return d.getLanguage(docID) == code
1042 },
1043 }, nil
1044
1045 case *query.Symbol:
1046 // Disable WordMatchTree since we don't support it in symbols yet.
1047 optCopy := opt
1048 optCopy.DisableWordMatchOptimization = true
1049
1050 subMT, err := d.newMatchTree(s.Expr, optCopy)
1051 if err != nil {
1052 return nil, err
1053 }
1054
1055 if substr, ok := subMT.(*substrMatchTree); ok {
1056 return &symbolSubstrMatchTree{
1057 substrMatchTree: substr,
1058 patternSize: uint32(utf8.RuneCountInString(substr.query.Pattern)),
1059 fileEndRunes: d.fileEndRunes,
1060 fileEndSymbol: d.fileEndSymbol,
1061 sections: d.runeDocSections,
1062 }, nil
1063 }
1064
1065 var regexp *regexp.Regexp
1066 visitMatchTree(subMT, func(mt matchTree) {
1067 if t, ok := mt.(*regexpMatchTree); ok {
1068 regexp = t.regexp
1069 }
1070 })
1071 if regexp == nil {
1072 return nil, fmt.Errorf("found %T inside query.Symbol", subMT)
1073 }
1074
1075 return &symbolRegexpMatchTree{
1076 regexp: regexp,
1077 all: regexp.String() == "(?i)(?-s:.)*",
1078 matchTree: subMT,
1079 }, nil
1080
1081 case *query.FileNameSet:
1082 return &docMatchTree{
1083 reason: "FileNameSet",
1084 numDocs: d.numDocs(),
1085 predicate: func(docID uint32) bool {
1086 fileName := d.fileName(docID)
1087 _, ok := s.Set[string(fileName)]
1088 return ok
1089 },
1090 }, nil
1091
1092 case *query.BranchesRepos:
1093 reposBranchesWant := make([]uint64, len(d.repoMetaData))
1094 for repoIdx := range d.repoMetaData {
1095 var mask uint64
1096 for _, br := range s.List {
1097 if br.Repos.Contains(d.repoMetaData[repoIdx].ID) {
1098 mask |= uint64(d.branchIDs[repoIdx][br.Branch])
1099 }
1100 }
1101 reposBranchesWant[repoIdx] = mask
1102 }
1103 return &docMatchTree{
1104 reason: "BranchesRepos",
1105 numDocs: d.numDocs(),
1106 predicate: func(docID uint32) bool {
1107 return d.fileBranchMasks[docID]&reposBranchesWant[d.repos[docID]] != 0
1108 },
1109 }, nil
1110
1111 case *query.RepoSet:
1112 reposWant := make([]bool, len(d.repoMetaData))
1113 for repoIdx, r := range d.repoMetaData {
1114 if _, ok := s.Set[r.Name]; ok {
1115 reposWant[repoIdx] = true
1116 }
1117 }
1118 return &docMatchTree{
1119 reason: "RepoSet",
1120 numDocs: d.numDocs(),
1121 predicate: func(docID uint32) bool {
1122 return reposWant[d.repos[docID]]
1123 },
1124 }, nil
1125
1126 case *query.RepoIDs:
1127 reposWant := make([]bool, len(d.repoMetaData))
1128 for repoIdx, r := range d.repoMetaData {
1129 if s.Repos.Contains(r.ID) {
1130 reposWant[repoIdx] = true
1131 }
1132 }
1133 return &docMatchTree{
1134 reason: "RepoIDs",
1135 numDocs: d.numDocs(),
1136 predicate: func(docID uint32) bool {
1137 return reposWant[d.repos[docID]]
1138 },
1139 }, nil
1140
1141 case *query.Repo:
1142 reposWant := make([]bool, len(d.repoMetaData))
1143 for repoIdx, r := range d.repoMetaData {
1144 if s.Regexp.MatchString(r.Name) {
1145 reposWant[repoIdx] = true
1146 }
1147 }
1148 return &docMatchTree{
1149 reason: "Repo",
1150 numDocs: d.numDocs(),
1151 predicate: func(docID uint32) bool {
1152 return reposWant[d.repos[docID]]
1153 },
1154 }, nil
1155
1156 case *query.RepoRegexp:
1157 reposWant := make([]bool, len(d.repoMetaData))
1158 for repoIdx, r := range d.repoMetaData {
1159 if s.Regexp.MatchString(r.Name) {
1160 reposWant[repoIdx] = true
1161 }
1162 }
1163 return &docMatchTree{
1164 reason: "RepoRegexp",
1165 numDocs: d.numDocs(),
1166 predicate: func(docID uint32) bool {
1167 return reposWant[d.repos[docID]]
1168 },
1169 }, nil
1170
1171 case query.RawConfig:
1172 return &docMatchTree{
1173 reason: s.String(),
1174 numDocs: d.numDocs(),
1175 predicate: func(docID uint32) bool {
1176 return uint8(s)&d.rawConfigMasks[d.repos[docID]] == uint8(s)
1177 },
1178 }, nil
1179 }
1180 log.Panicf("type %T", q)
1181 return nil, nil
1182}
1183
1184func (d *indexData) newSubstringMatchTree(s *query.Substring) (matchTree, error) {
1185 st := &substrMatchTree{
1186 query: s,
1187 caseSensitive: s.CaseSensitive,
1188 fileName: s.FileName,
1189 }
1190
1191 if utf8.RuneCountInString(s.Pattern) < ngramSize {
1192 prefix := ""
1193 if !s.CaseSensitive {
1194 prefix = "(?i)"
1195 }
1196 t := ®expMatchTree{
1197 regexp: regexp.MustCompile(prefix + regexp.QuoteMeta(s.Pattern)),
1198 fileName: s.FileName,
1199 }
1200 return t, nil
1201 }
1202
1203 result, err := d.iterateNgrams(s)
1204 if err != nil {
1205 return nil, err
1206 }
1207 st.matchIterator = result
1208 return st, nil
1209}
1210
1211func regexpToWordMatchTree(q *query.Regexp, opt matchTreeOpt) (_ *wordMatchTree, ok bool) {
1212 if opt.DisableWordMatchOptimization {
1213 return nil, false
1214 }
1215 // Needs to be case sensitive
1216 if !q.CaseSensitive || q.Regexp.Flags&syntax.FoldCase != 0 {
1217 return nil, false
1218 }
1219 // We want a regex that looks like Op.Concat[OpWordBoundary OpLiteral OpWordBoundary]
1220 if q.Regexp.Op != syntax.OpConcat || len(q.Regexp.Sub) != 3 {
1221 return nil, false
1222 }
1223 sub := q.Regexp.Sub
1224 if sub[0].Op != syntax.OpWordBoundary || sub[1].Op != syntax.OpLiteral || sub[2].Op != syntax.OpWordBoundary {
1225 return nil, false
1226 }
1227
1228 return &wordMatchTree{
1229 word: string(sub[1].Rune),
1230 fileName: q.FileName,
1231 }, true
1232}
1233
1234// pruneMatchTree removes impossible branches from the matchTree, as indicated
1235// by substrMatchTree having a noMatchTree and the resulting impossible and clauses and so forth.
1236func pruneMatchTree(mt matchTree) (matchTree, error) {
1237 var err error
1238 switch mt := mt.(type) {
1239 // leaf nodes that we test with the filter:
1240 case *substrMatchTree:
1241 if res, ok := mt.matchIterator.(*ngramIterationResults); ok {
1242 if _, ok := res.matchIterator.(*noMatchTree); ok {
1243 return nil, nil
1244 }
1245 }
1246 // recursive tree structures:
1247 case *andMatchTree:
1248 // Any branch of an and becoming impossible means the entire clause
1249 // is impossible. Otherwise, just handle rewrites.
1250 for i, child := range mt.children {
1251 newChild, err := pruneMatchTree(child)
1252 if err != nil {
1253 return nil, err
1254 }
1255 if newChild == nil {
1256 return nil, nil
1257 }
1258 mt.children[i] = newChild
1259 }
1260 case *orMatchTree:
1261 // *All* branches of an OR becoming impossible means the entire clause
1262 // is impossible. Otherwise, drop impossible subclauses and handle
1263 // rewrites, including simplifying to a singular resulting child branch.
1264 n := 0
1265 for _, child := range mt.children {
1266 newChild, err := pruneMatchTree(child)
1267 if err != nil {
1268 return nil, err
1269 }
1270 if newChild != nil {
1271 mt.children[n] = newChild
1272 n++
1273 }
1274 }
1275 mt.children = mt.children[:n]
1276 if len(mt.children) == 1 {
1277 return mt.children[0], nil
1278 } else if len(mt.children) == 0 {
1279 return nil, nil
1280 }
1281 case *noVisitMatchTree:
1282 mt.matchTree, err = pruneMatchTree(mt.matchTree)
1283 if err != nil {
1284 return nil, err
1285 }
1286 if mt.matchTree == nil {
1287 return nil, nil
1288 }
1289 case *fileNameMatchTree:
1290 mt.child, err = pruneMatchTree(mt.child)
1291 case *andLineMatchTree:
1292 child, err := pruneMatchTree(&mt.andMatchTree)
1293 if err != nil {
1294 return nil, err
1295 }
1296 if child == nil {
1297 return nil, nil
1298 }
1299 if c, ok := child.(*andMatchTree); ok {
1300 mt.andMatchTree = *c
1301 } else {
1302 // the and was simplified to a single clause,
1303 // so the linematch portion is irrelevant.
1304 return mt, nil
1305 }
1306 case *notMatchTree:
1307 mt.child, err = pruneMatchTree(mt.child)
1308 if err != nil {
1309 return nil, err
1310 }
1311 if mt.child == nil {
1312 // not false => true
1313 return &bruteForceMatchTree{}, nil
1314 }
1315 // unhandled:
1316 case *docMatchTree:
1317 case *bruteForceMatchTree:
1318 case *regexpMatchTree:
1319 case *wordMatchTree:
1320 }
1321 return mt, err
1322}