fork of https://github.com/sourcegraph/zoekt
1// Copyright 2018 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "bytes"
19 "fmt"
20 "log"
21 "regexp/syntax"
22 "strings"
23 "unicode/utf8"
24
25 "github.com/grafana/regexp"
26
27 "github.com/sourcegraph/zoekt/query"
28)
29
30// A docIterator iterates over documents in order.
31type docIterator interface {
32 // provide the next document where we may find something interesting.
33 //
34 // This is like a "peek" and shouldn't mutate state. prepare is what should
35 // change state.
36 nextDoc() uint32
37
38 // clears any per-document state of the docIterator, and
39 // prepares for evaluating the given doc. The argument is
40 // strictly increasing over time.
41 prepare(nextDoc uint32)
42}
43
44// costs are passed in increasing order to matchTree.matches until they do not
45// return matchesRequiresHigherCost.
46const (
47 costConst = 0
48 costMemory = 1
49 costContent = 2
50 costRegexp = 3
51)
52
53const (
54 costMin = costConst
55 costMax = costRegexp
56)
57
58// matchesState is an enum for the state of a matchTree after a call to
59// matchTree.matches.
60type matchesState uint8
61
62const (
63 // matchesRequiresHigherCost is returned when matchTree.matches hasn't done
64 // a search yet since the cost value is not high enough.
65 matchesRequiresHigherCost matchesState = iota
66
67 // matchesFound is returned when matchTree.matches has done a search and
68 // found one or more matches.
69 matchesFound
70
71 // matchesNone is returned when matchTree.matches has done a search and
72 // found nothing.
73 matchesNone
74)
75
76// matchesStatePred is a helper which returns matchesFound if b is true
77// otherwise returns matchesNone.
78func matchesStatePred(b bool) matchesState {
79 if b {
80 return matchesFound
81 }
82 return matchesNone
83}
84
85// matchesStateForSlice is a helper which returns matchesFound if v is
86// non-empty otherwise returns matchesNone.
87func matchesStateForSlice[T any](v []T) matchesState {
88 return matchesStatePred(len(v) > 0)
89}
90
91// An expression tree coupled with matches. The matchtree has two
92// functions:
93//
94// * it implements boolean combinations (and, or, not)
95//
96// * it implements shortcuts, where we skip documents (for example: if
97// there are no trigram matches, we can be sure there are no substring
98// matches). The matchtree iterates over the documents as they are
99// ordered in the shard.
100//
101// The general process for a given (shard, query) is:
102//
103// - construct matchTree for the query
104//
105// - find all different leaf matchTrees (substring, regexp, etc.)
106//
107// in a loop:
108//
109// - find next doc to process using nextDoc
110//
111// - evaluate atoms (leaf expressions that match text)
112//
113// - evaluate the tree using matches(), storing the result in map.
114//
115// - if the complete tree returns (matches() != matchesRequiresHigherCost)
116// for the document, collect all text matches by looking at leaf
117// matchTrees.
118type matchTree interface {
119 docIterator
120
121 // matches if cost is high enough. See documentation for matchesState's
122 // values.
123 //
124 // Note: Do not call this directly, rather use evalMatchTree which uses
125 // known to cache responses once the state transitions away from
126 // matchesRequiresHigherCost.
127 matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState
128}
129
130// docMatchTree iterates over documents for which predicate(docID) returns true.
131type docMatchTree struct {
132 // the number of documents in a shard.
133 numDocs uint32
134
135 predicate func(docID uint32) bool
136
137 // provides additional information about the reason why the docMatchTree was
138 // created.
139 reason string
140
141 // mutable
142 firstDone bool
143 docID uint32
144}
145
146type bruteForceMatchTree struct {
147 // mutable
148 firstDone bool
149 docID uint32
150}
151
152type andLineMatchTree struct {
153 andMatchTree
154}
155
156type andMatchTree struct {
157 children []matchTree
158}
159
160type orMatchTree struct {
161 children []matchTree
162}
163
164type notMatchTree struct {
165 child matchTree
166}
167
168// Returns only the filename of child matches.
169type fileNameMatchTree struct {
170 child matchTree
171}
172
173type boostMatchTree struct {
174 child matchTree
175 boost float64
176}
177
178// Don't visit this subtree for collecting matches.
179type noVisitMatchTree struct {
180 matchTree
181}
182
183type regexpMatchTree struct {
184 regexp *regexp.Regexp
185
186 fileName bool
187
188 // mutable
189 reEvaluated bool
190 found []*candidateMatch
191
192 // nextDoc, prepare.
193 bruteForceMatchTree
194}
195
196// \bLITERAL\b
197type wordMatchTree struct {
198 word string
199
200 fileName bool
201
202 // mutable
203 evaluated bool
204 found []*candidateMatch
205
206 // nextDoc, prepare.
207 bruteForceMatchTree
208}
209
210type substrMatchTree struct {
211 matchIterator
212
213 query *query.Substring
214 caseSensitive bool
215 fileName bool
216
217 // mutable
218 current []*candidateMatch
219 contEvaluated bool
220}
221
222type branchQueryMatchTree struct {
223 fileMasks []uint64
224 masks []uint64
225 repos []uint16
226
227 // mutable
228 firstDone bool
229 docID uint32
230}
231
232func (t *branchQueryMatchTree) branchMask() uint64 {
233 return t.fileMasks[t.docID] & t.masks[t.repos[t.docID]]
234}
235
236type symbolRegexpMatchTree struct {
237 matchTree
238 regexp *regexp.Regexp
239 all bool // skips regex match if .*
240
241 reEvaluated bool
242 found []*candidateMatch
243}
244
245func (t *symbolRegexpMatchTree) prepare(doc uint32) {
246 t.reEvaluated = false
247 t.found = t.found[:0]
248 t.matchTree.prepare(doc)
249}
250
251func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
252 if t.reEvaluated {
253 return matchesStateForSlice(t.found)
254 }
255
256 if cost < costRegexp {
257 return matchesRequiresHigherCost
258 }
259
260 sections := cp.docSections()
261 content := cp.data(false)
262
263 found := t.found[:0]
264 for i, sec := range sections {
265 var idx []int
266 if t.all {
267 idx = []int{0, int(sec.End - sec.Start)}
268 } else {
269 idx = t.regexp.FindIndex(content[sec.Start:sec.End])
270 if idx == nil {
271 continue
272 }
273 }
274
275 cm := &candidateMatch{
276 byteOffset: sec.Start + uint32(idx[0]),
277 byteMatchSz: uint32(idx[1] - idx[0]),
278 symbol: true,
279 symbolIdx: uint32(i),
280 }
281 found = append(found, cm)
282 }
283 t.found = found
284 t.reEvaluated = true
285
286 return matchesStateForSlice(t.found)
287}
288
289type symbolSubstrMatchTree struct {
290 *substrMatchTree
291
292 patternSize uint32
293 fileEndRunes []uint32
294 fileEndSymbol []uint32
295
296 doc uint32
297 sections []DocumentSection
298
299 secID uint32
300}
301
302func (t *symbolSubstrMatchTree) prepare(doc uint32) {
303 t.substrMatchTree.prepare(doc)
304 t.doc = doc
305
306 var fileStart uint32
307 if doc > 0 {
308 fileStart = t.fileEndRunes[doc-1]
309 }
310
311 var sections []DocumentSection
312 if len(t.sections) > 0 {
313 most := t.fileEndSymbol[len(t.fileEndSymbol)-1]
314 if most == uint32(len(t.sections)) {
315 sections = t.sections[t.fileEndSymbol[doc]:t.fileEndSymbol[doc+1]]
316 } else {
317 for t.secID < uint32(len(t.sections)) && t.sections[t.secID].Start < fileStart {
318 t.secID++
319 }
320
321 fileEnd, symbolEnd := t.fileEndRunes[doc], t.secID
322 for symbolEnd < uint32(len(t.sections)) && t.sections[symbolEnd].Start < fileEnd {
323 symbolEnd++
324 }
325
326 sections = t.sections[t.secID:symbolEnd]
327 }
328 }
329
330 secIdx := 0
331 trimmed := t.current[:0]
332 for len(sections) > secIdx && len(t.current) > 0 {
333 start := fileStart + t.current[0].runeOffset
334 end := start + t.patternSize
335 if start >= sections[secIdx].End {
336 secIdx++
337 continue
338 }
339
340 if start < sections[secIdx].Start {
341 t.current = t.current[1:]
342 continue
343 }
344
345 if end <= sections[secIdx].End {
346 t.current[0].symbol = true
347 t.current[0].symbolIdx = uint32(secIdx)
348 trimmed = append(trimmed, t.current[0])
349 }
350
351 t.current = t.current[1:]
352 }
353 t.current = trimmed
354}
355
356// all prepare methods
357
358func (t *bruteForceMatchTree) prepare(doc uint32) {
359 t.docID = doc
360 t.firstDone = true
361}
362
363func (t *docMatchTree) prepare(doc uint32) {
364 t.docID = doc
365 t.firstDone = true
366}
367
368func (t *andMatchTree) prepare(doc uint32) {
369 for _, c := range t.children {
370 c.prepare(doc)
371 }
372}
373
374func (t *regexpMatchTree) prepare(doc uint32) {
375 t.found = t.found[:0]
376 t.reEvaluated = false
377 t.bruteForceMatchTree.prepare(doc)
378}
379
380func (t *wordMatchTree) prepare(doc uint32) {
381 t.found = t.found[:0]
382 t.evaluated = false
383 t.bruteForceMatchTree.prepare(doc)
384}
385
386func (t *orMatchTree) prepare(doc uint32) {
387 for _, c := range t.children {
388 c.prepare(doc)
389 }
390}
391
392func (t *notMatchTree) prepare(doc uint32) {
393 t.child.prepare(doc)
394}
395
396func (t *fileNameMatchTree) prepare(doc uint32) {
397 t.child.prepare(doc)
398}
399
400func (t *boostMatchTree) prepare(doc uint32) {
401 t.child.prepare(doc)
402}
403
404func (t *substrMatchTree) prepare(nextDoc uint32) {
405 t.matchIterator.prepare(nextDoc)
406 t.current = t.matchIterator.candidates()
407 t.contEvaluated = false
408}
409
410func (t *branchQueryMatchTree) prepare(doc uint32) {
411 t.firstDone = true
412 t.docID = doc
413}
414
415// nextDoc
416
417func (t *docMatchTree) nextDoc() uint32 {
418 var start uint32
419 if t.firstDone {
420 start = t.docID + 1
421 }
422 for i := start; i < t.numDocs; i++ {
423 if t.predicate(i) {
424 return i
425 }
426 }
427 return maxUInt32
428}
429
430func (t *bruteForceMatchTree) nextDoc() uint32 {
431 if !t.firstDone {
432 return 0
433 }
434 return t.docID + 1
435}
436
437func (t *andMatchTree) nextDoc() uint32 {
438 var max uint32
439 for _, c := range t.children {
440 m := c.nextDoc()
441 if m > max {
442 max = m
443 }
444 }
445 return max
446}
447
448func (t *orMatchTree) nextDoc() uint32 {
449 min := uint32(maxUInt32)
450 for _, c := range t.children {
451 m := c.nextDoc()
452 if m < min {
453 min = m
454 }
455 }
456 return min
457}
458
459func (t *notMatchTree) nextDoc() uint32 {
460 return 0
461}
462
463func (t *fileNameMatchTree) nextDoc() uint32 {
464 return t.child.nextDoc()
465}
466
467func (t *boostMatchTree) nextDoc() uint32 {
468 return t.child.nextDoc()
469}
470
471func (t *branchQueryMatchTree) nextDoc() uint32 {
472 var start uint32
473 if t.firstDone {
474 start = t.docID + 1
475 }
476
477 for i := start; i < uint32(len(t.fileMasks)); i++ {
478 if (t.masks[t.repos[i]] & t.fileMasks[i]) != 0 {
479 return i
480 }
481 }
482 return maxUInt32
483}
484
485// all String methods
486
487func (t *bruteForceMatchTree) String() string {
488 return "all"
489}
490
491func (t *docMatchTree) String() string {
492 return fmt.Sprintf("doc(%s)", t.reason)
493}
494
495func (t *andMatchTree) String() string {
496 return fmt.Sprintf("and%v", t.children)
497}
498
499func (t *regexpMatchTree) String() string {
500 f := ""
501 if t.fileName {
502 f = "f"
503 }
504 return fmt.Sprintf("%sre(%s)", f, t.regexp)
505}
506
507func (t *wordMatchTree) String() string {
508 f := ""
509 if t.fileName {
510 f = "f"
511 }
512 return fmt.Sprintf("%sword(%s)", f, t.word)
513}
514
515func (t *orMatchTree) String() string {
516 return fmt.Sprintf("or%v", t.children)
517}
518
519func (t *notMatchTree) String() string {
520 return fmt.Sprintf("not(%v)", t.child)
521}
522
523func (t *noVisitMatchTree) String() string {
524 return fmt.Sprintf("novisit(%v)", t.matchTree)
525}
526
527func (t *fileNameMatchTree) String() string {
528 return fmt.Sprintf("f(%v)", t.child)
529}
530
531func (t *boostMatchTree) String() string {
532 return fmt.Sprintf("boost(%f, %v)", t.boost, t.child)
533}
534
535func (t *substrMatchTree) String() string {
536 f := ""
537 if t.fileName {
538 f = "f"
539 }
540
541 return fmt.Sprintf("%ssubstr(%q, %v, %v)", f, t.query.Pattern, t.current, t.matchIterator)
542}
543
544func (t *branchQueryMatchTree) String() string {
545 return fmt.Sprintf("branch(%x)", t.masks)
546}
547
548func (t *symbolSubstrMatchTree) String() string {
549 return fmt.Sprintf("symbol(%v)", t.substrMatchTree)
550}
551
552func (t *symbolRegexpMatchTree) String() string {
553 return fmt.Sprintf("symbol(%v)", t.matchTree)
554}
555
556// visitMatches visits all atoms in matchTree. Note: This visits
557// noVisitMatchTree. For collecting matches use visitMatches.
558func visitMatchTree(t matchTree, f func(matchTree)) {
559 switch s := t.(type) {
560 case *andMatchTree:
561 for _, ch := range s.children {
562 visitMatchTree(ch, f)
563 }
564 case *orMatchTree:
565 for _, ch := range s.children {
566 visitMatchTree(ch, f)
567 }
568 case *andLineMatchTree:
569 visitMatchTree(&s.andMatchTree, f)
570 case *noVisitMatchTree:
571 visitMatchTree(s.matchTree, f)
572 case *notMatchTree:
573 visitMatchTree(s.child, f)
574 case *fileNameMatchTree:
575 visitMatchTree(s.child, f)
576 case *boostMatchTree:
577 visitMatchTree(s.child, f)
578 case *symbolSubstrMatchTree:
579 visitMatchTree(s.substrMatchTree, f)
580 case *symbolRegexpMatchTree:
581 visitMatchTree(s.matchTree, f)
582 default:
583 f(t)
584 }
585}
586
587// updateMatchTreeStats calls updateStats on all atoms in mt which have that
588// function defined.
589func updateMatchTreeStats(mt matchTree, stats *Stats) {
590 visitMatchTree(mt, func(mt matchTree) {
591 if atom, ok := mt.(interface{ updateStats(*Stats) }); ok {
592 atom.updateStats(stats)
593 }
594 })
595}
596
597func visitMatchAtoms(t matchTree, known map[matchTree]bool, f func(matchTree)) {
598 visitMatches(t, known, 1, func(mt matchTree, _ float64) {
599 f(mt)
600 })
601}
602
603// visitMatches visits all atoms which can contribute matches. Note: This
604// skips noVisitMatchTree.
605func visitMatches(t matchTree, known map[matchTree]bool, weight float64, f func(matchTree, float64)) {
606 switch s := t.(type) {
607 case *andMatchTree:
608 for _, ch := range s.children {
609 if known[ch] {
610 visitMatches(ch, known, weight, f)
611 }
612 }
613 case *andLineMatchTree:
614 visitMatches(&s.andMatchTree, known, weight, f)
615 case *orMatchTree:
616 for _, ch := range s.children {
617 if known[ch] {
618 visitMatches(ch, known, weight, f)
619 }
620 }
621 case *boostMatchTree:
622 visitMatches(s.child, known, weight*s.boost, f)
623 case *symbolSubstrMatchTree:
624 visitMatches(s.substrMatchTree, known, weight, f)
625 case *notMatchTree:
626 case *noVisitMatchTree:
627 // don't collect into negative trees.
628 case *fileNameMatchTree:
629 // We will just gather the filename if we do not visit this tree.
630 default:
631 f(s, weight)
632 }
633}
634
635// all matches() methods.
636
637func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
638 return matchesStatePred(t.predicate(cp.idx))
639}
640
641func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
642 return matchesFound
643}
644
645// andLineMatchTree is a performance optimization of andMatchTree. For content
646// searches we don't want to run the regex engine if there is no line that
647// contains matches from all terms.
648func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
649 if state := evalMatchTree(cp, cost, known, &t.andMatchTree); state != matchesFound {
650 return state
651 }
652
653 // Invariant: all children have matches. If any line contains all of them we
654 // can return MatchesFound.
655
656 // find child with fewest candidates
657 min := maxUInt32
658 fewestChildren := 0
659 for ix, child := range t.children {
660 v, ok := child.(*substrMatchTree)
661 // make sure we are running a content search and that all candidates are a
662 // substrMatchTree
663 if !ok || v.fileName {
664 return matchesFound
665 }
666 if len(v.current) < min {
667 min = len(v.current)
668 fewestChildren = ix
669 }
670 }
671
672 type lineRange struct {
673 start int
674 end int
675 }
676 lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current))
677 prev := -1
678 for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current {
679 line, byteStart, byteEnd := cp.newlines().atOffset(candidate.byteOffset)
680 if line == prev {
681 continue
682 }
683 prev = line
684 lines = append(lines, lineRange{byteStart, byteEnd})
685 }
686
687 // children keeps track of the children's candidates we have already seen.
688 children := make([][]*candidateMatch, 0, len(t.children)-1)
689 for j, child := range t.children {
690 if j == fewestChildren {
691 continue
692 }
693 children = append(children, child.(*substrMatchTree).current)
694 }
695
696nextLine:
697 for i := 0; i < len(lines); i++ {
698 hits := 1
699 nextChild:
700 for j := range children {
701 nextCandidate:
702 for len(children[j]) > 0 {
703 candidate := children[j][0]
704 bo := int(cp.findOffset(false, candidate.runeOffset))
705 if bo < lines[i].start {
706 children[j] = children[j][1:]
707 continue nextCandidate
708 }
709 if bo <= lines[i].end {
710 hits++
711 continue nextChild
712 }
713 // move the `lines` iterator forward until bo <= line.end
714 for i < len(lines) && bo > lines[i].end {
715 i++
716 }
717 i--
718 continue nextLine
719 }
720 }
721 // return early once we found any line that contains matches from all children
722 if hits == len(t.children) {
723 return matchesFound
724 }
725 }
726 return matchesNone
727}
728
729func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
730 // We have found matches unless a child needs to do more work or it hasn't
731 // found matches.
732 state := matchesFound
733
734 for _, ch := range t.children {
735 switch evalMatchTree(cp, cost, known, ch) {
736 case matchesRequiresHigherCost:
737 // keep evaluating other children incase we come across matchesNone
738 state = matchesRequiresHigherCost
739 case matchesFound:
740 // will return this if every child has this value
741 case matchesNone:
742 return matchesNone
743 }
744 }
745
746 return state
747}
748
749func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
750 // we could short-circuit, but we want to use the other possibilities as a
751 // ranking signal. So we always return the most conservative state.
752 state := matchesNone
753 for _, ch := range t.children {
754 switch evalMatchTree(cp, cost, known, ch) {
755 case matchesRequiresHigherCost:
756 state = matchesRequiresHigherCost
757 case matchesFound:
758 if state != matchesRequiresHigherCost {
759 state = matchesFound
760 }
761 case matchesNone:
762 // noop
763 }
764 }
765 return state
766}
767
768func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
769 return matchesStatePred(t.branchMask() != 0)
770}
771
772func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
773 if t.reEvaluated {
774 return matchesStateForSlice(t.found)
775 }
776
777 if cost < costRegexp {
778 return matchesRequiresHigherCost
779 }
780
781 cp.stats.RegexpsConsidered++
782 idxs := t.regexp.FindAllIndex(cp.data(t.fileName), -1)
783 found := t.found[:0]
784 for _, idx := range idxs {
785 cm := &candidateMatch{
786 byteOffset: uint32(idx[0]),
787 byteMatchSz: uint32(idx[1] - idx[0]),
788 fileName: t.fileName,
789 }
790
791 found = append(found, cm)
792 }
793 t.found = found
794 t.reEvaluated = true
795
796 return matchesStateForSlice(t.found)
797}
798
799func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
800 if t.evaluated {
801 return matchesStateForSlice(t.found)
802 }
803
804 if cost < costRegexp {
805 return matchesRequiresHigherCost
806 }
807
808 data := cp.data(t.fileName)
809 offset := 0
810 found := t.found[:0]
811 for {
812 idx := bytes.Index(data[offset:], []byte(t.word))
813 if idx < 0 {
814 break
815 }
816
817 relStartOffset := offset + idx
818 relEndOffset := relStartOffset + len(t.word)
819
820 startBoundary := relStartOffset < len(data) && (relStartOffset == 0 || !characterClass(data[relStartOffset-1]))
821 endBoundary := relEndOffset > 0 && (relEndOffset == len(data) || !characterClass(data[relEndOffset]))
822 if startBoundary && endBoundary {
823 found = append(found, &candidateMatch{
824 byteOffset: uint32(offset + idx),
825 byteMatchSz: uint32(len(t.word)),
826 fileName: t.fileName,
827 })
828 }
829 offset += idx + len(t.word)
830 }
831
832 t.found = found
833 t.evaluated = true
834
835 return matchesStateForSlice(t.found)
836}
837
838// breakMatchesOnNewlines returns matches resulting from breaking each element
839// of cms on newlines within text.
840func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch {
841 var lineCMs []*candidateMatch
842 for _, cm := range cms {
843 lineCMs = append(lineCMs, breakOnNewlines(cm, text)...)
844 }
845 return lineCMs
846}
847
848// breakOnNewlines returns matches resulting from breaking cm on newlines
849// within text.
850func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch {
851 var cms []*candidateMatch
852 addMe := &candidateMatch{}
853 *addMe = *cm
854 for i := uint32(cm.byteOffset); i < cm.byteOffset+cm.byteMatchSz; i++ {
855 if text[i] == '\n' {
856 addMe.byteMatchSz = i - addMe.byteOffset
857 if addMe.byteMatchSz != 0 {
858 cms = append(cms, addMe)
859 }
860
861 addMe = &candidateMatch{}
862 *addMe = *cm
863 addMe.byteOffset = i + 1
864 }
865 }
866 addMe.byteMatchSz = cm.byteOffset + cm.byteMatchSz - addMe.byteOffset
867 if addMe.byteMatchSz != 0 {
868 cms = append(cms, addMe)
869 }
870 return cms
871}
872
873// evalMatchTree should be called instead of directly calling
874// matchTree.matches. It cache known values for future evaluation at higher
875// costs.
876func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) matchesState {
877 if v, ok := known[mt]; ok {
878 return matchesStatePred(v)
879 }
880
881 ms := mt.matches(cp, cost, known)
882 if ms != matchesRequiresHigherCost {
883 known[mt] = ms == matchesFound
884 }
885
886 return ms
887}
888
889func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
890 switch evalMatchTree(cp, cost, known, t.child) {
891 case matchesRequiresHigherCost:
892 return matchesRequiresHigherCost
893 case matchesFound:
894 return matchesNone
895 case matchesNone:
896 return matchesFound
897 default:
898 panic("unreachable")
899 }
900}
901
902func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
903 return evalMatchTree(cp, cost, known, t.child)
904}
905
906func (t *boostMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
907 return evalMatchTree(cp, cost, known, t.child)
908}
909
910func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
911 if t.contEvaluated {
912 return matchesStateForSlice(t.current)
913 }
914
915 if len(t.current) == 0 {
916 return matchesNone
917 }
918
919 if t.fileName && cost < costMemory {
920 return matchesRequiresHigherCost
921 }
922
923 if !t.fileName && cost < costContent {
924 return matchesRequiresHigherCost
925 }
926
927 pruned := t.current[:0]
928 for _, m := range t.current {
929 if m.byteOffset == 0 && m.runeOffset > 0 {
930 m.byteOffset = cp.findOffset(m.fileName, m.runeOffset)
931 }
932 if m.matchContent(cp.data(m.fileName)) {
933 pruned = append(pruned, m)
934 }
935 }
936 t.current = pruned
937 t.contEvaluated = true
938
939 return matchesStateForSlice(t.current)
940}
941
942type matchTreeOpt struct {
943 // DisableWordMatchOptimization is used to disable the use of wordMatchTree.
944 // This was added since we do not support wordMatchTree with symbol search.
945 DisableWordMatchOptimization bool
946}
947
948func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error) {
949 if q == nil {
950 return nil, fmt.Errorf("got nil (sub)query")
951 }
952 switch s := q.(type) {
953 case *query.Regexp:
954 // RegexpToMatchTreeRecursive tries to distill a matchTree that matches a
955 // superset of the regexp. If the returned matchTree is equivalent to the
956 // original regexp, it returns true. An equivalent matchTree has the same
957 // behaviour as the original regexp and can be used instead.
958 //
959 subMT, isEq, _, err := d.regexpToMatchTreeRecursive(s.Regexp, ngramSize, s.FileName, s.CaseSensitive)
960 if err != nil {
961 return nil, err
962 }
963 // if the query can be used in place of the regexp
964 // return the subtree
965 if isEq {
966 return subMT, nil
967 }
968
969 var tr matchTree
970 if wmt, ok := regexpToWordMatchTree(s, opt); ok {
971 // A common search we get is "\bLITERAL\b". Avoid the regex engine and
972 // provide something faster.
973 tr = wmt
974 } else {
975 prefix := ""
976 if !s.CaseSensitive {
977 prefix = "(?i)"
978 }
979
980 tr = ®expMatchTree{
981 regexp: regexp.MustCompile(prefix + s.Regexp.String()),
982 fileName: s.FileName,
983 }
984 }
985
986 return &andMatchTree{
987 children: []matchTree{
988 tr, &noVisitMatchTree{subMT},
989 },
990 }, nil
991 case *query.And:
992 var r []matchTree
993 for _, ch := range s.Children {
994 ct, err := d.newMatchTree(ch, opt)
995 if err != nil {
996 return nil, err
997 }
998 r = append(r, ct)
999 }
1000 return &andMatchTree{r}, nil
1001 case *query.Or:
1002 var r []matchTree
1003 for _, ch := range s.Children {
1004 ct, err := d.newMatchTree(ch, opt)
1005 if err != nil {
1006 return nil, err
1007 }
1008 r = append(r, ct)
1009 }
1010 return &orMatchTree{r}, nil
1011 case *query.Not:
1012 ct, err := d.newMatchTree(s.Child, opt)
1013 return ¬MatchTree{
1014 child: ct,
1015 }, err
1016
1017 case *query.Type:
1018 if s.Type != query.TypeFileName {
1019 break
1020 }
1021
1022 ct, err := d.newMatchTree(s.Child, opt)
1023 if err != nil {
1024 return nil, err
1025 }
1026
1027 return &fileNameMatchTree{
1028 child: ct,
1029 }, nil
1030
1031 case *query.Boost:
1032 ct, err := d.newMatchTree(s.Child, opt)
1033 if err != nil {
1034 return nil, err
1035 }
1036
1037 return &boostMatchTree{
1038 child: ct,
1039 boost: s.Boost,
1040 }, nil
1041
1042 case *query.Substring:
1043 return d.newSubstringMatchTree(s)
1044
1045 case *query.Branch:
1046 masks := make([]uint64, 0, len(d.repoMetaData))
1047 if s.Pattern == "HEAD" {
1048 for i := 0; i < len(d.repoMetaData); i++ {
1049 masks = append(masks, 1)
1050 }
1051 } else {
1052 for _, branchIDs := range d.branchIDs {
1053 mask := uint64(0)
1054 for nm, m := range branchIDs {
1055 if (s.Exact && nm == s.Pattern) || (!s.Exact && strings.Contains(nm, s.Pattern)) {
1056 mask |= uint64(m)
1057 }
1058 }
1059 masks = append(masks, mask)
1060 }
1061 }
1062 return &branchQueryMatchTree{
1063 masks: masks,
1064 fileMasks: d.fileBranchMasks,
1065 repos: d.repos,
1066 }, nil
1067 case *query.Const:
1068 if s.Value {
1069 return &bruteForceMatchTree{}, nil
1070 } else {
1071 return &noMatchTree{Why: "const"}, nil
1072 }
1073 case *query.Language:
1074 code, ok := d.metaData.LanguageMap[s.Language]
1075 if !ok {
1076 return &noMatchTree{Why: "lang"}, nil
1077 }
1078 return &docMatchTree{
1079 reason: "language",
1080 numDocs: d.numDocs(),
1081 predicate: func(docID uint32) bool {
1082 return d.getLanguage(docID) == code
1083 },
1084 }, nil
1085
1086 case *query.Symbol:
1087 // Disable WordMatchTree since we don't support it in symbols yet.
1088 optCopy := opt
1089 optCopy.DisableWordMatchOptimization = true
1090
1091 subMT, err := d.newMatchTree(s.Expr, optCopy)
1092 if err != nil {
1093 return nil, err
1094 }
1095
1096 if substr, ok := subMT.(*substrMatchTree); ok {
1097 return &symbolSubstrMatchTree{
1098 substrMatchTree: substr,
1099 patternSize: uint32(utf8.RuneCountInString(substr.query.Pattern)),
1100 fileEndRunes: d.fileEndRunes,
1101 fileEndSymbol: d.fileEndSymbol,
1102 sections: d.runeDocSections,
1103 }, nil
1104 }
1105
1106 var regexp *regexp.Regexp
1107 visitMatchTree(subMT, func(mt matchTree) {
1108 if t, ok := mt.(*regexpMatchTree); ok {
1109 regexp = t.regexp
1110 }
1111 })
1112 if regexp == nil {
1113 return nil, fmt.Errorf("found %T inside query.Symbol", subMT)
1114 }
1115
1116 return &symbolRegexpMatchTree{
1117 regexp: regexp,
1118 all: regexp.String() == "(?i)(?-s:.)*",
1119 matchTree: subMT,
1120 }, nil
1121
1122 case *query.FileNameSet:
1123 return &docMatchTree{
1124 reason: "FileNameSet",
1125 numDocs: d.numDocs(),
1126 predicate: func(docID uint32) bool {
1127 fileName := d.fileName(docID)
1128 _, ok := s.Set[string(fileName)]
1129 return ok
1130 },
1131 }, nil
1132
1133 case *query.BranchesRepos:
1134 reposBranchesWant := make([]uint64, len(d.repoMetaData))
1135 for repoIdx := range d.repoMetaData {
1136 var mask uint64
1137 for _, br := range s.List {
1138 if br.Repos.Contains(d.repoMetaData[repoIdx].ID) {
1139 mask |= uint64(d.branchIDs[repoIdx][br.Branch])
1140 }
1141 }
1142 reposBranchesWant[repoIdx] = mask
1143 }
1144 return &docMatchTree{
1145 reason: "BranchesRepos",
1146 numDocs: d.numDocs(),
1147 predicate: func(docID uint32) bool {
1148 return d.fileBranchMasks[docID]&reposBranchesWant[d.repos[docID]] != 0
1149 },
1150 }, nil
1151
1152 case *query.RepoSet:
1153 reposWant := make([]bool, len(d.repoMetaData))
1154 for repoIdx, r := range d.repoMetaData {
1155 if _, ok := s.Set[r.Name]; ok {
1156 reposWant[repoIdx] = true
1157 }
1158 }
1159 return &docMatchTree{
1160 reason: "RepoSet",
1161 numDocs: d.numDocs(),
1162 predicate: func(docID uint32) bool {
1163 return reposWant[d.repos[docID]]
1164 },
1165 }, nil
1166
1167 case *query.RepoIDs:
1168 reposWant := make([]bool, len(d.repoMetaData))
1169 for repoIdx, r := range d.repoMetaData {
1170 if s.Repos.Contains(r.ID) {
1171 reposWant[repoIdx] = true
1172 }
1173 }
1174 return &docMatchTree{
1175 reason: "RepoIDs",
1176 numDocs: d.numDocs(),
1177 predicate: func(docID uint32) bool {
1178 return reposWant[d.repos[docID]]
1179 },
1180 }, nil
1181
1182 case *query.Repo:
1183 reposWant := make([]bool, len(d.repoMetaData))
1184 for repoIdx, r := range d.repoMetaData {
1185 if s.Regexp.MatchString(r.Name) {
1186 reposWant[repoIdx] = true
1187 }
1188 }
1189 return &docMatchTree{
1190 reason: "Repo",
1191 numDocs: d.numDocs(),
1192 predicate: func(docID uint32) bool {
1193 return reposWant[d.repos[docID]]
1194 },
1195 }, nil
1196
1197 case *query.RepoRegexp:
1198 reposWant := make([]bool, len(d.repoMetaData))
1199 for repoIdx, r := range d.repoMetaData {
1200 if s.Regexp.MatchString(r.Name) {
1201 reposWant[repoIdx] = true
1202 }
1203 }
1204 return &docMatchTree{
1205 reason: "RepoRegexp",
1206 numDocs: d.numDocs(),
1207 predicate: func(docID uint32) bool {
1208 return reposWant[d.repos[docID]]
1209 },
1210 }, nil
1211
1212 case query.RawConfig:
1213 return &docMatchTree{
1214 reason: s.String(),
1215 numDocs: d.numDocs(),
1216 predicate: func(docID uint32) bool {
1217 return uint8(s)&d.rawConfigMasks[d.repos[docID]] == uint8(s)
1218 },
1219 }, nil
1220 }
1221 log.Panicf("type %T", q)
1222 return nil, nil
1223}
1224
1225func (d *indexData) newSubstringMatchTree(s *query.Substring) (matchTree, error) {
1226 st := &substrMatchTree{
1227 query: s,
1228 caseSensitive: s.CaseSensitive,
1229 fileName: s.FileName,
1230 }
1231
1232 if utf8.RuneCountInString(s.Pattern) < ngramSize {
1233 prefix := ""
1234 if !s.CaseSensitive {
1235 prefix = "(?i)"
1236 }
1237 t := ®expMatchTree{
1238 regexp: regexp.MustCompile(prefix + regexp.QuoteMeta(s.Pattern)),
1239 fileName: s.FileName,
1240 }
1241 return t, nil
1242 }
1243
1244 result, err := d.iterateNgrams(s)
1245 if err != nil {
1246 return nil, err
1247 }
1248 st.matchIterator = result
1249 return st, nil
1250}
1251
1252func regexpToWordMatchTree(q *query.Regexp, opt matchTreeOpt) (_ *wordMatchTree, ok bool) {
1253 if opt.DisableWordMatchOptimization {
1254 return nil, false
1255 }
1256 // Needs to be case sensitive
1257 if !q.CaseSensitive || q.Regexp.Flags&syntax.FoldCase != 0 {
1258 return nil, false
1259 }
1260 // We want a regex that looks like Op.Concat[OpWordBoundary OpLiteral OpWordBoundary]
1261 if q.Regexp.Op != syntax.OpConcat || len(q.Regexp.Sub) != 3 {
1262 return nil, false
1263 }
1264 sub := q.Regexp.Sub
1265 if sub[0].Op != syntax.OpWordBoundary || sub[1].Op != syntax.OpLiteral || sub[2].Op != syntax.OpWordBoundary {
1266 return nil, false
1267 }
1268
1269 return &wordMatchTree{
1270 word: string(sub[1].Rune),
1271 fileName: q.FileName,
1272 }, true
1273}
1274
1275// pruneMatchTree removes impossible branches from the matchTree, as indicated
1276// by substrMatchTree having a noMatchTree and the resulting impossible and clauses and so forth.
1277func pruneMatchTree(mt matchTree) (matchTree, error) {
1278 var err error
1279 switch mt := mt.(type) {
1280 // leaf nodes that we test with the filter:
1281 case *substrMatchTree:
1282 if res, ok := mt.matchIterator.(*ngramIterationResults); ok {
1283 if _, ok := res.matchIterator.(*noMatchTree); ok {
1284 return nil, nil
1285 }
1286 }
1287 // recursive tree structures:
1288 case *andMatchTree:
1289 // Any branch of an and becoming impossible means the entire clause
1290 // is impossible. Otherwise, just handle rewrites.
1291 for i, child := range mt.children {
1292 newChild, err := pruneMatchTree(child)
1293 if err != nil {
1294 return nil, err
1295 }
1296 if newChild == nil {
1297 return nil, nil
1298 }
1299 mt.children[i] = newChild
1300 }
1301 case *orMatchTree:
1302 // *All* branches of an OR becoming impossible means the entire clause
1303 // is impossible. Otherwise, drop impossible subclauses and handle
1304 // rewrites, including simplifying to a singular resulting child branch.
1305 n := 0
1306 for _, child := range mt.children {
1307 newChild, err := pruneMatchTree(child)
1308 if err != nil {
1309 return nil, err
1310 }
1311 if newChild != nil {
1312 mt.children[n] = newChild
1313 n++
1314 }
1315 }
1316 mt.children = mt.children[:n]
1317 if len(mt.children) == 1 {
1318 return mt.children[0], nil
1319 } else if len(mt.children) == 0 {
1320 return nil, nil
1321 }
1322 case *noVisitMatchTree:
1323 mt.matchTree, err = pruneMatchTree(mt.matchTree)
1324 if err != nil {
1325 return nil, err
1326 }
1327 if mt.matchTree == nil {
1328 return nil, nil
1329 }
1330 case *fileNameMatchTree:
1331 mt.child, err = pruneMatchTree(mt.child)
1332 if err != nil {
1333 return nil, err
1334 }
1335 if mt.child == nil {
1336 return nil, nil
1337 }
1338 case *boostMatchTree:
1339 mt.child, err = pruneMatchTree(mt.child)
1340 if err != nil {
1341 return nil, err
1342 }
1343 if mt.child == nil {
1344 return nil, nil
1345 }
1346 case *andLineMatchTree:
1347 child, err := pruneMatchTree(&mt.andMatchTree)
1348 if err != nil {
1349 return nil, err
1350 }
1351 if child == nil {
1352 return nil, nil
1353 }
1354 if c, ok := child.(*andMatchTree); ok {
1355 mt.andMatchTree = *c
1356 } else {
1357 // the and was simplified to a single clause,
1358 // so the linematch portion is irrelevant.
1359 return mt, nil
1360 }
1361 case *notMatchTree:
1362 mt.child, err = pruneMatchTree(mt.child)
1363 if err != nil {
1364 return nil, err
1365 }
1366 if mt.child == nil {
1367 // not false => true
1368 return &bruteForceMatchTree{}, nil
1369 }
1370 // unhandled:
1371 case *docMatchTree:
1372 case *bruteForceMatchTree:
1373 case *regexpMatchTree:
1374 case *wordMatchTree:
1375 }
1376 return mt, err
1377}