fork of https://github.com/sourcegraph/zoekt
1// Copyright 2018 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "bytes"
19 "fmt"
20 "log"
21 "regexp/syntax"
22 "strings"
23 "unicode/utf8"
24
25 "github.com/grafana/regexp"
26
27 "github.com/sourcegraph/zoekt/query"
28)
29
30// A docIterator iterates over documents in order.
31type docIterator interface {
32 // provide the next document where we may find something interesting.
33 //
34 // This is like a "peek" and shouldn't mutate state. prepare is what should
35 // change state.
36 nextDoc() uint32
37
38 // clears any per-document state of the docIterator, and
39 // prepares for evaluating the given doc. The argument is
40 // strictly increasing over time.
41 prepare(nextDoc uint32)
42}
43
44// costs are passed in increasing order to matchTree.matches until they do not
45// return matchesRequiresHigherCost.
46const (
47 costConst = 0
48 costMemory = 1
49 costContent = 2
50 costRegexp = 3
51)
52
53const (
54 costMin = costConst
55 costMax = costRegexp
56)
57
58// matchesState is an enum for the state of a matchTree after a call to
59// matchTree.matches.
60type matchesState uint8
61
62const (
63 // matchesRequiresHigherCost is returned when matchTree.matches hasn't done
64 // a search yet since the cost value is not high enough.
65 matchesRequiresHigherCost matchesState = iota
66
67 // matchesFound is returned when matchTree.matches has done a search and
68 // found one or more matches.
69 matchesFound
70
71 // matchesNone is returned when matchTree.matches has done a search and
72 // found nothing.
73 matchesNone
74)
75
76// matchesStatePred is a helper which returns matchesFound if b is true
77// otherwise returns matchesNone.
78func matchesStatePred(b bool) matchesState {
79 if b {
80 return matchesFound
81 }
82 return matchesNone
83}
84
85// matchesStateForSlice is a helper which returns matchesFound if v is
86// non-empty otherwise returns matchesNone.
87func matchesStateForSlice[T any](v []T) matchesState {
88 return matchesStatePred(len(v) > 0)
89}
90
91// An expression tree coupled with matches. The matchtree has two
92// functions:
93//
94// * it implements boolean combinations (and, or, not)
95//
96// * it implements shortcuts, where we skip documents (for example: if
97// there are no trigram matches, we can be sure there are no substring
98// matches). The matchtree iterates over the documents as they are
99// ordered in the shard.
100//
101// The general process for a given (shard, query) is:
102//
103// - construct matchTree for the query
104//
105// - find all different leaf matchTrees (substring, regexp, etc.)
106//
107// in a loop:
108//
109// - find next doc to process using nextDoc
110//
111// - evaluate atoms (leaf expressions that match text)
112//
113// - evaluate the tree using matches(), storing the result in map.
114//
115// - if the complete tree returns (matches() != matchesRequiresHigherCost)
116// for the document, collect all text matches by looking at leaf
117// matchTrees.
118type matchTree interface {
119 docIterator
120
121 // matches if cost is high enough. See documentation for matchesState's
122 // values.
123 //
124 // Note: Do not call this directly, rather use evalMatchTree which uses
125 // known to cache responses once the state transitions away from
126 // matchesRequiresHigherCost.
127 matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState
128}
129
130// docMatchTree iterates over documents for which predicate(docID) returns true.
131type docMatchTree struct {
132 // the number of documents in a shard.
133 numDocs uint32
134
135 predicate func(docID uint32) bool
136
137 // provides additional information about the reason why the docMatchTree was
138 // created.
139 reason string
140
141 // mutable
142 firstDone bool
143 docID uint32
144}
145
146type bruteForceMatchTree struct {
147 // mutable
148 firstDone bool
149 docID uint32
150}
151
152type andLineMatchTree struct {
153 andMatchTree
154}
155
156type andMatchTree struct {
157 children []matchTree
158}
159
160type orMatchTree struct {
161 children []matchTree
162}
163
164type notMatchTree struct {
165 child matchTree
166}
167
168// Returns only the filename of child matches.
169type fileNameMatchTree struct {
170 child matchTree
171}
172
173type boostMatchTree struct {
174 child matchTree
175 boost float64
176}
177
178// Don't visit this subtree for collecting matches.
179type noVisitMatchTree struct {
180 matchTree
181}
182
183type regexpMatchTree struct {
184 regexp *regexp.Regexp
185
186 fileName bool
187
188 // mutable
189 reEvaluated bool
190 found []*candidateMatch
191
192 // nextDoc, prepare.
193 bruteForceMatchTree
194}
195
196// \bLITERAL\b
197type wordMatchTree struct {
198 word string
199
200 fileName bool
201
202 // mutable
203 evaluated bool
204 found []*candidateMatch
205
206 // nextDoc, prepare.
207 bruteForceMatchTree
208}
209
210type substrMatchTree struct {
211 matchIterator
212
213 query *query.Substring
214 caseSensitive bool
215 fileName bool
216
217 // mutable
218 current []*candidateMatch
219 contEvaluated bool
220}
221
222type branchQueryMatchTree struct {
223 fileMasks []uint64
224 masks []uint64
225 repos []uint16
226
227 // mutable
228 firstDone bool
229 docID uint32
230}
231
232func (t *branchQueryMatchTree) branchMask() uint64 {
233 return t.fileMasks[t.docID] & t.masks[t.repos[t.docID]]
234}
235
236type symbolRegexpMatchTree struct {
237 matchTree
238 regexp *regexp.Regexp
239 all bool // skips regex match if .*
240
241 reEvaluated bool
242 found []*candidateMatch
243}
244
245func (t *symbolRegexpMatchTree) prepare(doc uint32) {
246 t.reEvaluated = false
247 t.found = t.found[:0]
248 t.matchTree.prepare(doc)
249}
250
251func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
252 if t.reEvaluated {
253 return matchesStateForSlice(t.found)
254 }
255
256 if cost < costRegexp {
257 return matchesRequiresHigherCost
258 }
259
260 sections := cp.docSections()
261 content := cp.data(false)
262
263 found := t.found[:0]
264 for i, sec := range sections {
265 var idx []int
266 if t.all {
267 idx = []int{0, int(sec.End - sec.Start)}
268 } else {
269 idx = t.regexp.FindIndex(content[sec.Start:sec.End])
270 if idx == nil {
271 continue
272 }
273 }
274
275 cm := &candidateMatch{
276 byteOffset: sec.Start + uint32(idx[0]),
277 byteMatchSz: uint32(idx[1] - idx[0]),
278 symbol: true,
279 symbolIdx: uint32(i),
280 }
281 found = append(found, cm)
282 }
283 t.found = found
284 t.reEvaluated = true
285
286 return matchesStateForSlice(t.found)
287}
288
289type symbolSubstrMatchTree struct {
290 *substrMatchTree
291
292 patternSize uint32
293 fileEndRunes []uint32
294 fileEndSymbol []uint32
295
296 doc uint32
297 sections []DocumentSection
298
299 secID uint32
300}
301
302func (t *symbolSubstrMatchTree) prepare(doc uint32) {
303 t.substrMatchTree.prepare(doc)
304 t.doc = doc
305
306 var fileStart uint32
307 if doc > 0 {
308 fileStart = t.fileEndRunes[doc-1]
309 }
310
311 var sections []DocumentSection
312 if len(t.sections) > 0 {
313 most := t.fileEndSymbol[len(t.fileEndSymbol)-1]
314 if most == uint32(len(t.sections)) {
315 sections = t.sections[t.fileEndSymbol[doc]:t.fileEndSymbol[doc+1]]
316 } else {
317 for t.secID < uint32(len(t.sections)) && t.sections[t.secID].Start < fileStart {
318 t.secID++
319 }
320
321 fileEnd, symbolEnd := t.fileEndRunes[doc], t.secID
322 for symbolEnd < uint32(len(t.sections)) && t.sections[symbolEnd].Start < fileEnd {
323 symbolEnd++
324 }
325
326 sections = t.sections[t.secID:symbolEnd]
327 }
328 }
329
330 secIdx := 0
331 trimmed := t.current[:0]
332 for len(sections) > secIdx && len(t.current) > 0 {
333 start := fileStart + t.current[0].runeOffset
334 end := start + t.patternSize
335 if start >= sections[secIdx].End {
336 secIdx++
337 continue
338 }
339
340 if start < sections[secIdx].Start {
341 t.current = t.current[1:]
342 continue
343 }
344
345 if end <= sections[secIdx].End {
346 t.current[0].symbol = true
347 t.current[0].symbolIdx = uint32(secIdx)
348 trimmed = append(trimmed, t.current[0])
349 }
350
351 t.current = t.current[1:]
352 }
353 t.current = trimmed
354}
355
356// all prepare methods
357
358func (t *bruteForceMatchTree) prepare(doc uint32) {
359 t.docID = doc
360 t.firstDone = true
361}
362
363func (t *docMatchTree) prepare(doc uint32) {
364 t.docID = doc
365 t.firstDone = true
366}
367
368func (t *andMatchTree) prepare(doc uint32) {
369 for _, c := range t.children {
370 c.prepare(doc)
371 }
372}
373
374func (t *regexpMatchTree) prepare(doc uint32) {
375 t.found = t.found[:0]
376 t.reEvaluated = false
377 t.bruteForceMatchTree.prepare(doc)
378}
379
380func (t *wordMatchTree) prepare(doc uint32) {
381 t.found = t.found[:0]
382 t.evaluated = false
383 t.bruteForceMatchTree.prepare(doc)
384}
385
386func (t *orMatchTree) prepare(doc uint32) {
387 for _, c := range t.children {
388 c.prepare(doc)
389 }
390}
391
392func (t *notMatchTree) prepare(doc uint32) {
393 t.child.prepare(doc)
394}
395
396func (t *fileNameMatchTree) prepare(doc uint32) {
397 t.child.prepare(doc)
398}
399
400func (t *boostMatchTree) prepare(doc uint32) {
401 t.child.prepare(doc)
402}
403
404func (t *substrMatchTree) prepare(nextDoc uint32) {
405 t.matchIterator.prepare(nextDoc)
406 t.current = t.matchIterator.candidates()
407 t.contEvaluated = false
408}
409
410func (t *branchQueryMatchTree) prepare(doc uint32) {
411 t.firstDone = true
412 t.docID = doc
413}
414
415// nextDoc
416
417func (t *docMatchTree) nextDoc() uint32 {
418 var start uint32
419 if t.firstDone {
420 start = t.docID + 1
421 }
422 for i := start; i < t.numDocs; i++ {
423 if t.predicate(i) {
424 return i
425 }
426 }
427 return maxUInt32
428}
429
430func (t *bruteForceMatchTree) nextDoc() uint32 {
431 if !t.firstDone {
432 return 0
433 }
434 return t.docID + 1
435}
436
437func (t *andMatchTree) nextDoc() uint32 {
438 var max uint32
439 for _, c := range t.children {
440 m := c.nextDoc()
441 if m > max {
442 max = m
443 }
444 }
445 return max
446}
447
448func (t *orMatchTree) nextDoc() uint32 {
449 min := uint32(maxUInt32)
450 for _, c := range t.children {
451 m := c.nextDoc()
452 if m < min {
453 min = m
454 }
455 }
456 return min
457}
458
459func (t *notMatchTree) nextDoc() uint32 {
460 return 0
461}
462
463func (t *fileNameMatchTree) nextDoc() uint32 {
464 return t.child.nextDoc()
465}
466
467func (t *boostMatchTree) nextDoc() uint32 {
468 return t.child.nextDoc()
469}
470
471func (t *branchQueryMatchTree) nextDoc() uint32 {
472 var start uint32
473 if t.firstDone {
474 start = t.docID + 1
475 }
476
477 for i := start; i < uint32(len(t.fileMasks)); i++ {
478 if (t.masks[t.repos[i]] & t.fileMasks[i]) != 0 {
479 return i
480 }
481 }
482 return maxUInt32
483}
484
485// all String methods
486
487func (t *bruteForceMatchTree) String() string {
488 return "all"
489}
490
491func (t *docMatchTree) String() string {
492 return fmt.Sprintf("doc(%s)", t.reason)
493}
494
495func (t *andMatchTree) String() string {
496 return fmt.Sprintf("and%v", t.children)
497}
498
499func (t *regexpMatchTree) String() string {
500 f := ""
501 if t.fileName {
502 f = "f"
503 }
504 return fmt.Sprintf("%sre(%s)", f, t.regexp)
505}
506
507func (t *wordMatchTree) String() string {
508 f := ""
509 if t.fileName {
510 f = "f"
511 }
512 return fmt.Sprintf("%sword(%s)", f, t.word)
513}
514
515func (t *orMatchTree) String() string {
516 return fmt.Sprintf("or%v", t.children)
517}
518
519func (t *notMatchTree) String() string {
520 return fmt.Sprintf("not(%v)", t.child)
521}
522
523func (t *noVisitMatchTree) String() string {
524 return fmt.Sprintf("novisit(%v)", t.matchTree)
525}
526
527func (t *fileNameMatchTree) String() string {
528 return fmt.Sprintf("f(%v)", t.child)
529}
530
531func (t *boostMatchTree) String() string {
532 return fmt.Sprintf("boost(%f, %v)", t.boost, t.child)
533}
534
535func (t *substrMatchTree) String() string {
536 f := ""
537 if t.fileName {
538 f = "f"
539 }
540
541 return fmt.Sprintf("%ssubstr(%q, %v, %v)", f, t.query.Pattern, t.current, t.matchIterator)
542}
543
544func (t *branchQueryMatchTree) String() string {
545 return fmt.Sprintf("branch(%x)", t.masks)
546}
547
548func (t *symbolSubstrMatchTree) String() string {
549 return fmt.Sprintf("symbol(%v)", t.substrMatchTree)
550}
551
552func (t *symbolRegexpMatchTree) String() string {
553 return fmt.Sprintf("symbol(%v)", t.matchTree)
554}
555
556// visitMatches visits all atoms in matchTree. Note: This visits
557// noVisitMatchTree. For collecting matches use visitMatches.
558func visitMatchTree(t matchTree, f func(matchTree)) {
559 switch s := t.(type) {
560 case *andMatchTree:
561 for _, ch := range s.children {
562 visitMatchTree(ch, f)
563 }
564 case *orMatchTree:
565 for _, ch := range s.children {
566 visitMatchTree(ch, f)
567 }
568 case *andLineMatchTree:
569 visitMatchTree(&s.andMatchTree, f)
570 case *noVisitMatchTree:
571 visitMatchTree(s.matchTree, f)
572 case *notMatchTree:
573 visitMatchTree(s.child, f)
574 case *fileNameMatchTree:
575 visitMatchTree(s.child, f)
576 case *boostMatchTree:
577 visitMatchTree(s.child, f)
578 case *symbolSubstrMatchTree:
579 visitMatchTree(s.substrMatchTree, f)
580 case *symbolRegexpMatchTree:
581 visitMatchTree(s.matchTree, f)
582 default:
583 f(t)
584 }
585}
586
587// updateMatchTreeStats calls updateStats on all atoms in mt which have that
588// function defined.
589func updateMatchTreeStats(mt matchTree, stats *Stats) {
590 visitMatchTree(mt, func(mt matchTree) {
591 if atom, ok := mt.(interface{ updateStats(*Stats) }); ok {
592 atom.updateStats(stats)
593 }
594 })
595}
596
597func visitMatchAtoms(t matchTree, known map[matchTree]bool, f func(matchTree)) {
598 visitMatches(t, known, 1, func(mt matchTree, _ float64) {
599 f(mt)
600 })
601}
602
603// visitMatches visits all atoms which can contribute matches. Note: This
604// skips noVisitMatchTree.
605func visitMatches(t matchTree, known map[matchTree]bool, weight float64, f func(matchTree, float64)) {
606 switch s := t.(type) {
607 case *andMatchTree:
608 for _, ch := range s.children {
609 if known[ch] {
610 visitMatches(ch, known, weight, f)
611 }
612 }
613 case *andLineMatchTree:
614 visitMatches(&s.andMatchTree, known, weight, f)
615 case *orMatchTree:
616 for _, ch := range s.children {
617 if known[ch] {
618 visitMatches(ch, known, weight, f)
619 }
620 }
621 case *boostMatchTree:
622 visitMatches(s.child, known, weight*s.boost, f)
623 case *symbolSubstrMatchTree:
624 visitMatches(s.substrMatchTree, known, weight, f)
625 case *notMatchTree:
626 case *noVisitMatchTree:
627 // don't collect into negative trees.
628 case *fileNameMatchTree:
629 // We will just gather the filename if we do not visit this tree.
630 default:
631 f(s, weight)
632 }
633}
634
635// all matches() methods.
636
637func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
638 return matchesStatePred(t.predicate(cp.idx))
639}
640
641func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
642 return matchesFound
643}
644
645// andLineMatchTree is a performance optimization of andMatchTree. For content
646// searches we don't want to run the regex engine if there is no line that
647// contains matches from all terms.
648func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
649 if state := evalMatchTree(cp, cost, known, &t.andMatchTree); state != matchesFound {
650 return state
651 }
652
653 // Invariant: all children have matches. If any line contains all of them we
654 // can return MatchesFound.
655
656 // find child with fewest candidates
657 min := maxUInt32
658 fewestChildren := 0
659 for ix, child := range t.children {
660 v, ok := child.(*substrMatchTree)
661 // make sure we are running a content search and that all candidates are a
662 // substrMatchTree
663 if !ok || v.fileName {
664 return matchesFound
665 }
666 if len(v.current) < min {
667 min = len(v.current)
668 fewestChildren = ix
669 }
670 }
671
672 type lineRange struct {
673 start int
674 end int
675 }
676 lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current))
677 prev := -1
678 for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current {
679 line, byteStart, byteEnd := cp.newlines().atOffset(candidate.byteOffset)
680 if line == prev {
681 continue
682 }
683 prev = line
684 lines = append(lines, lineRange{byteStart, byteEnd})
685 }
686
687 // children keeps track of the children's candidates we have already seen.
688 children := make([][]*candidateMatch, 0, len(t.children)-1)
689 for j, child := range t.children {
690 if j == fewestChildren {
691 continue
692 }
693 children = append(children, child.(*substrMatchTree).current)
694 }
695
696nextLine:
697 for i := 0; i < len(lines); i++ {
698 hits := 1
699 nextChild:
700 for j := range children {
701 nextCandidate:
702 for len(children[j]) > 0 {
703 candidate := children[j][0]
704 bo := int(cp.findOffset(false, candidate.runeOffset))
705 if bo < lines[i].start {
706 children[j] = children[j][1:]
707 continue nextCandidate
708 }
709 if bo <= lines[i].end {
710 hits++
711 continue nextChild
712 }
713 // move the `lines` iterator forward until bo <= line.end
714 for i < len(lines) && bo > lines[i].end {
715 i++
716 }
717 i--
718 continue nextLine
719 }
720 }
721 // return early once we found any line that contains matches from all children
722 if hits == len(t.children) {
723 return matchesFound
724 }
725 }
726 return matchesNone
727}
728
729func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
730 // We have found matches unless a child needs to do more work or it hasn't
731 // found matches.
732 state := matchesFound
733
734 for _, ch := range t.children {
735 switch evalMatchTree(cp, cost, known, ch) {
736 case matchesRequiresHigherCost:
737 // keep evaluating other children incase we come across matchesNone
738 state = matchesRequiresHigherCost
739 case matchesFound:
740 // will return this if every child has this value
741 case matchesNone:
742 return matchesNone
743 }
744 }
745
746 return state
747}
748
749func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
750 // we could short-circuit, but we want to use the other possibilities as a
751 // ranking signal. So we always return the most conservative state.
752 state := matchesNone
753 for _, ch := range t.children {
754 switch evalMatchTree(cp, cost, known, ch) {
755 case matchesRequiresHigherCost:
756 state = matchesRequiresHigherCost
757 case matchesFound:
758 if state != matchesRequiresHigherCost {
759 state = matchesFound
760 }
761 case matchesNone:
762 // noop
763 }
764 }
765 return state
766}
767
768func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
769 return matchesStatePred(t.branchMask() != 0)
770}
771
772func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
773 if t.reEvaluated {
774 return matchesStateForSlice(t.found)
775 }
776
777 if cost < costRegexp {
778 return matchesRequiresHigherCost
779 }
780
781 cp.stats.RegexpsConsidered++
782 idxs := t.regexp.FindAllIndex(cp.data(t.fileName), -1)
783 found := t.found[:0]
784 for _, idx := range idxs {
785 cm := &candidateMatch{
786 byteOffset: uint32(idx[0]),
787 byteMatchSz: uint32(idx[1] - idx[0]),
788 fileName: t.fileName,
789 }
790
791 found = append(found, cm)
792 }
793 t.found = found
794 t.reEvaluated = true
795
796 return matchesStateForSlice(t.found)
797}
798
799func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
800 if t.evaluated {
801 return matchesStateForSlice(t.found)
802 }
803
804 if cost < costRegexp {
805 return matchesRequiresHigherCost
806 }
807
808 data := cp.data(t.fileName)
809 offset := 0
810 found := t.found[:0]
811 for {
812 idx := bytes.Index(data[offset:], []byte(t.word))
813 if idx < 0 {
814 break
815 }
816
817 relStartOffset := offset + idx
818 relEndOffset := relStartOffset + len(t.word)
819
820 startBoundary := relStartOffset < len(data) && (relStartOffset == 0 || !characterClass(data[relStartOffset-1]))
821 endBoundary := relEndOffset > 0 && (relEndOffset == len(data) || !characterClass(data[relEndOffset]))
822 if startBoundary && endBoundary {
823 found = append(found, &candidateMatch{
824 byteOffset: uint32(offset + idx),
825 byteMatchSz: uint32(len(t.word)),
826 fileName: t.fileName,
827 })
828 }
829 offset += idx + len(t.word)
830 }
831
832 t.found = found
833 t.evaluated = true
834
835 return matchesStateForSlice(t.found)
836}
837
838// breakMatchesOnNewlines returns matches resulting from breaking each element
839// of cms on newlines within text.
840func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch {
841 var lineCMs []*candidateMatch
842 for _, cm := range cms {
843 lineCMs = append(lineCMs, breakOnNewlines(cm, text)...)
844 }
845 return lineCMs
846}
847
848// breakOnNewlines returns matches resulting from breaking cm on newlines
849// within text.
850func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch {
851 var cms []*candidateMatch
852 addMe := &candidateMatch{}
853 *addMe = *cm
854 for i := uint32(cm.byteOffset); i < cm.byteOffset+cm.byteMatchSz; i++ {
855 if text[i] == '\n' {
856 addMe.byteMatchSz = i - addMe.byteOffset
857 if addMe.byteMatchSz != 0 {
858 cms = append(cms, addMe)
859 }
860
861 addMe = &candidateMatch{}
862 *addMe = *cm
863 addMe.byteOffset = i + 1
864 }
865 }
866 addMe.byteMatchSz = cm.byteOffset + cm.byteMatchSz - addMe.byteOffset
867 if addMe.byteMatchSz != 0 {
868 cms = append(cms, addMe)
869 }
870 return cms
871}
872
873// evalMatchTree should be called instead of directly calling
874// matchTree.matches. It cache known values for future evaluation at higher
875// costs.
876func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) matchesState {
877 if v, ok := known[mt]; ok {
878 return matchesStatePred(v)
879 }
880
881 ms := mt.matches(cp, cost, known)
882 if ms != matchesRequiresHigherCost {
883 known[mt] = ms == matchesFound
884 }
885
886 return ms
887}
888
889func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
890 switch evalMatchTree(cp, cost, known, t.child) {
891 case matchesRequiresHigherCost:
892 return matchesRequiresHigherCost
893 case matchesFound:
894 return matchesNone
895 case matchesNone:
896 return matchesFound
897 default:
898 panic("unreachable")
899 }
900}
901
902func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
903 return evalMatchTree(cp, cost, known, t.child)
904}
905
906func (t *boostMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
907 return evalMatchTree(cp, cost, known, t.child)
908}
909
910func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
911 if t.contEvaluated {
912 return matchesStateForSlice(t.current)
913 }
914
915 if len(t.current) == 0 {
916 return matchesNone
917 }
918
919 if t.fileName && cost < costMemory {
920 return matchesRequiresHigherCost
921 }
922
923 if !t.fileName && cost < costContent {
924 return matchesRequiresHigherCost
925 }
926
927 pruned := t.current[:0]
928 for _, m := range t.current {
929 if m.byteOffset == 0 && m.runeOffset > 0 {
930 m.byteOffset = cp.findOffset(m.fileName, m.runeOffset)
931 }
932 if m.matchContent(cp.data(m.fileName)) {
933 pruned = append(pruned, m)
934 }
935 }
936 t.current = pruned
937 t.contEvaluated = true
938
939 return matchesStateForSlice(t.current)
940}
941
942type matchTreeOpt struct {
943 // DisableWordMatchOptimization is used to disable the use of wordMatchTree.
944 // This was added since we do not support wordMatchTree with symbol search.
945 DisableWordMatchOptimization bool
946}
947
948func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error) {
949 if q == nil {
950 return nil, fmt.Errorf("got nil (sub)query")
951 }
952 switch s := q.(type) {
953 case *query.Regexp:
954 // RegexpToMatchTreeRecursive tries to distill a matchTree that matches a
955 // superset of the regexp. If the returned matchTree is equivalent to the
956 // original regexp, it returns true. An equivalent matchTree has the same
957 // behaviour as the original regexp and can be used instead.
958 //
959 subMT, isEq, _, err := d.regexpToMatchTreeRecursive(s.Regexp, ngramSize, s.FileName, s.CaseSensitive)
960 if err != nil {
961 return nil, err
962 }
963 // if the query can be used in place of the regexp
964 // return the subtree
965 if isEq {
966 return subMT, nil
967 }
968
969 var tr matchTree
970 if wmt, ok := regexpToWordMatchTree(s, opt); ok {
971 // A common search we get is "\bLITERAL\b". Avoid the regex engine and
972 // provide something faster.
973 tr = wmt
974 } else {
975 prefix := ""
976 if !s.CaseSensitive {
977 prefix = "(?i)"
978 }
979
980 tr = ®expMatchTree{
981 regexp: regexp.MustCompile(prefix + s.Regexp.String()),
982 fileName: s.FileName,
983 }
984 }
985
986 return &andMatchTree{
987 children: []matchTree{
988 tr, &noVisitMatchTree{subMT},
989 },
990 }, nil
991 case *query.And:
992 var r []matchTree
993 for _, ch := range s.Children {
994 ct, err := d.newMatchTree(ch, opt)
995 if err != nil {
996 return nil, err
997 }
998 r = append(r, ct)
999 }
1000 return &andMatchTree{r}, nil
1001 case *query.Or:
1002 var r []matchTree
1003 for _, ch := range s.Children {
1004 ct, err := d.newMatchTree(ch, opt)
1005 if err != nil {
1006 return nil, err
1007 }
1008 r = append(r, ct)
1009 }
1010 return &orMatchTree{r}, nil
1011 case *query.Not:
1012 ct, err := d.newMatchTree(s.Child, opt)
1013 return ¬MatchTree{
1014 child: ct,
1015 }, err
1016
1017 case *query.Type:
1018 if s.Type != query.TypeFileName {
1019 break
1020 }
1021
1022 ct, err := d.newMatchTree(s.Child, opt)
1023 if err != nil {
1024 return nil, err
1025 }
1026
1027 return &fileNameMatchTree{
1028 child: ct,
1029 }, nil
1030
1031 case *query.Boost:
1032 ct, err := d.newMatchTree(s.Child, opt)
1033 if err != nil {
1034 return nil, err
1035 }
1036
1037 return &boostMatchTree{
1038 child: ct,
1039 boost: s.Boost,
1040 }, nil
1041
1042 case *query.Substring:
1043 return d.newSubstringMatchTree(s)
1044
1045 case *query.Branch:
1046 masks := make([]uint64, 0, len(d.repoMetaData))
1047 if s.Pattern == "HEAD" {
1048 for i := 0; i < len(d.repoMetaData); i++ {
1049 masks = append(masks, 1)
1050 }
1051 } else {
1052 for _, branchIDs := range d.branchIDs {
1053 mask := uint64(0)
1054 for nm, m := range branchIDs {
1055 if (s.Exact && nm == s.Pattern) || (!s.Exact && strings.Contains(nm, s.Pattern)) {
1056 mask |= uint64(m)
1057 }
1058 }
1059 masks = append(masks, mask)
1060 }
1061
1062 }
1063 return &branchQueryMatchTree{
1064 masks: masks,
1065 fileMasks: d.fileBranchMasks,
1066 repos: d.repos,
1067 }, nil
1068 case *query.Const:
1069 if s.Value {
1070 return &bruteForceMatchTree{}, nil
1071 } else {
1072 return &noMatchTree{Why: "const"}, nil
1073 }
1074 case *query.Language:
1075 code, ok := d.metaData.LanguageMap[s.Language]
1076 if !ok {
1077 return &noMatchTree{Why: "lang"}, nil
1078 }
1079 return &docMatchTree{
1080 reason: "language",
1081 numDocs: d.numDocs(),
1082 predicate: func(docID uint32) bool {
1083 return d.getLanguage(docID) == code
1084 },
1085 }, nil
1086
1087 case *query.Symbol:
1088 // Disable WordMatchTree since we don't support it in symbols yet.
1089 optCopy := opt
1090 optCopy.DisableWordMatchOptimization = true
1091
1092 subMT, err := d.newMatchTree(s.Expr, optCopy)
1093 if err != nil {
1094 return nil, err
1095 }
1096
1097 if substr, ok := subMT.(*substrMatchTree); ok {
1098 return &symbolSubstrMatchTree{
1099 substrMatchTree: substr,
1100 patternSize: uint32(utf8.RuneCountInString(substr.query.Pattern)),
1101 fileEndRunes: d.fileEndRunes,
1102 fileEndSymbol: d.fileEndSymbol,
1103 sections: d.runeDocSections,
1104 }, nil
1105 }
1106
1107 var regexp *regexp.Regexp
1108 visitMatchTree(subMT, func(mt matchTree) {
1109 if t, ok := mt.(*regexpMatchTree); ok {
1110 regexp = t.regexp
1111 }
1112 })
1113 if regexp == nil {
1114 return nil, fmt.Errorf("found %T inside query.Symbol", subMT)
1115 }
1116
1117 return &symbolRegexpMatchTree{
1118 regexp: regexp,
1119 all: regexp.String() == "(?i)(?-s:.)*",
1120 matchTree: subMT,
1121 }, nil
1122
1123 case *query.FileNameSet:
1124 return &docMatchTree{
1125 reason: "FileNameSet",
1126 numDocs: d.numDocs(),
1127 predicate: func(docID uint32) bool {
1128 fileName := d.fileName(docID)
1129 _, ok := s.Set[string(fileName)]
1130 return ok
1131 },
1132 }, nil
1133
1134 case *query.BranchesRepos:
1135 reposBranchesWant := make([]uint64, len(d.repoMetaData))
1136 for repoIdx := range d.repoMetaData {
1137 var mask uint64
1138 for _, br := range s.List {
1139 if br.Repos.Contains(d.repoMetaData[repoIdx].ID) {
1140 mask |= uint64(d.branchIDs[repoIdx][br.Branch])
1141 }
1142 }
1143 reposBranchesWant[repoIdx] = mask
1144 }
1145 return &docMatchTree{
1146 reason: "BranchesRepos",
1147 numDocs: d.numDocs(),
1148 predicate: func(docID uint32) bool {
1149 return d.fileBranchMasks[docID]&reposBranchesWant[d.repos[docID]] != 0
1150 },
1151 }, nil
1152
1153 case *query.RepoSet:
1154 reposWant := make([]bool, len(d.repoMetaData))
1155 for repoIdx, r := range d.repoMetaData {
1156 if _, ok := s.Set[r.Name]; ok {
1157 reposWant[repoIdx] = true
1158 }
1159 }
1160 return &docMatchTree{
1161 reason: "RepoSet",
1162 numDocs: d.numDocs(),
1163 predicate: func(docID uint32) bool {
1164 return reposWant[d.repos[docID]]
1165 },
1166 }, nil
1167
1168 case *query.RepoIDs:
1169 reposWant := make([]bool, len(d.repoMetaData))
1170 for repoIdx, r := range d.repoMetaData {
1171 if s.Repos.Contains(r.ID) {
1172 reposWant[repoIdx] = true
1173 }
1174 }
1175 return &docMatchTree{
1176 reason: "RepoIDs",
1177 numDocs: d.numDocs(),
1178 predicate: func(docID uint32) bool {
1179 return reposWant[d.repos[docID]]
1180 },
1181 }, nil
1182
1183 case *query.Repo:
1184 reposWant := make([]bool, len(d.repoMetaData))
1185 for repoIdx, r := range d.repoMetaData {
1186 if s.Regexp.MatchString(r.Name) {
1187 reposWant[repoIdx] = true
1188 }
1189 }
1190 return &docMatchTree{
1191 reason: "Repo",
1192 numDocs: d.numDocs(),
1193 predicate: func(docID uint32) bool {
1194 return reposWant[d.repos[docID]]
1195 },
1196 }, nil
1197
1198 case *query.RepoRegexp:
1199 reposWant := make([]bool, len(d.repoMetaData))
1200 for repoIdx, r := range d.repoMetaData {
1201 if s.Regexp.MatchString(r.Name) {
1202 reposWant[repoIdx] = true
1203 }
1204 }
1205 return &docMatchTree{
1206 reason: "RepoRegexp",
1207 numDocs: d.numDocs(),
1208 predicate: func(docID uint32) bool {
1209 return reposWant[d.repos[docID]]
1210 },
1211 }, nil
1212
1213 case query.RawConfig:
1214 return &docMatchTree{
1215 reason: s.String(),
1216 numDocs: d.numDocs(),
1217 predicate: func(docID uint32) bool {
1218 return uint8(s)&d.rawConfigMasks[d.repos[docID]] == uint8(s)
1219 },
1220 }, nil
1221 }
1222 log.Panicf("type %T", q)
1223 return nil, nil
1224}
1225
1226func (d *indexData) newSubstringMatchTree(s *query.Substring) (matchTree, error) {
1227 st := &substrMatchTree{
1228 query: s,
1229 caseSensitive: s.CaseSensitive,
1230 fileName: s.FileName,
1231 }
1232
1233 if utf8.RuneCountInString(s.Pattern) < ngramSize {
1234 prefix := ""
1235 if !s.CaseSensitive {
1236 prefix = "(?i)"
1237 }
1238 t := ®expMatchTree{
1239 regexp: regexp.MustCompile(prefix + regexp.QuoteMeta(s.Pattern)),
1240 fileName: s.FileName,
1241 }
1242 return t, nil
1243 }
1244
1245 result, err := d.iterateNgrams(s)
1246 if err != nil {
1247 return nil, err
1248 }
1249 st.matchIterator = result
1250 return st, nil
1251}
1252
1253func regexpToWordMatchTree(q *query.Regexp, opt matchTreeOpt) (_ *wordMatchTree, ok bool) {
1254 if opt.DisableWordMatchOptimization {
1255 return nil, false
1256 }
1257 // Needs to be case sensitive
1258 if !q.CaseSensitive || q.Regexp.Flags&syntax.FoldCase != 0 {
1259 return nil, false
1260 }
1261 // We want a regex that looks like Op.Concat[OpWordBoundary OpLiteral OpWordBoundary]
1262 if q.Regexp.Op != syntax.OpConcat || len(q.Regexp.Sub) != 3 {
1263 return nil, false
1264 }
1265 sub := q.Regexp.Sub
1266 if sub[0].Op != syntax.OpWordBoundary || sub[1].Op != syntax.OpLiteral || sub[2].Op != syntax.OpWordBoundary {
1267 return nil, false
1268 }
1269
1270 return &wordMatchTree{
1271 word: string(sub[1].Rune),
1272 fileName: q.FileName,
1273 }, true
1274}
1275
1276// pruneMatchTree removes impossible branches from the matchTree, as indicated
1277// by substrMatchTree having a noMatchTree and the resulting impossible and clauses and so forth.
1278func pruneMatchTree(mt matchTree) (matchTree, error) {
1279 var err error
1280 switch mt := mt.(type) {
1281 // leaf nodes that we test with the filter:
1282 case *substrMatchTree:
1283 if res, ok := mt.matchIterator.(*ngramIterationResults); ok {
1284 if _, ok := res.matchIterator.(*noMatchTree); ok {
1285 return nil, nil
1286 }
1287 }
1288 // recursive tree structures:
1289 case *andMatchTree:
1290 // Any branch of an and becoming impossible means the entire clause
1291 // is impossible. Otherwise, just handle rewrites.
1292 for i, child := range mt.children {
1293 newChild, err := pruneMatchTree(child)
1294 if err != nil {
1295 return nil, err
1296 }
1297 if newChild == nil {
1298 return nil, nil
1299 }
1300 mt.children[i] = newChild
1301 }
1302 case *orMatchTree:
1303 // *All* branches of an OR becoming impossible means the entire clause
1304 // is impossible. Otherwise, drop impossible subclauses and handle
1305 // rewrites, including simplifying to a singular resulting child branch.
1306 n := 0
1307 for _, child := range mt.children {
1308 newChild, err := pruneMatchTree(child)
1309 if err != nil {
1310 return nil, err
1311 }
1312 if newChild != nil {
1313 mt.children[n] = newChild
1314 n++
1315 }
1316 }
1317 mt.children = mt.children[:n]
1318 if len(mt.children) == 1 {
1319 return mt.children[0], nil
1320 } else if len(mt.children) == 0 {
1321 return nil, nil
1322 }
1323 case *noVisitMatchTree:
1324 mt.matchTree, err = pruneMatchTree(mt.matchTree)
1325 if err != nil {
1326 return nil, err
1327 }
1328 if mt.matchTree == nil {
1329 return nil, nil
1330 }
1331 case *fileNameMatchTree:
1332 mt.child, err = pruneMatchTree(mt.child)
1333 case *boostMatchTree:
1334 mt.child, err = pruneMatchTree(mt.child)
1335 if err != nil {
1336 return nil, err
1337 }
1338 if mt.child == nil {
1339 return nil, nil
1340 }
1341 case *andLineMatchTree:
1342 child, err := pruneMatchTree(&mt.andMatchTree)
1343 if err != nil {
1344 return nil, err
1345 }
1346 if child == nil {
1347 return nil, nil
1348 }
1349 if c, ok := child.(*andMatchTree); ok {
1350 mt.andMatchTree = *c
1351 } else {
1352 // the and was simplified to a single clause,
1353 // so the linematch portion is irrelevant.
1354 return mt, nil
1355 }
1356 case *notMatchTree:
1357 mt.child, err = pruneMatchTree(mt.child)
1358 if err != nil {
1359 return nil, err
1360 }
1361 if mt.child == nil {
1362 // not false => true
1363 return &bruteForceMatchTree{}, nil
1364 }
1365 // unhandled:
1366 case *docMatchTree:
1367 case *bruteForceMatchTree:
1368 case *regexpMatchTree:
1369 case *wordMatchTree:
1370 }
1371 return mt, err
1372}