fork of https://github.com/sourcegraph/zoekt
1// Copyright 2018 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "bytes"
19 "fmt"
20 "log"
21 "regexp/syntax"
22 "strings"
23 "unicode/utf8"
24
25 "github.com/grafana/regexp"
26
27 "github.com/sourcegraph/zoekt/query"
28)
29
30// A docIterator iterates over documents in order.
31type docIterator interface {
32 // provide the next document where we may find something interesting.
33 //
34 // This is like a "peek" and shouldn't mutate state. prepare is what should
35 // change state.
36 nextDoc() uint32
37
38 // clears any per-document state of the docIterator, and
39 // prepares for evaluating the given doc. The argument is
40 // strictly increasing over time.
41 prepare(nextDoc uint32)
42}
43
44// costs are passed in increasing order to matchTree.matches until they do not
45// return matchesRequiresHigherCost.
46const (
47 costConst = 0
48 costMemory = 1
49 costContent = 2
50 costRegexp = 3
51)
52
53const (
54 costMin = costConst
55 costMax = costRegexp
56)
57
58// matchesState is an enum for the state of a matchTree after a call to
59// matchTree.matches.
60type matchesState uint8
61
62const (
63 // matchesRequiresHigherCost is returned when matchTree.matches hasn't done
64 // a search yet since the cost value is not high enough.
65 matchesRequiresHigherCost matchesState = iota
66
67 // matchesFound is returned when matchTree.matches has done a search and
68 // found one or more matches.
69 matchesFound
70
71 // matchesNone is returned when matchTree.matches has done a search and
72 // found nothing.
73 matchesNone
74)
75
76// matchesStatePred is a helper which returns matchesFound if b is true
77// otherwise returns matchesNone.
78func matchesStatePred(b bool) matchesState {
79 if b {
80 return matchesFound
81 }
82 return matchesNone
83}
84
85// matchesStateForSlice is a helper which returns matchesFound if v is
86// non-empty otherwise returns matchesNone.
87func matchesStateForSlice[T any](v []T) matchesState {
88 return matchesStatePred(len(v) > 0)
89}
90
91// An expression tree coupled with matches. The matchtree has two
92// functions:
93//
94// * it implements boolean combinations (and, or, not)
95//
96// * it implements shortcuts, where we skip documents (for example: if
97// there are no trigram matches, we can be sure there are no substring
98// matches). The matchtree iterates over the documents as they are
99// ordered in the shard.
100//
101// The general process for a given (shard, query) is:
102//
103// - construct matchTree for the query
104//
105// - find all different leaf matchTrees (substring, regexp, etc.)
106//
107// in a loop:
108//
109// - find next doc to process using nextDoc
110//
111// - evaluate atoms (leaf expressions that match text)
112//
113// - evaluate the tree using matches(), storing the result in map.
114//
115// - if the complete tree returns (matches() != matchesRequiresHigherCost)
116// for the document, collect all text matches by looking at leaf
117// matchTrees.
118type matchTree interface {
119 docIterator
120
121 // matches if cost is high enough. See documentation for matchesState's
122 // values.
123 //
124 // Note: Do not call this directly, rather use evalMatchTree which uses
125 // known to cache responses once the state transitions away from
126 // matchesRequiresHigherCost.
127 matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState
128}
129
130// docMatchTree iterates over documents for which predicate(docID) returns true.
131type docMatchTree struct {
132 // the number of documents in a shard.
133 numDocs uint32
134
135 predicate func(docID uint32) bool
136
137 // provides additional information about the reason why the docMatchTree was
138 // created.
139 reason string
140
141 // mutable
142 firstDone bool
143 docID uint32
144}
145
146type bruteForceMatchTree struct {
147 // mutable
148 firstDone bool
149 docID uint32
150}
151
152type andLineMatchTree struct {
153 andMatchTree
154}
155
156type andMatchTree struct {
157 children []matchTree
158}
159
160type orMatchTree struct {
161 children []matchTree
162}
163
164type notMatchTree struct {
165 child matchTree
166}
167
168// Returns only the filename of child matches.
169type fileNameMatchTree struct {
170 child matchTree
171}
172
173type boostMatchTree struct {
174 child matchTree
175 boost float64
176}
177
178// Don't visit this subtree for collecting matches.
179type noVisitMatchTree struct {
180 matchTree
181}
182
183type regexpMatchTree struct {
184 regexp *regexp.Regexp
185
186 // origRegexp is the original parsed regexp from the query structure. It
187 // does not include mutations such as case sensitivity.
188 origRegexp *syntax.Regexp
189
190 fileName bool
191
192 // mutable
193 reEvaluated bool
194 found []*candidateMatch
195
196 // nextDoc, prepare.
197 bruteForceMatchTree
198}
199
200func newRegexpMatchTree(s *query.Regexp) *regexpMatchTree {
201 prefix := ""
202 if !s.CaseSensitive {
203 prefix = "(?i)"
204 }
205
206 return ®expMatchTree{
207 regexp: regexp.MustCompile(prefix + s.Regexp.String()),
208 origRegexp: s.Regexp,
209 fileName: s.FileName,
210 }
211}
212
213// \bLITERAL\b
214type wordMatchTree struct {
215 word string
216
217 fileName bool
218
219 // mutable
220 evaluated bool
221 found []*candidateMatch
222
223 // nextDoc, prepare.
224 bruteForceMatchTree
225}
226
227type substrMatchTree struct {
228 matchIterator
229
230 query *query.Substring
231 caseSensitive bool
232 fileName bool
233
234 // mutable
235 current []*candidateMatch
236 contEvaluated bool
237}
238
239type branchQueryMatchTree struct {
240 fileMasks []uint64
241 masks []uint64
242 repos []uint16
243
244 // mutable
245 firstDone bool
246 docID uint32
247}
248
249func (t *branchQueryMatchTree) branchMask() uint64 {
250 return t.fileMasks[t.docID] & t.masks[t.repos[t.docID]]
251}
252
253type symbolRegexpMatchTree struct {
254 matchTree
255 regexp *regexp.Regexp
256 all bool // skips regex match if .*
257
258 reEvaluated bool
259 found []*candidateMatch
260}
261
262func (t *symbolRegexpMatchTree) prepare(doc uint32) {
263 t.reEvaluated = false
264 t.found = t.found[:0]
265 t.matchTree.prepare(doc)
266}
267
268func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
269 if t.reEvaluated {
270 return matchesStateForSlice(t.found)
271 }
272
273 if cost < costRegexp {
274 return matchesRequiresHigherCost
275 }
276
277 sections := cp.docSections()
278 content := cp.data(false)
279
280 found := t.found[:0]
281 for i, sec := range sections {
282 var idx []int
283 if t.all {
284 idx = []int{0, int(sec.End - sec.Start)}
285 } else {
286 idx = t.regexp.FindIndex(content[sec.Start:sec.End])
287 if idx == nil {
288 continue
289 }
290 }
291
292 cm := &candidateMatch{
293 byteOffset: sec.Start + uint32(idx[0]),
294 byteMatchSz: uint32(idx[1] - idx[0]),
295 symbol: true,
296 symbolIdx: uint32(i),
297 }
298 found = append(found, cm)
299 }
300 t.found = found
301 t.reEvaluated = true
302
303 return matchesStateForSlice(t.found)
304}
305
306type symbolSubstrMatchTree struct {
307 *substrMatchTree
308
309 patternSize uint32
310 fileEndRunes []uint32
311 fileEndSymbol []uint32
312
313 doc uint32
314 sections []DocumentSection
315
316 secID uint32
317}
318
319func (t *symbolSubstrMatchTree) prepare(doc uint32) {
320 t.substrMatchTree.prepare(doc)
321 t.doc = doc
322
323 var fileStart uint32
324 if doc > 0 {
325 fileStart = t.fileEndRunes[doc-1]
326 }
327
328 var sections []DocumentSection
329 if len(t.sections) > 0 {
330 most := t.fileEndSymbol[len(t.fileEndSymbol)-1]
331 if most == uint32(len(t.sections)) {
332 sections = t.sections[t.fileEndSymbol[doc]:t.fileEndSymbol[doc+1]]
333 } else {
334 for t.secID < uint32(len(t.sections)) && t.sections[t.secID].Start < fileStart {
335 t.secID++
336 }
337
338 fileEnd, symbolEnd := t.fileEndRunes[doc], t.secID
339 for symbolEnd < uint32(len(t.sections)) && t.sections[symbolEnd].Start < fileEnd {
340 symbolEnd++
341 }
342
343 sections = t.sections[t.secID:symbolEnd]
344 }
345 }
346
347 secIdx := 0
348 trimmed := t.current[:0]
349 for len(sections) > secIdx && len(t.current) > 0 {
350 start := fileStart + t.current[0].runeOffset
351 end := start + t.patternSize
352 if start >= sections[secIdx].End {
353 secIdx++
354 continue
355 }
356
357 if start < sections[secIdx].Start {
358 t.current = t.current[1:]
359 continue
360 }
361
362 if end <= sections[secIdx].End {
363 t.current[0].symbol = true
364 t.current[0].symbolIdx = uint32(secIdx)
365 trimmed = append(trimmed, t.current[0])
366 }
367
368 t.current = t.current[1:]
369 }
370 t.current = trimmed
371}
372
373// all prepare methods
374
375func (t *bruteForceMatchTree) prepare(doc uint32) {
376 t.docID = doc
377 t.firstDone = true
378}
379
380func (t *docMatchTree) prepare(doc uint32) {
381 t.docID = doc
382 t.firstDone = true
383}
384
385func (t *andMatchTree) prepare(doc uint32) {
386 for _, c := range t.children {
387 c.prepare(doc)
388 }
389}
390
391func (t *regexpMatchTree) prepare(doc uint32) {
392 t.found = t.found[:0]
393 t.reEvaluated = false
394 t.bruteForceMatchTree.prepare(doc)
395}
396
397func (t *wordMatchTree) prepare(doc uint32) {
398 t.found = t.found[:0]
399 t.evaluated = false
400 t.bruteForceMatchTree.prepare(doc)
401}
402
403func (t *orMatchTree) prepare(doc uint32) {
404 for _, c := range t.children {
405 c.prepare(doc)
406 }
407}
408
409func (t *notMatchTree) prepare(doc uint32) {
410 t.child.prepare(doc)
411}
412
413func (t *fileNameMatchTree) prepare(doc uint32) {
414 t.child.prepare(doc)
415}
416
417func (t *boostMatchTree) prepare(doc uint32) {
418 t.child.prepare(doc)
419}
420
421func (t *substrMatchTree) prepare(nextDoc uint32) {
422 t.matchIterator.prepare(nextDoc)
423 t.current = t.matchIterator.candidates()
424 t.contEvaluated = false
425}
426
427func (t *branchQueryMatchTree) prepare(doc uint32) {
428 t.firstDone = true
429 t.docID = doc
430}
431
432// nextDoc
433
434func (t *docMatchTree) nextDoc() uint32 {
435 var start uint32
436 if t.firstDone {
437 start = t.docID + 1
438 }
439 for i := start; i < t.numDocs; i++ {
440 if t.predicate(i) {
441 return i
442 }
443 }
444 return maxUInt32
445}
446
447func (t *bruteForceMatchTree) nextDoc() uint32 {
448 if !t.firstDone {
449 return 0
450 }
451 return t.docID + 1
452}
453
454func (t *andMatchTree) nextDoc() uint32 {
455 var max uint32
456 for _, c := range t.children {
457 m := c.nextDoc()
458 if m > max {
459 max = m
460 }
461 }
462 return max
463}
464
465func (t *orMatchTree) nextDoc() uint32 {
466 min := uint32(maxUInt32)
467 for _, c := range t.children {
468 m := c.nextDoc()
469 if m < min {
470 min = m
471 }
472 }
473 return min
474}
475
476func (t *notMatchTree) nextDoc() uint32 {
477 return 0
478}
479
480func (t *fileNameMatchTree) nextDoc() uint32 {
481 return t.child.nextDoc()
482}
483
484func (t *boostMatchTree) nextDoc() uint32 {
485 return t.child.nextDoc()
486}
487
488func (t *branchQueryMatchTree) nextDoc() uint32 {
489 var start uint32
490 if t.firstDone {
491 start = t.docID + 1
492 }
493
494 for i := start; i < uint32(len(t.fileMasks)); i++ {
495 if (t.masks[t.repos[i]] & t.fileMasks[i]) != 0 {
496 return i
497 }
498 }
499 return maxUInt32
500}
501
502// all String methods
503
504func (t *bruteForceMatchTree) String() string {
505 return "all"
506}
507
508func (t *docMatchTree) String() string {
509 return fmt.Sprintf("doc(%s)", t.reason)
510}
511
512func (t *andMatchTree) String() string {
513 return fmt.Sprintf("and%v", t.children)
514}
515
516func (t *regexpMatchTree) String() string {
517 f := ""
518 if t.fileName {
519 f = "f"
520 }
521 return fmt.Sprintf("%sre(%s)", f, t.regexp)
522}
523
524func (t *wordMatchTree) String() string {
525 f := ""
526 if t.fileName {
527 f = "f"
528 }
529 return fmt.Sprintf("%sword(%s)", f, t.word)
530}
531
532func (t *orMatchTree) String() string {
533 return fmt.Sprintf("or%v", t.children)
534}
535
536func (t *notMatchTree) String() string {
537 return fmt.Sprintf("not(%v)", t.child)
538}
539
540func (t *noVisitMatchTree) String() string {
541 return fmt.Sprintf("novisit(%v)", t.matchTree)
542}
543
544func (t *fileNameMatchTree) String() string {
545 return fmt.Sprintf("f(%v)", t.child)
546}
547
548func (t *boostMatchTree) String() string {
549 return fmt.Sprintf("boost(%f, %v)", t.boost, t.child)
550}
551
552func (t *substrMatchTree) String() string {
553 f := ""
554 if t.fileName {
555 f = "f"
556 }
557
558 return fmt.Sprintf("%ssubstr(%q, %v, %v)", f, t.query.Pattern, t.current, t.matchIterator)
559}
560
561func (t *branchQueryMatchTree) String() string {
562 return fmt.Sprintf("branch(%x)", t.masks)
563}
564
565func (t *symbolSubstrMatchTree) String() string {
566 return fmt.Sprintf("symbol(%v)", t.substrMatchTree)
567}
568
569func (t *symbolRegexpMatchTree) String() string {
570 return fmt.Sprintf("symbol(%v)", t.matchTree)
571}
572
573// visitMatches visits all atoms in matchTree. Note: This visits
574// noVisitMatchTree. For collecting matches use visitMatches.
575func visitMatchTree(t matchTree, f func(matchTree)) {
576 switch s := t.(type) {
577 case *andMatchTree:
578 for _, ch := range s.children {
579 visitMatchTree(ch, f)
580 }
581 case *orMatchTree:
582 for _, ch := range s.children {
583 visitMatchTree(ch, f)
584 }
585 case *andLineMatchTree:
586 visitMatchTree(&s.andMatchTree, f)
587 case *noVisitMatchTree:
588 visitMatchTree(s.matchTree, f)
589 case *notMatchTree:
590 visitMatchTree(s.child, f)
591 case *fileNameMatchTree:
592 visitMatchTree(s.child, f)
593 case *boostMatchTree:
594 visitMatchTree(s.child, f)
595 case *symbolSubstrMatchTree:
596 visitMatchTree(s.substrMatchTree, f)
597 case *symbolRegexpMatchTree:
598 visitMatchTree(s.matchTree, f)
599 default:
600 f(t)
601 }
602}
603
604// updateMatchTreeStats calls updateStats on all atoms in mt which have that
605// function defined.
606func updateMatchTreeStats(mt matchTree, stats *Stats) {
607 visitMatchTree(mt, func(mt matchTree) {
608 if atom, ok := mt.(interface{ updateStats(*Stats) }); ok {
609 atom.updateStats(stats)
610 }
611 })
612}
613
614func visitMatchAtoms(t matchTree, known map[matchTree]bool, f func(matchTree)) {
615 visitMatches(t, known, 1, func(mt matchTree, _ float64) {
616 f(mt)
617 })
618}
619
620// visitMatches visits all atoms which can contribute matches. Note: This
621// skips noVisitMatchTree.
622func visitMatches(t matchTree, known map[matchTree]bool, weight float64, f func(matchTree, float64)) {
623 switch s := t.(type) {
624 case *andMatchTree:
625 for _, ch := range s.children {
626 if known[ch] {
627 visitMatches(ch, known, weight, f)
628 }
629 }
630 case *andLineMatchTree:
631 visitMatches(&s.andMatchTree, known, weight, f)
632 case *orMatchTree:
633 for _, ch := range s.children {
634 if known[ch] {
635 visitMatches(ch, known, weight, f)
636 }
637 }
638 case *boostMatchTree:
639 visitMatches(s.child, known, weight*s.boost, f)
640 case *symbolSubstrMatchTree:
641 visitMatches(s.substrMatchTree, known, weight, f)
642 case *notMatchTree:
643 case *noVisitMatchTree:
644 // don't collect into negative trees.
645 case *fileNameMatchTree:
646 // We will just gather the filename if we do not visit this tree.
647 default:
648 f(s, weight)
649 }
650}
651
652// all matches() methods.
653
654func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
655 return matchesStatePred(t.predicate(cp.idx))
656}
657
658func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
659 return matchesFound
660}
661
662// andLineMatchTree is a performance optimization of andMatchTree. For content
663// searches we don't want to run the regex engine if there is no line that
664// contains matches from all terms.
665func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
666 if state := evalMatchTree(cp, cost, known, &t.andMatchTree); state != matchesFound {
667 return state
668 }
669
670 // Invariant: all children have matches. If any line contains all of them we
671 // can return MatchesFound.
672
673 // find child with fewest candidates
674 min := maxUInt32
675 fewestChildren := 0
676 for ix, child := range t.children {
677 v, ok := child.(*substrMatchTree)
678 // make sure we are running a content search and that all candidates are a
679 // substrMatchTree
680 if !ok || v.fileName {
681 return matchesFound
682 }
683 if len(v.current) < min {
684 min = len(v.current)
685 fewestChildren = ix
686 }
687 }
688
689 type lineRange struct {
690 start int
691 end int
692 }
693 lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current))
694 prev := -1
695 for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current {
696 line, byteStart, byteEnd := cp.newlines().atOffset(candidate.byteOffset)
697 if line == prev {
698 continue
699 }
700 prev = line
701 lines = append(lines, lineRange{byteStart, byteEnd})
702 }
703
704 // children keeps track of the children's candidates we have already seen.
705 children := make([][]*candidateMatch, 0, len(t.children)-1)
706 for j, child := range t.children {
707 if j == fewestChildren {
708 continue
709 }
710 children = append(children, child.(*substrMatchTree).current)
711 }
712
713nextLine:
714 for i := 0; i < len(lines); i++ {
715 hits := 1
716 nextChild:
717 for j := range children {
718 nextCandidate:
719 for len(children[j]) > 0 {
720 candidate := children[j][0]
721 bo := int(cp.findOffset(false, candidate.runeOffset))
722 if bo < lines[i].start {
723 children[j] = children[j][1:]
724 continue nextCandidate
725 }
726 if bo <= lines[i].end {
727 hits++
728 continue nextChild
729 }
730 // move the `lines` iterator forward until bo <= line.end
731 for i < len(lines) && bo > lines[i].end {
732 i++
733 }
734 i--
735 continue nextLine
736 }
737 }
738 // return early once we found any line that contains matches from all children
739 if hits == len(t.children) {
740 return matchesFound
741 }
742 }
743 return matchesNone
744}
745
746func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
747 // We have found matches unless a child needs to do more work or it hasn't
748 // found matches.
749 state := matchesFound
750
751 for _, ch := range t.children {
752 switch evalMatchTree(cp, cost, known, ch) {
753 case matchesRequiresHigherCost:
754 // keep evaluating other children incase we come across matchesNone
755 state = matchesRequiresHigherCost
756 case matchesFound:
757 // will return this if every child has this value
758 case matchesNone:
759 return matchesNone
760 }
761 }
762
763 return state
764}
765
766func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
767 // we could short-circuit, but we want to use the other possibilities as a
768 // ranking signal. So we always return the most conservative state.
769 state := matchesNone
770 for _, ch := range t.children {
771 switch evalMatchTree(cp, cost, known, ch) {
772 case matchesRequiresHigherCost:
773 state = matchesRequiresHigherCost
774 case matchesFound:
775 if state != matchesRequiresHigherCost {
776 state = matchesFound
777 }
778 case matchesNone:
779 // noop
780 }
781 }
782 return state
783}
784
785func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
786 return matchesStatePred(t.branchMask() != 0)
787}
788
789func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
790 if t.reEvaluated {
791 return matchesStateForSlice(t.found)
792 }
793
794 if cost < costRegexp {
795 return matchesRequiresHigherCost
796 }
797
798 cp.stats.RegexpsConsidered++
799 idxs := t.regexp.FindAllIndex(cp.data(t.fileName), -1)
800 found := t.found[:0]
801 for _, idx := range idxs {
802 cm := &candidateMatch{
803 byteOffset: uint32(idx[0]),
804 byteMatchSz: uint32(idx[1] - idx[0]),
805 fileName: t.fileName,
806 }
807
808 found = append(found, cm)
809 }
810 t.found = found
811 t.reEvaluated = true
812
813 return matchesStateForSlice(t.found)
814}
815
816func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
817 if t.evaluated {
818 return matchesStateForSlice(t.found)
819 }
820
821 if cost < costRegexp {
822 return matchesRequiresHigherCost
823 }
824
825 data := cp.data(t.fileName)
826 offset := 0
827 found := t.found[:0]
828 for {
829 idx := bytes.Index(data[offset:], []byte(t.word))
830 if idx < 0 {
831 break
832 }
833
834 relStartOffset := offset + idx
835 relEndOffset := relStartOffset + len(t.word)
836
837 startBoundary := relStartOffset < len(data) && (relStartOffset == 0 || !characterClass(data[relStartOffset-1]))
838 endBoundary := relEndOffset > 0 && (relEndOffset == len(data) || !characterClass(data[relEndOffset]))
839 if startBoundary && endBoundary {
840 found = append(found, &candidateMatch{
841 byteOffset: uint32(offset + idx),
842 byteMatchSz: uint32(len(t.word)),
843 fileName: t.fileName,
844 })
845 }
846 offset += idx + len(t.word)
847 }
848
849 t.found = found
850 t.evaluated = true
851
852 return matchesStateForSlice(t.found)
853}
854
855// breakMatchesOnNewlines returns matches resulting from breaking each element
856// of cms on newlines within text.
857func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch {
858 var lineCMs []*candidateMatch
859 for _, cm := range cms {
860 lineCMs = append(lineCMs, breakOnNewlines(cm, text)...)
861 }
862 return lineCMs
863}
864
865// breakOnNewlines returns matches resulting from breaking cm on newlines
866// within text.
867func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch {
868 var cms []*candidateMatch
869 addMe := &candidateMatch{}
870 *addMe = *cm
871 for i := uint32(cm.byteOffset); i < cm.byteOffset+cm.byteMatchSz; i++ {
872 if text[i] == '\n' {
873 addMe.byteMatchSz = i - addMe.byteOffset
874 if addMe.byteMatchSz != 0 {
875 cms = append(cms, addMe)
876 }
877
878 addMe = &candidateMatch{}
879 *addMe = *cm
880 addMe.byteOffset = i + 1
881 }
882 }
883 addMe.byteMatchSz = cm.byteOffset + cm.byteMatchSz - addMe.byteOffset
884 if addMe.byteMatchSz != 0 {
885 cms = append(cms, addMe)
886 }
887 return cms
888}
889
890// evalMatchTree should be called instead of directly calling
891// matchTree.matches. It cache known values for future evaluation at higher
892// costs.
893func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) matchesState {
894 if v, ok := known[mt]; ok {
895 return matchesStatePred(v)
896 }
897
898 ms := mt.matches(cp, cost, known)
899 if ms != matchesRequiresHigherCost {
900 known[mt] = ms == matchesFound
901 }
902
903 return ms
904}
905
906func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
907 switch evalMatchTree(cp, cost, known, t.child) {
908 case matchesRequiresHigherCost:
909 return matchesRequiresHigherCost
910 case matchesFound:
911 return matchesNone
912 case matchesNone:
913 return matchesFound
914 default:
915 panic("unreachable")
916 }
917}
918
919func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
920 return evalMatchTree(cp, cost, known, t.child)
921}
922
923func (t *boostMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
924 return evalMatchTree(cp, cost, known, t.child)
925}
926
927func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
928 if t.contEvaluated {
929 return matchesStateForSlice(t.current)
930 }
931
932 if len(t.current) == 0 {
933 return matchesNone
934 }
935
936 if t.fileName && cost < costMemory {
937 return matchesRequiresHigherCost
938 }
939
940 if !t.fileName && cost < costContent {
941 return matchesRequiresHigherCost
942 }
943
944 pruned := t.current[:0]
945 for _, m := range t.current {
946 if m.byteOffset == 0 && m.runeOffset > 0 {
947 m.byteOffset = cp.findOffset(m.fileName, m.runeOffset)
948 }
949 if m.matchContent(cp.data(m.fileName)) {
950 pruned = append(pruned, m)
951 }
952 }
953 t.current = pruned
954 t.contEvaluated = true
955
956 return matchesStateForSlice(t.current)
957}
958
959type matchTreeOpt struct {
960 // DisableWordMatchOptimization is used to disable the use of wordMatchTree.
961 // This was added since we do not support wordMatchTree with symbol search.
962 DisableWordMatchOptimization bool
963}
964
965func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error) {
966 if q == nil {
967 return nil, fmt.Errorf("got nil (sub)query")
968 }
969 switch s := q.(type) {
970 case *query.Regexp:
971 // RegexpToMatchTreeRecursive tries to distill a matchTree that matches a
972 // superset of the regexp. If the returned matchTree is equivalent to the
973 // original regexp, it returns true. An equivalent matchTree has the same
974 // behaviour as the original regexp and can be used instead.
975 //
976 subMT, isEq, _, err := d.regexpToMatchTreeRecursive(s.Regexp, ngramSize, s.FileName, s.CaseSensitive)
977 if err != nil {
978 return nil, err
979 }
980 // if the query can be used in place of the regexp
981 // return the subtree
982 if isEq {
983 return subMT, nil
984 }
985
986 var tr matchTree
987 if wmt, ok := regexpToWordMatchTree(s, opt); ok {
988 // A common search we get is "\bLITERAL\b". Avoid the regex engine and
989 // provide something faster.
990 tr = wmt
991 } else {
992 tr = newRegexpMatchTree(s)
993 }
994
995 return &andMatchTree{
996 children: []matchTree{
997 tr, &noVisitMatchTree{subMT},
998 },
999 }, nil
1000 case *query.And:
1001 var r []matchTree
1002 for _, ch := range s.Children {
1003 ct, err := d.newMatchTree(ch, opt)
1004 if err != nil {
1005 return nil, err
1006 }
1007 r = append(r, ct)
1008 }
1009 return &andMatchTree{r}, nil
1010 case *query.Or:
1011 var r []matchTree
1012 for _, ch := range s.Children {
1013 ct, err := d.newMatchTree(ch, opt)
1014 if err != nil {
1015 return nil, err
1016 }
1017 r = append(r, ct)
1018 }
1019 return &orMatchTree{r}, nil
1020 case *query.Not:
1021 ct, err := d.newMatchTree(s.Child, opt)
1022 return ¬MatchTree{
1023 child: ct,
1024 }, err
1025
1026 case *query.Type:
1027 if s.Type != query.TypeFileName {
1028 break
1029 }
1030
1031 ct, err := d.newMatchTree(s.Child, opt)
1032 if err != nil {
1033 return nil, err
1034 }
1035
1036 return &fileNameMatchTree{
1037 child: ct,
1038 }, nil
1039
1040 case *query.Boost:
1041 ct, err := d.newMatchTree(s.Child, opt)
1042 if err != nil {
1043 return nil, err
1044 }
1045
1046 return &boostMatchTree{
1047 child: ct,
1048 boost: s.Boost,
1049 }, nil
1050
1051 case *query.Substring:
1052 return d.newSubstringMatchTree(s)
1053
1054 case *query.Branch:
1055 masks := make([]uint64, 0, len(d.repoMetaData))
1056 if s.Pattern == "HEAD" {
1057 for i := 0; i < len(d.repoMetaData); i++ {
1058 masks = append(masks, 1)
1059 }
1060 } else {
1061 for _, branchIDs := range d.branchIDs {
1062 mask := uint64(0)
1063 for nm, m := range branchIDs {
1064 if (s.Exact && nm == s.Pattern) || (!s.Exact && strings.Contains(nm, s.Pattern)) {
1065 mask |= uint64(m)
1066 }
1067 }
1068 masks = append(masks, mask)
1069 }
1070 }
1071 return &branchQueryMatchTree{
1072 masks: masks,
1073 fileMasks: d.fileBranchMasks,
1074 repos: d.repos,
1075 }, nil
1076 case *query.Const:
1077 if s.Value {
1078 return &bruteForceMatchTree{}, nil
1079 } else {
1080 return &noMatchTree{Why: "const"}, nil
1081 }
1082 case *query.Language:
1083 code, ok := d.metaData.LanguageMap[s.Language]
1084 if !ok {
1085 return &noMatchTree{Why: "lang"}, nil
1086 }
1087 return &docMatchTree{
1088 reason: "language",
1089 numDocs: d.numDocs(),
1090 predicate: func(docID uint32) bool {
1091 return d.getLanguage(docID) == code
1092 },
1093 }, nil
1094
1095 case *query.Symbol:
1096 // Disable WordMatchTree since we don't support it in symbols yet.
1097 optCopy := opt
1098 optCopy.DisableWordMatchOptimization = true
1099
1100 subMT, err := d.newMatchTree(s.Expr, optCopy)
1101 if err != nil {
1102 return nil, err
1103 }
1104
1105 if substr, ok := subMT.(*substrMatchTree); ok {
1106 return &symbolSubstrMatchTree{
1107 substrMatchTree: substr,
1108 patternSize: uint32(utf8.RuneCountInString(substr.query.Pattern)),
1109 fileEndRunes: d.fileEndRunes,
1110 fileEndSymbol: d.fileEndSymbol,
1111 sections: d.runeDocSections,
1112 }, nil
1113 }
1114
1115 var regexpMT *regexpMatchTree
1116 visitMatchTree(subMT, func(mt matchTree) {
1117 if t, ok := mt.(*regexpMatchTree); ok {
1118 regexpMT = t
1119 }
1120 })
1121 if regexpMT == nil {
1122 return nil, fmt.Errorf("found %T inside query.Symbol", subMT)
1123 }
1124
1125 return &symbolRegexpMatchTree{
1126 regexp: regexpMT.regexp,
1127 all: isRegexpAll(regexpMT.origRegexp),
1128 matchTree: subMT,
1129 }, nil
1130
1131 case *query.FileNameSet:
1132 return &docMatchTree{
1133 reason: "FileNameSet",
1134 numDocs: d.numDocs(),
1135 predicate: func(docID uint32) bool {
1136 fileName := d.fileName(docID)
1137 _, ok := s.Set[string(fileName)]
1138 return ok
1139 },
1140 }, nil
1141
1142 case *query.BranchesRepos:
1143 reposBranchesWant := make([]uint64, len(d.repoMetaData))
1144 for repoIdx := range d.repoMetaData {
1145 var mask uint64
1146 for _, br := range s.List {
1147 if br.Repos.Contains(d.repoMetaData[repoIdx].ID) {
1148 mask |= uint64(d.branchIDs[repoIdx][br.Branch])
1149 }
1150 }
1151 reposBranchesWant[repoIdx] = mask
1152 }
1153 return &docMatchTree{
1154 reason: "BranchesRepos",
1155 numDocs: d.numDocs(),
1156 predicate: func(docID uint32) bool {
1157 return d.fileBranchMasks[docID]&reposBranchesWant[d.repos[docID]] != 0
1158 },
1159 }, nil
1160
1161 case *query.RepoSet:
1162 reposWant := make([]bool, len(d.repoMetaData))
1163 for repoIdx, r := range d.repoMetaData {
1164 if _, ok := s.Set[r.Name]; ok {
1165 reposWant[repoIdx] = true
1166 }
1167 }
1168 return &docMatchTree{
1169 reason: "RepoSet",
1170 numDocs: d.numDocs(),
1171 predicate: func(docID uint32) bool {
1172 return reposWant[d.repos[docID]]
1173 },
1174 }, nil
1175
1176 case *query.RepoIDs:
1177 reposWant := make([]bool, len(d.repoMetaData))
1178 for repoIdx, r := range d.repoMetaData {
1179 if s.Repos.Contains(r.ID) {
1180 reposWant[repoIdx] = true
1181 }
1182 }
1183 return &docMatchTree{
1184 reason: "RepoIDs",
1185 numDocs: d.numDocs(),
1186 predicate: func(docID uint32) bool {
1187 return reposWant[d.repos[docID]]
1188 },
1189 }, nil
1190
1191 case *query.Repo:
1192 reposWant := make([]bool, len(d.repoMetaData))
1193 for repoIdx, r := range d.repoMetaData {
1194 if s.Regexp.MatchString(r.Name) {
1195 reposWant[repoIdx] = true
1196 }
1197 }
1198 return &docMatchTree{
1199 reason: "Repo",
1200 numDocs: d.numDocs(),
1201 predicate: func(docID uint32) bool {
1202 return reposWant[d.repos[docID]]
1203 },
1204 }, nil
1205
1206 case *query.RepoRegexp:
1207 reposWant := make([]bool, len(d.repoMetaData))
1208 for repoIdx, r := range d.repoMetaData {
1209 if s.Regexp.MatchString(r.Name) {
1210 reposWant[repoIdx] = true
1211 }
1212 }
1213 return &docMatchTree{
1214 reason: "RepoRegexp",
1215 numDocs: d.numDocs(),
1216 predicate: func(docID uint32) bool {
1217 return reposWant[d.repos[docID]]
1218 },
1219 }, nil
1220
1221 case query.RawConfig:
1222 return &docMatchTree{
1223 reason: s.String(),
1224 numDocs: d.numDocs(),
1225 predicate: func(docID uint32) bool {
1226 return uint8(s)&d.rawConfigMasks[d.repos[docID]] == uint8(s)
1227 },
1228 }, nil
1229 }
1230 log.Panicf("type %T", q)
1231 return nil, nil
1232}
1233
1234func (d *indexData) newSubstringMatchTree(s *query.Substring) (matchTree, error) {
1235 st := &substrMatchTree{
1236 query: s,
1237 caseSensitive: s.CaseSensitive,
1238 fileName: s.FileName,
1239 }
1240
1241 if utf8.RuneCountInString(s.Pattern) < ngramSize {
1242 return newRegexpMatchTree(&query.Regexp{
1243 Regexp: &syntax.Regexp{Op: syntax.OpLiteral, Rune: []rune(s.Pattern)},
1244 FileName: s.FileName,
1245 Content: s.Content,
1246 CaseSensitive: s.CaseSensitive,
1247 }), nil
1248 }
1249
1250 result, err := d.iterateNgrams(s)
1251 if err != nil {
1252 return nil, err
1253 }
1254 st.matchIterator = result
1255 return st, nil
1256}
1257
1258func regexpToWordMatchTree(q *query.Regexp, opt matchTreeOpt) (_ *wordMatchTree, ok bool) {
1259 if opt.DisableWordMatchOptimization {
1260 return nil, false
1261 }
1262 // Needs to be case sensitive
1263 if !q.CaseSensitive || q.Regexp.Flags&syntax.FoldCase != 0 {
1264 return nil, false
1265 }
1266 // We want a regex that looks like Op.Concat[OpWordBoundary OpLiteral OpWordBoundary]
1267 if q.Regexp.Op != syntax.OpConcat || len(q.Regexp.Sub) != 3 {
1268 return nil, false
1269 }
1270 sub := q.Regexp.Sub
1271 if sub[0].Op != syntax.OpWordBoundary || sub[1].Op != syntax.OpLiteral || sub[2].Op != syntax.OpWordBoundary {
1272 return nil, false
1273 }
1274
1275 return &wordMatchTree{
1276 word: string(sub[1].Rune),
1277 fileName: q.FileName,
1278 }, true
1279}
1280
1281// pruneMatchTree removes impossible branches from the matchTree, as indicated
1282// by substrMatchTree having a noMatchTree and the resulting impossible and clauses and so forth.
1283func pruneMatchTree(mt matchTree) (matchTree, error) {
1284 var err error
1285 switch mt := mt.(type) {
1286 // leaf nodes that we test with the filter:
1287 case *substrMatchTree:
1288 if res, ok := mt.matchIterator.(*ngramIterationResults); ok {
1289 if _, ok := res.matchIterator.(*noMatchTree); ok {
1290 return nil, nil
1291 }
1292 }
1293 // recursive tree structures:
1294 case *andMatchTree:
1295 // Any branch of an and becoming impossible means the entire clause
1296 // is impossible. Otherwise, just handle rewrites.
1297 for i, child := range mt.children {
1298 newChild, err := pruneMatchTree(child)
1299 if err != nil {
1300 return nil, err
1301 }
1302 if newChild == nil {
1303 return nil, nil
1304 }
1305 mt.children[i] = newChild
1306 }
1307 case *orMatchTree:
1308 // *All* branches of an OR becoming impossible means the entire clause
1309 // is impossible. Otherwise, drop impossible subclauses and handle
1310 // rewrites, including simplifying to a singular resulting child branch.
1311 n := 0
1312 for _, child := range mt.children {
1313 newChild, err := pruneMatchTree(child)
1314 if err != nil {
1315 return nil, err
1316 }
1317 if newChild != nil {
1318 mt.children[n] = newChild
1319 n++
1320 }
1321 }
1322 mt.children = mt.children[:n]
1323 if len(mt.children) == 1 {
1324 return mt.children[0], nil
1325 } else if len(mt.children) == 0 {
1326 return nil, nil
1327 }
1328 case *noVisitMatchTree:
1329 mt.matchTree, err = pruneMatchTree(mt.matchTree)
1330 if err != nil {
1331 return nil, err
1332 }
1333 if mt.matchTree == nil {
1334 return nil, nil
1335 }
1336 case *fileNameMatchTree:
1337 mt.child, err = pruneMatchTree(mt.child)
1338 if err != nil {
1339 return nil, err
1340 }
1341 if mt.child == nil {
1342 return nil, nil
1343 }
1344 case *boostMatchTree:
1345 mt.child, err = pruneMatchTree(mt.child)
1346 if err != nil {
1347 return nil, err
1348 }
1349 if mt.child == nil {
1350 return nil, nil
1351 }
1352 case *andLineMatchTree:
1353 child, err := pruneMatchTree(&mt.andMatchTree)
1354 if err != nil {
1355 return nil, err
1356 }
1357 if child == nil {
1358 return nil, nil
1359 }
1360 if c, ok := child.(*andMatchTree); ok {
1361 mt.andMatchTree = *c
1362 } else {
1363 // the and was simplified to a single clause,
1364 // so the linematch portion is irrelevant.
1365 return mt, nil
1366 }
1367 case *notMatchTree:
1368 mt.child, err = pruneMatchTree(mt.child)
1369 if err != nil {
1370 return nil, err
1371 }
1372 if mt.child == nil {
1373 // not false => true
1374 return &bruteForceMatchTree{}, nil
1375 }
1376 // unhandled:
1377 case *docMatchTree:
1378 case *bruteForceMatchTree:
1379 case *regexpMatchTree:
1380 case *wordMatchTree:
1381 }
1382 return mt, err
1383}
1384
1385// isRegexpAll returns true if the query matches all possible lines.
1386//
1387// Note: it is possible for a funky regex to actually match all but this
1388// returns false. This returns true for normal looking regexes like ".*" or
1389// "(?-s:.*)".
1390func isRegexpAll(r *syntax.Regexp) bool {
1391 // Note: we don't care about any flags since we are looking for .* and we
1392 // don't mind if . matches all or everything but newline.
1393
1394 // for loop is for visiting children of OpCapture/OpConcat until we hit .*
1395 for {
1396 // Our main target: .*
1397 if r.Op == syntax.OpStar && len(r.Sub) == 1 { // *
1398 // (?s:.) or (?-s:.)
1399 return r.Sub[0].Op == syntax.OpAnyChar || r.Sub[0].Op == syntax.OpAnyCharNotNL
1400 }
1401
1402 // Strip away expressions being wrapped in paranthesis
1403 if (r.Op == syntax.OpCapture || r.Op == syntax.OpConcat) && len(r.Sub) == 1 {
1404 r = r.Sub[0]
1405 continue
1406 }
1407
1408 return false
1409 }
1410}