fork of https://github.com/sourcegraph/zoekt
1// Copyright 2018 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "bytes"
19 "fmt"
20 "log"
21 "regexp/syntax"
22 "strings"
23 "unicode/utf8"
24
25 "github.com/grafana/regexp"
26
27 "github.com/sourcegraph/zoekt/internal/syntaxutil"
28 "github.com/sourcegraph/zoekt/query"
29)
30
31// A docIterator iterates over documents in order.
32type docIterator interface {
33 // provide the next document where we may find something interesting.
34 //
35 // This is like a "peek" and shouldn't mutate state. prepare is what should
36 // change state.
37 nextDoc() uint32
38
39 // clears any per-document state of the docIterator, and
40 // prepares for evaluating the given doc. The argument is
41 // strictly increasing over time.
42 prepare(nextDoc uint32)
43}
44
45// costs are passed in increasing order to matchTree.matches until they do not
46// return matchesRequiresHigherCost.
47const (
48 costConst = 0
49 costMemory = 1
50 costContent = 2
51 costRegexp = 3
52)
53
54const (
55 costMin = costConst
56 costMax = costRegexp
57)
58
59// matchesState is an enum for the state of a matchTree after a call to
60// matchTree.matches.
61type matchesState uint8
62
63const (
64 // matchesRequiresHigherCost is returned when matchTree.matches hasn't done
65 // a search yet since the cost value is not high enough.
66 matchesRequiresHigherCost matchesState = iota
67
68 // matchesFound is returned when matchTree.matches has done a search and
69 // found one or more matches.
70 matchesFound
71
72 // matchesNone is returned when matchTree.matches has done a search and
73 // found nothing.
74 matchesNone
75)
76
77// matchesStatePred is a helper which returns matchesFound if b is true
78// otherwise returns matchesNone.
79func matchesStatePred(b bool) matchesState {
80 if b {
81 return matchesFound
82 }
83 return matchesNone
84}
85
86// matchesStateForSlice is a helper which returns matchesFound if v is
87// non-empty otherwise returns matchesNone.
88func matchesStateForSlice[T any](v []T) matchesState {
89 return matchesStatePred(len(v) > 0)
90}
91
92// An expression tree coupled with matches. The matchtree has two
93// functions:
94//
95// * it implements boolean combinations (and, or, not)
96//
97// * it implements shortcuts, where we skip documents (for example: if
98// there are no trigram matches, we can be sure there are no substring
99// matches). The matchtree iterates over the documents as they are
100// ordered in the shard.
101//
102// The general process for a given (shard, query) is:
103//
104// - construct matchTree for the query
105//
106// - find all different leaf matchTrees (substring, regexp, etc.)
107//
108// in a loop:
109//
110// - find next doc to process using nextDoc
111//
112// - evaluate atoms (leaf expressions that match text)
113//
114// - evaluate the tree using matches(), storing the result in map.
115//
116// - if the complete tree returns (matches() != matchesRequiresHigherCost)
117// for the document, collect all text matches by looking at leaf
118// matchTrees.
119type matchTree interface {
120 docIterator
121
122 // matches if cost is high enough. See documentation for matchesState's
123 // values.
124 //
125 // Note: Do not call this directly, rather use evalMatchTree which uses
126 // known to cache responses once the state transitions away from
127 // matchesRequiresHigherCost.
128 matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState
129}
130
131// docMatchTree iterates over documents for which predicate(docID) returns true.
132type docMatchTree struct {
133 // the number of documents in a shard.
134 numDocs uint32
135
136 predicate func(docID uint32) bool
137
138 // provides additional information about the reason why the docMatchTree was
139 // created.
140 reason string
141
142 // mutable
143 firstDone bool
144 docID uint32
145}
146
147type bruteForceMatchTree struct {
148 // mutable
149 firstDone bool
150 docID uint32
151}
152
153type andLineMatchTree struct {
154 andMatchTree
155}
156
157type andMatchTree struct {
158 children []matchTree
159}
160
161type orMatchTree struct {
162 children []matchTree
163}
164
165type notMatchTree struct {
166 child matchTree
167}
168
169// Returns only the filename of child matches.
170type fileNameMatchTree struct {
171 child matchTree
172}
173
174type boostMatchTree struct {
175 child matchTree
176 boost float64
177}
178
179// Don't visit this subtree for collecting matches.
180type noVisitMatchTree struct {
181 matchTree
182}
183
184type regexpMatchTree struct {
185 regexp *regexp.Regexp
186
187 // origRegexp is the original parsed regexp from the query structure. It
188 // does not include mutations such as case sensitivity.
189 origRegexp *syntax.Regexp
190
191 fileName bool
192
193 // mutable
194 reEvaluated bool
195 found []*candidateMatch
196
197 // nextDoc, prepare.
198 bruteForceMatchTree
199}
200
201func newRegexpMatchTree(s *query.Regexp) *regexpMatchTree {
202 prefix := ""
203 if !s.CaseSensitive {
204 prefix = "(?i)"
205 }
206
207 return ®expMatchTree{
208 regexp: regexp.MustCompile(prefix + syntaxutil.RegexpString(s.Regexp)),
209 origRegexp: s.Regexp,
210 fileName: s.FileName,
211 }
212}
213
214// \bLITERAL\b
215type wordMatchTree struct {
216 word string
217
218 fileName bool
219
220 // mutable
221 evaluated bool
222 found []*candidateMatch
223
224 // nextDoc, prepare.
225 bruteForceMatchTree
226}
227
228type substrMatchTree struct {
229 matchIterator
230
231 query *query.Substring
232 caseSensitive bool
233 fileName bool
234
235 // mutable
236 current []*candidateMatch
237 contEvaluated bool
238}
239
240type branchQueryMatchTree struct {
241 fileMasks []uint64
242 masks []uint64
243 repos []uint16
244
245 // mutable
246 firstDone bool
247 docID uint32
248}
249
250func (t *branchQueryMatchTree) branchMask() uint64 {
251 return t.fileMasks[t.docID] & t.masks[t.repos[t.docID]]
252}
253
254type symbolRegexpMatchTree struct {
255 matchTree
256 regexp *regexp.Regexp
257 all bool // skips regex match if .*
258
259 reEvaluated bool
260 found []*candidateMatch
261}
262
263func (t *symbolRegexpMatchTree) prepare(doc uint32) {
264 t.reEvaluated = false
265 t.found = t.found[:0]
266 t.matchTree.prepare(doc)
267}
268
269func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
270 if t.reEvaluated {
271 return matchesStateForSlice(t.found)
272 }
273
274 if cost < costRegexp {
275 return matchesRequiresHigherCost
276 }
277
278 sections := cp.docSections()
279 content := cp.data(false)
280
281 found := t.found[:0]
282 for i, sec := range sections {
283 var idx []int
284 if t.all {
285 idx = []int{0, int(sec.End - sec.Start)}
286 } else {
287 idx = t.regexp.FindIndex(content[sec.Start:sec.End])
288 if idx == nil {
289 continue
290 }
291 }
292
293 cm := &candidateMatch{
294 byteOffset: sec.Start + uint32(idx[0]),
295 byteMatchSz: uint32(idx[1] - idx[0]),
296 symbol: true,
297 symbolIdx: uint32(i),
298 }
299 found = append(found, cm)
300 }
301 t.found = found
302 t.reEvaluated = true
303
304 return matchesStateForSlice(t.found)
305}
306
307type symbolSubstrMatchTree struct {
308 *substrMatchTree
309
310 patternSize uint32
311 fileEndRunes []uint32
312 fileEndSymbol []uint32
313
314 doc uint32
315 sections []DocumentSection
316
317 secID uint32
318}
319
320func (t *symbolSubstrMatchTree) prepare(doc uint32) {
321 t.substrMatchTree.prepare(doc)
322 t.doc = doc
323
324 var fileStart uint32
325 if doc > 0 {
326 fileStart = t.fileEndRunes[doc-1]
327 }
328
329 var sections []DocumentSection
330 if len(t.sections) > 0 {
331 most := t.fileEndSymbol[len(t.fileEndSymbol)-1]
332 if most == uint32(len(t.sections)) {
333 sections = t.sections[t.fileEndSymbol[doc]:t.fileEndSymbol[doc+1]]
334 } else {
335 for t.secID < uint32(len(t.sections)) && t.sections[t.secID].Start < fileStart {
336 t.secID++
337 }
338
339 fileEnd, symbolEnd := t.fileEndRunes[doc], t.secID
340 for symbolEnd < uint32(len(t.sections)) && t.sections[symbolEnd].Start < fileEnd {
341 symbolEnd++
342 }
343
344 sections = t.sections[t.secID:symbolEnd]
345 }
346 }
347
348 secIdx := 0
349 trimmed := t.current[:0]
350 for len(sections) > secIdx && len(t.current) > 0 {
351 start := fileStart + t.current[0].runeOffset
352 end := start + t.patternSize
353 if start >= sections[secIdx].End {
354 secIdx++
355 continue
356 }
357
358 if start < sections[secIdx].Start {
359 t.current = t.current[1:]
360 continue
361 }
362
363 if end <= sections[secIdx].End {
364 t.current[0].symbol = true
365 t.current[0].symbolIdx = uint32(secIdx)
366 trimmed = append(trimmed, t.current[0])
367 }
368
369 t.current = t.current[1:]
370 }
371 t.current = trimmed
372}
373
374// all prepare methods
375
376func (t *bruteForceMatchTree) prepare(doc uint32) {
377 t.docID = doc
378 t.firstDone = true
379}
380
381func (t *docMatchTree) prepare(doc uint32) {
382 t.docID = doc
383 t.firstDone = true
384}
385
386func (t *andMatchTree) prepare(doc uint32) {
387 for _, c := range t.children {
388 c.prepare(doc)
389 }
390}
391
392func (t *regexpMatchTree) prepare(doc uint32) {
393 t.found = t.found[:0]
394 t.reEvaluated = false
395 t.bruteForceMatchTree.prepare(doc)
396}
397
398func (t *wordMatchTree) prepare(doc uint32) {
399 t.found = t.found[:0]
400 t.evaluated = false
401 t.bruteForceMatchTree.prepare(doc)
402}
403
404func (t *orMatchTree) prepare(doc uint32) {
405 for _, c := range t.children {
406 c.prepare(doc)
407 }
408}
409
410func (t *notMatchTree) prepare(doc uint32) {
411 t.child.prepare(doc)
412}
413
414func (t *fileNameMatchTree) prepare(doc uint32) {
415 t.child.prepare(doc)
416}
417
418func (t *boostMatchTree) prepare(doc uint32) {
419 t.child.prepare(doc)
420}
421
422func (t *substrMatchTree) prepare(nextDoc uint32) {
423 t.matchIterator.prepare(nextDoc)
424 t.current = t.matchIterator.candidates()
425 t.contEvaluated = false
426}
427
428func (t *branchQueryMatchTree) prepare(doc uint32) {
429 t.firstDone = true
430 t.docID = doc
431}
432
433// nextDoc
434
435func (t *docMatchTree) nextDoc() uint32 {
436 var start uint32
437 if t.firstDone {
438 start = t.docID + 1
439 }
440 for i := start; i < t.numDocs; i++ {
441 if t.predicate(i) {
442 return i
443 }
444 }
445 return maxUInt32
446}
447
448func (t *bruteForceMatchTree) nextDoc() uint32 {
449 if !t.firstDone {
450 return 0
451 }
452 return t.docID + 1
453}
454
455func (t *andMatchTree) nextDoc() uint32 {
456 var max uint32
457 for _, c := range t.children {
458 m := c.nextDoc()
459 if m > max {
460 max = m
461 }
462 }
463 return max
464}
465
466func (t *orMatchTree) nextDoc() uint32 {
467 min := uint32(maxUInt32)
468 for _, c := range t.children {
469 m := c.nextDoc()
470 if m < min {
471 min = m
472 }
473 }
474 return min
475}
476
477func (t *notMatchTree) nextDoc() uint32 {
478 return 0
479}
480
481func (t *fileNameMatchTree) nextDoc() uint32 {
482 return t.child.nextDoc()
483}
484
485func (t *boostMatchTree) nextDoc() uint32 {
486 return t.child.nextDoc()
487}
488
489func (t *branchQueryMatchTree) nextDoc() uint32 {
490 var start uint32
491 if t.firstDone {
492 start = t.docID + 1
493 }
494
495 for i := start; i < uint32(len(t.fileMasks)); i++ {
496 if (t.masks[t.repos[i]] & t.fileMasks[i]) != 0 {
497 return i
498 }
499 }
500 return maxUInt32
501}
502
503// all String methods
504
505func (t *bruteForceMatchTree) String() string {
506 return "all"
507}
508
509func (t *docMatchTree) String() string {
510 return fmt.Sprintf("doc(%s)", t.reason)
511}
512
513func (t *andMatchTree) String() string {
514 return fmt.Sprintf("and%v", t.children)
515}
516
517func (t *regexpMatchTree) String() string {
518 f := ""
519 if t.fileName {
520 f = "f"
521 }
522 return fmt.Sprintf("%sre(%s)", f, t.regexp)
523}
524
525func (t *wordMatchTree) String() string {
526 f := ""
527 if t.fileName {
528 f = "f"
529 }
530 return fmt.Sprintf("%sword(%s)", f, t.word)
531}
532
533func (t *orMatchTree) String() string {
534 return fmt.Sprintf("or%v", t.children)
535}
536
537func (t *notMatchTree) String() string {
538 return fmt.Sprintf("not(%v)", t.child)
539}
540
541func (t *noVisitMatchTree) String() string {
542 return fmt.Sprintf("novisit(%v)", t.matchTree)
543}
544
545func (t *fileNameMatchTree) String() string {
546 return fmt.Sprintf("f(%v)", t.child)
547}
548
549func (t *boostMatchTree) String() string {
550 return fmt.Sprintf("boost(%f, %v)", t.boost, t.child)
551}
552
553func (t *substrMatchTree) String() string {
554 f := ""
555 if t.fileName {
556 f = "f"
557 }
558
559 return fmt.Sprintf("%ssubstr(%q, %v, %v)", f, t.query.Pattern, t.current, t.matchIterator)
560}
561
562func (t *branchQueryMatchTree) String() string {
563 return fmt.Sprintf("branch(%x)", t.masks)
564}
565
566func (t *symbolSubstrMatchTree) String() string {
567 return fmt.Sprintf("symbol(%v)", t.substrMatchTree)
568}
569
570func (t *symbolRegexpMatchTree) String() string {
571 return fmt.Sprintf("symbol(%v)", t.matchTree)
572}
573
574// visitMatches visits all atoms in matchTree. Note: This visits
575// noVisitMatchTree. For collecting matches use visitMatches.
576func visitMatchTree(t matchTree, f func(matchTree)) {
577 switch s := t.(type) {
578 case *andMatchTree:
579 for _, ch := range s.children {
580 visitMatchTree(ch, f)
581 }
582 case *orMatchTree:
583 for _, ch := range s.children {
584 visitMatchTree(ch, f)
585 }
586 case *andLineMatchTree:
587 visitMatchTree(&s.andMatchTree, f)
588 case *noVisitMatchTree:
589 visitMatchTree(s.matchTree, f)
590 case *notMatchTree:
591 visitMatchTree(s.child, f)
592 case *fileNameMatchTree:
593 visitMatchTree(s.child, f)
594 case *boostMatchTree:
595 visitMatchTree(s.child, f)
596 case *symbolSubstrMatchTree:
597 visitMatchTree(s.substrMatchTree, f)
598 case *symbolRegexpMatchTree:
599 visitMatchTree(s.matchTree, f)
600 default:
601 f(t)
602 }
603}
604
605// updateMatchTreeStats calls updateStats on all atoms in mt which have that
606// function defined.
607func updateMatchTreeStats(mt matchTree, stats *Stats) {
608 visitMatchTree(mt, func(mt matchTree) {
609 if atom, ok := mt.(interface{ updateStats(*Stats) }); ok {
610 atom.updateStats(stats)
611 }
612 })
613}
614
615func visitMatchAtoms(t matchTree, known map[matchTree]bool, f func(matchTree)) {
616 visitMatches(t, known, 1, func(mt matchTree, _ float64) {
617 f(mt)
618 })
619}
620
621// visitMatches visits all atoms which can contribute matches. Note: This
622// skips noVisitMatchTree.
623func visitMatches(t matchTree, known map[matchTree]bool, weight float64, f func(matchTree, float64)) {
624 switch s := t.(type) {
625 case *andMatchTree:
626 for _, ch := range s.children {
627 if known[ch] {
628 visitMatches(ch, known, weight, f)
629 }
630 }
631 case *andLineMatchTree:
632 visitMatches(&s.andMatchTree, known, weight, f)
633 case *orMatchTree:
634 for _, ch := range s.children {
635 if known[ch] {
636 visitMatches(ch, known, weight, f)
637 }
638 }
639 case *boostMatchTree:
640 visitMatches(s.child, known, weight*s.boost, f)
641 case *symbolSubstrMatchTree:
642 visitMatches(s.substrMatchTree, known, weight, f)
643 case *notMatchTree:
644 case *noVisitMatchTree:
645 // don't collect into negative trees.
646 case *fileNameMatchTree:
647 // We will just gather the filename if we do not visit this tree.
648 default:
649 f(s, weight)
650 }
651}
652
653// all matches() methods.
654
655func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
656 return matchesStatePred(t.predicate(cp.idx))
657}
658
659func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
660 return matchesFound
661}
662
663// andLineMatchTree is a performance optimization of andMatchTree. For content
664// searches we don't want to run the regex engine if there is no line that
665// contains matches from all terms.
666func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
667 if state := evalMatchTree(cp, cost, known, &t.andMatchTree); state != matchesFound {
668 return state
669 }
670
671 // Invariant: all children have matches. If any line contains all of them we
672 // can return MatchesFound.
673
674 // find child with fewest candidates
675 min := maxUInt32
676 fewestChildren := 0
677 for ix, child := range t.children {
678 v, ok := child.(*substrMatchTree)
679 // make sure we are running a content search and that all candidates are a
680 // substrMatchTree
681 if !ok || v.fileName {
682 return matchesFound
683 }
684 if len(v.current) < min {
685 min = len(v.current)
686 fewestChildren = ix
687 }
688 }
689
690 type lineRange struct {
691 start int
692 end int
693 }
694 lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current))
695 prev := -1
696 for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current {
697 line, byteStart, byteEnd := cp.newlines().atOffset(candidate.byteOffset)
698 if line == prev {
699 continue
700 }
701 prev = line
702 lines = append(lines, lineRange{byteStart, byteEnd})
703 }
704
705 // children keeps track of the children's candidates we have already seen.
706 children := make([][]*candidateMatch, 0, len(t.children)-1)
707 for j, child := range t.children {
708 if j == fewestChildren {
709 continue
710 }
711 children = append(children, child.(*substrMatchTree).current)
712 }
713
714nextLine:
715 for i := 0; i < len(lines); i++ {
716 hits := 1
717 nextChild:
718 for j := range children {
719 nextCandidate:
720 for len(children[j]) > 0 {
721 candidate := children[j][0]
722 bo := int(cp.findOffset(false, candidate.runeOffset))
723 if bo < lines[i].start {
724 children[j] = children[j][1:]
725 continue nextCandidate
726 }
727 if bo <= lines[i].end {
728 hits++
729 continue nextChild
730 }
731 // move the `lines` iterator forward until bo <= line.end
732 for i < len(lines) && bo > lines[i].end {
733 i++
734 }
735 i--
736 continue nextLine
737 }
738 }
739 // return early once we found any line that contains matches from all children
740 if hits == len(t.children) {
741 return matchesFound
742 }
743 }
744 return matchesNone
745}
746
747func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
748 // We have found matches unless a child needs to do more work or it hasn't
749 // found matches.
750 state := matchesFound
751
752 for _, ch := range t.children {
753 switch evalMatchTree(cp, cost, known, ch) {
754 case matchesRequiresHigherCost:
755 // keep evaluating other children incase we come across matchesNone
756 state = matchesRequiresHigherCost
757 case matchesFound:
758 // will return this if every child has this value
759 case matchesNone:
760 return matchesNone
761 }
762 }
763
764 return state
765}
766
767func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
768 // we could short-circuit, but we want to use the other possibilities as a
769 // ranking signal. So we always return the most conservative state.
770 state := matchesNone
771 for _, ch := range t.children {
772 switch evalMatchTree(cp, cost, known, ch) {
773 case matchesRequiresHigherCost:
774 state = matchesRequiresHigherCost
775 case matchesFound:
776 if state != matchesRequiresHigherCost {
777 state = matchesFound
778 }
779 case matchesNone:
780 // noop
781 }
782 }
783 return state
784}
785
786func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
787 return matchesStatePred(t.branchMask() != 0)
788}
789
790func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
791 if t.reEvaluated {
792 return matchesStateForSlice(t.found)
793 }
794
795 if cost < costRegexp {
796 return matchesRequiresHigherCost
797 }
798
799 cp.stats.RegexpsConsidered++
800 idxs := t.regexp.FindAllIndex(cp.data(t.fileName), -1)
801 found := t.found[:0]
802 for _, idx := range idxs {
803 cm := &candidateMatch{
804 byteOffset: uint32(idx[0]),
805 byteMatchSz: uint32(idx[1] - idx[0]),
806 fileName: t.fileName,
807 }
808
809 found = append(found, cm)
810 }
811 t.found = found
812 t.reEvaluated = true
813
814 return matchesStateForSlice(t.found)
815}
816
817func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
818 if t.evaluated {
819 return matchesStateForSlice(t.found)
820 }
821
822 if cost < costRegexp {
823 return matchesRequiresHigherCost
824 }
825
826 data := cp.data(t.fileName)
827 offset := 0
828 found := t.found[:0]
829 for {
830 idx := bytes.Index(data[offset:], []byte(t.word))
831 if idx < 0 {
832 break
833 }
834
835 relStartOffset := offset + idx
836 relEndOffset := relStartOffset + len(t.word)
837
838 startBoundary := relStartOffset < len(data) && (relStartOffset == 0 || !characterClass(data[relStartOffset-1]))
839 endBoundary := relEndOffset > 0 && (relEndOffset == len(data) || !characterClass(data[relEndOffset]))
840 if startBoundary && endBoundary {
841 found = append(found, &candidateMatch{
842 byteOffset: uint32(offset + idx),
843 byteMatchSz: uint32(len(t.word)),
844 fileName: t.fileName,
845 })
846 }
847 offset += idx + len(t.word)
848 }
849
850 t.found = found
851 t.evaluated = true
852
853 return matchesStateForSlice(t.found)
854}
855
856// breakMatchesOnNewlines returns matches resulting from breaking each element
857// of cms on newlines within text.
858func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch {
859 var lineCMs []*candidateMatch
860 for _, cm := range cms {
861 lineCMs = append(lineCMs, breakOnNewlines(cm, text)...)
862 }
863 return lineCMs
864}
865
866// breakOnNewlines returns matches resulting from breaking cm on newlines
867// within text.
868func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch {
869 var cms []*candidateMatch
870 addMe := &candidateMatch{}
871 *addMe = *cm
872 for i := uint32(cm.byteOffset); i < cm.byteOffset+cm.byteMatchSz; i++ {
873 if text[i] == '\n' {
874 addMe.byteMatchSz = i - addMe.byteOffset
875 if addMe.byteMatchSz != 0 {
876 cms = append(cms, addMe)
877 }
878
879 addMe = &candidateMatch{}
880 *addMe = *cm
881 addMe.byteOffset = i + 1
882 }
883 }
884 addMe.byteMatchSz = cm.byteOffset + cm.byteMatchSz - addMe.byteOffset
885 if addMe.byteMatchSz != 0 {
886 cms = append(cms, addMe)
887 }
888 return cms
889}
890
891// evalMatchTree should be called instead of directly calling
892// matchTree.matches. It cache known values for future evaluation at higher
893// costs.
894func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) matchesState {
895 if v, ok := known[mt]; ok {
896 return matchesStatePred(v)
897 }
898
899 ms := mt.matches(cp, cost, known)
900 if ms != matchesRequiresHigherCost {
901 known[mt] = ms == matchesFound
902 }
903
904 return ms
905}
906
907func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
908 switch evalMatchTree(cp, cost, known, t.child) {
909 case matchesRequiresHigherCost:
910 return matchesRequiresHigherCost
911 case matchesFound:
912 return matchesNone
913 case matchesNone:
914 return matchesFound
915 default:
916 panic("unreachable")
917 }
918}
919
920func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
921 return evalMatchTree(cp, cost, known, t.child)
922}
923
924func (t *boostMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
925 return evalMatchTree(cp, cost, known, t.child)
926}
927
928func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
929 if t.contEvaluated {
930 return matchesStateForSlice(t.current)
931 }
932
933 if len(t.current) == 0 {
934 return matchesNone
935 }
936
937 if t.fileName && cost < costMemory {
938 return matchesRequiresHigherCost
939 }
940
941 if !t.fileName && cost < costContent {
942 return matchesRequiresHigherCost
943 }
944
945 pruned := t.current[:0]
946 for _, m := range t.current {
947 if m.byteOffset == 0 && m.runeOffset > 0 {
948 m.byteOffset = cp.findOffset(m.fileName, m.runeOffset)
949 }
950 if m.matchContent(cp.data(m.fileName)) {
951 pruned = append(pruned, m)
952 }
953 }
954 t.current = pruned
955 t.contEvaluated = true
956
957 return matchesStateForSlice(t.current)
958}
959
960type matchTreeOpt struct {
961 // DisableWordMatchOptimization is used to disable the use of wordMatchTree.
962 // This was added since we do not support wordMatchTree with symbol search.
963 DisableWordMatchOptimization bool
964}
965
966func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error) {
967 if q == nil {
968 return nil, fmt.Errorf("got nil (sub)query")
969 }
970 switch s := q.(type) {
971 case *query.Regexp:
972 // RegexpToMatchTreeRecursive tries to distill a matchTree that matches a
973 // superset of the regexp. If the returned matchTree is equivalent to the
974 // original regexp, it returns true. An equivalent matchTree has the same
975 // behaviour as the original regexp and can be used instead.
976 //
977 subMT, isEq, _, err := d.regexpToMatchTreeRecursive(s.Regexp, ngramSize, s.FileName, s.CaseSensitive)
978 if err != nil {
979 return nil, err
980 }
981 // if the query can be used in place of the regexp
982 // return the subtree
983 if isEq {
984 return subMT, nil
985 }
986
987 var tr matchTree
988 if wmt, ok := regexpToWordMatchTree(s, opt); ok {
989 // A common search we get is "\bLITERAL\b". Avoid the regex engine and
990 // provide something faster.
991 tr = wmt
992 } else {
993 tr = newRegexpMatchTree(s)
994 }
995
996 return &andMatchTree{
997 children: []matchTree{
998 tr, &noVisitMatchTree{subMT},
999 },
1000 }, nil
1001 case *query.And:
1002 var r []matchTree
1003 for _, ch := range s.Children {
1004 ct, err := d.newMatchTree(ch, opt)
1005 if err != nil {
1006 return nil, err
1007 }
1008 r = append(r, ct)
1009 }
1010 return &andMatchTree{r}, nil
1011 case *query.Or:
1012 var r []matchTree
1013 for _, ch := range s.Children {
1014 ct, err := d.newMatchTree(ch, opt)
1015 if err != nil {
1016 return nil, err
1017 }
1018 r = append(r, ct)
1019 }
1020 return &orMatchTree{r}, nil
1021 case *query.Not:
1022 ct, err := d.newMatchTree(s.Child, opt)
1023 return ¬MatchTree{
1024 child: ct,
1025 }, err
1026
1027 case *query.Type:
1028 if s.Type != query.TypeFileName {
1029 break
1030 }
1031
1032 ct, err := d.newMatchTree(s.Child, opt)
1033 if err != nil {
1034 return nil, err
1035 }
1036
1037 return &fileNameMatchTree{
1038 child: ct,
1039 }, nil
1040
1041 case *query.Boost:
1042 ct, err := d.newMatchTree(s.Child, opt)
1043 if err != nil {
1044 return nil, err
1045 }
1046
1047 return &boostMatchTree{
1048 child: ct,
1049 boost: s.Boost,
1050 }, nil
1051
1052 case *query.Substring:
1053 return d.newSubstringMatchTree(s)
1054
1055 case *query.Branch:
1056 masks := make([]uint64, 0, len(d.repoMetaData))
1057 if s.Pattern == "HEAD" {
1058 for i := 0; i < len(d.repoMetaData); i++ {
1059 masks = append(masks, 1)
1060 }
1061 } else {
1062 for _, branchIDs := range d.branchIDs {
1063 mask := uint64(0)
1064 for nm, m := range branchIDs {
1065 if (s.Exact && nm == s.Pattern) || (!s.Exact && strings.Contains(nm, s.Pattern)) {
1066 mask |= uint64(m)
1067 }
1068 }
1069 masks = append(masks, mask)
1070 }
1071 }
1072 return &branchQueryMatchTree{
1073 masks: masks,
1074 fileMasks: d.fileBranchMasks,
1075 repos: d.repos,
1076 }, nil
1077 case *query.Const:
1078 if s.Value {
1079 return &bruteForceMatchTree{}, nil
1080 } else {
1081 return &noMatchTree{Why: "const"}, nil
1082 }
1083 case *query.Language:
1084 code, ok := d.metaData.LanguageMap[s.Language]
1085 if !ok {
1086 return &noMatchTree{Why: "lang"}, nil
1087 }
1088 return &docMatchTree{
1089 reason: "language",
1090 numDocs: d.numDocs(),
1091 predicate: func(docID uint32) bool {
1092 return d.getLanguage(docID) == code
1093 },
1094 }, nil
1095
1096 case *query.Symbol:
1097 // Disable WordMatchTree since we don't support it in symbols yet.
1098 optCopy := opt
1099 optCopy.DisableWordMatchOptimization = true
1100
1101 subMT, err := d.newMatchTree(s.Expr, optCopy)
1102 if err != nil {
1103 return nil, err
1104 }
1105
1106 if substr, ok := subMT.(*substrMatchTree); ok {
1107 return &symbolSubstrMatchTree{
1108 substrMatchTree: substr,
1109 patternSize: uint32(utf8.RuneCountInString(substr.query.Pattern)),
1110 fileEndRunes: d.fileEndRunes,
1111 fileEndSymbol: d.fileEndSymbol,
1112 sections: d.runeDocSections,
1113 }, nil
1114 }
1115
1116 var regexpMT *regexpMatchTree
1117 visitMatchTree(subMT, func(mt matchTree) {
1118 if t, ok := mt.(*regexpMatchTree); ok {
1119 regexpMT = t
1120 }
1121 })
1122 if regexpMT == nil {
1123 return nil, fmt.Errorf("found %T inside query.Symbol", subMT)
1124 }
1125
1126 return &symbolRegexpMatchTree{
1127 regexp: regexpMT.regexp,
1128 all: isRegexpAll(regexpMT.origRegexp),
1129 matchTree: subMT,
1130 }, nil
1131
1132 case *query.FileNameSet:
1133 return &docMatchTree{
1134 reason: "FileNameSet",
1135 numDocs: d.numDocs(),
1136 predicate: func(docID uint32) bool {
1137 fileName := d.fileName(docID)
1138 _, ok := s.Set[string(fileName)]
1139 return ok
1140 },
1141 }, nil
1142
1143 case *query.BranchesRepos:
1144 reposBranchesWant := make([]uint64, len(d.repoMetaData))
1145 for repoIdx := range d.repoMetaData {
1146 var mask uint64
1147 for _, br := range s.List {
1148 if br.Repos.Contains(d.repoMetaData[repoIdx].ID) {
1149 mask |= uint64(d.branchIDs[repoIdx][br.Branch])
1150 }
1151 }
1152 reposBranchesWant[repoIdx] = mask
1153 }
1154 return &docMatchTree{
1155 reason: "BranchesRepos",
1156 numDocs: d.numDocs(),
1157 predicate: func(docID uint32) bool {
1158 return d.fileBranchMasks[docID]&reposBranchesWant[d.repos[docID]] != 0
1159 },
1160 }, nil
1161
1162 case *query.RepoSet:
1163 reposWant := make([]bool, len(d.repoMetaData))
1164 for repoIdx, r := range d.repoMetaData {
1165 if _, ok := s.Set[r.Name]; ok {
1166 reposWant[repoIdx] = true
1167 }
1168 }
1169 return &docMatchTree{
1170 reason: "RepoSet",
1171 numDocs: d.numDocs(),
1172 predicate: func(docID uint32) bool {
1173 return reposWant[d.repos[docID]]
1174 },
1175 }, nil
1176
1177 case *query.RepoIDs:
1178 reposWant := make([]bool, len(d.repoMetaData))
1179 for repoIdx, r := range d.repoMetaData {
1180 if s.Repos.Contains(r.ID) {
1181 reposWant[repoIdx] = true
1182 }
1183 }
1184 return &docMatchTree{
1185 reason: "RepoIDs",
1186 numDocs: d.numDocs(),
1187 predicate: func(docID uint32) bool {
1188 return reposWant[d.repos[docID]]
1189 },
1190 }, nil
1191
1192 case *query.Repo:
1193 reposWant := make([]bool, len(d.repoMetaData))
1194 for repoIdx, r := range d.repoMetaData {
1195 if s.Regexp.MatchString(r.Name) {
1196 reposWant[repoIdx] = true
1197 }
1198 }
1199 return &docMatchTree{
1200 reason: "Repo",
1201 numDocs: d.numDocs(),
1202 predicate: func(docID uint32) bool {
1203 return reposWant[d.repos[docID]]
1204 },
1205 }, nil
1206
1207 case *query.RepoRegexp:
1208 reposWant := make([]bool, len(d.repoMetaData))
1209 for repoIdx, r := range d.repoMetaData {
1210 if s.Regexp.MatchString(r.Name) {
1211 reposWant[repoIdx] = true
1212 }
1213 }
1214 return &docMatchTree{
1215 reason: "RepoRegexp",
1216 numDocs: d.numDocs(),
1217 predicate: func(docID uint32) bool {
1218 return reposWant[d.repos[docID]]
1219 },
1220 }, nil
1221
1222 case query.RawConfig:
1223 return &docMatchTree{
1224 reason: s.String(),
1225 numDocs: d.numDocs(),
1226 predicate: func(docID uint32) bool {
1227 return uint8(s)&d.rawConfigMasks[d.repos[docID]] == uint8(s)
1228 },
1229 }, nil
1230 }
1231 log.Panicf("type %T", q)
1232 return nil, nil
1233}
1234
1235func (d *indexData) newSubstringMatchTree(s *query.Substring) (matchTree, error) {
1236 st := &substrMatchTree{
1237 query: s,
1238 caseSensitive: s.CaseSensitive,
1239 fileName: s.FileName,
1240 }
1241
1242 if utf8.RuneCountInString(s.Pattern) < ngramSize {
1243 return newRegexpMatchTree(&query.Regexp{
1244 Regexp: &syntax.Regexp{Op: syntax.OpLiteral, Rune: []rune(s.Pattern)},
1245 FileName: s.FileName,
1246 Content: s.Content,
1247 CaseSensitive: s.CaseSensitive,
1248 }), nil
1249 }
1250
1251 result, err := d.iterateNgrams(s)
1252 if err != nil {
1253 return nil, err
1254 }
1255 st.matchIterator = result
1256 return st, nil
1257}
1258
1259func regexpToWordMatchTree(q *query.Regexp, opt matchTreeOpt) (_ *wordMatchTree, ok bool) {
1260 if opt.DisableWordMatchOptimization {
1261 return nil, false
1262 }
1263 // Needs to be case sensitive
1264 if !q.CaseSensitive || q.Regexp.Flags&syntax.FoldCase != 0 {
1265 return nil, false
1266 }
1267 // We want a regex that looks like Op.Concat[OpWordBoundary OpLiteral OpWordBoundary]
1268 if q.Regexp.Op != syntax.OpConcat || len(q.Regexp.Sub) != 3 {
1269 return nil, false
1270 }
1271 sub := q.Regexp.Sub
1272 if sub[0].Op != syntax.OpWordBoundary || sub[1].Op != syntax.OpLiteral || sub[2].Op != syntax.OpWordBoundary {
1273 return nil, false
1274 }
1275
1276 return &wordMatchTree{
1277 word: string(sub[1].Rune),
1278 fileName: q.FileName,
1279 }, true
1280}
1281
1282// pruneMatchTree removes impossible branches from the matchTree, as indicated
1283// by substrMatchTree having a noMatchTree and the resulting impossible and clauses and so forth.
1284func pruneMatchTree(mt matchTree) (matchTree, error) {
1285 var err error
1286 switch mt := mt.(type) {
1287 // leaf nodes that we test with the filter:
1288 case *substrMatchTree:
1289 if res, ok := mt.matchIterator.(*ngramIterationResults); ok {
1290 if _, ok := res.matchIterator.(*noMatchTree); ok {
1291 return nil, nil
1292 }
1293 }
1294 // recursive tree structures:
1295 case *andMatchTree:
1296 // Any branch of an and becoming impossible means the entire clause
1297 // is impossible. Otherwise, just handle rewrites.
1298 for i, child := range mt.children {
1299 newChild, err := pruneMatchTree(child)
1300 if err != nil {
1301 return nil, err
1302 }
1303 if newChild == nil {
1304 return nil, nil
1305 }
1306 mt.children[i] = newChild
1307 }
1308 case *orMatchTree:
1309 // *All* branches of an OR becoming impossible means the entire clause
1310 // is impossible. Otherwise, drop impossible subclauses and handle
1311 // rewrites, including simplifying to a singular resulting child branch.
1312 n := 0
1313 for _, child := range mt.children {
1314 newChild, err := pruneMatchTree(child)
1315 if err != nil {
1316 return nil, err
1317 }
1318 if newChild != nil {
1319 mt.children[n] = newChild
1320 n++
1321 }
1322 }
1323 mt.children = mt.children[:n]
1324 if len(mt.children) == 1 {
1325 return mt.children[0], nil
1326 } else if len(mt.children) == 0 {
1327 return nil, nil
1328 }
1329 case *noVisitMatchTree:
1330 mt.matchTree, err = pruneMatchTree(mt.matchTree)
1331 if err != nil {
1332 return nil, err
1333 }
1334 if mt.matchTree == nil {
1335 return nil, nil
1336 }
1337 case *fileNameMatchTree:
1338 mt.child, err = pruneMatchTree(mt.child)
1339 if err != nil {
1340 return nil, err
1341 }
1342 if mt.child == nil {
1343 return nil, nil
1344 }
1345 case *boostMatchTree:
1346 mt.child, err = pruneMatchTree(mt.child)
1347 if err != nil {
1348 return nil, err
1349 }
1350 if mt.child == nil {
1351 return nil, nil
1352 }
1353 case *andLineMatchTree:
1354 child, err := pruneMatchTree(&mt.andMatchTree)
1355 if err != nil {
1356 return nil, err
1357 }
1358 if child == nil {
1359 return nil, nil
1360 }
1361 if c, ok := child.(*andMatchTree); ok {
1362 mt.andMatchTree = *c
1363 } else {
1364 // the and was simplified to a single clause,
1365 // so the linematch portion is irrelevant.
1366 return mt, nil
1367 }
1368 case *notMatchTree:
1369 mt.child, err = pruneMatchTree(mt.child)
1370 if err != nil {
1371 return nil, err
1372 }
1373 if mt.child == nil {
1374 // not false => true
1375 return &bruteForceMatchTree{}, nil
1376 }
1377 // unhandled:
1378 case *docMatchTree:
1379 case *bruteForceMatchTree:
1380 case *regexpMatchTree:
1381 case *wordMatchTree:
1382 }
1383 return mt, err
1384}
1385
1386// isRegexpAll returns true if the query matches all possible lines.
1387//
1388// Note: it is possible for a funky regex to actually match all but this
1389// returns false. This returns true for normal looking regexes like ".*" or
1390// "(?-s:.*)".
1391func isRegexpAll(r *syntax.Regexp) bool {
1392 // Note: we don't care about any flags since we are looking for .* and we
1393 // don't mind if . matches all or everything but newline.
1394
1395 // for loop is for visiting children of OpCapture/OpConcat until we hit .*
1396 for {
1397 // Our main target: .*
1398 if r.Op == syntax.OpStar && len(r.Sub) == 1 { // *
1399 // (?s:.) or (?-s:.)
1400 return r.Sub[0].Op == syntax.OpAnyChar || r.Sub[0].Op == syntax.OpAnyCharNotNL
1401 }
1402
1403 // Strip away expressions being wrapped in paranthesis
1404 if (r.Op == syntax.OpCapture || r.Op == syntax.OpConcat) && len(r.Sub) == 1 {
1405 r = r.Sub[0]
1406 continue
1407 }
1408
1409 return false
1410 }
1411}