fork of https://github.com/sourcegraph/zoekt
1// Copyright 2018 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package index
16
17import (
18 "bytes"
19 "fmt"
20 "log"
21 "math"
22 "regexp/syntax"
23 "strings"
24 "unicode/utf8"
25
26 "github.com/grafana/regexp"
27
28 "github.com/sourcegraph/zoekt"
29 "github.com/sourcegraph/zoekt/internal/syntaxutil"
30 "github.com/sourcegraph/zoekt/query"
31)
32
33// A docIterator iterates over documents in order.
34type docIterator interface {
35 // provide the next document where we may find something interesting.
36 //
37 // This is like a "peek" and shouldn't mutate state. prepare is what should
38 // change state.
39 nextDoc() uint32
40
41 // clears any per-document state of the docIterator, and
42 // prepares for evaluating the given doc. The argument is
43 // strictly increasing over time.
44 prepare(nextDoc uint32)
45}
46
47// costs are passed in increasing order to matchTree.matches until they do not
48// return matchesRequiresHigherCost.
49const (
50 costConst = 0
51 costMemory = 1
52 costContent = 2
53 costRegexp = 3
54)
55
56const (
57 costMin = costConst
58 costMax = costRegexp
59)
60
61// matchesState is an enum for the state of a matchTree after a call to
62// matchTree.matches.
63type matchesState uint8
64
65const (
66 // matchesRequiresHigherCost is returned when matchTree.matches hasn't done
67 // a search yet since the cost value is not high enough.
68 matchesRequiresHigherCost matchesState = iota
69
70 // matchesFound is returned when matchTree.matches has done a search and
71 // found one or more matches.
72 matchesFound
73
74 // matchesNone is returned when matchTree.matches has done a search and
75 // found nothing.
76 matchesNone
77)
78
79// matchesStatePred is a helper which returns matchesFound if b is true
80// otherwise returns matchesNone.
81func matchesStatePred(b bool) matchesState {
82 if b {
83 return matchesFound
84 }
85 return matchesNone
86}
87
88// matchesStateForSlice is a helper which returns matchesFound if v is
89// non-empty otherwise returns matchesNone.
90func matchesStateForSlice[T any](v []T) matchesState {
91 return matchesStatePred(len(v) > 0)
92}
93
94// An expression tree coupled with matches. The matchtree has two
95// functions:
96//
97// * it implements boolean combinations (and, or, not)
98//
99// * it implements shortcuts, where we skip documents (for example: if
100// there are no trigram matches, we can be sure there are no substring
101// matches). The matchtree iterates over the documents as they are
102// ordered in the shard.
103//
104// The general process for a given (shard, query) is:
105//
106// - construct matchTree for the query
107//
108// - find all different leaf matchTrees (substring, regexp, etc.)
109//
110// in a loop:
111//
112// - find next doc to process using nextDoc
113//
114// - evaluate atoms (leaf expressions that match text)
115//
116// - evaluate the tree using matches(), storing the result in map.
117//
118// - if the complete tree returns (matches() != matchesRequiresHigherCost)
119// for the document, collect all text matches by looking at leaf
120// matchTrees.
121type matchTree interface {
122 docIterator
123
124 // matches if cost is high enough. See documentation for matchesState's
125 // values.
126 //
127 // Note: Do not call this directly, rather use evalMatchTree which uses
128 // known to cache responses once the state transitions away from
129 // matchesRequiresHigherCost.
130 matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState
131}
132
133// docMatchTree iterates over documents for which predicate(docID) returns true.
134type docMatchTree struct {
135 // the number of documents in a shard.
136 numDocs uint32
137
138 predicate func(docID uint32) bool
139
140 // provides additional information about the reason why the docMatchTree was
141 // created.
142 reason string
143
144 // mutable
145 firstDone bool
146 docID uint32
147}
148
149type bruteForceMatchTree struct {
150 // mutable
151 firstDone bool
152 docID uint32
153}
154
155type andLineMatchTree struct {
156 andMatchTree
157}
158
159type andMatchTree struct {
160 children []matchTree
161}
162
163type orMatchTree struct {
164 children []matchTree
165}
166
167type notMatchTree struct {
168 child matchTree
169}
170
171// Returns only the filename of child matches.
172type fileNameMatchTree struct {
173 child matchTree
174}
175
176type boostMatchTree struct {
177 child matchTree
178 boost float64
179}
180
181// Don't visit this subtree for collecting matches.
182type noVisitMatchTree struct {
183 matchTree
184}
185
186type regexpMatchTree struct {
187 regexp *regexp.Regexp
188
189 // origRegexp is the original parsed regexp from the query structure. It
190 // does not include mutations such as case sensitivity.
191 origRegexp *syntax.Regexp
192
193 fileName bool
194
195 // mutable
196 reEvaluated bool
197 found []*candidateMatch
198
199 // nextDoc, prepare.
200 bruteForceMatchTree
201}
202
203func newRegexpMatchTree(s *query.Regexp) *regexpMatchTree {
204 prefix := ""
205 if !s.CaseSensitive {
206 prefix = "(?i)"
207 }
208
209 return ®expMatchTree{
210 regexp: regexp.MustCompile(prefix + syntaxutil.RegexpString(s.Regexp)),
211 origRegexp: s.Regexp,
212 fileName: s.FileName,
213 }
214}
215
216// \bLITERAL\b
217type wordMatchTree struct {
218 word string
219
220 fileName bool
221
222 // mutable
223 evaluated bool
224 found []*candidateMatch
225
226 // nextDoc, prepare.
227 bruteForceMatchTree
228}
229
230type substrMatchTree struct {
231 matchIterator
232
233 query *query.Substring
234 caseSensitive bool
235 fileName bool
236
237 // mutable
238 current []*candidateMatch
239 contEvaluated bool
240}
241
242type branchQueryMatchTree struct {
243 fileMasks []uint64
244 masks []uint64
245 repos []uint16
246
247 // mutable
248 firstDone bool
249 docID uint32
250}
251
252func (t *branchQueryMatchTree) branchMask() uint64 {
253 return t.fileMasks[t.docID] & t.masks[t.repos[t.docID]]
254}
255
256type symbolRegexpMatchTree struct {
257 matchTree
258 regexp *regexp.Regexp
259 all bool // skips regex match if .*
260
261 reEvaluated bool
262 found []*candidateMatch
263}
264
265func (t *symbolRegexpMatchTree) prepare(doc uint32) {
266 t.reEvaluated = false
267 t.found = t.found[:0]
268 t.matchTree.prepare(doc)
269}
270
271func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
272 if t.reEvaluated {
273 return matchesStateForSlice(t.found)
274 }
275
276 if cost < costRegexp {
277 return matchesRequiresHigherCost
278 }
279
280 sections := cp.docSections()
281 content := cp.data(false)
282
283 found := t.found[:0]
284 for i, sec := range sections {
285 var idx []int
286 if t.all {
287 idx = []int{0, int(sec.End - sec.Start)}
288 } else {
289 idx = t.regexp.FindIndex(content[sec.Start:sec.End])
290 if idx == nil {
291 continue
292 }
293 }
294
295 cm := &candidateMatch{
296 byteOffset: sec.Start + uint32(idx[0]),
297 byteMatchSz: uint32(idx[1] - idx[0]),
298 symbol: true,
299 symbolIdx: uint32(i),
300 }
301 found = append(found, cm)
302 }
303 t.found = found
304 t.reEvaluated = true
305
306 return matchesStateForSlice(t.found)
307}
308
309type symbolSubstrMatchTree struct {
310 *substrMatchTree
311
312 patternSize uint32
313 fileEndRunes []uint32
314 fileEndSymbol []uint32
315
316 doc uint32
317 sections []DocumentSection
318
319 secID uint32
320}
321
322func (t *symbolSubstrMatchTree) prepare(doc uint32) {
323 t.substrMatchTree.prepare(doc)
324 t.doc = doc
325
326 var fileStart uint32
327 if doc > 0 {
328 fileStart = t.fileEndRunes[doc-1]
329 }
330
331 var sections []DocumentSection
332 if len(t.sections) > 0 {
333 most := t.fileEndSymbol[len(t.fileEndSymbol)-1]
334 if most == uint32(len(t.sections)) {
335 sections = t.sections[t.fileEndSymbol[doc]:t.fileEndSymbol[doc+1]]
336 } else {
337 for t.secID < uint32(len(t.sections)) && t.sections[t.secID].Start < fileStart {
338 t.secID++
339 }
340
341 fileEnd, symbolEnd := t.fileEndRunes[doc], t.secID
342 for symbolEnd < uint32(len(t.sections)) && t.sections[symbolEnd].Start < fileEnd {
343 symbolEnd++
344 }
345
346 sections = t.sections[t.secID:symbolEnd]
347 }
348 }
349
350 secIdx := 0
351 trimmed := t.current[:0]
352 for len(sections) > secIdx && len(t.current) > 0 {
353 start := fileStart + t.current[0].runeOffset
354 end := start + t.patternSize
355 if start >= sections[secIdx].End {
356 secIdx++
357 continue
358 }
359
360 if start < sections[secIdx].Start {
361 t.current = t.current[1:]
362 continue
363 }
364
365 if end <= sections[secIdx].End {
366 t.current[0].symbol = true
367 t.current[0].symbolIdx = uint32(secIdx)
368 trimmed = append(trimmed, t.current[0])
369 }
370
371 t.current = t.current[1:]
372 }
373 t.current = trimmed
374}
375
376// all prepare methods
377
378func (t *bruteForceMatchTree) prepare(doc uint32) {
379 t.docID = doc
380 t.firstDone = true
381}
382
383func (t *docMatchTree) prepare(doc uint32) {
384 t.docID = doc
385 t.firstDone = true
386}
387
388func (t *andMatchTree) prepare(doc uint32) {
389 for _, c := range t.children {
390 c.prepare(doc)
391 }
392}
393
394func (t *regexpMatchTree) prepare(doc uint32) {
395 t.found = t.found[:0]
396 t.reEvaluated = false
397 t.bruteForceMatchTree.prepare(doc)
398}
399
400func (t *wordMatchTree) prepare(doc uint32) {
401 t.found = t.found[:0]
402 t.evaluated = false
403 t.bruteForceMatchTree.prepare(doc)
404}
405
406func (t *orMatchTree) prepare(doc uint32) {
407 for _, c := range t.children {
408 c.prepare(doc)
409 }
410}
411
412func (t *notMatchTree) prepare(doc uint32) {
413 t.child.prepare(doc)
414}
415
416func (t *fileNameMatchTree) prepare(doc uint32) {
417 t.child.prepare(doc)
418}
419
420func (t *boostMatchTree) prepare(doc uint32) {
421 t.child.prepare(doc)
422}
423
424func (t *substrMatchTree) prepare(nextDoc uint32) {
425 t.matchIterator.prepare(nextDoc)
426 t.current = t.matchIterator.candidates()
427 t.contEvaluated = false
428}
429
430func (t *branchQueryMatchTree) prepare(doc uint32) {
431 t.firstDone = true
432 t.docID = doc
433}
434
435// nextDoc
436
437func (t *docMatchTree) nextDoc() uint32 {
438 var start uint32
439 if t.firstDone {
440 start = t.docID + 1
441 }
442 for i := start; i < t.numDocs; i++ {
443 if t.predicate(i) {
444 return i
445 }
446 }
447 return math.MaxUint32
448}
449
450func (t *bruteForceMatchTree) nextDoc() uint32 {
451 if !t.firstDone {
452 return 0
453 }
454 return t.docID + 1
455}
456
457func (t *andMatchTree) nextDoc() uint32 {
458 var max uint32
459 for _, c := range t.children {
460 m := c.nextDoc()
461 if m > max {
462 max = m
463 }
464 }
465 return max
466}
467
468func (t *orMatchTree) nextDoc() uint32 {
469 min := uint32(math.MaxUint32)
470 for _, c := range t.children {
471 m := c.nextDoc()
472 if m < min {
473 min = m
474 }
475 }
476 return min
477}
478
479func (t *notMatchTree) nextDoc() uint32 {
480 return 0
481}
482
483func (t *fileNameMatchTree) nextDoc() uint32 {
484 return t.child.nextDoc()
485}
486
487func (t *boostMatchTree) nextDoc() uint32 {
488 return t.child.nextDoc()
489}
490
491func (t *branchQueryMatchTree) nextDoc() uint32 {
492 var start uint32
493 if t.firstDone {
494 start = t.docID + 1
495 }
496
497 for i := start; i < uint32(len(t.fileMasks)); i++ {
498 if (t.masks[t.repos[i]] & t.fileMasks[i]) != 0 {
499 return i
500 }
501 }
502 return math.MaxUint32
503}
504
505// all String methods
506
507func (t *bruteForceMatchTree) String() string {
508 return "all"
509}
510
511func (t *docMatchTree) String() string {
512 return fmt.Sprintf("doc(%s)", t.reason)
513}
514
515func (t *andMatchTree) String() string {
516 return fmt.Sprintf("and%v", t.children)
517}
518
519func (t *regexpMatchTree) String() string {
520 f := ""
521 if t.fileName {
522 f = "f"
523 }
524 return fmt.Sprintf("%sre(%s)", f, t.regexp)
525}
526
527func (t *wordMatchTree) String() string {
528 f := ""
529 if t.fileName {
530 f = "f"
531 }
532 return fmt.Sprintf("%sword(%s)", f, t.word)
533}
534
535func (t *orMatchTree) String() string {
536 return fmt.Sprintf("or%v", t.children)
537}
538
539func (t *notMatchTree) String() string {
540 return fmt.Sprintf("not(%v)", t.child)
541}
542
543func (t *noVisitMatchTree) String() string {
544 return fmt.Sprintf("novisit(%v)", t.matchTree)
545}
546
547func (t *fileNameMatchTree) String() string {
548 return fmt.Sprintf("f(%v)", t.child)
549}
550
551func (t *boostMatchTree) String() string {
552 return fmt.Sprintf("boost(%f, %v)", t.boost, t.child)
553}
554
555func (t *substrMatchTree) String() string {
556 f := ""
557 if t.fileName {
558 f = "f"
559 }
560
561 return fmt.Sprintf("%ssubstr(%q, %v, %v)", f, t.query.Pattern, t.current, t.matchIterator)
562}
563
564func (t *branchQueryMatchTree) String() string {
565 return fmt.Sprintf("branch(%x)", t.masks)
566}
567
568func (t *symbolSubstrMatchTree) String() string {
569 return fmt.Sprintf("symbol(%v)", t.substrMatchTree)
570}
571
572func (t *symbolRegexpMatchTree) String() string {
573 return fmt.Sprintf("symbol(%v)", t.matchTree)
574}
575
576// visitMatches visits all atoms in matchTree. Note: This visits
577// noVisitMatchTree. For collecting matches use visitMatches.
578func visitMatchTree(t matchTree, f func(matchTree)) {
579 switch s := t.(type) {
580 case *andMatchTree:
581 for _, ch := range s.children {
582 visitMatchTree(ch, f)
583 }
584 case *orMatchTree:
585 for _, ch := range s.children {
586 visitMatchTree(ch, f)
587 }
588 case *andLineMatchTree:
589 visitMatchTree(&s.andMatchTree, f)
590 case *noVisitMatchTree:
591 visitMatchTree(s.matchTree, f)
592 case *notMatchTree:
593 visitMatchTree(s.child, f)
594 case *fileNameMatchTree:
595 visitMatchTree(s.child, f)
596 case *boostMatchTree:
597 visitMatchTree(s.child, f)
598 case *symbolSubstrMatchTree:
599 visitMatchTree(s.substrMatchTree, f)
600 case *symbolRegexpMatchTree:
601 visitMatchTree(s.matchTree, f)
602 default:
603 f(t)
604 }
605}
606
607// updateMatchTreeStats calls updateStats on all atoms in mt which have that
608// function defined.
609func updateMatchTreeStats(mt matchTree, stats *zoekt.Stats) {
610 visitMatchTree(mt, func(mt matchTree) {
611 if atom, ok := mt.(interface{ updateStats(*zoekt.Stats) }); ok {
612 atom.updateStats(stats)
613 }
614 })
615}
616
617func visitMatchAtoms(t matchTree, known map[matchTree]bool, f func(matchTree)) {
618 visitMatches(t, known, 1, func(mt matchTree, _ float64) {
619 f(mt)
620 })
621}
622
623// visitMatches visits all atoms which can contribute matches. Note: This
624// skips noVisitMatchTree.
625func visitMatches(t matchTree, known map[matchTree]bool, weight float64, f func(matchTree, float64)) {
626 switch s := t.(type) {
627 case *andMatchTree:
628 for _, ch := range s.children {
629 if known[ch] {
630 visitMatches(ch, known, weight, f)
631 }
632 }
633 case *andLineMatchTree:
634 visitMatches(&s.andMatchTree, known, weight, f)
635 case *orMatchTree:
636 for _, ch := range s.children {
637 if known[ch] {
638 visitMatches(ch, known, weight, f)
639 }
640 }
641 case *boostMatchTree:
642 visitMatches(s.child, known, weight*s.boost, f)
643 case *symbolSubstrMatchTree:
644 visitMatches(s.substrMatchTree, known, weight, f)
645 case *notMatchTree:
646 case *noVisitMatchTree:
647 // don't collect into negative trees.
648 case *fileNameMatchTree:
649 // We will just gather the filename if we do not visit this tree.
650 default:
651 f(s, weight)
652 }
653}
654
655// all matches() methods.
656
657func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
658 return matchesStatePred(t.predicate(cp.idx))
659}
660
661func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
662 return matchesFound
663}
664
665// andLineMatchTree is a performance optimization of andMatchTree. For content
666// searches we don't want to run the regex engine if there is no line that
667// contains matches from all terms.
668func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
669 if state := evalMatchTree(cp, cost, known, &t.andMatchTree); state != matchesFound {
670 return state
671 }
672
673 // Invariant: all children have matches. If any line contains all of them we
674 // can return MatchesFound.
675
676 // find child with fewest candidates
677 minCount := math.MaxInt
678 fewestChildren := 0
679 for ix, child := range t.children {
680 v, ok := child.(*substrMatchTree)
681 // make sure we are running a content search and that all candidates are a
682 // substrMatchTree
683 if !ok || v.fileName {
684 return matchesFound
685 }
686 if len(v.current) < minCount {
687 minCount = len(v.current)
688 fewestChildren = ix
689 }
690 }
691
692 type lineRange struct {
693 start int
694 end int
695 }
696 lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current))
697 prev := -1
698 for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current {
699 line := cp.newlines().atOffset(candidate.byteOffset)
700 if line == prev {
701 continue
702 }
703 prev = line
704 byteStart := int(cp.newlines().lineStart(line))
705 byteEnd := int(cp.newlines().lineStart(line + 1))
706 lines = append(lines, lineRange{byteStart, byteEnd})
707 }
708
709 // children keeps track of the children's candidates we have already seen.
710 children := make([][]*candidateMatch, 0, len(t.children)-1)
711 for j, child := range t.children {
712 if j == fewestChildren {
713 continue
714 }
715 children = append(children, child.(*substrMatchTree).current)
716 }
717
718nextLine:
719 for i := 0; i < len(lines); i++ {
720 hits := 1
721 nextChild:
722 for j := range children {
723 nextCandidate:
724 for len(children[j]) > 0 {
725 candidate := children[j][0]
726 bo := int(cp.findOffset(false, candidate.runeOffset))
727 if bo < lines[i].start {
728 children[j] = children[j][1:]
729 continue nextCandidate
730 }
731 if bo < lines[i].end {
732 hits++
733 continue nextChild
734 }
735 // move the `lines` iterator forward until bo < line.end
736 for i < len(lines) && bo >= lines[i].end {
737 i++
738 }
739 i--
740 continue nextLine
741 }
742 }
743 // return early once we found any line that contains matches from all children
744 if hits == len(t.children) {
745 return matchesFound
746 }
747 }
748 return matchesNone
749}
750
751func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
752 // We have found matches unless a child needs to do more work or it hasn't
753 // found matches.
754 state := matchesFound
755
756 for _, ch := range t.children {
757 switch evalMatchTree(cp, cost, known, ch) {
758 case matchesRequiresHigherCost:
759 // keep evaluating other children incase we come across matchesNone
760 state = matchesRequiresHigherCost
761 case matchesFound:
762 // will return this if every child has this value
763 case matchesNone:
764 return matchesNone
765 }
766 }
767
768 return state
769}
770
771func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
772 // we could short-circuit, but we want to use the other possibilities as a
773 // ranking signal. So we always return the most conservative state.
774 state := matchesNone
775 for _, ch := range t.children {
776 switch evalMatchTree(cp, cost, known, ch) {
777 case matchesRequiresHigherCost:
778 state = matchesRequiresHigherCost
779 case matchesFound:
780 if state != matchesRequiresHigherCost {
781 state = matchesFound
782 }
783 case matchesNone:
784 // noop
785 }
786 }
787 return state
788}
789
790func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
791 return matchesStatePred(t.branchMask() != 0)
792}
793
794func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
795 if t.reEvaluated {
796 return matchesStateForSlice(t.found)
797 }
798
799 if cost < costRegexp {
800 return matchesRequiresHigherCost
801 }
802
803 cp.stats.RegexpsConsidered++
804 idxs := t.regexp.FindAllIndex(cp.data(t.fileName), -1)
805 found := t.found[:0]
806 for _, idx := range idxs {
807 cm := &candidateMatch{
808 byteOffset: uint32(idx[0]),
809 byteMatchSz: uint32(idx[1] - idx[0]),
810 fileName: t.fileName,
811 }
812
813 found = append(found, cm)
814 }
815 t.found = found
816 t.reEvaluated = true
817
818 return matchesStateForSlice(t.found)
819}
820
821func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
822 if t.evaluated {
823 return matchesStateForSlice(t.found)
824 }
825
826 if cost < costRegexp {
827 return matchesRequiresHigherCost
828 }
829
830 data := cp.data(t.fileName)
831 offset := 0
832 found := t.found[:0]
833 for {
834 idx := bytes.Index(data[offset:], []byte(t.word))
835 if idx < 0 {
836 break
837 }
838
839 relStartOffset := offset + idx
840 relEndOffset := relStartOffset + len(t.word)
841
842 startBoundary := relStartOffset < len(data) && (relStartOffset == 0 || !characterClass(data[relStartOffset-1]))
843 endBoundary := relEndOffset > 0 && (relEndOffset == len(data) || !characterClass(data[relEndOffset]))
844 if startBoundary && endBoundary {
845 found = append(found, &candidateMatch{
846 byteOffset: uint32(offset + idx),
847 byteMatchSz: uint32(len(t.word)),
848 fileName: t.fileName,
849 })
850 }
851 offset += idx + len(t.word)
852 }
853
854 t.found = found
855 t.evaluated = true
856
857 return matchesStateForSlice(t.found)
858}
859
860// breakMatchesOnNewlines returns matches resulting from breaking each element
861// of cms on newlines within text.
862func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch {
863 var lineCMs []*candidateMatch
864 for _, cm := range cms {
865 lineCMs = append(lineCMs, breakOnNewlines(cm, text)...)
866 }
867 return lineCMs
868}
869
870// breakOnNewlines returns matches resulting from breaking cm on newlines
871// within text.
872func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch {
873 var cms []*candidateMatch
874 addMe := &candidateMatch{}
875 *addMe = *cm
876 for i := uint32(cm.byteOffset); i < cm.byteOffset+cm.byteMatchSz; i++ {
877 if text[i] == '\n' {
878 addMe.byteMatchSz = i - addMe.byteOffset
879 if addMe.byteMatchSz != 0 {
880 cms = append(cms, addMe)
881 }
882
883 addMe = &candidateMatch{}
884 *addMe = *cm
885 addMe.byteOffset = i + 1
886 }
887 }
888 addMe.byteMatchSz = cm.byteOffset + cm.byteMatchSz - addMe.byteOffset
889 if addMe.byteMatchSz != 0 {
890 cms = append(cms, addMe)
891 }
892 return cms
893}
894
895// evalMatchTree should be called instead of directly calling
896// matchTree.matches. It cache known values for future evaluation at higher
897// costs.
898func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) matchesState {
899 if v, ok := known[mt]; ok {
900 return matchesStatePred(v)
901 }
902
903 ms := mt.matches(cp, cost, known)
904 if ms != matchesRequiresHigherCost {
905 known[mt] = ms == matchesFound
906 }
907
908 return ms
909}
910
911func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
912 switch evalMatchTree(cp, cost, known, t.child) {
913 case matchesRequiresHigherCost:
914 return matchesRequiresHigherCost
915 case matchesFound:
916 return matchesNone
917 case matchesNone:
918 return matchesFound
919 default:
920 panic("unreachable")
921 }
922}
923
924func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
925 return evalMatchTree(cp, cost, known, t.child)
926}
927
928func (t *boostMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
929 return evalMatchTree(cp, cost, known, t.child)
930}
931
932func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
933 if t.contEvaluated {
934 return matchesStateForSlice(t.current)
935 }
936
937 if len(t.current) == 0 {
938 return matchesNone
939 }
940
941 if t.fileName && cost < costMemory {
942 return matchesRequiresHigherCost
943 }
944
945 if !t.fileName && cost < costContent {
946 return matchesRequiresHigherCost
947 }
948
949 pruned := t.current[:0]
950 for _, m := range t.current {
951 if m.byteOffset == 0 && m.runeOffset > 0 {
952 m.byteOffset = cp.findOffset(m.fileName, m.runeOffset)
953 }
954 if m.matchContent(cp.data(m.fileName)) {
955 pruned = append(pruned, m)
956 }
957 }
958 t.current = pruned
959 t.contEvaluated = true
960
961 return matchesStateForSlice(t.current)
962}
963
964type matchTreeOpt struct {
965 // DisableWordMatchOptimization is used to disable the use of wordMatchTree.
966 // This was added since we do not support wordMatchTree with symbol search.
967 DisableWordMatchOptimization bool
968}
969
970func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error) {
971 if q == nil {
972 return nil, fmt.Errorf("got nil (sub)query")
973 }
974 switch s := q.(type) {
975 case *query.Regexp:
976 // RegexpToMatchTreeRecursive tries to distill a matchTree that matches a
977 // superset of the regexp. If the returned matchTree is equivalent to the
978 // original regexp, it returns true. An equivalent matchTree has the same
979 // behaviour as the original regexp and can be used instead.
980 //
981 subMT, isEq, _, err := d.regexpToMatchTreeRecursive(s.Regexp, ngramSize, s.FileName, s.CaseSensitive)
982 if err != nil {
983 return nil, err
984 }
985 // if the query can be used in place of the regexp
986 // return the subtree
987 if isEq {
988 return subMT, nil
989 }
990
991 var tr matchTree
992 if wmt, ok := regexpToWordMatchTree(s, opt); ok {
993 // A common search we get is "\bLITERAL\b". Avoid the regex engine and
994 // provide something faster.
995 tr = wmt
996 } else {
997 tr = newRegexpMatchTree(s)
998 }
999
1000 return &andMatchTree{
1001 children: []matchTree{
1002 tr, &noVisitMatchTree{subMT},
1003 },
1004 }, nil
1005 case *query.And:
1006 var r []matchTree
1007 for _, ch := range s.Children {
1008 ct, err := d.newMatchTree(ch, opt)
1009 if err != nil {
1010 return nil, err
1011 }
1012 r = append(r, ct)
1013 }
1014 return &andMatchTree{r}, nil
1015 case *query.Or:
1016 var r []matchTree
1017 for _, ch := range s.Children {
1018 ct, err := d.newMatchTree(ch, opt)
1019 if err != nil {
1020 return nil, err
1021 }
1022 r = append(r, ct)
1023 }
1024 return &orMatchTree{r}, nil
1025 case *query.Not:
1026 ct, err := d.newMatchTree(s.Child, opt)
1027 return ¬MatchTree{
1028 child: ct,
1029 }, err
1030
1031 case *query.Type:
1032 if s.Type != query.TypeFileName {
1033 break
1034 }
1035
1036 ct, err := d.newMatchTree(s.Child, opt)
1037 if err != nil {
1038 return nil, err
1039 }
1040
1041 return &fileNameMatchTree{
1042 child: ct,
1043 }, nil
1044
1045 case *query.Boost:
1046 ct, err := d.newMatchTree(s.Child, opt)
1047 if err != nil {
1048 return nil, err
1049 }
1050
1051 return &boostMatchTree{
1052 child: ct,
1053 boost: s.Boost,
1054 }, nil
1055
1056 case *query.Substring:
1057 return d.newSubstringMatchTree(s)
1058
1059 case *query.Branch:
1060 masks := make([]uint64, 0, len(d.repoMetaData))
1061 if s.Pattern == "HEAD" {
1062 for range d.repoMetaData {
1063 masks = append(masks, 1)
1064 }
1065 } else {
1066 for _, branchIDs := range d.branchIDs {
1067 mask := uint64(0)
1068 for nm, m := range branchIDs {
1069 if (s.Exact && nm == s.Pattern) || (!s.Exact && strings.Contains(nm, s.Pattern)) {
1070 mask |= uint64(m)
1071 }
1072 }
1073 masks = append(masks, mask)
1074 }
1075 }
1076 return &branchQueryMatchTree{
1077 masks: masks,
1078 fileMasks: d.fileBranchMasks,
1079 repos: d.repos,
1080 }, nil
1081 case *query.Const:
1082 if s.Value {
1083 return &bruteForceMatchTree{}, nil
1084 } else {
1085 return &noMatchTree{Why: "const"}, nil
1086 }
1087 case *query.Language:
1088 code, ok := d.metaData.LanguageMap[s.Language]
1089 if !ok {
1090 return &noMatchTree{Why: "lang"}, nil
1091 }
1092 return &docMatchTree{
1093 reason: "language",
1094 numDocs: d.numDocs(),
1095 predicate: func(docID uint32) bool {
1096 return d.getLanguage(docID) == code
1097 },
1098 }, nil
1099
1100 case *query.Symbol:
1101 // Disable WordMatchTree since we don't support it in symbols yet.
1102 optCopy := opt
1103 optCopy.DisableWordMatchOptimization = true
1104
1105 subMT, err := d.newMatchTree(s.Expr, optCopy)
1106 if err != nil {
1107 return nil, err
1108 }
1109
1110 if substr, ok := subMT.(*substrMatchTree); ok {
1111 return &symbolSubstrMatchTree{
1112 substrMatchTree: substr,
1113 patternSize: uint32(utf8.RuneCountInString(substr.query.Pattern)),
1114 fileEndRunes: d.fileEndRunes,
1115 fileEndSymbol: d.fileEndSymbol,
1116 sections: d.runeDocSections,
1117 }, nil
1118 }
1119
1120 var regexpMT *regexpMatchTree
1121 visitMatchTree(subMT, func(mt matchTree) {
1122 if t, ok := mt.(*regexpMatchTree); ok {
1123 regexpMT = t
1124 }
1125 })
1126 if regexpMT == nil {
1127 return nil, fmt.Errorf("found %T inside query.Symbol", subMT)
1128 }
1129
1130 return &symbolRegexpMatchTree{
1131 regexp: regexpMT.regexp,
1132 all: isRegexpAll(regexpMT.origRegexp),
1133 matchTree: subMT,
1134 }, nil
1135
1136 case *query.FileNameSet:
1137 return &docMatchTree{
1138 reason: "FileNameSet",
1139 numDocs: d.numDocs(),
1140 predicate: func(docID uint32) bool {
1141 fileName := d.fileName(docID)
1142 _, ok := s.Set[string(fileName)]
1143 return ok
1144 },
1145 }, nil
1146
1147 case *query.BranchesRepos:
1148 reposBranchesWant := make([]uint64, len(d.repoMetaData))
1149 for repoIdx := range d.repoMetaData {
1150 var mask uint64
1151 for _, br := range s.List {
1152 if br.Repos.Contains(d.repoMetaData[repoIdx].ID) {
1153 mask |= uint64(d.branchIDs[repoIdx][br.Branch])
1154 }
1155 }
1156 reposBranchesWant[repoIdx] = mask
1157 }
1158 return &docMatchTree{
1159 reason: "BranchesRepos",
1160 numDocs: d.numDocs(),
1161 predicate: func(docID uint32) bool {
1162 return d.fileBranchMasks[docID]&reposBranchesWant[d.repos[docID]] != 0
1163 },
1164 }, nil
1165
1166 case *query.RepoSet:
1167 reposWant := make([]bool, len(d.repoMetaData))
1168 for repoIdx, r := range d.repoMetaData {
1169 if _, ok := s.Set[r.Name]; ok {
1170 reposWant[repoIdx] = true
1171 }
1172 }
1173 return &docMatchTree{
1174 reason: "RepoSet",
1175 numDocs: d.numDocs(),
1176 predicate: func(docID uint32) bool {
1177 return reposWant[d.repos[docID]]
1178 },
1179 }, nil
1180
1181 case *query.RepoIDs:
1182 reposWant := make([]bool, len(d.repoMetaData))
1183 for repoIdx, r := range d.repoMetaData {
1184 if s.Repos.Contains(r.ID) {
1185 reposWant[repoIdx] = true
1186 }
1187 }
1188 return &docMatchTree{
1189 reason: "RepoIDs",
1190 numDocs: d.numDocs(),
1191 predicate: func(docID uint32) bool {
1192 return reposWant[d.repos[docID]]
1193 },
1194 }, nil
1195
1196 case *query.Repo:
1197 reposWant := make([]bool, len(d.repoMetaData))
1198 for repoIdx, r := range d.repoMetaData {
1199 if s.Regexp.MatchString(r.Name) {
1200 reposWant[repoIdx] = true
1201 }
1202 }
1203 return &docMatchTree{
1204 reason: "Repo",
1205 numDocs: d.numDocs(),
1206 predicate: func(docID uint32) bool {
1207 return reposWant[d.repos[docID]]
1208 },
1209 }, nil
1210
1211 case *query.RepoRegexp:
1212 reposWant := make([]bool, len(d.repoMetaData))
1213 for repoIdx, r := range d.repoMetaData {
1214 if s.Regexp.MatchString(r.Name) {
1215 reposWant[repoIdx] = true
1216 }
1217 }
1218 return &docMatchTree{
1219 reason: "RepoRegexp",
1220 numDocs: d.numDocs(),
1221 predicate: func(docID uint32) bool {
1222 return reposWant[d.repos[docID]]
1223 },
1224 }, nil
1225
1226 case query.RawConfig:
1227 return &docMatchTree{
1228 reason: s.String(),
1229 numDocs: d.numDocs(),
1230 predicate: func(docID uint32) bool {
1231 return uint8(s)&d.rawConfigMasks[d.repos[docID]] == uint8(s)
1232 },
1233 }, nil
1234 }
1235 log.Panicf("type %T", q)
1236 return nil, nil
1237}
1238
1239func (d *indexData) newSubstringMatchTree(s *query.Substring) (matchTree, error) {
1240 st := &substrMatchTree{
1241 query: s,
1242 caseSensitive: s.CaseSensitive,
1243 fileName: s.FileName,
1244 }
1245
1246 if utf8.RuneCountInString(s.Pattern) < ngramSize {
1247 return newRegexpMatchTree(&query.Regexp{
1248 Regexp: &syntax.Regexp{Op: syntax.OpLiteral, Rune: []rune(s.Pattern)},
1249 FileName: s.FileName,
1250 Content: s.Content,
1251 CaseSensitive: s.CaseSensitive,
1252 }), nil
1253 }
1254
1255 result, err := d.iterateNgrams(s)
1256 if err != nil {
1257 return nil, err
1258 }
1259 st.matchIterator = result
1260 return st, nil
1261}
1262
1263func regexpToWordMatchTree(q *query.Regexp, opt matchTreeOpt) (_ *wordMatchTree, ok bool) {
1264 if opt.DisableWordMatchOptimization {
1265 return nil, false
1266 }
1267 // Needs to be case sensitive
1268 if !q.CaseSensitive || q.Regexp.Flags&syntax.FoldCase != 0 {
1269 return nil, false
1270 }
1271 // We want a regex that looks like Op.Concat[OpWordBoundary OpLiteral OpWordBoundary]
1272 if q.Regexp.Op != syntax.OpConcat || len(q.Regexp.Sub) != 3 {
1273 return nil, false
1274 }
1275 sub := q.Regexp.Sub
1276 if sub[0].Op != syntax.OpWordBoundary || sub[1].Op != syntax.OpLiteral || sub[2].Op != syntax.OpWordBoundary {
1277 return nil, false
1278 }
1279
1280 return &wordMatchTree{
1281 word: string(sub[1].Rune),
1282 fileName: q.FileName,
1283 }, true
1284}
1285
1286// pruneMatchTree removes impossible branches from the matchTree, as indicated
1287// by substrMatchTree having a noMatchTree and the resulting impossible and clauses and so forth.
1288func pruneMatchTree(mt matchTree) (matchTree, error) {
1289 var err error
1290 switch mt := mt.(type) {
1291 // leaf nodes that we test with the filter:
1292 case *substrMatchTree:
1293 if res, ok := mt.matchIterator.(*ngramIterationResults); ok {
1294 if _, ok := res.matchIterator.(*noMatchTree); ok {
1295 return nil, nil
1296 }
1297 }
1298 // recursive tree structures:
1299 case *andMatchTree:
1300 // Any branch of an and becoming impossible means the entire clause
1301 // is impossible. Otherwise, just handle rewrites.
1302 for i, child := range mt.children {
1303 newChild, err := pruneMatchTree(child)
1304 if err != nil {
1305 return nil, err
1306 }
1307 if newChild == nil {
1308 return nil, nil
1309 }
1310 mt.children[i] = newChild
1311 }
1312 case *orMatchTree:
1313 // *All* branches of an OR becoming impossible means the entire clause
1314 // is impossible. Otherwise, drop impossible subclauses and handle
1315 // rewrites, including simplifying to a singular resulting child branch.
1316 n := 0
1317 for _, child := range mt.children {
1318 newChild, err := pruneMatchTree(child)
1319 if err != nil {
1320 return nil, err
1321 }
1322 if newChild != nil {
1323 mt.children[n] = newChild
1324 n++
1325 }
1326 }
1327 mt.children = mt.children[:n]
1328 if len(mt.children) == 1 {
1329 return mt.children[0], nil
1330 } else if len(mt.children) == 0 {
1331 return nil, nil
1332 }
1333 case *noVisitMatchTree:
1334 mt.matchTree, err = pruneMatchTree(mt.matchTree)
1335 if err != nil {
1336 return nil, err
1337 }
1338 if mt.matchTree == nil {
1339 return nil, nil
1340 }
1341 case *fileNameMatchTree:
1342 mt.child, err = pruneMatchTree(mt.child)
1343 if err != nil {
1344 return nil, err
1345 }
1346 if mt.child == nil {
1347 return nil, nil
1348 }
1349 case *boostMatchTree:
1350 mt.child, err = pruneMatchTree(mt.child)
1351 if err != nil {
1352 return nil, err
1353 }
1354 if mt.child == nil {
1355 return nil, nil
1356 }
1357 case *andLineMatchTree:
1358 child, err := pruneMatchTree(&mt.andMatchTree)
1359 if err != nil {
1360 return nil, err
1361 }
1362 if child == nil {
1363 return nil, nil
1364 }
1365 if c, ok := child.(*andMatchTree); ok {
1366 mt.andMatchTree = *c
1367 } else {
1368 // the and was simplified to a single clause,
1369 // so the linematch portion is irrelevant.
1370 return mt, nil
1371 }
1372 case *notMatchTree:
1373 mt.child, err = pruneMatchTree(mt.child)
1374 if err != nil {
1375 return nil, err
1376 }
1377 if mt.child == nil {
1378 // not false => true
1379 return &bruteForceMatchTree{}, nil
1380 }
1381 // unhandled:
1382 case *docMatchTree:
1383 case *bruteForceMatchTree:
1384 case *regexpMatchTree:
1385 case *wordMatchTree:
1386 }
1387 return mt, err
1388}
1389
1390// isRegexpAll returns true if the query matches all possible lines.
1391//
1392// Note: it is possible for a funky regex to actually match all but this
1393// returns false. This returns true for normal looking regexes like ".*" or
1394// "(?-s:.*)".
1395func isRegexpAll(r *syntax.Regexp) bool {
1396 // Note: we don't care about any flags since we are looking for .* and we
1397 // don't mind if . matches all or everything but newline.
1398
1399 // for loop is for visiting children of OpCapture/OpConcat until we hit .*
1400 for {
1401 // Our main target: .*
1402 if r.Op == syntax.OpStar && len(r.Sub) == 1 { // *
1403 // (?s:.) or (?-s:.)
1404 return r.Sub[0].Op == syntax.OpAnyChar || r.Sub[0].Op == syntax.OpAnyCharNotNL
1405 }
1406
1407 // Strip away expressions being wrapped in paranthesis
1408 if (r.Op == syntax.OpCapture || r.Op == syntax.OpConcat) && len(r.Sub) == 1 {
1409 r = r.Sub[0]
1410 continue
1411 }
1412
1413 return false
1414 }
1415}