fork of https://github.com/sourcegraph/zoekt
1// Copyright 2018 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package index
16
17import (
18 "bytes"
19 "fmt"
20 "log"
21 "math"
22 "regexp/syntax"
23 "strings"
24 "unicode/utf8"
25
26 "github.com/cespare/xxhash/v2"
27 "github.com/grafana/regexp"
28
29 "github.com/sourcegraph/zoekt"
30 "github.com/sourcegraph/zoekt/internal/syntaxutil"
31 "github.com/sourcegraph/zoekt/query"
32)
33
34// A docIterator iterates over documents in order.
35type docIterator interface {
36 // provide the next document where we may find something interesting.
37 //
38 // This is like a "peek" and shouldn't mutate state. prepare is what should
39 // change state.
40 nextDoc() uint32
41
42 // clears any per-document state of the docIterator, and
43 // prepares for evaluating the given doc. The argument is
44 // strictly increasing over time.
45 prepare(nextDoc uint32)
46}
47
48// costs are passed in increasing order to matchTree.matches until they do not
49// return matchesRequiresHigherCost.
50const (
51 costConst = 0
52 costMemory = 1
53 costContent = 2
54 costRegexp = 3
55)
56
57const (
58 costMin = costConst
59 costMax = costRegexp
60)
61
62// matchesState is an enum for the state of a matchTree after a call to
63// matchTree.matches.
64type matchesState uint8
65
66const (
67 // matchesRequiresHigherCost is returned when matchTree.matches hasn't done
68 // a search yet since the cost value is not high enough.
69 matchesRequiresHigherCost matchesState = iota
70
71 // matchesFound is returned when matchTree.matches has done a search and
72 // found one or more matches.
73 matchesFound
74
75 // matchesNone is returned when matchTree.matches has done a search and
76 // found nothing.
77 matchesNone
78)
79
80// matchesStatePred is a helper which returns matchesFound if b is true
81// otherwise returns matchesNone.
82func matchesStatePred(b bool) matchesState {
83 if b {
84 return matchesFound
85 }
86 return matchesNone
87}
88
89// matchesStateForSlice is a helper which returns matchesFound if v is
90// non-empty otherwise returns matchesNone.
91func matchesStateForSlice[T any](v []T) matchesState {
92 return matchesStatePred(len(v) > 0)
93}
94
95// An expression tree coupled with matches. The matchtree has two
96// functions:
97//
98// * it implements boolean combinations (and, or, not)
99//
100// * it implements shortcuts, where we skip documents (for example: if
101// there are no trigram matches, we can be sure there are no substring
102// matches). The matchtree iterates over the documents as they are
103// ordered in the shard.
104//
105// The general process for a given (shard, query) is:
106//
107// - construct matchTree for the query
108//
109// - find all different leaf matchTrees (substring, regexp, etc.)
110//
111// in a loop:
112//
113// - find next doc to process using nextDoc
114//
115// - evaluate atoms (leaf expressions that match text)
116//
117// - evaluate the tree using matches(), storing the result in map.
118//
119// - if the complete tree returns (matches() != matchesRequiresHigherCost)
120// for the document, collect all text matches by looking at leaf
121// matchTrees.
122type matchTree interface {
123 docIterator
124
125 // matches if cost is high enough. See documentation for matchesState's
126 // values.
127 //
128 // Note: Do not call this directly, rather use evalMatchTree which uses
129 // known to cache responses once the state transitions away from
130 // matchesRequiresHigherCost.
131 matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState
132}
133
134// docMatchTree iterates over documents for which predicate(docID) returns true.
135type docMatchTree struct {
136 // the number of documents in a shard.
137 numDocs uint32
138
139 predicate func(docID uint32) bool
140
141 // provides additional information about the reason why the docMatchTree was
142 // created.
143 reason string
144
145 // mutable
146 firstDone bool
147 docID uint32
148}
149
150type bruteForceMatchTree struct {
151 // mutable
152 firstDone bool
153 docID uint32
154}
155
156type andLineMatchTree struct {
157 andMatchTree
158}
159
160type andMatchTree struct {
161 children []matchTree
162}
163
164type orMatchTree struct {
165 children []matchTree
166}
167
168type notMatchTree struct {
169 child matchTree
170}
171
172// Returns only the filename of child matches.
173type fileNameMatchTree struct {
174 child matchTree
175}
176
177type boostMatchTree struct {
178 child matchTree
179 boost float64
180}
181
182// Don't visit this subtree for collecting matches.
183type noVisitMatchTree struct {
184 matchTree
185}
186
187type regexpMatchTree struct {
188 regexp *regexp.Regexp
189
190 // origRegexp is the original parsed regexp from the query structure. It
191 // does not include mutations such as case sensitivity.
192 origRegexp *syntax.Regexp
193
194 fileName bool
195
196 // mutable
197 reEvaluated bool
198 found []*candidateMatch
199
200 // nextDoc, prepare.
201 bruteForceMatchTree
202}
203
204func newRegexpMatchTree(s *query.Regexp) *regexpMatchTree {
205 prefix := ""
206 if !s.CaseSensitive {
207 prefix = "(?i)"
208 }
209
210 return ®expMatchTree{
211 regexp: regexp.MustCompile(prefix + syntaxutil.RegexpString(s.Regexp)),
212 origRegexp: s.Regexp,
213 fileName: s.FileName,
214 }
215}
216
217// \bLITERAL\b
218type wordMatchTree struct {
219 word string
220
221 fileName bool
222
223 // mutable
224 evaluated bool
225 found []*candidateMatch
226
227 // nextDoc, prepare.
228 bruteForceMatchTree
229}
230
231type substrMatchTree struct {
232 matchIterator
233
234 query *query.Substring
235 caseSensitive bool
236 fileName bool
237
238 // mutable
239 current []*candidateMatch
240 contEvaluated bool
241}
242
243type branchQueryMatchTree struct {
244 fileMasks []uint64
245 masks []uint64
246 repos []uint16
247
248 // mutable
249 firstDone bool
250 docID uint32
251}
252
253func (t *branchQueryMatchTree) branchMask() uint64 {
254 return t.fileMasks[t.docID] & t.masks[t.repos[t.docID]]
255}
256
257type symbolRegexpMatchTree struct {
258 matchTree
259 regexp *regexp.Regexp
260 all bool // skips regex match if .*
261
262 reEvaluated bool
263 found []*candidateMatch
264}
265
266func (t *symbolRegexpMatchTree) prepare(doc uint32) {
267 t.reEvaluated = false
268 t.found = t.found[:0]
269 t.matchTree.prepare(doc)
270}
271
272func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
273 if t.reEvaluated {
274 return matchesStateForSlice(t.found)
275 }
276
277 if cost < costRegexp {
278 return matchesRequiresHigherCost
279 }
280
281 sections := cp.docSections()
282 content := cp.data(false)
283
284 found := t.found[:0]
285 for i, sec := range sections {
286 var idx []int
287 if t.all {
288 idx = []int{0, int(sec.End - sec.Start)}
289 } else {
290 idx = t.regexp.FindIndex(content[sec.Start:sec.End])
291 if idx == nil {
292 continue
293 }
294 }
295
296 cm := &candidateMatch{
297 byteOffset: sec.Start + uint32(idx[0]),
298 byteMatchSz: uint32(idx[1] - idx[0]),
299 symbol: true,
300 symbolIdx: uint32(i),
301 }
302 found = append(found, cm)
303 }
304 t.found = found
305 t.reEvaluated = true
306
307 return matchesStateForSlice(t.found)
308}
309
310type symbolSubstrMatchTree struct {
311 *substrMatchTree
312
313 patternSize uint32
314 fileEndRunes []uint32
315 fileEndSymbol []uint32
316
317 doc uint32
318 sections []DocumentSection
319
320 secID uint32
321}
322
323func (t *symbolSubstrMatchTree) prepare(doc uint32) {
324 t.substrMatchTree.prepare(doc)
325 t.doc = doc
326
327 var fileStart uint32
328 if doc > 0 {
329 fileStart = t.fileEndRunes[doc-1]
330 }
331
332 var sections []DocumentSection
333 if len(t.sections) > 0 {
334 most := t.fileEndSymbol[len(t.fileEndSymbol)-1]
335 if most == uint32(len(t.sections)) {
336 sections = t.sections[t.fileEndSymbol[doc]:t.fileEndSymbol[doc+1]]
337 } else {
338 for t.secID < uint32(len(t.sections)) && t.sections[t.secID].Start < fileStart {
339 t.secID++
340 }
341
342 fileEnd, symbolEnd := t.fileEndRunes[doc], t.secID
343 for symbolEnd < uint32(len(t.sections)) && t.sections[symbolEnd].Start < fileEnd {
344 symbolEnd++
345 }
346
347 sections = t.sections[t.secID:symbolEnd]
348 }
349 }
350
351 secIdx := 0
352 trimmed := t.current[:0]
353 for len(sections) > secIdx && len(t.current) > 0 {
354 start := fileStart + t.current[0].runeOffset
355 end := start + t.patternSize
356 if start >= sections[secIdx].End {
357 secIdx++
358 continue
359 }
360
361 if start < sections[secIdx].Start {
362 t.current = t.current[1:]
363 continue
364 }
365
366 if end <= sections[secIdx].End {
367 t.current[0].symbol = true
368 t.current[0].symbolIdx = uint32(secIdx)
369 trimmed = append(trimmed, t.current[0])
370 }
371
372 t.current = t.current[1:]
373 }
374 t.current = trimmed
375}
376
377// all prepare methods
378
379func (t *bruteForceMatchTree) prepare(doc uint32) {
380 t.docID = doc
381 t.firstDone = true
382}
383
384func (t *docMatchTree) prepare(doc uint32) {
385 t.docID = doc
386 t.firstDone = true
387}
388
389func (t *andMatchTree) prepare(doc uint32) {
390 for _, c := range t.children {
391 c.prepare(doc)
392 }
393}
394
395func (t *regexpMatchTree) prepare(doc uint32) {
396 t.found = t.found[:0]
397 t.reEvaluated = false
398 t.bruteForceMatchTree.prepare(doc)
399}
400
401func (t *wordMatchTree) prepare(doc uint32) {
402 t.found = t.found[:0]
403 t.evaluated = false
404 t.bruteForceMatchTree.prepare(doc)
405}
406
407func (t *orMatchTree) prepare(doc uint32) {
408 for _, c := range t.children {
409 c.prepare(doc)
410 }
411}
412
413func (t *notMatchTree) prepare(doc uint32) {
414 t.child.prepare(doc)
415}
416
417func (t *fileNameMatchTree) prepare(doc uint32) {
418 t.child.prepare(doc)
419}
420
421func (t *boostMatchTree) prepare(doc uint32) {
422 t.child.prepare(doc)
423}
424
425func (t *substrMatchTree) prepare(nextDoc uint32) {
426 t.matchIterator.prepare(nextDoc)
427 t.current = t.matchIterator.candidates()
428 t.contEvaluated = false
429}
430
431func (t *branchQueryMatchTree) prepare(doc uint32) {
432 t.firstDone = true
433 t.docID = doc
434}
435
436// nextDoc
437
438func (t *docMatchTree) nextDoc() uint32 {
439 var start uint32
440 if t.firstDone {
441 start = t.docID + 1
442 }
443 for i := start; i < t.numDocs; i++ {
444 if t.predicate(i) {
445 return i
446 }
447 }
448 return math.MaxUint32
449}
450
451func (t *bruteForceMatchTree) nextDoc() uint32 {
452 if !t.firstDone {
453 return 0
454 }
455 return t.docID + 1
456}
457
458func (t *andMatchTree) nextDoc() uint32 {
459 var max uint32
460 for _, c := range t.children {
461 m := c.nextDoc()
462 if m > max {
463 max = m
464 }
465 }
466 return max
467}
468
469func (t *orMatchTree) nextDoc() uint32 {
470 min := uint32(math.MaxUint32)
471 for _, c := range t.children {
472 m := c.nextDoc()
473 if m < min {
474 min = m
475 }
476 }
477 return min
478}
479
480func (t *notMatchTree) nextDoc() uint32 {
481 return 0
482}
483
484func (t *fileNameMatchTree) nextDoc() uint32 {
485 return t.child.nextDoc()
486}
487
488func (t *boostMatchTree) nextDoc() uint32 {
489 return t.child.nextDoc()
490}
491
492func (t *branchQueryMatchTree) nextDoc() uint32 {
493 var start uint32
494 if t.firstDone {
495 start = t.docID + 1
496 }
497
498 for i := start; i < uint32(len(t.fileMasks)); i++ {
499 if (t.masks[t.repos[i]] & t.fileMasks[i]) != 0 {
500 return i
501 }
502 }
503 return math.MaxUint32
504}
505
506// all String methods
507
508func (t *bruteForceMatchTree) String() string {
509 return "all"
510}
511
512func (t *docMatchTree) String() string {
513 return fmt.Sprintf("doc(%s)", t.reason)
514}
515
516func (t *andMatchTree) String() string {
517 return fmt.Sprintf("and%v", t.children)
518}
519
520func (t *regexpMatchTree) String() string {
521 f := ""
522 if t.fileName {
523 f = "f"
524 }
525 return fmt.Sprintf("%sre(%s)", f, t.regexp)
526}
527
528func (t *wordMatchTree) String() string {
529 f := ""
530 if t.fileName {
531 f = "f"
532 }
533 return fmt.Sprintf("%sword(%s)", f, t.word)
534}
535
536func (t *orMatchTree) String() string {
537 return fmt.Sprintf("or%v", t.children)
538}
539
540func (t *notMatchTree) String() string {
541 return fmt.Sprintf("not(%v)", t.child)
542}
543
544func (t *noVisitMatchTree) String() string {
545 return fmt.Sprintf("novisit(%v)", t.matchTree)
546}
547
548func (t *fileNameMatchTree) String() string {
549 return fmt.Sprintf("f(%v)", t.child)
550}
551
552func (t *boostMatchTree) String() string {
553 return fmt.Sprintf("boost(%f, %v)", t.boost, t.child)
554}
555
556func (t *substrMatchTree) String() string {
557 f := ""
558 if t.fileName {
559 f = "f"
560 }
561
562 return fmt.Sprintf("%ssubstr(%q, %v, %v)", f, t.query.Pattern, t.current, t.matchIterator)
563}
564
565func (t *branchQueryMatchTree) String() string {
566 return fmt.Sprintf("branch(%x)", t.masks)
567}
568
569func (t *symbolSubstrMatchTree) String() string {
570 return fmt.Sprintf("symbol(%v)", t.substrMatchTree)
571}
572
573func (t *symbolRegexpMatchTree) String() string {
574 return fmt.Sprintf("symbol(%v)", t.matchTree)
575}
576
577// visitMatches visits all atoms in matchTree. Note: This visits
578// noVisitMatchTree. For collecting matches use visitMatches.
579func visitMatchTree(t matchTree, f func(matchTree)) {
580 switch s := t.(type) {
581 case *andMatchTree:
582 for _, ch := range s.children {
583 visitMatchTree(ch, f)
584 }
585 case *orMatchTree:
586 for _, ch := range s.children {
587 visitMatchTree(ch, f)
588 }
589 case *andLineMatchTree:
590 visitMatchTree(&s.andMatchTree, f)
591 case *noVisitMatchTree:
592 visitMatchTree(s.matchTree, f)
593 case *notMatchTree:
594 visitMatchTree(s.child, f)
595 case *fileNameMatchTree:
596 visitMatchTree(s.child, f)
597 case *boostMatchTree:
598 visitMatchTree(s.child, f)
599 case *symbolSubstrMatchTree:
600 visitMatchTree(s.substrMatchTree, f)
601 case *symbolRegexpMatchTree:
602 visitMatchTree(s.matchTree, f)
603 default:
604 f(t)
605 }
606}
607
608// updateMatchTreeStats calls updateStats on all atoms in mt which have that
609// function defined.
610func updateMatchTreeStats(mt matchTree, stats *zoekt.Stats) {
611 visitMatchTree(mt, func(mt matchTree) {
612 if atom, ok := mt.(interface{ updateStats(*zoekt.Stats) }); ok {
613 atom.updateStats(stats)
614 }
615 })
616}
617
618func visitMatchAtoms(t matchTree, known map[matchTree]bool, f func(matchTree)) {
619 visitMatches(t, known, 1, func(mt matchTree, _ float64) {
620 f(mt)
621 })
622}
623
624// visitMatches visits all atoms which can contribute matches. Note: This
625// skips noVisitMatchTree.
626func visitMatches(t matchTree, known map[matchTree]bool, weight float64, f func(matchTree, float64)) {
627 switch s := t.(type) {
628 case *andMatchTree:
629 for _, ch := range s.children {
630 if known[ch] {
631 visitMatches(ch, known, weight, f)
632 }
633 }
634 case *andLineMatchTree:
635 visitMatches(&s.andMatchTree, known, weight, f)
636 case *orMatchTree:
637 for _, ch := range s.children {
638 if known[ch] {
639 visitMatches(ch, known, weight, f)
640 }
641 }
642 case *boostMatchTree:
643 visitMatches(s.child, known, weight*s.boost, f)
644 case *symbolSubstrMatchTree:
645 visitMatches(s.substrMatchTree, known, weight, f)
646 case *notMatchTree:
647 case *noVisitMatchTree:
648 // don't collect into negative trees.
649 case *fileNameMatchTree:
650 // We will just gather the filename if we do not visit this tree.
651 default:
652 f(s, weight)
653 }
654}
655
656// all matches() methods.
657
658func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
659 return matchesStatePred(t.predicate(cp.idx))
660}
661
662func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
663 return matchesFound
664}
665
666// andLineMatchTree is a performance optimization of andMatchTree. For content
667// searches we don't want to run the regex engine if there is no line that
668// contains matches from all terms.
669func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
670 if state := evalMatchTree(cp, cost, known, &t.andMatchTree); state != matchesFound {
671 return state
672 }
673
674 // Invariant: all children have matches. If any line contains all of them we
675 // can return MatchesFound.
676
677 // find child with fewest candidates
678 minCount := math.MaxInt
679 fewestChildren := 0
680 for ix, child := range t.children {
681 v, ok := child.(*substrMatchTree)
682 // make sure we are running a content search and that all candidates are a
683 // substrMatchTree
684 if !ok || v.fileName {
685 return matchesFound
686 }
687 if len(v.current) < minCount {
688 minCount = len(v.current)
689 fewestChildren = ix
690 }
691 }
692
693 type lineRange struct {
694 start int
695 end int
696 }
697 lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current))
698 prev := -1
699 for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current {
700 line := cp.newlines().atOffset(candidate.byteOffset)
701 if line == prev {
702 continue
703 }
704 prev = line
705 byteStart := int(cp.newlines().lineStart(line))
706 byteEnd := int(cp.newlines().lineStart(line + 1))
707 lines = append(lines, lineRange{byteStart, byteEnd})
708 }
709
710 // children keeps track of the children's candidates we have already seen.
711 children := make([][]*candidateMatch, 0, len(t.children)-1)
712 for j, child := range t.children {
713 if j == fewestChildren {
714 continue
715 }
716 children = append(children, child.(*substrMatchTree).current)
717 }
718
719nextLine:
720 for i := 0; i < len(lines); i++ {
721 hits := 1
722 nextChild:
723 for j := range children {
724 nextCandidate:
725 for len(children[j]) > 0 {
726 candidate := children[j][0]
727 bo := int(cp.findOffset(false, candidate.runeOffset))
728 if bo < lines[i].start {
729 children[j] = children[j][1:]
730 continue nextCandidate
731 }
732 if bo < lines[i].end {
733 hits++
734 continue nextChild
735 }
736 // move the `lines` iterator forward until bo < line.end
737 for i < len(lines) && bo >= lines[i].end {
738 i++
739 }
740 i--
741 continue nextLine
742 }
743 }
744 // return early once we found any line that contains matches from all children
745 if hits == len(t.children) {
746 return matchesFound
747 }
748 }
749 return matchesNone
750}
751
752func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
753 // We have found matches unless a child needs to do more work or it hasn't
754 // found matches.
755 state := matchesFound
756
757 for _, ch := range t.children {
758 switch evalMatchTree(cp, cost, known, ch) {
759 case matchesRequiresHigherCost:
760 // keep evaluating other children incase we come across matchesNone
761 state = matchesRequiresHigherCost
762 case matchesFound:
763 // will return this if every child has this value
764 case matchesNone:
765 return matchesNone
766 }
767 }
768
769 return state
770}
771
772func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
773 // we could short-circuit, but we want to use the other possibilities as a
774 // ranking signal. So we always return the most conservative state.
775 state := matchesNone
776 for _, ch := range t.children {
777 switch evalMatchTree(cp, cost, known, ch) {
778 case matchesRequiresHigherCost:
779 state = matchesRequiresHigherCost
780 case matchesFound:
781 if state != matchesRequiresHigherCost {
782 state = matchesFound
783 }
784 case matchesNone:
785 // noop
786 }
787 }
788 return state
789}
790
791func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
792 return matchesStatePred(t.branchMask() != 0)
793}
794
795func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
796 if t.reEvaluated {
797 return matchesStateForSlice(t.found)
798 }
799
800 if cost < costRegexp {
801 return matchesRequiresHigherCost
802 }
803
804 cp.stats.RegexpsConsidered++
805 idxs := t.regexp.FindAllIndex(cp.data(t.fileName), -1)
806 found := t.found[:0]
807 for _, idx := range idxs {
808 cm := &candidateMatch{
809 byteOffset: uint32(idx[0]),
810 byteMatchSz: uint32(idx[1] - idx[0]),
811 fileName: t.fileName,
812 }
813
814 found = append(found, cm)
815 }
816 t.found = found
817 t.reEvaluated = true
818
819 return matchesStateForSlice(t.found)
820}
821
822func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
823 if t.evaluated {
824 return matchesStateForSlice(t.found)
825 }
826
827 if cost < costRegexp {
828 return matchesRequiresHigherCost
829 }
830
831 data := cp.data(t.fileName)
832 offset := 0
833 found := t.found[:0]
834 for {
835 idx := bytes.Index(data[offset:], []byte(t.word))
836 if idx < 0 {
837 break
838 }
839
840 relStartOffset := offset + idx
841 relEndOffset := relStartOffset + len(t.word)
842
843 startBoundary := relStartOffset < len(data) && (relStartOffset == 0 || !characterClass(data[relStartOffset-1]))
844 endBoundary := relEndOffset > 0 && (relEndOffset == len(data) || !characterClass(data[relEndOffset]))
845 if startBoundary && endBoundary {
846 found = append(found, &candidateMatch{
847 byteOffset: uint32(offset + idx),
848 byteMatchSz: uint32(len(t.word)),
849 fileName: t.fileName,
850 })
851 }
852 offset += idx + len(t.word)
853 }
854
855 t.found = found
856 t.evaluated = true
857
858 return matchesStateForSlice(t.found)
859}
860
861// breakMatchesOnNewlines returns matches resulting from breaking each element
862// of cms on newlines within text.
863func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch {
864 var lineCMs []*candidateMatch
865 for _, cm := range cms {
866 lineCMs = append(lineCMs, breakOnNewlines(cm, text)...)
867 }
868 return lineCMs
869}
870
871// breakOnNewlines returns matches resulting from breaking cm on newlines
872// within text.
873func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch {
874 var cms []*candidateMatch
875 addMe := &candidateMatch{}
876 *addMe = *cm
877 for i := uint32(cm.byteOffset); i < cm.byteOffset+cm.byteMatchSz; i++ {
878 if text[i] == '\n' {
879 addMe.byteMatchSz = i - addMe.byteOffset
880 if addMe.byteMatchSz != 0 {
881 cms = append(cms, addMe)
882 }
883
884 addMe = &candidateMatch{}
885 *addMe = *cm
886 addMe.byteOffset = i + 1
887 }
888 }
889 addMe.byteMatchSz = cm.byteOffset + cm.byteMatchSz - addMe.byteOffset
890 if addMe.byteMatchSz != 0 {
891 cms = append(cms, addMe)
892 }
893 return cms
894}
895
896// evalMatchTree should be called instead of directly calling
897// matchTree.matches. It cache known values for future evaluation at higher
898// costs.
899func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) matchesState {
900 if v, ok := known[mt]; ok {
901 return matchesStatePred(v)
902 }
903
904 ms := mt.matches(cp, cost, known)
905 if ms != matchesRequiresHigherCost {
906 known[mt] = ms == matchesFound
907 }
908
909 return ms
910}
911
912func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
913 switch evalMatchTree(cp, cost, known, t.child) {
914 case matchesRequiresHigherCost:
915 return matchesRequiresHigherCost
916 case matchesFound:
917 return matchesNone
918 case matchesNone:
919 return matchesFound
920 default:
921 panic("unreachable")
922 }
923}
924
925func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
926 return evalMatchTree(cp, cost, known, t.child)
927}
928
929func (t *boostMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
930 return evalMatchTree(cp, cost, known, t.child)
931}
932
933func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
934 if t.contEvaluated {
935 return matchesStateForSlice(t.current)
936 }
937
938 if len(t.current) == 0 {
939 return matchesNone
940 }
941
942 if t.fileName && cost < costMemory {
943 return matchesRequiresHigherCost
944 }
945
946 if !t.fileName && cost < costContent {
947 return matchesRequiresHigherCost
948 }
949
950 pruned := t.current[:0]
951 for _, m := range t.current {
952 if m.byteOffset == 0 && m.runeOffset > 0 {
953 m.byteOffset = cp.findOffset(m.fileName, m.runeOffset)
954 }
955 if m.matchContent(cp.data(m.fileName)) {
956 pruned = append(pruned, m)
957 }
958 }
959 t.current = pruned
960 t.contEvaluated = true
961
962 return matchesStateForSlice(t.current)
963}
964
965type matchTreeOpt struct {
966 // DisableWordMatchOptimization is used to disable the use of wordMatchTree.
967 // This was added since we do not support wordMatchTree with symbol search.
968 DisableWordMatchOptimization bool
969}
970
971func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error) {
972 if q == nil {
973 return nil, fmt.Errorf("got nil (sub)query")
974 }
975
976 switch s := q.(type) {
977 case *query.Regexp:
978 // RegexpToMatchTreeRecursive tries to distill a matchTree that matches a
979 // superset of the regexp. If the returned matchTree is equivalent to the
980 // original regexp, it returns true. An equivalent matchTree has the same
981 // behaviour as the original regexp and can be used instead.
982 //
983 subMT, isEq, _, err := d.regexpToMatchTreeRecursive(s.Regexp, ngramSize, s.FileName, s.CaseSensitive)
984 if err != nil {
985 return nil, err
986 }
987 // if the query can be used in place of the regexp
988 // return the subtree
989 if isEq {
990 return subMT, nil
991 }
992
993 var tr matchTree
994 if wmt, ok := regexpToWordMatchTree(s, opt); ok {
995 // A common search we get is "\bLITERAL\b". Avoid the regex engine and
996 // provide something faster.
997 tr = wmt
998 } else {
999 tr = newRegexpMatchTree(s)
1000 }
1001
1002 return &andMatchTree{
1003 children: []matchTree{
1004 tr, &noVisitMatchTree{subMT},
1005 },
1006 }, nil
1007 case *query.And:
1008 var r []matchTree
1009 for _, ch := range s.Children {
1010 ct, err := d.newMatchTree(ch, opt)
1011 if err != nil {
1012 return nil, err
1013 }
1014 r = append(r, ct)
1015 }
1016 return &andMatchTree{r}, nil
1017 case *query.Or:
1018 var r []matchTree
1019 for _, ch := range s.Children {
1020 ct, err := d.newMatchTree(ch, opt)
1021 if err != nil {
1022 return nil, err
1023 }
1024 r = append(r, ct)
1025 }
1026 return &orMatchTree{r}, nil
1027 case *query.Not:
1028 ct, err := d.newMatchTree(s.Child, opt)
1029 return ¬MatchTree{
1030 child: ct,
1031 }, err
1032
1033 case *query.Type:
1034 if s.Type != query.TypeFileName {
1035 break
1036 }
1037
1038 ct, err := d.newMatchTree(s.Child, opt)
1039 if err != nil {
1040 return nil, err
1041 }
1042
1043 return &fileNameMatchTree{
1044 child: ct,
1045 }, nil
1046
1047 case *query.Boost:
1048 ct, err := d.newMatchTree(s.Child, opt)
1049 if err != nil {
1050 return nil, err
1051 }
1052
1053 return &boostMatchTree{
1054 child: ct,
1055 boost: s.Boost,
1056 }, nil
1057
1058 case *query.Meta:
1059 checksum := queryMetaChecksum(s.Field, s.Value)
1060 cacheKeyField := "Meta"
1061 if cached, ok := d.docMatchTreeCache.Get(cacheKeyField, checksum); ok {
1062 return cached, nil
1063 }
1064
1065 reposWant := make([]bool, len(d.repoMetaData))
1066 for repoIdx, r := range d.repoMetaData {
1067 if r.Metadata != nil {
1068 if val, ok := r.Metadata[s.Field]; ok && s.Value.MatchString(val) {
1069 reposWant[repoIdx] = true
1070 }
1071 }
1072 }
1073
1074 mt := &docMatchTree{
1075 reason: "Meta",
1076 numDocs: d.numDocs(),
1077 predicate: func(docID uint32) bool {
1078 repoIdx := d.repos[docID]
1079 if int(repoIdx) >= len(reposWant) {
1080 return false
1081 }
1082 return reposWant[repoIdx]
1083 },
1084 }
1085 d.docMatchTreeCache.Add(cacheKeyField, checksum, mt)
1086 return mt, nil
1087
1088 case *query.Substring:
1089 return d.newSubstringMatchTree(s)
1090
1091 case *query.Branch:
1092 masks := make([]uint64, 0, len(d.repoMetaData))
1093 if s.Pattern == "HEAD" {
1094 for range d.repoMetaData {
1095 masks = append(masks, 1)
1096 }
1097 } else {
1098 for _, branchIDs := range d.branchIDs {
1099 mask := uint64(0)
1100 for nm, m := range branchIDs {
1101 if (s.Exact && nm == s.Pattern) || (!s.Exact && strings.Contains(nm, s.Pattern)) {
1102 mask |= uint64(m)
1103 }
1104 }
1105 masks = append(masks, mask)
1106 }
1107 }
1108 return &branchQueryMatchTree{
1109 masks: masks,
1110 fileMasks: d.fileBranchMasks,
1111 repos: d.repos,
1112 }, nil
1113 case *query.Const:
1114 if s.Value {
1115 return &bruteForceMatchTree{}, nil
1116 } else {
1117 return &noMatchTree{Why: "const"}, nil
1118 }
1119 case *query.Language:
1120 code, ok := d.metaData.LanguageMap[s.Language]
1121 if !ok {
1122 return &noMatchTree{Why: "lang"}, nil
1123 }
1124 return &docMatchTree{
1125 reason: "language",
1126 numDocs: d.numDocs(),
1127 predicate: func(docID uint32) bool {
1128 return d.getLanguage(docID) == code
1129 },
1130 }, nil
1131
1132 case *query.Symbol:
1133 // Disable WordMatchTree since we don't support it in symbols yet.
1134 optCopy := opt
1135 optCopy.DisableWordMatchOptimization = true
1136
1137 subMT, err := d.newMatchTree(s.Expr, optCopy)
1138 if err != nil {
1139 return nil, err
1140 }
1141
1142 if substr, ok := subMT.(*substrMatchTree); ok {
1143 return &symbolSubstrMatchTree{
1144 substrMatchTree: substr,
1145 patternSize: uint32(utf8.RuneCountInString(substr.query.Pattern)),
1146 fileEndRunes: d.fileEndRunes,
1147 fileEndSymbol: d.fileEndSymbol,
1148 sections: d.runeDocSections,
1149 }, nil
1150 }
1151
1152 var regexpMT *regexpMatchTree
1153 visitMatchTree(subMT, func(mt matchTree) {
1154 if t, ok := mt.(*regexpMatchTree); ok {
1155 regexpMT = t
1156 }
1157 })
1158 if regexpMT == nil {
1159 return nil, fmt.Errorf("found %T inside query.Symbol", subMT)
1160 }
1161
1162 return &symbolRegexpMatchTree{
1163 regexp: regexpMT.regexp,
1164 all: isRegexpAll(regexpMT.origRegexp),
1165 matchTree: subMT,
1166 }, nil
1167
1168 case *query.FileNameSet:
1169 return &docMatchTree{
1170 reason: "FileNameSet",
1171 numDocs: d.numDocs(),
1172 predicate: func(docID uint32) bool {
1173 fileName := d.fileName(docID)
1174 _, ok := s.Set[string(fileName)]
1175 return ok
1176 },
1177 }, nil
1178
1179 case *query.BranchesRepos:
1180 reposBranchesWant := make([]uint64, len(d.repoMetaData))
1181 for repoIdx := range d.repoMetaData {
1182 var mask uint64
1183 for _, br := range s.List {
1184 if br.Repos.Contains(d.repoMetaData[repoIdx].ID) {
1185 mask |= uint64(d.branchIDs[repoIdx][br.Branch])
1186 }
1187 }
1188 reposBranchesWant[repoIdx] = mask
1189 }
1190 return &docMatchTree{
1191 reason: "BranchesRepos",
1192 numDocs: d.numDocs(),
1193 predicate: func(docID uint32) bool {
1194 return d.fileBranchMasks[docID]&reposBranchesWant[d.repos[docID]] != 0
1195 },
1196 }, nil
1197
1198 case *query.RepoSet:
1199 reposWant := make([]bool, len(d.repoMetaData))
1200 for repoIdx, r := range d.repoMetaData {
1201 if _, ok := s.Set[r.Name]; ok {
1202 reposWant[repoIdx] = true
1203 }
1204 }
1205 return &docMatchTree{
1206 reason: "RepoSet",
1207 numDocs: d.numDocs(),
1208 predicate: func(docID uint32) bool {
1209 return reposWant[d.repos[docID]]
1210 },
1211 }, nil
1212
1213 case *query.RepoIDs:
1214 reposWant := make([]bool, len(d.repoMetaData))
1215 for repoIdx, r := range d.repoMetaData {
1216 if s.Repos.Contains(r.ID) {
1217 reposWant[repoIdx] = true
1218 }
1219 }
1220 return &docMatchTree{
1221 reason: "RepoIDs",
1222 numDocs: d.numDocs(),
1223 predicate: func(docID uint32) bool {
1224 return reposWant[d.repos[docID]]
1225 },
1226 }, nil
1227
1228 case *query.Repo:
1229 reposWant := make([]bool, len(d.repoMetaData))
1230 for repoIdx, r := range d.repoMetaData {
1231 if s.Regexp.MatchString(r.Name) {
1232 reposWant[repoIdx] = true
1233 }
1234 }
1235 return &docMatchTree{
1236 reason: "Repo",
1237 numDocs: d.numDocs(),
1238 predicate: func(docID uint32) bool {
1239 return reposWant[d.repos[docID]]
1240 },
1241 }, nil
1242
1243 case *query.RepoRegexp:
1244 reposWant := make([]bool, len(d.repoMetaData))
1245 for repoIdx, r := range d.repoMetaData {
1246 if s.Regexp.MatchString(r.Name) {
1247 reposWant[repoIdx] = true
1248 }
1249 }
1250 return &docMatchTree{
1251 reason: "RepoRegexp",
1252 numDocs: d.numDocs(),
1253 predicate: func(docID uint32) bool {
1254 return reposWant[d.repos[docID]]
1255 },
1256 }, nil
1257
1258 case query.RawConfig:
1259 return &docMatchTree{
1260 reason: s.String(),
1261 numDocs: d.numDocs(),
1262 predicate: func(docID uint32) bool {
1263 return uint8(s)&d.rawConfigMasks[d.repos[docID]] == uint8(s)
1264 },
1265 }, nil
1266 }
1267 log.Panicf("type %T", q)
1268 return nil, nil
1269}
1270
1271func (d *indexData) newSubstringMatchTree(s *query.Substring) (matchTree, error) {
1272 st := &substrMatchTree{
1273 query: s,
1274 caseSensitive: s.CaseSensitive,
1275 fileName: s.FileName,
1276 }
1277
1278 if utf8.RuneCountInString(s.Pattern) < ngramSize {
1279 return newRegexpMatchTree(&query.Regexp{
1280 Regexp: &syntax.Regexp{Op: syntax.OpLiteral, Rune: []rune(s.Pattern)},
1281 FileName: s.FileName,
1282 Content: s.Content,
1283 CaseSensitive: s.CaseSensitive,
1284 }), nil
1285 }
1286
1287 result, err := d.iterateNgrams(s)
1288 if err != nil {
1289 return nil, err
1290 }
1291 st.matchIterator = result
1292 return st, nil
1293}
1294
1295func regexpToWordMatchTree(q *query.Regexp, opt matchTreeOpt) (_ *wordMatchTree, ok bool) {
1296 if opt.DisableWordMatchOptimization {
1297 return nil, false
1298 }
1299 // Needs to be case sensitive
1300 if !q.CaseSensitive || q.Regexp.Flags&syntax.FoldCase != 0 {
1301 return nil, false
1302 }
1303 // We want a regex that looks like Op.Concat[OpWordBoundary OpLiteral OpWordBoundary]
1304 if q.Regexp.Op != syntax.OpConcat || len(q.Regexp.Sub) != 3 {
1305 return nil, false
1306 }
1307 sub := q.Regexp.Sub
1308 if sub[0].Op != syntax.OpWordBoundary || sub[1].Op != syntax.OpLiteral || sub[2].Op != syntax.OpWordBoundary {
1309 return nil, false
1310 }
1311
1312 return &wordMatchTree{
1313 word: string(sub[1].Rune),
1314 fileName: q.FileName,
1315 }, true
1316}
1317
1318// pruneMatchTree removes impossible branches from the matchTree, as indicated
1319// by substrMatchTree having a noMatchTree and the resulting impossible and clauses and so forth.
1320func pruneMatchTree(mt matchTree) (matchTree, error) {
1321 var err error
1322 switch mt := mt.(type) {
1323 // leaf nodes that we test with the filter:
1324 case *substrMatchTree:
1325 if res, ok := mt.matchIterator.(*ngramIterationResults); ok {
1326 if _, ok := res.matchIterator.(*noMatchTree); ok {
1327 return nil, nil
1328 }
1329 }
1330 // recursive tree structures:
1331 case *andMatchTree:
1332 // Any branch of an and becoming impossible means the entire clause
1333 // is impossible. Otherwise, just handle rewrites.
1334 for i, child := range mt.children {
1335 newChild, err := pruneMatchTree(child)
1336 if err != nil {
1337 return nil, err
1338 }
1339 if newChild == nil {
1340 return nil, nil
1341 }
1342 mt.children[i] = newChild
1343 }
1344 case *orMatchTree:
1345 // *All* branches of an OR becoming impossible means the entire clause
1346 // is impossible. Otherwise, drop impossible subclauses and handle
1347 // rewrites, including simplifying to a singular resulting child branch.
1348 n := 0
1349 for _, child := range mt.children {
1350 newChild, err := pruneMatchTree(child)
1351 if err != nil {
1352 return nil, err
1353 }
1354 if newChild != nil {
1355 mt.children[n] = newChild
1356 n++
1357 }
1358 }
1359 mt.children = mt.children[:n]
1360 if len(mt.children) == 1 {
1361 return mt.children[0], nil
1362 } else if len(mt.children) == 0 {
1363 return nil, nil
1364 }
1365 case *noVisitMatchTree:
1366 mt.matchTree, err = pruneMatchTree(mt.matchTree)
1367 if err != nil {
1368 return nil, err
1369 }
1370 if mt.matchTree == nil {
1371 return nil, nil
1372 }
1373 case *fileNameMatchTree:
1374 mt.child, err = pruneMatchTree(mt.child)
1375 if err != nil {
1376 return nil, err
1377 }
1378 if mt.child == nil {
1379 return nil, nil
1380 }
1381 case *boostMatchTree:
1382 mt.child, err = pruneMatchTree(mt.child)
1383 if err != nil {
1384 return nil, err
1385 }
1386 if mt.child == nil {
1387 return nil, nil
1388 }
1389 case *andLineMatchTree:
1390 child, err := pruneMatchTree(&mt.andMatchTree)
1391 if err != nil {
1392 return nil, err
1393 }
1394 if child == nil {
1395 return nil, nil
1396 }
1397 if c, ok := child.(*andMatchTree); ok {
1398 mt.andMatchTree = *c
1399 } else {
1400 // the and was simplified to a single clause,
1401 // so the linematch portion is irrelevant.
1402 return mt, nil
1403 }
1404 case *notMatchTree:
1405 mt.child, err = pruneMatchTree(mt.child)
1406 if err != nil {
1407 return nil, err
1408 }
1409 if mt.child == nil {
1410 // not false => true
1411 return &bruteForceMatchTree{}, nil
1412 }
1413 // unhandled:
1414 case *docMatchTree:
1415 case *bruteForceMatchTree:
1416 case *regexpMatchTree:
1417 case *wordMatchTree:
1418 }
1419 return mt, err
1420}
1421
1422// isRegexpAll returns true if the query matches all possible lines.
1423//
1424// Note: it is possible for a funky regex to actually match all but this
1425// returns false. This returns true for normal looking regexes like ".*" or
1426// "(?-s:.*)".
1427func isRegexpAll(r *syntax.Regexp) bool {
1428 // Note: we don't care about any flags since we are looking for .* and we
1429 // don't mind if . matches all or everything but newline.
1430
1431 // for loop is for visiting children of OpCapture/OpConcat until we hit .*
1432 for {
1433 // Our main target: .*
1434 if r.Op == syntax.OpStar && len(r.Sub) == 1 { // *
1435 // (?s:.) or (?-s:.)
1436 return r.Sub[0].Op == syntax.OpAnyChar || r.Sub[0].Op == syntax.OpAnyCharNotNL
1437 }
1438
1439 // Strip away expressions being wrapped in paranthesis
1440 if (r.Op == syntax.OpCapture || r.Op == syntax.OpConcat) && len(r.Sub) == 1 {
1441 r = r.Sub[0]
1442 continue
1443 }
1444
1445 return false
1446 }
1447}
1448
1449func queryMetaChecksum(field string, value *regexp.Regexp) string {
1450 h := xxhash.New()
1451 h.Write([]byte(field))
1452 h.Write([]byte{':'})
1453 h.Write([]byte(value.String()))
1454 return fmt.Sprintf("%x", h.Sum64())
1455}