fork of https://github.com/sourcegraph/zoekt
1// Copyright 2018 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package index
16
17import (
18 "bytes"
19 "fmt"
20 "log"
21 "math"
22 "regexp/syntax"
23 "strings"
24 "unicode/utf8"
25
26 "github.com/cespare/xxhash/v2"
27 "github.com/grafana/regexp"
28
29 "github.com/sourcegraph/zoekt"
30 "github.com/sourcegraph/zoekt/internal/hybridre2"
31 "github.com/sourcegraph/zoekt/internal/syntaxutil"
32 "github.com/sourcegraph/zoekt/query"
33)
34
35// A docIterator iterates over documents in order.
36type docIterator interface {
37 // provide the next document where we may find something interesting.
38 //
39 // This is like a "peek" and shouldn't mutate state. prepare is what should
40 // change state.
41 nextDoc() uint32
42
43 // clears any per-document state of the docIterator, and
44 // prepares for evaluating the given doc. The argument is
45 // strictly increasing over time.
46 prepare(nextDoc uint32)
47}
48
49// costs are passed in increasing order to matchTree.matches until they do not
50// return matchesRequiresHigherCost.
51const (
52 costConst = 0
53 costMemory = 1
54 costContent = 2
55 costRegexp = 3
56)
57
58const (
59 costMin = costConst
60 costMax = costRegexp
61)
62
63// matchesState is an enum for the state of a matchTree after a call to
64// matchTree.matches.
65type matchesState uint8
66
67const (
68 // matchesRequiresHigherCost is returned when matchTree.matches hasn't done
69 // a search yet since the cost value is not high enough.
70 matchesRequiresHigherCost matchesState = iota
71
72 // matchesFound is returned when matchTree.matches has done a search and
73 // found one or more matches.
74 matchesFound
75
76 // matchesNone is returned when matchTree.matches has done a search and
77 // found nothing.
78 matchesNone
79)
80
81// matchesStatePred is a helper which returns matchesFound if b is true
82// otherwise returns matchesNone.
83func matchesStatePred(b bool) matchesState {
84 if b {
85 return matchesFound
86 }
87 return matchesNone
88}
89
90// matchesStateForSlice is a helper which returns matchesFound if v is
91// non-empty otherwise returns matchesNone.
92func matchesStateForSlice[T any](v []T) matchesState {
93 return matchesStatePred(len(v) > 0)
94}
95
96// An expression tree coupled with matches. The matchtree has two
97// functions:
98//
99// * it implements boolean combinations (and, or, not)
100//
101// * it implements shortcuts, where we skip documents (for example: if
102// there are no trigram matches, we can be sure there are no substring
103// matches). The matchtree iterates over the documents as they are
104// ordered in the shard.
105//
106// The general process for a given (shard, query) is:
107//
108// - construct matchTree for the query
109//
110// - find all different leaf matchTrees (substring, regexp, etc.)
111//
112// in a loop:
113//
114// - find next doc to process using nextDoc
115//
116// - evaluate atoms (leaf expressions that match text)
117//
118// - evaluate the tree using matches(), storing the result in map.
119//
120// - if the complete tree returns (matches() != matchesRequiresHigherCost)
121// for the document, collect all text matches by looking at leaf
122// matchTrees.
123type matchTree interface {
124 docIterator
125
126 // matches if cost is high enough. See documentation for matchesState's
127 // values.
128 //
129 // Note: Do not call this directly, rather use evalMatchTree which uses
130 // known to cache responses once the state transitions away from
131 // matchesRequiresHigherCost.
132 matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState
133}
134
135// docMatchTree iterates over documents for which predicate(docID) returns true.
136type docMatchTree struct {
137 // the number of documents in a shard.
138 numDocs uint32
139
140 predicate func(docID uint32) bool
141
142 // provides additional information about the reason why the docMatchTree was
143 // created.
144 reason string
145
146 // mutable
147 firstDone bool
148 docID uint32
149}
150
151type bruteForceMatchTree struct {
152 // mutable
153 firstDone bool
154 docID uint32
155}
156
157type andLineMatchTree struct {
158 andMatchTree
159}
160
161type andMatchTree struct {
162 children []matchTree
163}
164
165type orMatchTree struct {
166 children []matchTree
167}
168
169type notMatchTree struct {
170 child matchTree
171}
172
173// Returns only the filename of child matches.
174type fileNameMatchTree struct {
175 child matchTree
176}
177
178type boostMatchTree struct {
179 child matchTree
180 boost float64
181}
182
183// Don't visit this subtree for collecting matches.
184type noVisitMatchTree struct {
185 matchTree
186}
187
188type regexpMatchTree struct {
189 regexp *regexp.Regexp
190
191 // hybridRegexp is a size-aware wrapper that dispatches to go-re2 for
192 // large file content when ZOEKT_RE2_THRESHOLD_BYTES is configured.
193 // For small inputs and filename matches, regexp is used directly.
194 hybridRegexp *hybridre2.Regexp
195
196 // origRegexp is the original parsed regexp from the query structure. It
197 // does not include mutations such as case sensitivity.
198 origRegexp *syntax.Regexp
199
200 fileName bool
201
202 // mutable
203 reEvaluated bool
204 found []*candidateMatch
205
206 // nextDoc, prepare.
207 bruteForceMatchTree
208}
209
210func newRegexpMatchTree(s *query.Regexp) *regexpMatchTree {
211 prefix := ""
212 if !s.CaseSensitive {
213 prefix = "(?i)"
214 }
215
216 pattern := prefix + syntaxutil.RegexpString(s.Regexp)
217
218 // hybridRegexp is only used for file content matching; skip the RE2
219 // compilation overhead for filename-only regexps.
220 var hr *hybridre2.Regexp
221 if !s.FileName {
222 hr = hybridre2.MustCompile(pattern)
223 }
224 return ®expMatchTree{
225 regexp: regexp.MustCompile(pattern),
226 hybridRegexp: hr,
227 origRegexp: s.Regexp,
228 fileName: s.FileName,
229 }
230}
231
232// \bLITERAL\b
233type wordMatchTree struct {
234 word string
235
236 fileName bool
237
238 // mutable
239 evaluated bool
240 found []*candidateMatch
241
242 // nextDoc, prepare.
243 bruteForceMatchTree
244}
245
246type substrMatchTree struct {
247 matchIterator
248
249 query *query.Substring
250 caseSensitive bool
251 fileName bool
252
253 // mutable
254 current []*candidateMatch
255 contEvaluated bool
256}
257
258type branchQueryMatchTree struct {
259 fileMasks []uint64
260 masks []uint64
261 repos []uint16
262
263 // mutable
264 firstDone bool
265 docID uint32
266}
267
268func (t *branchQueryMatchTree) branchMask() uint64 {
269 return t.fileMasks[t.docID] & t.masks[t.repos[t.docID]]
270}
271
272type symbolRegexpMatchTree struct {
273 matchTree
274 regexp *regexp.Regexp
275 all bool // skips regex match if .*
276
277 reEvaluated bool
278 found []*candidateMatch
279}
280
281func (t *symbolRegexpMatchTree) prepare(doc uint32) {
282 t.reEvaluated = false
283 t.found = t.found[:0]
284 t.matchTree.prepare(doc)
285}
286
287func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
288 if t.reEvaluated {
289 return matchesStateForSlice(t.found)
290 }
291
292 if cost < costRegexp {
293 return matchesRequiresHigherCost
294 }
295
296 sections := cp.docSections()
297 content := cp.data(false)
298
299 found := t.found[:0]
300 for i, sec := range sections {
301 var idx []int
302 if t.all {
303 idx = []int{0, int(sec.End - sec.Start)}
304 } else {
305 idx = t.regexp.FindIndex(content[sec.Start:sec.End])
306 if idx == nil {
307 continue
308 }
309 }
310
311 cm := &candidateMatch{
312 byteOffset: sec.Start + uint32(idx[0]),
313 byteMatchSz: uint32(idx[1] - idx[0]),
314 symbol: true,
315 symbolIdx: uint32(i),
316 }
317 found = append(found, cm)
318 }
319 t.found = found
320 t.reEvaluated = true
321
322 return matchesStateForSlice(t.found)
323}
324
325type symbolSubstrMatchTree struct {
326 *substrMatchTree
327
328 patternSize uint32
329 fileEndRunes []uint32
330 fileEndSymbol []uint32
331
332 doc uint32
333 sections []DocumentSection
334
335 secID uint32
336}
337
338func (t *symbolSubstrMatchTree) prepare(doc uint32) {
339 t.substrMatchTree.prepare(doc)
340 t.doc = doc
341
342 var fileStart uint32
343 if doc > 0 {
344 fileStart = t.fileEndRunes[doc-1]
345 }
346
347 var sections []DocumentSection
348 if len(t.sections) > 0 {
349 most := t.fileEndSymbol[len(t.fileEndSymbol)-1]
350 if most == uint32(len(t.sections)) {
351 sections = t.sections[t.fileEndSymbol[doc]:t.fileEndSymbol[doc+1]]
352 } else {
353 for t.secID < uint32(len(t.sections)) && t.sections[t.secID].Start < fileStart {
354 t.secID++
355 }
356
357 fileEnd, symbolEnd := t.fileEndRunes[doc], t.secID
358 for symbolEnd < uint32(len(t.sections)) && t.sections[symbolEnd].Start < fileEnd {
359 symbolEnd++
360 }
361
362 sections = t.sections[t.secID:symbolEnd]
363 }
364 }
365
366 secIdx := 0
367 trimmed := t.current[:0]
368 for len(sections) > secIdx && len(t.current) > 0 {
369 start := fileStart + t.current[0].runeOffset
370 end := start + t.patternSize
371 if start >= sections[secIdx].End {
372 secIdx++
373 continue
374 }
375
376 if start < sections[secIdx].Start {
377 t.current = t.current[1:]
378 continue
379 }
380
381 if end <= sections[secIdx].End {
382 t.current[0].symbol = true
383 t.current[0].symbolIdx = uint32(secIdx)
384 trimmed = append(trimmed, t.current[0])
385 }
386
387 t.current = t.current[1:]
388 }
389 t.current = trimmed
390}
391
392// all prepare methods
393
394func (t *bruteForceMatchTree) prepare(doc uint32) {
395 t.docID = doc
396 t.firstDone = true
397}
398
399func (t *docMatchTree) prepare(doc uint32) {
400 t.docID = doc
401 t.firstDone = true
402}
403
404func (t *andMatchTree) prepare(doc uint32) {
405 for _, c := range t.children {
406 c.prepare(doc)
407 }
408}
409
410func (t *regexpMatchTree) prepare(doc uint32) {
411 t.found = t.found[:0]
412 t.reEvaluated = false
413 t.bruteForceMatchTree.prepare(doc)
414}
415
416func (t *wordMatchTree) prepare(doc uint32) {
417 t.found = t.found[:0]
418 t.evaluated = false
419 t.bruteForceMatchTree.prepare(doc)
420}
421
422func (t *orMatchTree) prepare(doc uint32) {
423 for _, c := range t.children {
424 c.prepare(doc)
425 }
426}
427
428func (t *notMatchTree) prepare(doc uint32) {
429 t.child.prepare(doc)
430}
431
432func (t *fileNameMatchTree) prepare(doc uint32) {
433 t.child.prepare(doc)
434}
435
436func (t *boostMatchTree) prepare(doc uint32) {
437 t.child.prepare(doc)
438}
439
440func (t *substrMatchTree) prepare(nextDoc uint32) {
441 t.matchIterator.prepare(nextDoc)
442 t.current = t.matchIterator.candidates()
443 t.contEvaluated = false
444}
445
446func (t *branchQueryMatchTree) prepare(doc uint32) {
447 t.firstDone = true
448 t.docID = doc
449}
450
451// nextDoc
452
453func (t *docMatchTree) nextDoc() uint32 {
454 var start uint32
455 if t.firstDone {
456 start = t.docID + 1
457 }
458 for i := start; i < t.numDocs; i++ {
459 if t.predicate(i) {
460 return i
461 }
462 }
463 return math.MaxUint32
464}
465
466func (t *bruteForceMatchTree) nextDoc() uint32 {
467 if !t.firstDone {
468 return 0
469 }
470 return t.docID + 1
471}
472
473func (t *andMatchTree) nextDoc() uint32 {
474 var max uint32
475 for _, c := range t.children {
476 m := c.nextDoc()
477 if m > max {
478 max = m
479 }
480 }
481 return max
482}
483
484func (t *orMatchTree) nextDoc() uint32 {
485 min := uint32(math.MaxUint32)
486 for _, c := range t.children {
487 m := c.nextDoc()
488 if m < min {
489 min = m
490 }
491 }
492 return min
493}
494
495func (t *notMatchTree) nextDoc() uint32 {
496 return 0
497}
498
499func (t *fileNameMatchTree) nextDoc() uint32 {
500 return t.child.nextDoc()
501}
502
503func (t *boostMatchTree) nextDoc() uint32 {
504 return t.child.nextDoc()
505}
506
507func (t *branchQueryMatchTree) nextDoc() uint32 {
508 var start uint32
509 if t.firstDone {
510 start = t.docID + 1
511 }
512
513 for i := start; i < uint32(len(t.fileMasks)); i++ {
514 if (t.masks[t.repos[i]] & t.fileMasks[i]) != 0 {
515 return i
516 }
517 }
518 return math.MaxUint32
519}
520
521// all String methods
522
523func (t *bruteForceMatchTree) String() string {
524 return "all"
525}
526
527func (t *docMatchTree) String() string {
528 return fmt.Sprintf("doc(%s)", t.reason)
529}
530
531func (t *andMatchTree) String() string {
532 return fmt.Sprintf("and%v", t.children)
533}
534
535func (t *regexpMatchTree) String() string {
536 f := ""
537 if t.fileName {
538 f = "f"
539 }
540 return fmt.Sprintf("%sre(%s)", f, t.regexp)
541}
542
543func (t *wordMatchTree) String() string {
544 f := ""
545 if t.fileName {
546 f = "f"
547 }
548 return fmt.Sprintf("%sword(%s)", f, t.word)
549}
550
551func (t *orMatchTree) String() string {
552 return fmt.Sprintf("or%v", t.children)
553}
554
555func (t *notMatchTree) String() string {
556 return fmt.Sprintf("not(%v)", t.child)
557}
558
559func (t *noVisitMatchTree) String() string {
560 return fmt.Sprintf("novisit(%v)", t.matchTree)
561}
562
563func (t *fileNameMatchTree) String() string {
564 return fmt.Sprintf("f(%v)", t.child)
565}
566
567func (t *boostMatchTree) String() string {
568 return fmt.Sprintf("boost(%f, %v)", t.boost, t.child)
569}
570
571func (t *substrMatchTree) String() string {
572 f := ""
573 if t.fileName {
574 f = "f"
575 }
576
577 return fmt.Sprintf("%ssubstr(%q, %v, %v)", f, t.query.Pattern, t.current, t.matchIterator)
578}
579
580func (t *branchQueryMatchTree) String() string {
581 return fmt.Sprintf("branch(%x)", t.masks)
582}
583
584func (t *symbolSubstrMatchTree) String() string {
585 return fmt.Sprintf("symbol(%v)", t.substrMatchTree)
586}
587
588func (t *symbolRegexpMatchTree) String() string {
589 return fmt.Sprintf("symbol(%v)", t.matchTree)
590}
591
592// visitMatches visits all atoms in matchTree. Note: This visits
593// noVisitMatchTree. For collecting matches use visitMatches.
594func visitMatchTree(t matchTree, f func(matchTree)) {
595 switch s := t.(type) {
596 case *andMatchTree:
597 for _, ch := range s.children {
598 visitMatchTree(ch, f)
599 }
600 case *orMatchTree:
601 for _, ch := range s.children {
602 visitMatchTree(ch, f)
603 }
604 case *andLineMatchTree:
605 visitMatchTree(&s.andMatchTree, f)
606 case *noVisitMatchTree:
607 visitMatchTree(s.matchTree, f)
608 case *notMatchTree:
609 visitMatchTree(s.child, f)
610 case *fileNameMatchTree:
611 visitMatchTree(s.child, f)
612 case *boostMatchTree:
613 visitMatchTree(s.child, f)
614 case *symbolSubstrMatchTree:
615 visitMatchTree(s.substrMatchTree, f)
616 case *symbolRegexpMatchTree:
617 visitMatchTree(s.matchTree, f)
618 default:
619 f(t)
620 }
621}
622
623// updateMatchTreeStats calls updateStats on all atoms in mt which have that
624// function defined.
625func updateMatchTreeStats(mt matchTree, stats *zoekt.Stats) {
626 visitMatchTree(mt, func(mt matchTree) {
627 if atom, ok := mt.(interface{ updateStats(*zoekt.Stats) }); ok {
628 atom.updateStats(stats)
629 }
630 })
631}
632
633func visitMatchAtoms(t matchTree, known map[matchTree]bool, f func(matchTree)) {
634 visitMatches(t, known, 1, func(mt matchTree, _ float64) {
635 f(mt)
636 })
637}
638
639// visitMatches visits all atoms which can contribute matches. Note: This
640// skips noVisitMatchTree.
641func visitMatches(t matchTree, known map[matchTree]bool, weight float64, f func(matchTree, float64)) {
642 switch s := t.(type) {
643 case *andMatchTree:
644 for _, ch := range s.children {
645 if known[ch] {
646 visitMatches(ch, known, weight, f)
647 }
648 }
649 case *andLineMatchTree:
650 visitMatches(&s.andMatchTree, known, weight, f)
651 case *orMatchTree:
652 for _, ch := range s.children {
653 if known[ch] {
654 visitMatches(ch, known, weight, f)
655 }
656 }
657 case *boostMatchTree:
658 visitMatches(s.child, known, weight*s.boost, f)
659 case *symbolSubstrMatchTree:
660 visitMatches(s.substrMatchTree, known, weight, f)
661 case *notMatchTree:
662 case *noVisitMatchTree:
663 // don't collect into negative trees.
664 case *fileNameMatchTree:
665 // We will just gather the filename if we do not visit this tree.
666 default:
667 f(s, weight)
668 }
669}
670
671// all matches() methods.
672
673func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
674 return matchesStatePred(t.predicate(cp.idx))
675}
676
677func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
678 return matchesFound
679}
680
681// andLineMatchTree is a performance optimization of andMatchTree. For content
682// searches we don't want to run the regex engine if there is no line that
683// contains matches from all terms.
684func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
685 if state := evalMatchTree(cp, cost, known, &t.andMatchTree); state != matchesFound {
686 return state
687 }
688
689 // Invariant: all children have matches. If any line contains all of them we
690 // can return MatchesFound.
691
692 // find child with fewest candidates
693 minCount := math.MaxInt
694 fewestChildren := 0
695 for ix, child := range t.children {
696 v, ok := child.(*substrMatchTree)
697 // make sure we are running a content search and that all candidates are a
698 // substrMatchTree
699 if !ok || v.fileName {
700 return matchesFound
701 }
702 if len(v.current) < minCount {
703 minCount = len(v.current)
704 fewestChildren = ix
705 }
706 }
707
708 type lineRange struct {
709 start int
710 end int
711 }
712 lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current))
713 prev := -1
714 for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current {
715 line := cp.newlines().atOffset(candidate.byteOffset)
716 if line == prev {
717 continue
718 }
719 prev = line
720 byteStart := int(cp.newlines().lineStart(line))
721 byteEnd := int(cp.newlines().lineStart(line + 1))
722 lines = append(lines, lineRange{byteStart, byteEnd})
723 }
724
725 // children keeps track of the children's candidates we have already seen.
726 children := make([][]*candidateMatch, 0, len(t.children)-1)
727 for j, child := range t.children {
728 if j == fewestChildren {
729 continue
730 }
731 children = append(children, child.(*substrMatchTree).current)
732 }
733
734nextLine:
735 for i := 0; i < len(lines); i++ {
736 hits := 1
737 nextChild:
738 for j := range children {
739 nextCandidate:
740 for len(children[j]) > 0 {
741 candidate := children[j][0]
742 bo := int(cp.findOffset(false, candidate.runeOffset))
743 if bo < lines[i].start {
744 children[j] = children[j][1:]
745 continue nextCandidate
746 }
747 if bo < lines[i].end {
748 hits++
749 continue nextChild
750 }
751 // move the `lines` iterator forward until bo < line.end
752 for i < len(lines) && bo >= lines[i].end {
753 i++
754 }
755 i--
756 continue nextLine
757 }
758 }
759 // return early once we found any line that contains matches from all children
760 if hits == len(t.children) {
761 return matchesFound
762 }
763 }
764 return matchesNone
765}
766
767func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
768 // We have found matches unless a child needs to do more work or it hasn't
769 // found matches.
770 state := matchesFound
771
772 for _, ch := range t.children {
773 switch evalMatchTree(cp, cost, known, ch) {
774 case matchesRequiresHigherCost:
775 // keep evaluating other children incase we come across matchesNone
776 state = matchesRequiresHigherCost
777 case matchesFound:
778 // will return this if every child has this value
779 case matchesNone:
780 return matchesNone
781 }
782 }
783
784 return state
785}
786
787func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
788 // we could short-circuit, but we want to use the other possibilities as a
789 // ranking signal. So we always return the most conservative state.
790 state := matchesNone
791 for _, ch := range t.children {
792 switch evalMatchTree(cp, cost, known, ch) {
793 case matchesRequiresHigherCost:
794 state = matchesRequiresHigherCost
795 case matchesFound:
796 if state != matchesRequiresHigherCost {
797 state = matchesFound
798 }
799 case matchesNone:
800 // noop
801 }
802 }
803 return state
804}
805
806func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
807 return matchesStatePred(t.branchMask() != 0)
808}
809
810func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
811 if t.reEvaluated {
812 return matchesStateForSlice(t.found)
813 }
814
815 if cost < costRegexp {
816 return matchesRequiresHigherCost
817 }
818
819 cp.stats.RegexpsConsidered++
820 data := cp.data(t.fileName)
821 // For file content, use hybridRegexp which dispatches to go-re2 when
822 // len(data) >= ZOEKT_RE2_THRESHOLD_BYTES. For filename matching, use
823 // grafana/regexp directly: filenames are always short, so the WASM
824 // call overhead of go-re2 outweighs any benefit.
825 var idxs [][]int
826 if t.fileName {
827 idxs = t.regexp.FindAllIndex(data, -1)
828 } else {
829 idxs = t.hybridRegexp.FindAllIndex(data, -1)
830 }
831 found := t.found[:0]
832 for _, idx := range idxs {
833 cm := &candidateMatch{
834 byteOffset: uint32(idx[0]),
835 byteMatchSz: uint32(idx[1] - idx[0]),
836 fileName: t.fileName,
837 }
838
839 found = append(found, cm)
840 }
841 t.found = found
842 t.reEvaluated = true
843
844 return matchesStateForSlice(t.found)
845}
846
847func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
848 if t.evaluated {
849 return matchesStateForSlice(t.found)
850 }
851
852 if cost < costRegexp {
853 return matchesRequiresHigherCost
854 }
855
856 data := cp.data(t.fileName)
857 offset := 0
858 found := t.found[:0]
859 for {
860 idx := bytes.Index(data[offset:], []byte(t.word))
861 if idx < 0 {
862 break
863 }
864
865 relStartOffset := offset + idx
866 relEndOffset := relStartOffset + len(t.word)
867
868 startBoundary := relStartOffset < len(data) && (relStartOffset == 0 || !characterClass(data[relStartOffset-1]))
869 endBoundary := relEndOffset > 0 && (relEndOffset == len(data) || !characterClass(data[relEndOffset]))
870 if startBoundary && endBoundary {
871 found = append(found, &candidateMatch{
872 byteOffset: uint32(offset + idx),
873 byteMatchSz: uint32(len(t.word)),
874 fileName: t.fileName,
875 })
876 }
877 offset += idx + len(t.word)
878 }
879
880 t.found = found
881 t.evaluated = true
882
883 return matchesStateForSlice(t.found)
884}
885
886// breakMatchesOnNewlines returns matches resulting from breaking each element
887// of cms on newlines within text.
888func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch {
889 var lineCMs []*candidateMatch
890 for _, cm := range cms {
891 lineCMs = append(lineCMs, breakOnNewlines(cm, text)...)
892 }
893 return lineCMs
894}
895
896// breakOnNewlines returns matches resulting from breaking cm on newlines
897// within text.
898func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch {
899 var cms []*candidateMatch
900 addMe := &candidateMatch{}
901 *addMe = *cm
902 for i := uint32(cm.byteOffset); i < cm.byteOffset+cm.byteMatchSz; i++ {
903 if text[i] == '\n' {
904 addMe.byteMatchSz = i - addMe.byteOffset
905 if addMe.byteMatchSz != 0 {
906 cms = append(cms, addMe)
907 }
908
909 addMe = &candidateMatch{}
910 *addMe = *cm
911 addMe.byteOffset = i + 1
912 }
913 }
914 addMe.byteMatchSz = cm.byteOffset + cm.byteMatchSz - addMe.byteOffset
915 if addMe.byteMatchSz != 0 {
916 cms = append(cms, addMe)
917 }
918 return cms
919}
920
921// evalMatchTree should be called instead of directly calling
922// matchTree.matches. It cache known values for future evaluation at higher
923// costs.
924func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) matchesState {
925 if v, ok := known[mt]; ok {
926 return matchesStatePred(v)
927 }
928
929 ms := mt.matches(cp, cost, known)
930 if ms != matchesRequiresHigherCost {
931 known[mt] = ms == matchesFound
932 }
933
934 return ms
935}
936
937func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
938 switch evalMatchTree(cp, cost, known, t.child) {
939 case matchesRequiresHigherCost:
940 return matchesRequiresHigherCost
941 case matchesFound:
942 return matchesNone
943 case matchesNone:
944 return matchesFound
945 default:
946 panic("unreachable")
947 }
948}
949
950func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
951 return evalMatchTree(cp, cost, known, t.child)
952}
953
954func (t *boostMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
955 return evalMatchTree(cp, cost, known, t.child)
956}
957
958func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
959 if t.contEvaluated {
960 return matchesStateForSlice(t.current)
961 }
962
963 if len(t.current) == 0 {
964 return matchesNone
965 }
966
967 if t.fileName && cost < costMemory {
968 return matchesRequiresHigherCost
969 }
970
971 if !t.fileName && cost < costContent {
972 return matchesRequiresHigherCost
973 }
974
975 pruned := t.current[:0]
976 for _, m := range t.current {
977 if m.byteOffset == 0 && m.runeOffset > 0 {
978 m.byteOffset = cp.findOffset(m.fileName, m.runeOffset)
979 }
980 if m.matchContent(cp.data(m.fileName)) {
981 pruned = append(pruned, m)
982 }
983 }
984 t.current = pruned
985 t.contEvaluated = true
986
987 return matchesStateForSlice(t.current)
988}
989
990type matchTreeOpt struct {
991 // DisableWordMatchOptimization is used to disable the use of wordMatchTree.
992 // This was added since we do not support wordMatchTree with symbol search.
993 DisableWordMatchOptimization bool
994}
995
996func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error) {
997 if q == nil {
998 return nil, fmt.Errorf("got nil (sub)query")
999 }
1000
1001 switch s := q.(type) {
1002 case *query.Regexp:
1003 // RegexpToMatchTreeRecursive tries to distill a matchTree that matches a
1004 // superset of the regexp. If the returned matchTree is equivalent to the
1005 // original regexp, it returns true. An equivalent matchTree has the same
1006 // behaviour as the original regexp and can be used instead.
1007 //
1008 subMT, isEq, _, err := d.regexpToMatchTreeRecursive(s.Regexp, ngramSize, s.FileName, s.CaseSensitive)
1009 if err != nil {
1010 return nil, err
1011 }
1012 // if the query can be used in place of the regexp
1013 // return the subtree
1014 if isEq {
1015 return subMT, nil
1016 }
1017
1018 var tr matchTree
1019 if wmt, ok := regexpToWordMatchTree(s, opt); ok {
1020 // A common search we get is "\bLITERAL\b". Avoid the regex engine and
1021 // provide something faster.
1022 tr = wmt
1023 } else {
1024 tr = newRegexpMatchTree(s)
1025 }
1026
1027 return &andMatchTree{
1028 children: []matchTree{
1029 tr, &noVisitMatchTree{subMT},
1030 },
1031 }, nil
1032 case *query.And:
1033 var r []matchTree
1034 for _, ch := range s.Children {
1035 ct, err := d.newMatchTree(ch, opt)
1036 if err != nil {
1037 return nil, err
1038 }
1039 r = append(r, ct)
1040 }
1041 return &andMatchTree{r}, nil
1042 case *query.Or:
1043 var r []matchTree
1044 for _, ch := range s.Children {
1045 ct, err := d.newMatchTree(ch, opt)
1046 if err != nil {
1047 return nil, err
1048 }
1049 r = append(r, ct)
1050 }
1051 return &orMatchTree{r}, nil
1052 case *query.Not:
1053 ct, err := d.newMatchTree(s.Child, opt)
1054 return ¬MatchTree{
1055 child: ct,
1056 }, err
1057
1058 case *query.Type:
1059 if s.Type != query.TypeFileName {
1060 break
1061 }
1062
1063 ct, err := d.newMatchTree(s.Child, opt)
1064 if err != nil {
1065 return nil, err
1066 }
1067
1068 return &fileNameMatchTree{
1069 child: ct,
1070 }, nil
1071
1072 case *query.Boost:
1073 ct, err := d.newMatchTree(s.Child, opt)
1074 if err != nil {
1075 return nil, err
1076 }
1077
1078 return &boostMatchTree{
1079 child: ct,
1080 boost: s.Boost,
1081 }, nil
1082
1083 case *query.Meta:
1084 checksum := queryMetaChecksum(s.Field, s.Value)
1085 cacheKeyField := "Meta"
1086 if cached, ok := d.docMatchTreeCache.Get(cacheKeyField, checksum); ok {
1087 return cached, nil
1088 }
1089
1090 reposWant := make([]bool, len(d.repoMetaData))
1091 for repoIdx, r := range d.repoMetaData {
1092 if r.Metadata != nil {
1093 if val, ok := r.Metadata[s.Field]; ok && s.Value.MatchString(val) {
1094 reposWant[repoIdx] = true
1095 }
1096 }
1097 }
1098
1099 mt := &docMatchTree{
1100 reason: "Meta",
1101 numDocs: d.numDocs(),
1102 predicate: func(docID uint32) bool {
1103 repoIdx := d.repos[docID]
1104 if int(repoIdx) >= len(reposWant) {
1105 return false
1106 }
1107 return reposWant[repoIdx]
1108 },
1109 }
1110 d.docMatchTreeCache.Add(cacheKeyField, checksum, mt)
1111 return mt, nil
1112
1113 case *query.Substring:
1114 return d.newSubstringMatchTree(s)
1115
1116 case *query.Branch:
1117 masks := make([]uint64, 0, len(d.repoMetaData))
1118 if s.Pattern == "HEAD" {
1119 for range d.repoMetaData {
1120 masks = append(masks, 1)
1121 }
1122 } else {
1123 for _, branchIDs := range d.branchIDs {
1124 mask := uint64(0)
1125 for nm, m := range branchIDs {
1126 if (s.Exact && nm == s.Pattern) || (!s.Exact && strings.Contains(nm, s.Pattern)) {
1127 mask |= uint64(m)
1128 }
1129 }
1130 masks = append(masks, mask)
1131 }
1132 }
1133 return &branchQueryMatchTree{
1134 masks: masks,
1135 fileMasks: d.fileBranchMasks,
1136 repos: d.repos,
1137 }, nil
1138 case *query.Const:
1139 if s.Value {
1140 return &bruteForceMatchTree{}, nil
1141 } else {
1142 return &noMatchTree{Why: "const"}, nil
1143 }
1144 case *query.Language:
1145 code, ok := d.metaData.LanguageMap[s.Language]
1146 if !ok {
1147 return &noMatchTree{Why: "lang"}, nil
1148 }
1149 return &docMatchTree{
1150 reason: "language",
1151 numDocs: d.numDocs(),
1152 predicate: func(docID uint32) bool {
1153 return d.getLanguage(docID) == code
1154 },
1155 }, nil
1156
1157 case *query.Symbol:
1158 // Disable WordMatchTree since we don't support it in symbols yet.
1159 optCopy := opt
1160 optCopy.DisableWordMatchOptimization = true
1161
1162 subMT, err := d.newMatchTree(s.Expr, optCopy)
1163 if err != nil {
1164 return nil, err
1165 }
1166
1167 if substr, ok := subMT.(*substrMatchTree); ok {
1168 return &symbolSubstrMatchTree{
1169 substrMatchTree: substr,
1170 patternSize: uint32(utf8.RuneCountInString(substr.query.Pattern)),
1171 fileEndRunes: d.fileEndRunes,
1172 fileEndSymbol: d.fileEndSymbol,
1173 sections: d.runeDocSections,
1174 }, nil
1175 }
1176
1177 var regexpMT *regexpMatchTree
1178 visitMatchTree(subMT, func(mt matchTree) {
1179 if t, ok := mt.(*regexpMatchTree); ok {
1180 regexpMT = t
1181 }
1182 })
1183 if regexpMT == nil {
1184 return nil, fmt.Errorf("found %T inside query.Symbol", subMT)
1185 }
1186
1187 return &symbolRegexpMatchTree{
1188 regexp: regexpMT.regexp,
1189 all: isRegexpAll(regexpMT.origRegexp),
1190 matchTree: subMT,
1191 }, nil
1192
1193 case *query.FileNameSet:
1194 return &docMatchTree{
1195 reason: "FileNameSet",
1196 numDocs: d.numDocs(),
1197 predicate: func(docID uint32) bool {
1198 fileName := d.fileName(docID)
1199 _, ok := s.Set[string(fileName)]
1200 return ok
1201 },
1202 }, nil
1203
1204 case *query.BranchesRepos:
1205 reposBranchesWant := make([]uint64, len(d.repoMetaData))
1206 for repoIdx := range d.repoMetaData {
1207 var mask uint64
1208 for _, br := range s.List {
1209 if br.Repos.Contains(d.repoMetaData[repoIdx].ID) {
1210 mask |= uint64(d.branchIDs[repoIdx][br.Branch])
1211 }
1212 }
1213 reposBranchesWant[repoIdx] = mask
1214 }
1215 return &docMatchTree{
1216 reason: "BranchesRepos",
1217 numDocs: d.numDocs(),
1218 predicate: func(docID uint32) bool {
1219 return d.fileBranchMasks[docID]&reposBranchesWant[d.repos[docID]] != 0
1220 },
1221 }, nil
1222
1223 case *query.RepoSet:
1224 reposWant := make([]bool, len(d.repoMetaData))
1225 for repoIdx, r := range d.repoMetaData {
1226 if _, ok := s.Set[r.Name]; ok {
1227 reposWant[repoIdx] = true
1228 }
1229 }
1230 return &docMatchTree{
1231 reason: "RepoSet",
1232 numDocs: d.numDocs(),
1233 predicate: func(docID uint32) bool {
1234 return reposWant[d.repos[docID]]
1235 },
1236 }, nil
1237
1238 case *query.RepoIDs:
1239 reposWant := make([]bool, len(d.repoMetaData))
1240 for repoIdx, r := range d.repoMetaData {
1241 if s.Repos.Contains(r.ID) {
1242 reposWant[repoIdx] = true
1243 }
1244 }
1245 return &docMatchTree{
1246 reason: "RepoIDs",
1247 numDocs: d.numDocs(),
1248 predicate: func(docID uint32) bool {
1249 return reposWant[d.repos[docID]]
1250 },
1251 }, nil
1252
1253 case *query.Repo:
1254 reposWant := make([]bool, len(d.repoMetaData))
1255 for repoIdx, r := range d.repoMetaData {
1256 if s.Regexp.MatchString(r.Name) {
1257 reposWant[repoIdx] = true
1258 }
1259 }
1260 return &docMatchTree{
1261 reason: "Repo",
1262 numDocs: d.numDocs(),
1263 predicate: func(docID uint32) bool {
1264 return reposWant[d.repos[docID]]
1265 },
1266 }, nil
1267
1268 case *query.RepoRegexp:
1269 reposWant := make([]bool, len(d.repoMetaData))
1270 for repoIdx, r := range d.repoMetaData {
1271 if s.Regexp.MatchString(r.Name) {
1272 reposWant[repoIdx] = true
1273 }
1274 }
1275 return &docMatchTree{
1276 reason: "RepoRegexp",
1277 numDocs: d.numDocs(),
1278 predicate: func(docID uint32) bool {
1279 return reposWant[d.repos[docID]]
1280 },
1281 }, nil
1282
1283 case query.RawConfig:
1284 return &docMatchTree{
1285 reason: s.String(),
1286 numDocs: d.numDocs(),
1287 predicate: func(docID uint32) bool {
1288 return uint8(s)&d.rawConfigMasks[d.repos[docID]] == uint8(s)
1289 },
1290 }, nil
1291 }
1292 log.Panicf("type %T", q)
1293 return nil, nil
1294}
1295
1296func (d *indexData) newSubstringMatchTree(s *query.Substring) (matchTree, error) {
1297 st := &substrMatchTree{
1298 query: s,
1299 caseSensitive: s.CaseSensitive,
1300 fileName: s.FileName,
1301 }
1302
1303 if utf8.RuneCountInString(s.Pattern) < ngramSize {
1304 return newRegexpMatchTree(&query.Regexp{
1305 Regexp: &syntax.Regexp{Op: syntax.OpLiteral, Rune: []rune(s.Pattern)},
1306 FileName: s.FileName,
1307 Content: s.Content,
1308 CaseSensitive: s.CaseSensitive,
1309 }), nil
1310 }
1311
1312 result, err := d.iterateNgrams(s)
1313 if err != nil {
1314 return nil, err
1315 }
1316 st.matchIterator = result
1317 return st, nil
1318}
1319
1320func regexpToWordMatchTree(q *query.Regexp, opt matchTreeOpt) (_ *wordMatchTree, ok bool) {
1321 if opt.DisableWordMatchOptimization {
1322 return nil, false
1323 }
1324 // Needs to be case sensitive
1325 if !q.CaseSensitive || q.Regexp.Flags&syntax.FoldCase != 0 {
1326 return nil, false
1327 }
1328 // We want a regex that looks like Op.Concat[OpWordBoundary OpLiteral OpWordBoundary]
1329 if q.Regexp.Op != syntax.OpConcat || len(q.Regexp.Sub) != 3 {
1330 return nil, false
1331 }
1332 sub := q.Regexp.Sub
1333 if sub[0].Op != syntax.OpWordBoundary || sub[1].Op != syntax.OpLiteral || sub[2].Op != syntax.OpWordBoundary {
1334 return nil, false
1335 }
1336
1337 return &wordMatchTree{
1338 word: string(sub[1].Rune),
1339 fileName: q.FileName,
1340 }, true
1341}
1342
1343// pruneMatchTree removes impossible branches from the matchTree, as indicated
1344// by substrMatchTree having a noMatchTree and the resulting impossible and clauses and so forth.
1345func pruneMatchTree(mt matchTree) (matchTree, error) {
1346 var err error
1347 switch mt := mt.(type) {
1348 // leaf nodes that we test with the filter:
1349 case *substrMatchTree:
1350 if res, ok := mt.matchIterator.(*ngramIterationResults); ok {
1351 if _, ok := res.matchIterator.(*noMatchTree); ok {
1352 return nil, nil
1353 }
1354 }
1355 // recursive tree structures:
1356 case *andMatchTree:
1357 // Any branch of an and becoming impossible means the entire clause
1358 // is impossible. Otherwise, just handle rewrites.
1359 for i, child := range mt.children {
1360 newChild, err := pruneMatchTree(child)
1361 if err != nil {
1362 return nil, err
1363 }
1364 if newChild == nil {
1365 return nil, nil
1366 }
1367 mt.children[i] = newChild
1368 }
1369 case *orMatchTree:
1370 // *All* branches of an OR becoming impossible means the entire clause
1371 // is impossible. Otherwise, drop impossible subclauses and handle
1372 // rewrites, including simplifying to a singular resulting child branch.
1373 n := 0
1374 for _, child := range mt.children {
1375 newChild, err := pruneMatchTree(child)
1376 if err != nil {
1377 return nil, err
1378 }
1379 if newChild != nil {
1380 mt.children[n] = newChild
1381 n++
1382 }
1383 }
1384 mt.children = mt.children[:n]
1385 if len(mt.children) == 1 {
1386 return mt.children[0], nil
1387 } else if len(mt.children) == 0 {
1388 return nil, nil
1389 }
1390 case *noVisitMatchTree:
1391 mt.matchTree, err = pruneMatchTree(mt.matchTree)
1392 if err != nil {
1393 return nil, err
1394 }
1395 if mt.matchTree == nil {
1396 return nil, nil
1397 }
1398 case *fileNameMatchTree:
1399 mt.child, err = pruneMatchTree(mt.child)
1400 if err != nil {
1401 return nil, err
1402 }
1403 if mt.child == nil {
1404 return nil, nil
1405 }
1406 case *boostMatchTree:
1407 mt.child, err = pruneMatchTree(mt.child)
1408 if err != nil {
1409 return nil, err
1410 }
1411 if mt.child == nil {
1412 return nil, nil
1413 }
1414 case *andLineMatchTree:
1415 child, err := pruneMatchTree(&mt.andMatchTree)
1416 if err != nil {
1417 return nil, err
1418 }
1419 if child == nil {
1420 return nil, nil
1421 }
1422 if c, ok := child.(*andMatchTree); ok {
1423 mt.andMatchTree = *c
1424 } else {
1425 // the and was simplified to a single clause,
1426 // so the linematch portion is irrelevant.
1427 return mt, nil
1428 }
1429 case *notMatchTree:
1430 mt.child, err = pruneMatchTree(mt.child)
1431 if err != nil {
1432 return nil, err
1433 }
1434 if mt.child == nil {
1435 // not false => true
1436 return &bruteForceMatchTree{}, nil
1437 }
1438 // unhandled:
1439 case *docMatchTree:
1440 case *bruteForceMatchTree:
1441 case *regexpMatchTree:
1442 case *wordMatchTree:
1443 }
1444 return mt, err
1445}
1446
1447// isRegexpAll returns true if the query matches all possible lines.
1448//
1449// Note: it is possible for a funky regex to actually match all but this
1450// returns false. This returns true for normal looking regexes like ".*" or
1451// "(?-s:.*)".
1452func isRegexpAll(r *syntax.Regexp) bool {
1453 // Note: we don't care about any flags since we are looking for .* and we
1454 // don't mind if . matches all or everything but newline.
1455
1456 // for loop is for visiting children of OpCapture/OpConcat until we hit .*
1457 for {
1458 // Our main target: .*
1459 if r.Op == syntax.OpStar && len(r.Sub) == 1 { // *
1460 // (?s:.) or (?-s:.)
1461 return r.Sub[0].Op == syntax.OpAnyChar || r.Sub[0].Op == syntax.OpAnyCharNotNL
1462 }
1463
1464 // Strip away expressions being wrapped in paranthesis
1465 if (r.Op == syntax.OpCapture || r.Op == syntax.OpConcat) && len(r.Sub) == 1 {
1466 r = r.Sub[0]
1467 continue
1468 }
1469
1470 return false
1471 }
1472}
1473
1474func queryMetaChecksum(field string, value *regexp.Regexp) string {
1475 h := xxhash.New()
1476 h.Write([]byte(field))
1477 h.Write([]byte{':'})
1478 h.Write([]byte(value.String()))
1479 return fmt.Sprintf("%x", h.Sum64())
1480}