fork of https://github.com/sourcegraph/zoekt
1// Copyright 2018 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package index
16
17import (
18 "bytes"
19 "fmt"
20 "log"
21 "math"
22 "regexp/syntax"
23 "strings"
24 "unicode/utf8"
25
26 "github.com/grafana/regexp"
27 "github.com/sourcegraph/zoekt"
28 "github.com/sourcegraph/zoekt/internal/syntaxutil"
29 "github.com/sourcegraph/zoekt/query"
30)
31
32// A docIterator iterates over documents in order.
33type docIterator interface {
34 // provide the next document where we may find something interesting.
35 //
36 // This is like a "peek" and shouldn't mutate state. prepare is what should
37 // change state.
38 nextDoc() uint32
39
40 // clears any per-document state of the docIterator, and
41 // prepares for evaluating the given doc. The argument is
42 // strictly increasing over time.
43 prepare(nextDoc uint32)
44}
45
46// costs are passed in increasing order to matchTree.matches until they do not
47// return matchesRequiresHigherCost.
48const (
49 costConst = 0
50 costMemory = 1
51 costContent = 2
52 costRegexp = 3
53)
54
55const (
56 costMin = costConst
57 costMax = costRegexp
58)
59
60// matchesState is an enum for the state of a matchTree after a call to
61// matchTree.matches.
62type matchesState uint8
63
64const (
65 // matchesRequiresHigherCost is returned when matchTree.matches hasn't done
66 // a search yet since the cost value is not high enough.
67 matchesRequiresHigherCost matchesState = iota
68
69 // matchesFound is returned when matchTree.matches has done a search and
70 // found one or more matches.
71 matchesFound
72
73 // matchesNone is returned when matchTree.matches has done a search and
74 // found nothing.
75 matchesNone
76)
77
78// matchesStatePred is a helper which returns matchesFound if b is true
79// otherwise returns matchesNone.
80func matchesStatePred(b bool) matchesState {
81 if b {
82 return matchesFound
83 }
84 return matchesNone
85}
86
87// matchesStateForSlice is a helper which returns matchesFound if v is
88// non-empty otherwise returns matchesNone.
89func matchesStateForSlice[T any](v []T) matchesState {
90 return matchesStatePred(len(v) > 0)
91}
92
93// An expression tree coupled with matches. The matchtree has two
94// functions:
95//
96// * it implements boolean combinations (and, or, not)
97//
98// * it implements shortcuts, where we skip documents (for example: if
99// there are no trigram matches, we can be sure there are no substring
100// matches). The matchtree iterates over the documents as they are
101// ordered in the shard.
102//
103// The general process for a given (shard, query) is:
104//
105// - construct matchTree for the query
106//
107// - find all different leaf matchTrees (substring, regexp, etc.)
108//
109// in a loop:
110//
111// - find next doc to process using nextDoc
112//
113// - evaluate atoms (leaf expressions that match text)
114//
115// - evaluate the tree using matches(), storing the result in map.
116//
117// - if the complete tree returns (matches() != matchesRequiresHigherCost)
118// for the document, collect all text matches by looking at leaf
119// matchTrees.
120type matchTree interface {
121 docIterator
122
123 // matches if cost is high enough. See documentation for matchesState's
124 // values.
125 //
126 // Note: Do not call this directly, rather use evalMatchTree which uses
127 // known to cache responses once the state transitions away from
128 // matchesRequiresHigherCost.
129 matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState
130}
131
132// docMatchTree iterates over documents for which predicate(docID) returns true.
133type docMatchTree struct {
134 // the number of documents in a shard.
135 numDocs uint32
136
137 predicate func(docID uint32) bool
138
139 // provides additional information about the reason why the docMatchTree was
140 // created.
141 reason string
142
143 // mutable
144 firstDone bool
145 docID uint32
146}
147
148type bruteForceMatchTree struct {
149 // mutable
150 firstDone bool
151 docID uint32
152}
153
154type andLineMatchTree struct {
155 andMatchTree
156}
157
158type andMatchTree struct {
159 children []matchTree
160}
161
162type orMatchTree struct {
163 children []matchTree
164}
165
166type notMatchTree struct {
167 child matchTree
168}
169
170// Returns only the filename of child matches.
171type fileNameMatchTree struct {
172 child matchTree
173}
174
175type boostMatchTree struct {
176 child matchTree
177 boost float64
178}
179
180// Don't visit this subtree for collecting matches.
181type noVisitMatchTree struct {
182 matchTree
183}
184
185type regexpMatchTree struct {
186 regexp *regexp.Regexp
187
188 // origRegexp is the original parsed regexp from the query structure. It
189 // does not include mutations such as case sensitivity.
190 origRegexp *syntax.Regexp
191
192 fileName bool
193
194 // mutable
195 reEvaluated bool
196 found []*candidateMatch
197
198 // nextDoc, prepare.
199 bruteForceMatchTree
200}
201
202func newRegexpMatchTree(s *query.Regexp) *regexpMatchTree {
203 prefix := ""
204 if !s.CaseSensitive {
205 prefix = "(?i)"
206 }
207
208 return ®expMatchTree{
209 regexp: regexp.MustCompile(prefix + syntaxutil.RegexpString(s.Regexp)),
210 origRegexp: s.Regexp,
211 fileName: s.FileName,
212 }
213}
214
215// \bLITERAL\b
216type wordMatchTree struct {
217 word string
218
219 fileName bool
220
221 // mutable
222 evaluated bool
223 found []*candidateMatch
224
225 // nextDoc, prepare.
226 bruteForceMatchTree
227}
228
229type substrMatchTree struct {
230 matchIterator
231
232 query *query.Substring
233 caseSensitive bool
234 fileName bool
235
236 // mutable
237 current []*candidateMatch
238 contEvaluated bool
239}
240
241type branchQueryMatchTree struct {
242 fileMasks []uint64
243 masks []uint64
244 repos []uint16
245
246 // mutable
247 firstDone bool
248 docID uint32
249}
250
251func (t *branchQueryMatchTree) branchMask() uint64 {
252 return t.fileMasks[t.docID] & t.masks[t.repos[t.docID]]
253}
254
255type symbolRegexpMatchTree struct {
256 matchTree
257 regexp *regexp.Regexp
258 all bool // skips regex match if .*
259
260 reEvaluated bool
261 found []*candidateMatch
262}
263
264func (t *symbolRegexpMatchTree) prepare(doc uint32) {
265 t.reEvaluated = false
266 t.found = t.found[:0]
267 t.matchTree.prepare(doc)
268}
269
270func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
271 if t.reEvaluated {
272 return matchesStateForSlice(t.found)
273 }
274
275 if cost < costRegexp {
276 return matchesRequiresHigherCost
277 }
278
279 sections := cp.docSections()
280 content := cp.data(false)
281
282 found := t.found[:0]
283 for i, sec := range sections {
284 var idx []int
285 if t.all {
286 idx = []int{0, int(sec.End - sec.Start)}
287 } else {
288 idx = t.regexp.FindIndex(content[sec.Start:sec.End])
289 if idx == nil {
290 continue
291 }
292 }
293
294 cm := &candidateMatch{
295 byteOffset: sec.Start + uint32(idx[0]),
296 byteMatchSz: uint32(idx[1] - idx[0]),
297 symbol: true,
298 symbolIdx: uint32(i),
299 }
300 found = append(found, cm)
301 }
302 t.found = found
303 t.reEvaluated = true
304
305 return matchesStateForSlice(t.found)
306}
307
308type symbolSubstrMatchTree struct {
309 *substrMatchTree
310
311 patternSize uint32
312 fileEndRunes []uint32
313 fileEndSymbol []uint32
314
315 doc uint32
316 sections []DocumentSection
317
318 secID uint32
319}
320
321func (t *symbolSubstrMatchTree) prepare(doc uint32) {
322 t.substrMatchTree.prepare(doc)
323 t.doc = doc
324
325 var fileStart uint32
326 if doc > 0 {
327 fileStart = t.fileEndRunes[doc-1]
328 }
329
330 var sections []DocumentSection
331 if len(t.sections) > 0 {
332 most := t.fileEndSymbol[len(t.fileEndSymbol)-1]
333 if most == uint32(len(t.sections)) {
334 sections = t.sections[t.fileEndSymbol[doc]:t.fileEndSymbol[doc+1]]
335 } else {
336 for t.secID < uint32(len(t.sections)) && t.sections[t.secID].Start < fileStart {
337 t.secID++
338 }
339
340 fileEnd, symbolEnd := t.fileEndRunes[doc], t.secID
341 for symbolEnd < uint32(len(t.sections)) && t.sections[symbolEnd].Start < fileEnd {
342 symbolEnd++
343 }
344
345 sections = t.sections[t.secID:symbolEnd]
346 }
347 }
348
349 secIdx := 0
350 trimmed := t.current[:0]
351 for len(sections) > secIdx && len(t.current) > 0 {
352 start := fileStart + t.current[0].runeOffset
353 end := start + t.patternSize
354 if start >= sections[secIdx].End {
355 secIdx++
356 continue
357 }
358
359 if start < sections[secIdx].Start {
360 t.current = t.current[1:]
361 continue
362 }
363
364 if end <= sections[secIdx].End {
365 t.current[0].symbol = true
366 t.current[0].symbolIdx = uint32(secIdx)
367 trimmed = append(trimmed, t.current[0])
368 }
369
370 t.current = t.current[1:]
371 }
372 t.current = trimmed
373}
374
375// all prepare methods
376
377func (t *bruteForceMatchTree) prepare(doc uint32) {
378 t.docID = doc
379 t.firstDone = true
380}
381
382func (t *docMatchTree) prepare(doc uint32) {
383 t.docID = doc
384 t.firstDone = true
385}
386
387func (t *andMatchTree) prepare(doc uint32) {
388 for _, c := range t.children {
389 c.prepare(doc)
390 }
391}
392
393func (t *regexpMatchTree) prepare(doc uint32) {
394 t.found = t.found[:0]
395 t.reEvaluated = false
396 t.bruteForceMatchTree.prepare(doc)
397}
398
399func (t *wordMatchTree) prepare(doc uint32) {
400 t.found = t.found[:0]
401 t.evaluated = false
402 t.bruteForceMatchTree.prepare(doc)
403}
404
405func (t *orMatchTree) prepare(doc uint32) {
406 for _, c := range t.children {
407 c.prepare(doc)
408 }
409}
410
411func (t *notMatchTree) prepare(doc uint32) {
412 t.child.prepare(doc)
413}
414
415func (t *fileNameMatchTree) prepare(doc uint32) {
416 t.child.prepare(doc)
417}
418
419func (t *boostMatchTree) prepare(doc uint32) {
420 t.child.prepare(doc)
421}
422
423func (t *substrMatchTree) prepare(nextDoc uint32) {
424 t.matchIterator.prepare(nextDoc)
425 t.current = t.matchIterator.candidates()
426 t.contEvaluated = false
427}
428
429func (t *branchQueryMatchTree) prepare(doc uint32) {
430 t.firstDone = true
431 t.docID = doc
432}
433
434// nextDoc
435
436func (t *docMatchTree) nextDoc() uint32 {
437 var start uint32
438 if t.firstDone {
439 start = t.docID + 1
440 }
441 for i := start; i < t.numDocs; i++ {
442 if t.predicate(i) {
443 return i
444 }
445 }
446 return math.MaxUint32
447}
448
449func (t *bruteForceMatchTree) nextDoc() uint32 {
450 if !t.firstDone {
451 return 0
452 }
453 return t.docID + 1
454}
455
456func (t *andMatchTree) nextDoc() uint32 {
457 var max uint32
458 for _, c := range t.children {
459 m := c.nextDoc()
460 if m > max {
461 max = m
462 }
463 }
464 return max
465}
466
467func (t *orMatchTree) nextDoc() uint32 {
468 min := uint32(math.MaxUint32)
469 for _, c := range t.children {
470 m := c.nextDoc()
471 if m < min {
472 min = m
473 }
474 }
475 return min
476}
477
478func (t *notMatchTree) nextDoc() uint32 {
479 return 0
480}
481
482func (t *fileNameMatchTree) nextDoc() uint32 {
483 return t.child.nextDoc()
484}
485
486func (t *boostMatchTree) nextDoc() uint32 {
487 return t.child.nextDoc()
488}
489
490func (t *branchQueryMatchTree) nextDoc() uint32 {
491 var start uint32
492 if t.firstDone {
493 start = t.docID + 1
494 }
495
496 for i := start; i < uint32(len(t.fileMasks)); i++ {
497 if (t.masks[t.repos[i]] & t.fileMasks[i]) != 0 {
498 return i
499 }
500 }
501 return math.MaxUint32
502}
503
504// all String methods
505
506func (t *bruteForceMatchTree) String() string {
507 return "all"
508}
509
510func (t *docMatchTree) String() string {
511 return fmt.Sprintf("doc(%s)", t.reason)
512}
513
514func (t *andMatchTree) String() string {
515 return fmt.Sprintf("and%v", t.children)
516}
517
518func (t *regexpMatchTree) String() string {
519 f := ""
520 if t.fileName {
521 f = "f"
522 }
523 return fmt.Sprintf("%sre(%s)", f, t.regexp)
524}
525
526func (t *wordMatchTree) String() string {
527 f := ""
528 if t.fileName {
529 f = "f"
530 }
531 return fmt.Sprintf("%sword(%s)", f, t.word)
532}
533
534func (t *orMatchTree) String() string {
535 return fmt.Sprintf("or%v", t.children)
536}
537
538func (t *notMatchTree) String() string {
539 return fmt.Sprintf("not(%v)", t.child)
540}
541
542func (t *noVisitMatchTree) String() string {
543 return fmt.Sprintf("novisit(%v)", t.matchTree)
544}
545
546func (t *fileNameMatchTree) String() string {
547 return fmt.Sprintf("f(%v)", t.child)
548}
549
550func (t *boostMatchTree) String() string {
551 return fmt.Sprintf("boost(%f, %v)", t.boost, t.child)
552}
553
554func (t *substrMatchTree) String() string {
555 f := ""
556 if t.fileName {
557 f = "f"
558 }
559
560 return fmt.Sprintf("%ssubstr(%q, %v, %v)", f, t.query.Pattern, t.current, t.matchIterator)
561}
562
563func (t *branchQueryMatchTree) String() string {
564 return fmt.Sprintf("branch(%x)", t.masks)
565}
566
567func (t *symbolSubstrMatchTree) String() string {
568 return fmt.Sprintf("symbol(%v)", t.substrMatchTree)
569}
570
571func (t *symbolRegexpMatchTree) String() string {
572 return fmt.Sprintf("symbol(%v)", t.matchTree)
573}
574
575// visitMatches visits all atoms in matchTree. Note: This visits
576// noVisitMatchTree. For collecting matches use visitMatches.
577func visitMatchTree(t matchTree, f func(matchTree)) {
578 switch s := t.(type) {
579 case *andMatchTree:
580 for _, ch := range s.children {
581 visitMatchTree(ch, f)
582 }
583 case *orMatchTree:
584 for _, ch := range s.children {
585 visitMatchTree(ch, f)
586 }
587 case *andLineMatchTree:
588 visitMatchTree(&s.andMatchTree, f)
589 case *noVisitMatchTree:
590 visitMatchTree(s.matchTree, f)
591 case *notMatchTree:
592 visitMatchTree(s.child, f)
593 case *fileNameMatchTree:
594 visitMatchTree(s.child, f)
595 case *boostMatchTree:
596 visitMatchTree(s.child, f)
597 case *symbolSubstrMatchTree:
598 visitMatchTree(s.substrMatchTree, f)
599 case *symbolRegexpMatchTree:
600 visitMatchTree(s.matchTree, f)
601 default:
602 f(t)
603 }
604}
605
606// updateMatchTreeStats calls updateStats on all atoms in mt which have that
607// function defined.
608func updateMatchTreeStats(mt matchTree, stats *zoekt.Stats) {
609 visitMatchTree(mt, func(mt matchTree) {
610 if atom, ok := mt.(interface{ updateStats(*zoekt.Stats) }); ok {
611 atom.updateStats(stats)
612 }
613 })
614}
615
616func visitMatchAtoms(t matchTree, known map[matchTree]bool, f func(matchTree)) {
617 visitMatches(t, known, 1, func(mt matchTree, _ float64) {
618 f(mt)
619 })
620}
621
622// visitMatches visits all atoms which can contribute matches. Note: This
623// skips noVisitMatchTree.
624func visitMatches(t matchTree, known map[matchTree]bool, weight float64, f func(matchTree, float64)) {
625 switch s := t.(type) {
626 case *andMatchTree:
627 for _, ch := range s.children {
628 if known[ch] {
629 visitMatches(ch, known, weight, f)
630 }
631 }
632 case *andLineMatchTree:
633 visitMatches(&s.andMatchTree, known, weight, f)
634 case *orMatchTree:
635 for _, ch := range s.children {
636 if known[ch] {
637 visitMatches(ch, known, weight, f)
638 }
639 }
640 case *boostMatchTree:
641 visitMatches(s.child, known, weight*s.boost, f)
642 case *symbolSubstrMatchTree:
643 visitMatches(s.substrMatchTree, known, weight, f)
644 case *notMatchTree:
645 case *noVisitMatchTree:
646 // don't collect into negative trees.
647 case *fileNameMatchTree:
648 // We will just gather the filename if we do not visit this tree.
649 default:
650 f(s, weight)
651 }
652}
653
654// all matches() methods.
655
656func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
657 return matchesStatePred(t.predicate(cp.idx))
658}
659
660func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
661 return matchesFound
662}
663
664// andLineMatchTree is a performance optimization of andMatchTree. For content
665// searches we don't want to run the regex engine if there is no line that
666// contains matches from all terms.
667func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
668 if state := evalMatchTree(cp, cost, known, &t.andMatchTree); state != matchesFound {
669 return state
670 }
671
672 // Invariant: all children have matches. If any line contains all of them we
673 // can return MatchesFound.
674
675 // find child with fewest candidates
676 minCount := math.MaxInt
677 fewestChildren := 0
678 for ix, child := range t.children {
679 v, ok := child.(*substrMatchTree)
680 // make sure we are running a content search and that all candidates are a
681 // substrMatchTree
682 if !ok || v.fileName {
683 return matchesFound
684 }
685 if len(v.current) < minCount {
686 minCount = len(v.current)
687 fewestChildren = ix
688 }
689 }
690
691 type lineRange struct {
692 start int
693 end int
694 }
695 lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current))
696 prev := -1
697 for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current {
698 line := cp.newlines().atOffset(candidate.byteOffset)
699 if line == prev {
700 continue
701 }
702 prev = line
703 byteStart := int(cp.newlines().lineStart(line))
704 byteEnd := int(cp.newlines().lineStart(line + 1))
705 lines = append(lines, lineRange{byteStart, byteEnd})
706 }
707
708 // children keeps track of the children's candidates we have already seen.
709 children := make([][]*candidateMatch, 0, len(t.children)-1)
710 for j, child := range t.children {
711 if j == fewestChildren {
712 continue
713 }
714 children = append(children, child.(*substrMatchTree).current)
715 }
716
717nextLine:
718 for i := 0; i < len(lines); i++ {
719 hits := 1
720 nextChild:
721 for j := range children {
722 nextCandidate:
723 for len(children[j]) > 0 {
724 candidate := children[j][0]
725 bo := int(cp.findOffset(false, candidate.runeOffset))
726 if bo < lines[i].start {
727 children[j] = children[j][1:]
728 continue nextCandidate
729 }
730 if bo < lines[i].end {
731 hits++
732 continue nextChild
733 }
734 // move the `lines` iterator forward until bo < line.end
735 for i < len(lines) && bo >= lines[i].end {
736 i++
737 }
738 i--
739 continue nextLine
740 }
741 }
742 // return early once we found any line that contains matches from all children
743 if hits == len(t.children) {
744 return matchesFound
745 }
746 }
747 return matchesNone
748}
749
750func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
751 // We have found matches unless a child needs to do more work or it hasn't
752 // found matches.
753 state := matchesFound
754
755 for _, ch := range t.children {
756 switch evalMatchTree(cp, cost, known, ch) {
757 case matchesRequiresHigherCost:
758 // keep evaluating other children incase we come across matchesNone
759 state = matchesRequiresHigherCost
760 case matchesFound:
761 // will return this if every child has this value
762 case matchesNone:
763 return matchesNone
764 }
765 }
766
767 return state
768}
769
770func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
771 // we could short-circuit, but we want to use the other possibilities as a
772 // ranking signal. So we always return the most conservative state.
773 state := matchesNone
774 for _, ch := range t.children {
775 switch evalMatchTree(cp, cost, known, ch) {
776 case matchesRequiresHigherCost:
777 state = matchesRequiresHigherCost
778 case matchesFound:
779 if state != matchesRequiresHigherCost {
780 state = matchesFound
781 }
782 case matchesNone:
783 // noop
784 }
785 }
786 return state
787}
788
789func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
790 return matchesStatePred(t.branchMask() != 0)
791}
792
793func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
794 if t.reEvaluated {
795 return matchesStateForSlice(t.found)
796 }
797
798 if cost < costRegexp {
799 return matchesRequiresHigherCost
800 }
801
802 cp.stats.RegexpsConsidered++
803 idxs := t.regexp.FindAllIndex(cp.data(t.fileName), -1)
804 found := t.found[:0]
805 for _, idx := range idxs {
806 cm := &candidateMatch{
807 byteOffset: uint32(idx[0]),
808 byteMatchSz: uint32(idx[1] - idx[0]),
809 fileName: t.fileName,
810 }
811
812 found = append(found, cm)
813 }
814 t.found = found
815 t.reEvaluated = true
816
817 return matchesStateForSlice(t.found)
818}
819
820func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
821 if t.evaluated {
822 return matchesStateForSlice(t.found)
823 }
824
825 if cost < costRegexp {
826 return matchesRequiresHigherCost
827 }
828
829 data := cp.data(t.fileName)
830 offset := 0
831 found := t.found[:0]
832 for {
833 idx := bytes.Index(data[offset:], []byte(t.word))
834 if idx < 0 {
835 break
836 }
837
838 relStartOffset := offset + idx
839 relEndOffset := relStartOffset + len(t.word)
840
841 startBoundary := relStartOffset < len(data) && (relStartOffset == 0 || !characterClass(data[relStartOffset-1]))
842 endBoundary := relEndOffset > 0 && (relEndOffset == len(data) || !characterClass(data[relEndOffset]))
843 if startBoundary && endBoundary {
844 found = append(found, &candidateMatch{
845 byteOffset: uint32(offset + idx),
846 byteMatchSz: uint32(len(t.word)),
847 fileName: t.fileName,
848 })
849 }
850 offset += idx + len(t.word)
851 }
852
853 t.found = found
854 t.evaluated = true
855
856 return matchesStateForSlice(t.found)
857}
858
859// breakMatchesOnNewlines returns matches resulting from breaking each element
860// of cms on newlines within text.
861func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch {
862 var lineCMs []*candidateMatch
863 for _, cm := range cms {
864 lineCMs = append(lineCMs, breakOnNewlines(cm, text)...)
865 }
866 return lineCMs
867}
868
869// breakOnNewlines returns matches resulting from breaking cm on newlines
870// within text.
871func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch {
872 var cms []*candidateMatch
873 addMe := &candidateMatch{}
874 *addMe = *cm
875 for i := uint32(cm.byteOffset); i < cm.byteOffset+cm.byteMatchSz; i++ {
876 if text[i] == '\n' {
877 addMe.byteMatchSz = i - addMe.byteOffset
878 if addMe.byteMatchSz != 0 {
879 cms = append(cms, addMe)
880 }
881
882 addMe = &candidateMatch{}
883 *addMe = *cm
884 addMe.byteOffset = i + 1
885 }
886 }
887 addMe.byteMatchSz = cm.byteOffset + cm.byteMatchSz - addMe.byteOffset
888 if addMe.byteMatchSz != 0 {
889 cms = append(cms, addMe)
890 }
891 return cms
892}
893
894// evalMatchTree should be called instead of directly calling
895// matchTree.matches. It cache known values for future evaluation at higher
896// costs.
897func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) matchesState {
898 if v, ok := known[mt]; ok {
899 return matchesStatePred(v)
900 }
901
902 ms := mt.matches(cp, cost, known)
903 if ms != matchesRequiresHigherCost {
904 known[mt] = ms == matchesFound
905 }
906
907 return ms
908}
909
910func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
911 switch evalMatchTree(cp, cost, known, t.child) {
912 case matchesRequiresHigherCost:
913 return matchesRequiresHigherCost
914 case matchesFound:
915 return matchesNone
916 case matchesNone:
917 return matchesFound
918 default:
919 panic("unreachable")
920 }
921}
922
923func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
924 return evalMatchTree(cp, cost, known, t.child)
925}
926
927func (t *boostMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
928 return evalMatchTree(cp, cost, known, t.child)
929}
930
931func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
932 if t.contEvaluated {
933 return matchesStateForSlice(t.current)
934 }
935
936 if len(t.current) == 0 {
937 return matchesNone
938 }
939
940 if t.fileName && cost < costMemory {
941 return matchesRequiresHigherCost
942 }
943
944 if !t.fileName && cost < costContent {
945 return matchesRequiresHigherCost
946 }
947
948 pruned := t.current[:0]
949 for _, m := range t.current {
950 if m.byteOffset == 0 && m.runeOffset > 0 {
951 m.byteOffset = cp.findOffset(m.fileName, m.runeOffset)
952 }
953 if m.matchContent(cp.data(m.fileName)) {
954 pruned = append(pruned, m)
955 }
956 }
957 t.current = pruned
958 t.contEvaluated = true
959
960 return matchesStateForSlice(t.current)
961}
962
963type matchTreeOpt struct {
964 // DisableWordMatchOptimization is used to disable the use of wordMatchTree.
965 // This was added since we do not support wordMatchTree with symbol search.
966 DisableWordMatchOptimization bool
967}
968
969func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error) {
970 if q == nil {
971 return nil, fmt.Errorf("got nil (sub)query")
972 }
973 switch s := q.(type) {
974 case *query.Regexp:
975 // RegexpToMatchTreeRecursive tries to distill a matchTree that matches a
976 // superset of the regexp. If the returned matchTree is equivalent to the
977 // original regexp, it returns true. An equivalent matchTree has the same
978 // behaviour as the original regexp and can be used instead.
979 //
980 subMT, isEq, _, err := d.regexpToMatchTreeRecursive(s.Regexp, ngramSize, s.FileName, s.CaseSensitive)
981 if err != nil {
982 return nil, err
983 }
984 // if the query can be used in place of the regexp
985 // return the subtree
986 if isEq {
987 return subMT, nil
988 }
989
990 var tr matchTree
991 if wmt, ok := regexpToWordMatchTree(s, opt); ok {
992 // A common search we get is "\bLITERAL\b". Avoid the regex engine and
993 // provide something faster.
994 tr = wmt
995 } else {
996 tr = newRegexpMatchTree(s)
997 }
998
999 return &andMatchTree{
1000 children: []matchTree{
1001 tr, &noVisitMatchTree{subMT},
1002 },
1003 }, nil
1004 case *query.And:
1005 var r []matchTree
1006 for _, ch := range s.Children {
1007 ct, err := d.newMatchTree(ch, opt)
1008 if err != nil {
1009 return nil, err
1010 }
1011 r = append(r, ct)
1012 }
1013 return &andMatchTree{r}, nil
1014 case *query.Or:
1015 var r []matchTree
1016 for _, ch := range s.Children {
1017 ct, err := d.newMatchTree(ch, opt)
1018 if err != nil {
1019 return nil, err
1020 }
1021 r = append(r, ct)
1022 }
1023 return &orMatchTree{r}, nil
1024 case *query.Not:
1025 ct, err := d.newMatchTree(s.Child, opt)
1026 return ¬MatchTree{
1027 child: ct,
1028 }, err
1029
1030 case *query.Type:
1031 if s.Type != query.TypeFileName {
1032 break
1033 }
1034
1035 ct, err := d.newMatchTree(s.Child, opt)
1036 if err != nil {
1037 return nil, err
1038 }
1039
1040 return &fileNameMatchTree{
1041 child: ct,
1042 }, nil
1043
1044 case *query.Boost:
1045 ct, err := d.newMatchTree(s.Child, opt)
1046 if err != nil {
1047 return nil, err
1048 }
1049
1050 return &boostMatchTree{
1051 child: ct,
1052 boost: s.Boost,
1053 }, nil
1054
1055 case *query.Substring:
1056 return d.newSubstringMatchTree(s)
1057
1058 case *query.Branch:
1059 masks := make([]uint64, 0, len(d.repoMetaData))
1060 if s.Pattern == "HEAD" {
1061 for range d.repoMetaData {
1062 masks = append(masks, 1)
1063 }
1064 } else {
1065 for _, branchIDs := range d.branchIDs {
1066 mask := uint64(0)
1067 for nm, m := range branchIDs {
1068 if (s.Exact && nm == s.Pattern) || (!s.Exact && strings.Contains(nm, s.Pattern)) {
1069 mask |= uint64(m)
1070 }
1071 }
1072 masks = append(masks, mask)
1073 }
1074 }
1075 return &branchQueryMatchTree{
1076 masks: masks,
1077 fileMasks: d.fileBranchMasks,
1078 repos: d.repos,
1079 }, nil
1080 case *query.Const:
1081 if s.Value {
1082 return &bruteForceMatchTree{}, nil
1083 } else {
1084 return &noMatchTree{Why: "const"}, nil
1085 }
1086 case *query.Language:
1087 code, ok := d.metaData.LanguageMap[s.Language]
1088 if !ok {
1089 return &noMatchTree{Why: "lang"}, nil
1090 }
1091 return &docMatchTree{
1092 reason: "language",
1093 numDocs: d.numDocs(),
1094 predicate: func(docID uint32) bool {
1095 return d.getLanguage(docID) == code
1096 },
1097 }, nil
1098
1099 case *query.Symbol:
1100 // Disable WordMatchTree since we don't support it in symbols yet.
1101 optCopy := opt
1102 optCopy.DisableWordMatchOptimization = true
1103
1104 subMT, err := d.newMatchTree(s.Expr, optCopy)
1105 if err != nil {
1106 return nil, err
1107 }
1108
1109 if substr, ok := subMT.(*substrMatchTree); ok {
1110 return &symbolSubstrMatchTree{
1111 substrMatchTree: substr,
1112 patternSize: uint32(utf8.RuneCountInString(substr.query.Pattern)),
1113 fileEndRunes: d.fileEndRunes,
1114 fileEndSymbol: d.fileEndSymbol,
1115 sections: d.runeDocSections,
1116 }, nil
1117 }
1118
1119 var regexpMT *regexpMatchTree
1120 visitMatchTree(subMT, func(mt matchTree) {
1121 if t, ok := mt.(*regexpMatchTree); ok {
1122 regexpMT = t
1123 }
1124 })
1125 if regexpMT == nil {
1126 return nil, fmt.Errorf("found %T inside query.Symbol", subMT)
1127 }
1128
1129 return &symbolRegexpMatchTree{
1130 regexp: regexpMT.regexp,
1131 all: isRegexpAll(regexpMT.origRegexp),
1132 matchTree: subMT,
1133 }, nil
1134
1135 case *query.FileNameSet:
1136 return &docMatchTree{
1137 reason: "FileNameSet",
1138 numDocs: d.numDocs(),
1139 predicate: func(docID uint32) bool {
1140 fileName := d.fileName(docID)
1141 _, ok := s.Set[string(fileName)]
1142 return ok
1143 },
1144 }, nil
1145
1146 case *query.BranchesRepos:
1147 reposBranchesWant := make([]uint64, len(d.repoMetaData))
1148 for repoIdx := range d.repoMetaData {
1149 var mask uint64
1150 for _, br := range s.List {
1151 if br.Repos.Contains(d.repoMetaData[repoIdx].ID) {
1152 mask |= uint64(d.branchIDs[repoIdx][br.Branch])
1153 }
1154 }
1155 reposBranchesWant[repoIdx] = mask
1156 }
1157 return &docMatchTree{
1158 reason: "BranchesRepos",
1159 numDocs: d.numDocs(),
1160 predicate: func(docID uint32) bool {
1161 return d.fileBranchMasks[docID]&reposBranchesWant[d.repos[docID]] != 0
1162 },
1163 }, nil
1164
1165 case *query.RepoSet:
1166 reposWant := make([]bool, len(d.repoMetaData))
1167 for repoIdx, r := range d.repoMetaData {
1168 if _, ok := s.Set[r.Name]; ok {
1169 reposWant[repoIdx] = true
1170 }
1171 }
1172 return &docMatchTree{
1173 reason: "RepoSet",
1174 numDocs: d.numDocs(),
1175 predicate: func(docID uint32) bool {
1176 return reposWant[d.repos[docID]]
1177 },
1178 }, nil
1179
1180 case *query.RepoIDs:
1181 reposWant := make([]bool, len(d.repoMetaData))
1182 for repoIdx, r := range d.repoMetaData {
1183 if s.Repos.Contains(r.ID) {
1184 reposWant[repoIdx] = true
1185 }
1186 }
1187 return &docMatchTree{
1188 reason: "RepoIDs",
1189 numDocs: d.numDocs(),
1190 predicate: func(docID uint32) bool {
1191 return reposWant[d.repos[docID]]
1192 },
1193 }, nil
1194
1195 case *query.Repo:
1196 reposWant := make([]bool, len(d.repoMetaData))
1197 for repoIdx, r := range d.repoMetaData {
1198 if s.Regexp.MatchString(r.Name) {
1199 reposWant[repoIdx] = true
1200 }
1201 }
1202 return &docMatchTree{
1203 reason: "Repo",
1204 numDocs: d.numDocs(),
1205 predicate: func(docID uint32) bool {
1206 return reposWant[d.repos[docID]]
1207 },
1208 }, nil
1209
1210 case *query.RepoRegexp:
1211 reposWant := make([]bool, len(d.repoMetaData))
1212 for repoIdx, r := range d.repoMetaData {
1213 if s.Regexp.MatchString(r.Name) {
1214 reposWant[repoIdx] = true
1215 }
1216 }
1217 return &docMatchTree{
1218 reason: "RepoRegexp",
1219 numDocs: d.numDocs(),
1220 predicate: func(docID uint32) bool {
1221 return reposWant[d.repos[docID]]
1222 },
1223 }, nil
1224
1225 case query.RawConfig:
1226 return &docMatchTree{
1227 reason: s.String(),
1228 numDocs: d.numDocs(),
1229 predicate: func(docID uint32) bool {
1230 return uint8(s)&d.rawConfigMasks[d.repos[docID]] == uint8(s)
1231 },
1232 }, nil
1233 }
1234 log.Panicf("type %T", q)
1235 return nil, nil
1236}
1237
1238func (d *indexData) newSubstringMatchTree(s *query.Substring) (matchTree, error) {
1239 st := &substrMatchTree{
1240 query: s,
1241 caseSensitive: s.CaseSensitive,
1242 fileName: s.FileName,
1243 }
1244
1245 if utf8.RuneCountInString(s.Pattern) < ngramSize {
1246 return newRegexpMatchTree(&query.Regexp{
1247 Regexp: &syntax.Regexp{Op: syntax.OpLiteral, Rune: []rune(s.Pattern)},
1248 FileName: s.FileName,
1249 Content: s.Content,
1250 CaseSensitive: s.CaseSensitive,
1251 }), nil
1252 }
1253
1254 result, err := d.iterateNgrams(s)
1255 if err != nil {
1256 return nil, err
1257 }
1258 st.matchIterator = result
1259 return st, nil
1260}
1261
1262func regexpToWordMatchTree(q *query.Regexp, opt matchTreeOpt) (_ *wordMatchTree, ok bool) {
1263 if opt.DisableWordMatchOptimization {
1264 return nil, false
1265 }
1266 // Needs to be case sensitive
1267 if !q.CaseSensitive || q.Regexp.Flags&syntax.FoldCase != 0 {
1268 return nil, false
1269 }
1270 // We want a regex that looks like Op.Concat[OpWordBoundary OpLiteral OpWordBoundary]
1271 if q.Regexp.Op != syntax.OpConcat || len(q.Regexp.Sub) != 3 {
1272 return nil, false
1273 }
1274 sub := q.Regexp.Sub
1275 if sub[0].Op != syntax.OpWordBoundary || sub[1].Op != syntax.OpLiteral || sub[2].Op != syntax.OpWordBoundary {
1276 return nil, false
1277 }
1278
1279 return &wordMatchTree{
1280 word: string(sub[1].Rune),
1281 fileName: q.FileName,
1282 }, true
1283}
1284
1285// pruneMatchTree removes impossible branches from the matchTree, as indicated
1286// by substrMatchTree having a noMatchTree and the resulting impossible and clauses and so forth.
1287func pruneMatchTree(mt matchTree) (matchTree, error) {
1288 var err error
1289 switch mt := mt.(type) {
1290 // leaf nodes that we test with the filter:
1291 case *substrMatchTree:
1292 if res, ok := mt.matchIterator.(*ngramIterationResults); ok {
1293 if _, ok := res.matchIterator.(*noMatchTree); ok {
1294 return nil, nil
1295 }
1296 }
1297 // recursive tree structures:
1298 case *andMatchTree:
1299 // Any branch of an and becoming impossible means the entire clause
1300 // is impossible. Otherwise, just handle rewrites.
1301 for i, child := range mt.children {
1302 newChild, err := pruneMatchTree(child)
1303 if err != nil {
1304 return nil, err
1305 }
1306 if newChild == nil {
1307 return nil, nil
1308 }
1309 mt.children[i] = newChild
1310 }
1311 case *orMatchTree:
1312 // *All* branches of an OR becoming impossible means the entire clause
1313 // is impossible. Otherwise, drop impossible subclauses and handle
1314 // rewrites, including simplifying to a singular resulting child branch.
1315 n := 0
1316 for _, child := range mt.children {
1317 newChild, err := pruneMatchTree(child)
1318 if err != nil {
1319 return nil, err
1320 }
1321 if newChild != nil {
1322 mt.children[n] = newChild
1323 n++
1324 }
1325 }
1326 mt.children = mt.children[:n]
1327 if len(mt.children) == 1 {
1328 return mt.children[0], nil
1329 } else if len(mt.children) == 0 {
1330 return nil, nil
1331 }
1332 case *noVisitMatchTree:
1333 mt.matchTree, err = pruneMatchTree(mt.matchTree)
1334 if err != nil {
1335 return nil, err
1336 }
1337 if mt.matchTree == nil {
1338 return nil, nil
1339 }
1340 case *fileNameMatchTree:
1341 mt.child, err = pruneMatchTree(mt.child)
1342 if err != nil {
1343 return nil, err
1344 }
1345 if mt.child == nil {
1346 return nil, nil
1347 }
1348 case *boostMatchTree:
1349 mt.child, err = pruneMatchTree(mt.child)
1350 if err != nil {
1351 return nil, err
1352 }
1353 if mt.child == nil {
1354 return nil, nil
1355 }
1356 case *andLineMatchTree:
1357 child, err := pruneMatchTree(&mt.andMatchTree)
1358 if err != nil {
1359 return nil, err
1360 }
1361 if child == nil {
1362 return nil, nil
1363 }
1364 if c, ok := child.(*andMatchTree); ok {
1365 mt.andMatchTree = *c
1366 } else {
1367 // the and was simplified to a single clause,
1368 // so the linematch portion is irrelevant.
1369 return mt, nil
1370 }
1371 case *notMatchTree:
1372 mt.child, err = pruneMatchTree(mt.child)
1373 if err != nil {
1374 return nil, err
1375 }
1376 if mt.child == nil {
1377 // not false => true
1378 return &bruteForceMatchTree{}, nil
1379 }
1380 // unhandled:
1381 case *docMatchTree:
1382 case *bruteForceMatchTree:
1383 case *regexpMatchTree:
1384 case *wordMatchTree:
1385 }
1386 return mt, err
1387}
1388
1389// isRegexpAll returns true if the query matches all possible lines.
1390//
1391// Note: it is possible for a funky regex to actually match all but this
1392// returns false. This returns true for normal looking regexes like ".*" or
1393// "(?-s:.*)".
1394func isRegexpAll(r *syntax.Regexp) bool {
1395 // Note: we don't care about any flags since we are looking for .* and we
1396 // don't mind if . matches all or everything but newline.
1397
1398 // for loop is for visiting children of OpCapture/OpConcat until we hit .*
1399 for {
1400 // Our main target: .*
1401 if r.Op == syntax.OpStar && len(r.Sub) == 1 { // *
1402 // (?s:.) or (?-s:.)
1403 return r.Sub[0].Op == syntax.OpAnyChar || r.Sub[0].Op == syntax.OpAnyCharNotNL
1404 }
1405
1406 // Strip away expressions being wrapped in paranthesis
1407 if (r.Op == syntax.OpCapture || r.Op == syntax.OpConcat) && len(r.Sub) == 1 {
1408 r = r.Sub[0]
1409 continue
1410 }
1411
1412 return false
1413 }
1414}