fork of https://github.com/sourcegraph/zoekt
1// Copyright 2018 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package index
16
17import (
18 "bytes"
19 "fmt"
20 "log"
21 "regexp/syntax"
22 "strings"
23 "unicode/utf8"
24
25 "github.com/grafana/regexp"
26 "github.com/sourcegraph/zoekt"
27 "github.com/sourcegraph/zoekt/internal/syntaxutil"
28 "github.com/sourcegraph/zoekt/query"
29)
30
31// A docIterator iterates over documents in order.
32type docIterator interface {
33 // provide the next document where we may find something interesting.
34 //
35 // This is like a "peek" and shouldn't mutate state. prepare is what should
36 // change state.
37 nextDoc() uint32
38
39 // clears any per-document state of the docIterator, and
40 // prepares for evaluating the given doc. The argument is
41 // strictly increasing over time.
42 prepare(nextDoc uint32)
43}
44
45// costs are passed in increasing order to matchTree.matches until they do not
46// return matchesRequiresHigherCost.
47const (
48 costConst = 0
49 costMemory = 1
50 costContent = 2
51 costRegexp = 3
52)
53
54const (
55 costMin = costConst
56 costMax = costRegexp
57)
58
59// matchesState is an enum for the state of a matchTree after a call to
60// matchTree.matches.
61type matchesState uint8
62
63const (
64 // matchesRequiresHigherCost is returned when matchTree.matches hasn't done
65 // a search yet since the cost value is not high enough.
66 matchesRequiresHigherCost matchesState = iota
67
68 // matchesFound is returned when matchTree.matches has done a search and
69 // found one or more matches.
70 matchesFound
71
72 // matchesNone is returned when matchTree.matches has done a search and
73 // found nothing.
74 matchesNone
75)
76
77// matchesStatePred is a helper which returns matchesFound if b is true
78// otherwise returns matchesNone.
79func matchesStatePred(b bool) matchesState {
80 if b {
81 return matchesFound
82 }
83 return matchesNone
84}
85
86// matchesStateForSlice is a helper which returns matchesFound if v is
87// non-empty otherwise returns matchesNone.
88func matchesStateForSlice[T any](v []T) matchesState {
89 return matchesStatePred(len(v) > 0)
90}
91
92// An expression tree coupled with matches. The matchtree has two
93// functions:
94//
95// * it implements boolean combinations (and, or, not)
96//
97// * it implements shortcuts, where we skip documents (for example: if
98// there are no trigram matches, we can be sure there are no substring
99// matches). The matchtree iterates over the documents as they are
100// ordered in the shard.
101//
102// The general process for a given (shard, query) is:
103//
104// - construct matchTree for the query
105//
106// - find all different leaf matchTrees (substring, regexp, etc.)
107//
108// in a loop:
109//
110// - find next doc to process using nextDoc
111//
112// - evaluate atoms (leaf expressions that match text)
113//
114// - evaluate the tree using matches(), storing the result in map.
115//
116// - if the complete tree returns (matches() != matchesRequiresHigherCost)
117// for the document, collect all text matches by looking at leaf
118// matchTrees.
119type matchTree interface {
120 docIterator
121
122 // matches if cost is high enough. See documentation for matchesState's
123 // values.
124 //
125 // Note: Do not call this directly, rather use evalMatchTree which uses
126 // known to cache responses once the state transitions away from
127 // matchesRequiresHigherCost.
128 matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState
129}
130
131// docMatchTree iterates over documents for which predicate(docID) returns true.
132type docMatchTree struct {
133 // the number of documents in a shard.
134 numDocs uint32
135
136 predicate func(docID uint32) bool
137
138 // provides additional information about the reason why the docMatchTree was
139 // created.
140 reason string
141
142 // mutable
143 firstDone bool
144 docID uint32
145}
146
147type bruteForceMatchTree struct {
148 // mutable
149 firstDone bool
150 docID uint32
151}
152
153type andLineMatchTree struct {
154 andMatchTree
155}
156
157type andMatchTree struct {
158 children []matchTree
159}
160
161type orMatchTree struct {
162 children []matchTree
163}
164
165type notMatchTree struct {
166 child matchTree
167}
168
169// Returns only the filename of child matches.
170type fileNameMatchTree struct {
171 child matchTree
172}
173
174type boostMatchTree struct {
175 child matchTree
176 boost float64
177}
178
179// Don't visit this subtree for collecting matches.
180type noVisitMatchTree struct {
181 matchTree
182}
183
184type regexpMatchTree struct {
185 regexp *regexp.Regexp
186
187 // origRegexp is the original parsed regexp from the query structure. It
188 // does not include mutations such as case sensitivity.
189 origRegexp *syntax.Regexp
190
191 fileName bool
192
193 // mutable
194 reEvaluated bool
195 found []*candidateMatch
196
197 // nextDoc, prepare.
198 bruteForceMatchTree
199}
200
201func newRegexpMatchTree(s *query.Regexp) *regexpMatchTree {
202 prefix := ""
203 if !s.CaseSensitive {
204 prefix = "(?i)"
205 }
206
207 return ®expMatchTree{
208 regexp: regexp.MustCompile(prefix + syntaxutil.RegexpString(s.Regexp)),
209 origRegexp: s.Regexp,
210 fileName: s.FileName,
211 }
212}
213
214// \bLITERAL\b
215type wordMatchTree struct {
216 word string
217
218 fileName bool
219
220 // mutable
221 evaluated bool
222 found []*candidateMatch
223
224 // nextDoc, prepare.
225 bruteForceMatchTree
226}
227
228type substrMatchTree struct {
229 matchIterator
230
231 query *query.Substring
232 caseSensitive bool
233 fileName bool
234
235 // mutable
236 current []*candidateMatch
237 contEvaluated bool
238}
239
240type branchQueryMatchTree struct {
241 fileMasks []uint64
242 masks []uint64
243 repos []uint16
244
245 // mutable
246 firstDone bool
247 docID uint32
248}
249
250func (t *branchQueryMatchTree) branchMask() uint64 {
251 return t.fileMasks[t.docID] & t.masks[t.repos[t.docID]]
252}
253
254type symbolRegexpMatchTree struct {
255 matchTree
256 regexp *regexp.Regexp
257 all bool // skips regex match if .*
258
259 reEvaluated bool
260 found []*candidateMatch
261}
262
263func (t *symbolRegexpMatchTree) prepare(doc uint32) {
264 t.reEvaluated = false
265 t.found = t.found[:0]
266 t.matchTree.prepare(doc)
267}
268
269func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
270 if t.reEvaluated {
271 return matchesStateForSlice(t.found)
272 }
273
274 if cost < costRegexp {
275 return matchesRequiresHigherCost
276 }
277
278 sections := cp.docSections()
279 content := cp.data(false)
280
281 found := t.found[:0]
282 for i, sec := range sections {
283 var idx []int
284 if t.all {
285 idx = []int{0, int(sec.End - sec.Start)}
286 } else {
287 idx = t.regexp.FindIndex(content[sec.Start:sec.End])
288 if idx == nil {
289 continue
290 }
291 }
292
293 cm := &candidateMatch{
294 byteOffset: sec.Start + uint32(idx[0]),
295 byteMatchSz: uint32(idx[1] - idx[0]),
296 symbol: true,
297 symbolIdx: uint32(i),
298 }
299 found = append(found, cm)
300 }
301 t.found = found
302 t.reEvaluated = true
303
304 return matchesStateForSlice(t.found)
305}
306
307type symbolSubstrMatchTree struct {
308 *substrMatchTree
309
310 patternSize uint32
311 fileEndRunes []uint32
312 fileEndSymbol []uint32
313
314 doc uint32
315 sections []DocumentSection
316
317 secID uint32
318}
319
320func (t *symbolSubstrMatchTree) prepare(doc uint32) {
321 t.substrMatchTree.prepare(doc)
322 t.doc = doc
323
324 var fileStart uint32
325 if doc > 0 {
326 fileStart = t.fileEndRunes[doc-1]
327 }
328
329 var sections []DocumentSection
330 if len(t.sections) > 0 {
331 most := t.fileEndSymbol[len(t.fileEndSymbol)-1]
332 if most == uint32(len(t.sections)) {
333 sections = t.sections[t.fileEndSymbol[doc]:t.fileEndSymbol[doc+1]]
334 } else {
335 for t.secID < uint32(len(t.sections)) && t.sections[t.secID].Start < fileStart {
336 t.secID++
337 }
338
339 fileEnd, symbolEnd := t.fileEndRunes[doc], t.secID
340 for symbolEnd < uint32(len(t.sections)) && t.sections[symbolEnd].Start < fileEnd {
341 symbolEnd++
342 }
343
344 sections = t.sections[t.secID:symbolEnd]
345 }
346 }
347
348 secIdx := 0
349 trimmed := t.current[:0]
350 for len(sections) > secIdx && len(t.current) > 0 {
351 start := fileStart + t.current[0].runeOffset
352 end := start + t.patternSize
353 if start >= sections[secIdx].End {
354 secIdx++
355 continue
356 }
357
358 if start < sections[secIdx].Start {
359 t.current = t.current[1:]
360 continue
361 }
362
363 if end <= sections[secIdx].End {
364 t.current[0].symbol = true
365 t.current[0].symbolIdx = uint32(secIdx)
366 trimmed = append(trimmed, t.current[0])
367 }
368
369 t.current = t.current[1:]
370 }
371 t.current = trimmed
372}
373
374// all prepare methods
375
376func (t *bruteForceMatchTree) prepare(doc uint32) {
377 t.docID = doc
378 t.firstDone = true
379}
380
381func (t *docMatchTree) prepare(doc uint32) {
382 t.docID = doc
383 t.firstDone = true
384}
385
386func (t *andMatchTree) prepare(doc uint32) {
387 for _, c := range t.children {
388 c.prepare(doc)
389 }
390}
391
392func (t *regexpMatchTree) prepare(doc uint32) {
393 t.found = t.found[:0]
394 t.reEvaluated = false
395 t.bruteForceMatchTree.prepare(doc)
396}
397
398func (t *wordMatchTree) prepare(doc uint32) {
399 t.found = t.found[:0]
400 t.evaluated = false
401 t.bruteForceMatchTree.prepare(doc)
402}
403
404func (t *orMatchTree) prepare(doc uint32) {
405 for _, c := range t.children {
406 c.prepare(doc)
407 }
408}
409
410func (t *notMatchTree) prepare(doc uint32) {
411 t.child.prepare(doc)
412}
413
414func (t *fileNameMatchTree) prepare(doc uint32) {
415 t.child.prepare(doc)
416}
417
418func (t *boostMatchTree) prepare(doc uint32) {
419 t.child.prepare(doc)
420}
421
422func (t *substrMatchTree) prepare(nextDoc uint32) {
423 t.matchIterator.prepare(nextDoc)
424 t.current = t.matchIterator.candidates()
425 t.contEvaluated = false
426}
427
428func (t *branchQueryMatchTree) prepare(doc uint32) {
429 t.firstDone = true
430 t.docID = doc
431}
432
433// nextDoc
434
435func (t *docMatchTree) nextDoc() uint32 {
436 var start uint32
437 if t.firstDone {
438 start = t.docID + 1
439 }
440 for i := start; i < t.numDocs; i++ {
441 if t.predicate(i) {
442 return i
443 }
444 }
445 return maxUInt32
446}
447
448func (t *bruteForceMatchTree) nextDoc() uint32 {
449 if !t.firstDone {
450 return 0
451 }
452 return t.docID + 1
453}
454
455func (t *andMatchTree) nextDoc() uint32 {
456 var max uint32
457 for _, c := range t.children {
458 m := c.nextDoc()
459 if m > max {
460 max = m
461 }
462 }
463 return max
464}
465
466func (t *orMatchTree) nextDoc() uint32 {
467 min := uint32(maxUInt32)
468 for _, c := range t.children {
469 m := c.nextDoc()
470 if m < min {
471 min = m
472 }
473 }
474 return min
475}
476
477func (t *notMatchTree) nextDoc() uint32 {
478 return 0
479}
480
481func (t *fileNameMatchTree) nextDoc() uint32 {
482 return t.child.nextDoc()
483}
484
485func (t *boostMatchTree) nextDoc() uint32 {
486 return t.child.nextDoc()
487}
488
489func (t *branchQueryMatchTree) nextDoc() uint32 {
490 var start uint32
491 if t.firstDone {
492 start = t.docID + 1
493 }
494
495 for i := start; i < uint32(len(t.fileMasks)); i++ {
496 if (t.masks[t.repos[i]] & t.fileMasks[i]) != 0 {
497 return i
498 }
499 }
500 return maxUInt32
501}
502
503// all String methods
504
505func (t *bruteForceMatchTree) String() string {
506 return "all"
507}
508
509func (t *docMatchTree) String() string {
510 return fmt.Sprintf("doc(%s)", t.reason)
511}
512
513func (t *andMatchTree) String() string {
514 return fmt.Sprintf("and%v", t.children)
515}
516
517func (t *regexpMatchTree) String() string {
518 f := ""
519 if t.fileName {
520 f = "f"
521 }
522 return fmt.Sprintf("%sre(%s)", f, t.regexp)
523}
524
525func (t *wordMatchTree) String() string {
526 f := ""
527 if t.fileName {
528 f = "f"
529 }
530 return fmt.Sprintf("%sword(%s)", f, t.word)
531}
532
533func (t *orMatchTree) String() string {
534 return fmt.Sprintf("or%v", t.children)
535}
536
537func (t *notMatchTree) String() string {
538 return fmt.Sprintf("not(%v)", t.child)
539}
540
541func (t *noVisitMatchTree) String() string {
542 return fmt.Sprintf("novisit(%v)", t.matchTree)
543}
544
545func (t *fileNameMatchTree) String() string {
546 return fmt.Sprintf("f(%v)", t.child)
547}
548
549func (t *boostMatchTree) String() string {
550 return fmt.Sprintf("boost(%f, %v)", t.boost, t.child)
551}
552
553func (t *substrMatchTree) String() string {
554 f := ""
555 if t.fileName {
556 f = "f"
557 }
558
559 return fmt.Sprintf("%ssubstr(%q, %v, %v)", f, t.query.Pattern, t.current, t.matchIterator)
560}
561
562func (t *branchQueryMatchTree) String() string {
563 return fmt.Sprintf("branch(%x)", t.masks)
564}
565
566func (t *symbolSubstrMatchTree) String() string {
567 return fmt.Sprintf("symbol(%v)", t.substrMatchTree)
568}
569
570func (t *symbolRegexpMatchTree) String() string {
571 return fmt.Sprintf("symbol(%v)", t.matchTree)
572}
573
574// visitMatches visits all atoms in matchTree. Note: This visits
575// noVisitMatchTree. For collecting matches use visitMatches.
576func visitMatchTree(t matchTree, f func(matchTree)) {
577 switch s := t.(type) {
578 case *andMatchTree:
579 for _, ch := range s.children {
580 visitMatchTree(ch, f)
581 }
582 case *orMatchTree:
583 for _, ch := range s.children {
584 visitMatchTree(ch, f)
585 }
586 case *andLineMatchTree:
587 visitMatchTree(&s.andMatchTree, f)
588 case *noVisitMatchTree:
589 visitMatchTree(s.matchTree, f)
590 case *notMatchTree:
591 visitMatchTree(s.child, f)
592 case *fileNameMatchTree:
593 visitMatchTree(s.child, f)
594 case *boostMatchTree:
595 visitMatchTree(s.child, f)
596 case *symbolSubstrMatchTree:
597 visitMatchTree(s.substrMatchTree, f)
598 case *symbolRegexpMatchTree:
599 visitMatchTree(s.matchTree, f)
600 default:
601 f(t)
602 }
603}
604
605// updateMatchTreeStats calls updateStats on all atoms in mt which have that
606// function defined.
607func updateMatchTreeStats(mt matchTree, stats *zoekt.Stats) {
608 visitMatchTree(mt, func(mt matchTree) {
609 if atom, ok := mt.(interface{ updateStats(*zoekt.Stats) }); ok {
610 atom.updateStats(stats)
611 }
612 })
613}
614
615func visitMatchAtoms(t matchTree, known map[matchTree]bool, f func(matchTree)) {
616 visitMatches(t, known, 1, func(mt matchTree, _ float64) {
617 f(mt)
618 })
619}
620
621// visitMatches visits all atoms which can contribute matches. Note: This
622// skips noVisitMatchTree.
623func visitMatches(t matchTree, known map[matchTree]bool, weight float64, f func(matchTree, float64)) {
624 switch s := t.(type) {
625 case *andMatchTree:
626 for _, ch := range s.children {
627 if known[ch] {
628 visitMatches(ch, known, weight, f)
629 }
630 }
631 case *andLineMatchTree:
632 visitMatches(&s.andMatchTree, known, weight, f)
633 case *orMatchTree:
634 for _, ch := range s.children {
635 if known[ch] {
636 visitMatches(ch, known, weight, f)
637 }
638 }
639 case *boostMatchTree:
640 visitMatches(s.child, known, weight*s.boost, f)
641 case *symbolSubstrMatchTree:
642 visitMatches(s.substrMatchTree, known, weight, f)
643 case *notMatchTree:
644 case *noVisitMatchTree:
645 // don't collect into negative trees.
646 case *fileNameMatchTree:
647 // We will just gather the filename if we do not visit this tree.
648 default:
649 f(s, weight)
650 }
651}
652
653// all matches() methods.
654
655func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
656 return matchesStatePred(t.predicate(cp.idx))
657}
658
659func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
660 return matchesFound
661}
662
663// andLineMatchTree is a performance optimization of andMatchTree. For content
664// searches we don't want to run the regex engine if there is no line that
665// contains matches from all terms.
666func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
667 if state := evalMatchTree(cp, cost, known, &t.andMatchTree); state != matchesFound {
668 return state
669 }
670
671 // Invariant: all children have matches. If any line contains all of them we
672 // can return MatchesFound.
673
674 // find child with fewest candidates
675 min := maxUInt32
676 fewestChildren := 0
677 for ix, child := range t.children {
678 v, ok := child.(*substrMatchTree)
679 // make sure we are running a content search and that all candidates are a
680 // substrMatchTree
681 if !ok || v.fileName {
682 return matchesFound
683 }
684 if len(v.current) < min {
685 min = len(v.current)
686 fewestChildren = ix
687 }
688 }
689
690 type lineRange struct {
691 start int
692 end int
693 }
694 lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current))
695 prev := -1
696 for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current {
697 line := cp.newlines().atOffset(candidate.byteOffset)
698 if line == prev {
699 continue
700 }
701 prev = line
702 byteStart := int(cp.newlines().lineStart(line))
703 byteEnd := int(cp.newlines().lineStart(line + 1))
704 lines = append(lines, lineRange{byteStart, byteEnd})
705 }
706
707 // children keeps track of the children's candidates we have already seen.
708 children := make([][]*candidateMatch, 0, len(t.children)-1)
709 for j, child := range t.children {
710 if j == fewestChildren {
711 continue
712 }
713 children = append(children, child.(*substrMatchTree).current)
714 }
715
716nextLine:
717 for i := 0; i < len(lines); i++ {
718 hits := 1
719 nextChild:
720 for j := range children {
721 nextCandidate:
722 for len(children[j]) > 0 {
723 candidate := children[j][0]
724 bo := int(cp.findOffset(false, candidate.runeOffset))
725 if bo < lines[i].start {
726 children[j] = children[j][1:]
727 continue nextCandidate
728 }
729 if bo < lines[i].end {
730 hits++
731 continue nextChild
732 }
733 // move the `lines` iterator forward until bo < line.end
734 for i < len(lines) && bo >= lines[i].end {
735 i++
736 }
737 i--
738 continue nextLine
739 }
740 }
741 // return early once we found any line that contains matches from all children
742 if hits == len(t.children) {
743 return matchesFound
744 }
745 }
746 return matchesNone
747}
748
749func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
750 // We have found matches unless a child needs to do more work or it hasn't
751 // found matches.
752 state := matchesFound
753
754 for _, ch := range t.children {
755 switch evalMatchTree(cp, cost, known, ch) {
756 case matchesRequiresHigherCost:
757 // keep evaluating other children incase we come across matchesNone
758 state = matchesRequiresHigherCost
759 case matchesFound:
760 // will return this if every child has this value
761 case matchesNone:
762 return matchesNone
763 }
764 }
765
766 return state
767}
768
769func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
770 // we could short-circuit, but we want to use the other possibilities as a
771 // ranking signal. So we always return the most conservative state.
772 state := matchesNone
773 for _, ch := range t.children {
774 switch evalMatchTree(cp, cost, known, ch) {
775 case matchesRequiresHigherCost:
776 state = matchesRequiresHigherCost
777 case matchesFound:
778 if state != matchesRequiresHigherCost {
779 state = matchesFound
780 }
781 case matchesNone:
782 // noop
783 }
784 }
785 return state
786}
787
788func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
789 return matchesStatePred(t.branchMask() != 0)
790}
791
792func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
793 if t.reEvaluated {
794 return matchesStateForSlice(t.found)
795 }
796
797 if cost < costRegexp {
798 return matchesRequiresHigherCost
799 }
800
801 cp.stats.RegexpsConsidered++
802 idxs := t.regexp.FindAllIndex(cp.data(t.fileName), -1)
803 found := t.found[:0]
804 for _, idx := range idxs {
805 cm := &candidateMatch{
806 byteOffset: uint32(idx[0]),
807 byteMatchSz: uint32(idx[1] - idx[0]),
808 fileName: t.fileName,
809 }
810
811 found = append(found, cm)
812 }
813 t.found = found
814 t.reEvaluated = true
815
816 return matchesStateForSlice(t.found)
817}
818
819func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
820 if t.evaluated {
821 return matchesStateForSlice(t.found)
822 }
823
824 if cost < costRegexp {
825 return matchesRequiresHigherCost
826 }
827
828 data := cp.data(t.fileName)
829 offset := 0
830 found := t.found[:0]
831 for {
832 idx := bytes.Index(data[offset:], []byte(t.word))
833 if idx < 0 {
834 break
835 }
836
837 relStartOffset := offset + idx
838 relEndOffset := relStartOffset + len(t.word)
839
840 startBoundary := relStartOffset < len(data) && (relStartOffset == 0 || !characterClass(data[relStartOffset-1]))
841 endBoundary := relEndOffset > 0 && (relEndOffset == len(data) || !characterClass(data[relEndOffset]))
842 if startBoundary && endBoundary {
843 found = append(found, &candidateMatch{
844 byteOffset: uint32(offset + idx),
845 byteMatchSz: uint32(len(t.word)),
846 fileName: t.fileName,
847 })
848 }
849 offset += idx + len(t.word)
850 }
851
852 t.found = found
853 t.evaluated = true
854
855 return matchesStateForSlice(t.found)
856}
857
858// breakMatchesOnNewlines returns matches resulting from breaking each element
859// of cms on newlines within text.
860func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch {
861 var lineCMs []*candidateMatch
862 for _, cm := range cms {
863 lineCMs = append(lineCMs, breakOnNewlines(cm, text)...)
864 }
865 return lineCMs
866}
867
868// breakOnNewlines returns matches resulting from breaking cm on newlines
869// within text.
870func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch {
871 var cms []*candidateMatch
872 addMe := &candidateMatch{}
873 *addMe = *cm
874 for i := uint32(cm.byteOffset); i < cm.byteOffset+cm.byteMatchSz; i++ {
875 if text[i] == '\n' {
876 addMe.byteMatchSz = i - addMe.byteOffset
877 if addMe.byteMatchSz != 0 {
878 cms = append(cms, addMe)
879 }
880
881 addMe = &candidateMatch{}
882 *addMe = *cm
883 addMe.byteOffset = i + 1
884 }
885 }
886 addMe.byteMatchSz = cm.byteOffset + cm.byteMatchSz - addMe.byteOffset
887 if addMe.byteMatchSz != 0 {
888 cms = append(cms, addMe)
889 }
890 return cms
891}
892
893// evalMatchTree should be called instead of directly calling
894// matchTree.matches. It cache known values for future evaluation at higher
895// costs.
896func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) matchesState {
897 if v, ok := known[mt]; ok {
898 return matchesStatePred(v)
899 }
900
901 ms := mt.matches(cp, cost, known)
902 if ms != matchesRequiresHigherCost {
903 known[mt] = ms == matchesFound
904 }
905
906 return ms
907}
908
909func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
910 switch evalMatchTree(cp, cost, known, t.child) {
911 case matchesRequiresHigherCost:
912 return matchesRequiresHigherCost
913 case matchesFound:
914 return matchesNone
915 case matchesNone:
916 return matchesFound
917 default:
918 panic("unreachable")
919 }
920}
921
922func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
923 return evalMatchTree(cp, cost, known, t.child)
924}
925
926func (t *boostMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
927 return evalMatchTree(cp, cost, known, t.child)
928}
929
930func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
931 if t.contEvaluated {
932 return matchesStateForSlice(t.current)
933 }
934
935 if len(t.current) == 0 {
936 return matchesNone
937 }
938
939 if t.fileName && cost < costMemory {
940 return matchesRequiresHigherCost
941 }
942
943 if !t.fileName && cost < costContent {
944 return matchesRequiresHigherCost
945 }
946
947 pruned := t.current[:0]
948 for _, m := range t.current {
949 if m.byteOffset == 0 && m.runeOffset > 0 {
950 m.byteOffset = cp.findOffset(m.fileName, m.runeOffset)
951 }
952 if m.matchContent(cp.data(m.fileName)) {
953 pruned = append(pruned, m)
954 }
955 }
956 t.current = pruned
957 t.contEvaluated = true
958
959 return matchesStateForSlice(t.current)
960}
961
962type matchTreeOpt struct {
963 // DisableWordMatchOptimization is used to disable the use of wordMatchTree.
964 // This was added since we do not support wordMatchTree with symbol search.
965 DisableWordMatchOptimization bool
966}
967
968func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error) {
969 if q == nil {
970 return nil, fmt.Errorf("got nil (sub)query")
971 }
972 switch s := q.(type) {
973 case *query.Regexp:
974 // RegexpToMatchTreeRecursive tries to distill a matchTree that matches a
975 // superset of the regexp. If the returned matchTree is equivalent to the
976 // original regexp, it returns true. An equivalent matchTree has the same
977 // behaviour as the original regexp and can be used instead.
978 //
979 subMT, isEq, _, err := d.regexpToMatchTreeRecursive(s.Regexp, ngramSize, s.FileName, s.CaseSensitive)
980 if err != nil {
981 return nil, err
982 }
983 // if the query can be used in place of the regexp
984 // return the subtree
985 if isEq {
986 return subMT, nil
987 }
988
989 var tr matchTree
990 if wmt, ok := regexpToWordMatchTree(s, opt); ok {
991 // A common search we get is "\bLITERAL\b". Avoid the regex engine and
992 // provide something faster.
993 tr = wmt
994 } else {
995 tr = newRegexpMatchTree(s)
996 }
997
998 return &andMatchTree{
999 children: []matchTree{
1000 tr, &noVisitMatchTree{subMT},
1001 },
1002 }, nil
1003 case *query.And:
1004 var r []matchTree
1005 for _, ch := range s.Children {
1006 ct, err := d.newMatchTree(ch, opt)
1007 if err != nil {
1008 return nil, err
1009 }
1010 r = append(r, ct)
1011 }
1012 return &andMatchTree{r}, nil
1013 case *query.Or:
1014 var r []matchTree
1015 for _, ch := range s.Children {
1016 ct, err := d.newMatchTree(ch, opt)
1017 if err != nil {
1018 return nil, err
1019 }
1020 r = append(r, ct)
1021 }
1022 return &orMatchTree{r}, nil
1023 case *query.Not:
1024 ct, err := d.newMatchTree(s.Child, opt)
1025 return ¬MatchTree{
1026 child: ct,
1027 }, err
1028
1029 case *query.Type:
1030 if s.Type != query.TypeFileName {
1031 break
1032 }
1033
1034 ct, err := d.newMatchTree(s.Child, opt)
1035 if err != nil {
1036 return nil, err
1037 }
1038
1039 return &fileNameMatchTree{
1040 child: ct,
1041 }, nil
1042
1043 case *query.Boost:
1044 ct, err := d.newMatchTree(s.Child, opt)
1045 if err != nil {
1046 return nil, err
1047 }
1048
1049 return &boostMatchTree{
1050 child: ct,
1051 boost: s.Boost,
1052 }, nil
1053
1054 case *query.Substring:
1055 return d.newSubstringMatchTree(s)
1056
1057 case *query.Branch:
1058 masks := make([]uint64, 0, len(d.repoMetaData))
1059 if s.Pattern == "HEAD" {
1060 for i := 0; i < len(d.repoMetaData); i++ {
1061 masks = append(masks, 1)
1062 }
1063 } else {
1064 for _, branchIDs := range d.branchIDs {
1065 mask := uint64(0)
1066 for nm, m := range branchIDs {
1067 if (s.Exact && nm == s.Pattern) || (!s.Exact && strings.Contains(nm, s.Pattern)) {
1068 mask |= uint64(m)
1069 }
1070 }
1071 masks = append(masks, mask)
1072 }
1073 }
1074 return &branchQueryMatchTree{
1075 masks: masks,
1076 fileMasks: d.fileBranchMasks,
1077 repos: d.repos,
1078 }, nil
1079 case *query.Const:
1080 if s.Value {
1081 return &bruteForceMatchTree{}, nil
1082 } else {
1083 return &noMatchTree{Why: "const"}, nil
1084 }
1085 case *query.Language:
1086 code, ok := d.metaData.LanguageMap[s.Language]
1087 if !ok {
1088 return &noMatchTree{Why: "lang"}, nil
1089 }
1090 return &docMatchTree{
1091 reason: "language",
1092 numDocs: d.numDocs(),
1093 predicate: func(docID uint32) bool {
1094 return d.getLanguage(docID) == code
1095 },
1096 }, nil
1097
1098 case *query.Symbol:
1099 // Disable WordMatchTree since we don't support it in symbols yet.
1100 optCopy := opt
1101 optCopy.DisableWordMatchOptimization = true
1102
1103 subMT, err := d.newMatchTree(s.Expr, optCopy)
1104 if err != nil {
1105 return nil, err
1106 }
1107
1108 if substr, ok := subMT.(*substrMatchTree); ok {
1109 return &symbolSubstrMatchTree{
1110 substrMatchTree: substr,
1111 patternSize: uint32(utf8.RuneCountInString(substr.query.Pattern)),
1112 fileEndRunes: d.fileEndRunes,
1113 fileEndSymbol: d.fileEndSymbol,
1114 sections: d.runeDocSections,
1115 }, nil
1116 }
1117
1118 var regexpMT *regexpMatchTree
1119 visitMatchTree(subMT, func(mt matchTree) {
1120 if t, ok := mt.(*regexpMatchTree); ok {
1121 regexpMT = t
1122 }
1123 })
1124 if regexpMT == nil {
1125 return nil, fmt.Errorf("found %T inside query.Symbol", subMT)
1126 }
1127
1128 return &symbolRegexpMatchTree{
1129 regexp: regexpMT.regexp,
1130 all: isRegexpAll(regexpMT.origRegexp),
1131 matchTree: subMT,
1132 }, nil
1133
1134 case *query.FileNameSet:
1135 return &docMatchTree{
1136 reason: "FileNameSet",
1137 numDocs: d.numDocs(),
1138 predicate: func(docID uint32) bool {
1139 fileName := d.fileName(docID)
1140 _, ok := s.Set[string(fileName)]
1141 return ok
1142 },
1143 }, nil
1144
1145 case *query.BranchesRepos:
1146 reposBranchesWant := make([]uint64, len(d.repoMetaData))
1147 for repoIdx := range d.repoMetaData {
1148 var mask uint64
1149 for _, br := range s.List {
1150 if br.Repos.Contains(d.repoMetaData[repoIdx].ID) {
1151 mask |= uint64(d.branchIDs[repoIdx][br.Branch])
1152 }
1153 }
1154 reposBranchesWant[repoIdx] = mask
1155 }
1156 return &docMatchTree{
1157 reason: "BranchesRepos",
1158 numDocs: d.numDocs(),
1159 predicate: func(docID uint32) bool {
1160 return d.fileBranchMasks[docID]&reposBranchesWant[d.repos[docID]] != 0
1161 },
1162 }, nil
1163
1164 case *query.RepoSet:
1165 reposWant := make([]bool, len(d.repoMetaData))
1166 for repoIdx, r := range d.repoMetaData {
1167 if _, ok := s.Set[r.Name]; ok {
1168 reposWant[repoIdx] = true
1169 }
1170 }
1171 return &docMatchTree{
1172 reason: "RepoSet",
1173 numDocs: d.numDocs(),
1174 predicate: func(docID uint32) bool {
1175 return reposWant[d.repos[docID]]
1176 },
1177 }, nil
1178
1179 case *query.RepoIDs:
1180 reposWant := make([]bool, len(d.repoMetaData))
1181 for repoIdx, r := range d.repoMetaData {
1182 if s.Repos.Contains(r.ID) {
1183 reposWant[repoIdx] = true
1184 }
1185 }
1186 return &docMatchTree{
1187 reason: "RepoIDs",
1188 numDocs: d.numDocs(),
1189 predicate: func(docID uint32) bool {
1190 return reposWant[d.repos[docID]]
1191 },
1192 }, nil
1193
1194 case *query.Repo:
1195 reposWant := make([]bool, len(d.repoMetaData))
1196 for repoIdx, r := range d.repoMetaData {
1197 if s.Regexp.MatchString(r.Name) {
1198 reposWant[repoIdx] = true
1199 }
1200 }
1201 return &docMatchTree{
1202 reason: "Repo",
1203 numDocs: d.numDocs(),
1204 predicate: func(docID uint32) bool {
1205 return reposWant[d.repos[docID]]
1206 },
1207 }, nil
1208
1209 case *query.RepoRegexp:
1210 reposWant := make([]bool, len(d.repoMetaData))
1211 for repoIdx, r := range d.repoMetaData {
1212 if s.Regexp.MatchString(r.Name) {
1213 reposWant[repoIdx] = true
1214 }
1215 }
1216 return &docMatchTree{
1217 reason: "RepoRegexp",
1218 numDocs: d.numDocs(),
1219 predicate: func(docID uint32) bool {
1220 return reposWant[d.repos[docID]]
1221 },
1222 }, nil
1223
1224 case query.RawConfig:
1225 return &docMatchTree{
1226 reason: s.String(),
1227 numDocs: d.numDocs(),
1228 predicate: func(docID uint32) bool {
1229 return uint8(s)&d.rawConfigMasks[d.repos[docID]] == uint8(s)
1230 },
1231 }, nil
1232 }
1233 log.Panicf("type %T", q)
1234 return nil, nil
1235}
1236
1237func (d *indexData) newSubstringMatchTree(s *query.Substring) (matchTree, error) {
1238 st := &substrMatchTree{
1239 query: s,
1240 caseSensitive: s.CaseSensitive,
1241 fileName: s.FileName,
1242 }
1243
1244 if utf8.RuneCountInString(s.Pattern) < ngramSize {
1245 return newRegexpMatchTree(&query.Regexp{
1246 Regexp: &syntax.Regexp{Op: syntax.OpLiteral, Rune: []rune(s.Pattern)},
1247 FileName: s.FileName,
1248 Content: s.Content,
1249 CaseSensitive: s.CaseSensitive,
1250 }), nil
1251 }
1252
1253 result, err := d.iterateNgrams(s)
1254 if err != nil {
1255 return nil, err
1256 }
1257 st.matchIterator = result
1258 return st, nil
1259}
1260
1261func regexpToWordMatchTree(q *query.Regexp, opt matchTreeOpt) (_ *wordMatchTree, ok bool) {
1262 if opt.DisableWordMatchOptimization {
1263 return nil, false
1264 }
1265 // Needs to be case sensitive
1266 if !q.CaseSensitive || q.Regexp.Flags&syntax.FoldCase != 0 {
1267 return nil, false
1268 }
1269 // We want a regex that looks like Op.Concat[OpWordBoundary OpLiteral OpWordBoundary]
1270 if q.Regexp.Op != syntax.OpConcat || len(q.Regexp.Sub) != 3 {
1271 return nil, false
1272 }
1273 sub := q.Regexp.Sub
1274 if sub[0].Op != syntax.OpWordBoundary || sub[1].Op != syntax.OpLiteral || sub[2].Op != syntax.OpWordBoundary {
1275 return nil, false
1276 }
1277
1278 return &wordMatchTree{
1279 word: string(sub[1].Rune),
1280 fileName: q.FileName,
1281 }, true
1282}
1283
1284// pruneMatchTree removes impossible branches from the matchTree, as indicated
1285// by substrMatchTree having a noMatchTree and the resulting impossible and clauses and so forth.
1286func pruneMatchTree(mt matchTree) (matchTree, error) {
1287 var err error
1288 switch mt := mt.(type) {
1289 // leaf nodes that we test with the filter:
1290 case *substrMatchTree:
1291 if res, ok := mt.matchIterator.(*ngramIterationResults); ok {
1292 if _, ok := res.matchIterator.(*noMatchTree); ok {
1293 return nil, nil
1294 }
1295 }
1296 // recursive tree structures:
1297 case *andMatchTree:
1298 // Any branch of an and becoming impossible means the entire clause
1299 // is impossible. Otherwise, just handle rewrites.
1300 for i, child := range mt.children {
1301 newChild, err := pruneMatchTree(child)
1302 if err != nil {
1303 return nil, err
1304 }
1305 if newChild == nil {
1306 return nil, nil
1307 }
1308 mt.children[i] = newChild
1309 }
1310 case *orMatchTree:
1311 // *All* branches of an OR becoming impossible means the entire clause
1312 // is impossible. Otherwise, drop impossible subclauses and handle
1313 // rewrites, including simplifying to a singular resulting child branch.
1314 n := 0
1315 for _, child := range mt.children {
1316 newChild, err := pruneMatchTree(child)
1317 if err != nil {
1318 return nil, err
1319 }
1320 if newChild != nil {
1321 mt.children[n] = newChild
1322 n++
1323 }
1324 }
1325 mt.children = mt.children[:n]
1326 if len(mt.children) == 1 {
1327 return mt.children[0], nil
1328 } else if len(mt.children) == 0 {
1329 return nil, nil
1330 }
1331 case *noVisitMatchTree:
1332 mt.matchTree, err = pruneMatchTree(mt.matchTree)
1333 if err != nil {
1334 return nil, err
1335 }
1336 if mt.matchTree == nil {
1337 return nil, nil
1338 }
1339 case *fileNameMatchTree:
1340 mt.child, err = pruneMatchTree(mt.child)
1341 if err != nil {
1342 return nil, err
1343 }
1344 if mt.child == nil {
1345 return nil, nil
1346 }
1347 case *boostMatchTree:
1348 mt.child, err = pruneMatchTree(mt.child)
1349 if err != nil {
1350 return nil, err
1351 }
1352 if mt.child == nil {
1353 return nil, nil
1354 }
1355 case *andLineMatchTree:
1356 child, err := pruneMatchTree(&mt.andMatchTree)
1357 if err != nil {
1358 return nil, err
1359 }
1360 if child == nil {
1361 return nil, nil
1362 }
1363 if c, ok := child.(*andMatchTree); ok {
1364 mt.andMatchTree = *c
1365 } else {
1366 // the and was simplified to a single clause,
1367 // so the linematch portion is irrelevant.
1368 return mt, nil
1369 }
1370 case *notMatchTree:
1371 mt.child, err = pruneMatchTree(mt.child)
1372 if err != nil {
1373 return nil, err
1374 }
1375 if mt.child == nil {
1376 // not false => true
1377 return &bruteForceMatchTree{}, nil
1378 }
1379 // unhandled:
1380 case *docMatchTree:
1381 case *bruteForceMatchTree:
1382 case *regexpMatchTree:
1383 case *wordMatchTree:
1384 }
1385 return mt, err
1386}
1387
1388// isRegexpAll returns true if the query matches all possible lines.
1389//
1390// Note: it is possible for a funky regex to actually match all but this
1391// returns false. This returns true for normal looking regexes like ".*" or
1392// "(?-s:.*)".
1393func isRegexpAll(r *syntax.Regexp) bool {
1394 // Note: we don't care about any flags since we are looking for .* and we
1395 // don't mind if . matches all or everything but newline.
1396
1397 // for loop is for visiting children of OpCapture/OpConcat until we hit .*
1398 for {
1399 // Our main target: .*
1400 if r.Op == syntax.OpStar && len(r.Sub) == 1 { // *
1401 // (?s:.) or (?-s:.)
1402 return r.Sub[0].Op == syntax.OpAnyChar || r.Sub[0].Op == syntax.OpAnyCharNotNL
1403 }
1404
1405 // Strip away expressions being wrapped in paranthesis
1406 if (r.Op == syntax.OpCapture || r.Op == syntax.OpConcat) && len(r.Sub) == 1 {
1407 r = r.Sub[0]
1408 continue
1409 }
1410
1411 return false
1412 }
1413}