fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package query
16
17import (
18 "bytes"
19 "encoding/gob"
20 "encoding/json"
21 "fmt"
22 "log"
23 "reflect"
24 "regexp/syntax"
25 "sort"
26 "strconv"
27 "strings"
28 "sync"
29
30 "github.com/RoaringBitmap/roaring"
31 "github.com/grafana/regexp"
32)
33
34var _ = log.Println
35
36// Q is a representation for a possibly hierarchical search query.
37type Q interface {
38 String() string
39}
40
41// RPCUnwrap processes q to remove RPC specific elements from q. This is
42// needed because gob isn't flexible enough for us. This should be called by
43// RPC servers at the client/server boundary so that q works with the rest of
44// zoekt.
45func RPCUnwrap(q Q) Q {
46 if cache, ok := q.(*GobCache); ok {
47 return cache.Q
48 }
49 return q
50}
51
52// RawConfig filters repositories based on their encoded RawConfig map.
53type RawConfig uint64
54
55const (
56 RcOnlyPublic RawConfig = 1
57 RcOnlyPrivate RawConfig = 2
58 RcOnlyForks RawConfig = 1 << 2
59 RcNoForks RawConfig = 2 << 2
60 RcOnlyArchived RawConfig = 1 << 4
61 RcNoArchived RawConfig = 2 << 4
62)
63
64var flagNames = []struct {
65 Mask RawConfig
66 Label string
67}{
68 {RcOnlyPublic, "RcOnlyPublic"},
69 {RcOnlyPrivate, "RcOnlyPrivate"},
70 {RcOnlyForks, "RcOnlyForks"},
71 {RcNoForks, "RcNoForks"},
72 {RcOnlyArchived, "RcOnlyArchived"},
73 {RcNoArchived, "RcNoArchived"},
74}
75
76func (r RawConfig) String() string {
77 var s []string
78 for _, fn := range flagNames {
79 if r&fn.Mask != 0 {
80 s = append(s, fn.Label)
81 }
82 }
83 return fmt.Sprintf("rawConfig:%s", strings.Join(s, "|"))
84}
85
86// RegexpQuery is a query looking for regular expressions matches.
87type Regexp struct {
88 Regexp *syntax.Regexp
89 FileName bool
90 Content bool
91 CaseSensitive bool
92}
93
94func (q *Regexp) String() string {
95 pref := ""
96 if q.FileName {
97 pref = "file_"
98 }
99 if q.CaseSensitive {
100 pref = "case_" + pref
101 }
102 return fmt.Sprintf("%sregex:%q", pref, q.Regexp.String())
103}
104
105// gobRegexp wraps Regexp to make it gob-encodable/decodable. Regexp contains syntax.Regexp, which
106// contains slices/arrays with possibly nil elements, which gob doesn't support
107// (https://github.com/golang/go/issues/1501).
108type gobRegexp struct {
109 Regexp // Regexp.Regexp (*syntax.Regexp) is set to nil and its string is set in RegexpString
110 RegexpString string
111}
112
113// GobEncode implements gob.Encoder.
114func (q Regexp) GobEncode() ([]byte, error) {
115 gobq := gobRegexp{Regexp: q, RegexpString: q.Regexp.String()}
116 gobq.Regexp.Regexp = nil // can't be gob-encoded/decoded
117 return json.Marshal(gobq)
118}
119
120// GobDecode implements gob.Decoder.
121func (q *Regexp) GobDecode(data []byte) error {
122 var gobq gobRegexp
123 err := json.Unmarshal(data, &gobq)
124 if err != nil {
125 return err
126 }
127 gobq.Regexp.Regexp, err = syntax.Parse(gobq.RegexpString, regexpFlags)
128 if err != nil {
129 return err
130 }
131 *q = gobq.Regexp
132 return nil
133}
134
135// Symbol finds a string that is a symbol.
136type Symbol struct {
137 Expr Q
138}
139
140func (s *Symbol) String() string {
141 return fmt.Sprintf("sym:%s", s.Expr)
142}
143
144type caseQ struct {
145 Flavor string
146}
147
148func (c *caseQ) String() string {
149 return "case:" + c.Flavor
150}
151
152type Language struct {
153 Language string
154}
155
156func (l *Language) String() string {
157 return "lang:" + l.Language
158}
159
160type Const struct {
161 Value bool
162}
163
164func (q *Const) String() string {
165 if q.Value {
166 return "TRUE"
167 }
168 return "FALSE"
169}
170
171type Repo struct {
172 Regexp *regexp.Regexp
173}
174
175func (q *Repo) String() string {
176 return fmt.Sprintf("repo:%s", q.Regexp.String())
177}
178
179func (q Repo) GobEncode() ([]byte, error) {
180 return []byte(q.Regexp.String()), nil
181}
182
183func (q *Repo) GobDecode(data []byte) error {
184 var err error
185 q.Regexp, err = regexp.Compile(string(data))
186 return err
187}
188
189// RepoRegexp is a Sourcegraph addition which searches documents where the
190// repository name matches Regexp.
191type RepoRegexp struct {
192 Regexp *regexp.Regexp
193}
194
195func (q *RepoRegexp) String() string {
196 return fmt.Sprintf("reporegex:%q", q.Regexp.String())
197}
198
199// GobEncode implements gob.Encoder.
200func (q *RepoRegexp) GobEncode() ([]byte, error) {
201 // gob can't encode syntax.Regexp
202 return []byte(q.Regexp.String()), nil
203}
204
205// GobDecode implements gob.Decoder.
206func (q *RepoRegexp) GobDecode(data []byte) error {
207 var err error
208 q.Regexp, err = regexp.Compile(string(data))
209 return err
210}
211
212// BranchesRepos is a slice of BranchRepos to match. It is a Sourcegraph
213// addition and only used in the RPC interface for efficient checking of large
214// repo lists.
215type BranchesRepos struct {
216 List []BranchRepos
217}
218
219// NewSingleBranchesRepos is a helper for creating a BranchesRepos which
220// searches a single branch.
221func NewSingleBranchesRepos(branch string, ids ...uint32) *BranchesRepos {
222 return &BranchesRepos{List: []BranchRepos{
223 {branch, roaring.BitmapOf(ids...)},
224 }}
225}
226
227func (q *BranchesRepos) String() string {
228 var sb strings.Builder
229
230 sb.WriteString("(branchesrepos")
231
232 for _, br := range q.List {
233 if size := br.Repos.GetCardinality(); size > 1 {
234 sb.WriteString(" " + br.Branch + ":" + strconv.FormatUint(size, 10))
235 } else {
236 sb.WriteString(" " + br.Branch + "=" + br.Repos.String())
237 }
238 }
239
240 sb.WriteString(")")
241 return sb.String()
242}
243
244// NewRepoIDs is a helper for creating a RepoIDs which
245// searches only the matched repos.
246func NewRepoIDs(ids ...uint32) *RepoIDs {
247 return &RepoIDs{Repos: roaring.BitmapOf(ids...)}
248}
249
250func (q *RepoIDs) String() string {
251 var sb strings.Builder
252
253 sb.WriteString("(repoids ")
254
255 if size := q.Repos.GetCardinality(); size > 1 {
256 sb.WriteString("count:" + strconv.FormatUint(size, 10))
257 } else {
258 sb.WriteString("repoid=" + q.Repos.String())
259 }
260
261 sb.WriteString(")")
262 return sb.String()
263}
264
265// MarshalBinary implements a specialized encoder for BranchesRepos.
266func (q BranchesRepos) MarshalBinary() ([]byte, error) {
267 return branchesReposEncode(q.List)
268}
269
270// UnmarshalBinary implements a specialized decoder for BranchesRepos.
271func (q *BranchesRepos) UnmarshalBinary(b []byte) (err error) {
272 q.List, err = branchesReposDecode(b)
273 return err
274}
275
276// BranchRepos is a (branch, sourcegraph repo ids bitmap) tuple. It is a
277// Sourcegraph addition.
278type BranchRepos struct {
279 Branch string
280 Repos *roaring.Bitmap
281}
282
283// Similar to BranchRepos but will be used to match only by repoid and
284// therefore matches all branches
285type RepoIDs struct {
286 Repos *roaring.Bitmap
287}
288
289// RepoSet is a list of repos to match. It is a Sourcegraph addition and only
290// used in the RPC interface for efficient checking of large repo lists.
291type RepoSet struct {
292 Set map[string]bool
293}
294
295func (q *RepoSet) String() string {
296 var detail string
297 if len(q.Set) > 5 {
298 // Large sets being output are not useful
299 detail = fmt.Sprintf("size=%d", len(q.Set))
300 } else {
301 repos := make([]string, len(q.Set))
302 i := 0
303 for repo := range q.Set {
304 repos[i] = repo
305 i++
306 }
307 sort.Strings(repos)
308 detail = strings.Join(repos, " ")
309 }
310 return fmt.Sprintf("(reposet %s)", detail)
311}
312
313func NewRepoSet(repo ...string) *RepoSet {
314 s := &RepoSet{Set: make(map[string]bool)}
315 for _, r := range repo {
316 s.Set[r] = true
317 }
318 return s
319}
320
321// FileNameSet is a list of file names to match. It is a Sourcegraph addition
322// and only used in the RPC interface for efficient checking of large file
323// lists.
324type FileNameSet struct {
325 Set map[string]struct{}
326}
327
328// MarshalBinary implements a specialized encoder for FileNameSet.
329func (q *FileNameSet) MarshalBinary() ([]byte, error) {
330 return stringSetEncode(q.Set)
331}
332
333// UnmarshalBinary implements a specialized decoder for FileNameSet.
334func (q *FileNameSet) UnmarshalBinary(b []byte) error {
335 var err error
336 q.Set, err = stringSetDecode(b)
337 return err
338}
339
340func (q *FileNameSet) String() string {
341 var detail string
342 if len(q.Set) > 5 {
343 // Large sets being output are not useful
344 detail = fmt.Sprintf("size=%d", len(q.Set))
345 } else {
346 values := make([]string, 0, len(q.Set))
347 for v := range q.Set {
348 values = append(values, v)
349 }
350 sort.Strings(values)
351 detail = strings.Join(values, " ")
352 }
353 return fmt.Sprintf("(filenameset %s)", detail)
354}
355
356func NewFileNameSet(fileNames ...string) *FileNameSet {
357 s := &FileNameSet{Set: make(map[string]struct{})}
358 for _, r := range fileNames {
359 s.Set[r] = struct{}{}
360 }
361 return s
362}
363
364const (
365 TypeFileMatch uint8 = iota
366 TypeFileName
367 TypeRepo
368)
369
370// Type changes the result type returned.
371type Type struct {
372 Child Q
373 Type uint8
374}
375
376func (q *Type) String() string {
377 switch q.Type {
378 case TypeFileMatch:
379 return fmt.Sprintf("(type:filematch %s)", q.Child)
380 case TypeFileName:
381 return fmt.Sprintf("(type:filename %s)", q.Child)
382 case TypeRepo:
383 return fmt.Sprintf("(type:repo %s)", q.Child)
384 default:
385 return fmt.Sprintf("(type:UNKNOWN %s)", q.Child)
386 }
387}
388
389// Boost scales the contribution to score of descendents.
390type Boost struct {
391 Child Q
392 // Boost will multiply the score of its descendents. Values less than 1 will
393 // give less importance while values greater than 1 will give more
394 // importance.
395 Boost float64
396}
397
398func (q *Boost) String() string {
399 return fmt.Sprintf("(boost %0.2f %s)", q.Boost, q.Child)
400}
401
402// Substring is the most basic query: a query for a substring.
403type Substring struct {
404 Pattern string
405 CaseSensitive bool
406
407 // Match only filename
408 FileName bool
409
410 // Match only content
411 Content bool
412}
413
414func (q *Substring) String() string {
415 s := ""
416
417 t := ""
418 if q.FileName {
419 t = "file_"
420 } else if q.Content {
421 t = "content_"
422 }
423
424 s += fmt.Sprintf("%ssubstr:%q", t, q.Pattern)
425 if q.CaseSensitive {
426 s = "case_" + s
427 }
428 return s
429}
430
431type setCaser interface {
432 setCase(string)
433}
434
435func (q *Substring) setCase(k string) {
436 switch k {
437 case "yes":
438 q.CaseSensitive = true
439 case "no":
440 q.CaseSensitive = false
441 case "auto":
442 // TODO - unicode
443 q.CaseSensitive = (q.Pattern != string(toLower([]byte(q.Pattern))))
444 }
445}
446
447func (q *Symbol) setCase(k string) {
448 if sc, ok := q.Expr.(setCaser); ok {
449 sc.setCase(k)
450 }
451}
452
453func (q *Regexp) setCase(k string) {
454 switch k {
455 case "yes":
456 q.CaseSensitive = true
457 case "no":
458 q.CaseSensitive = false
459 case "auto":
460 q.CaseSensitive = (q.Regexp.String() != LowerRegexp(q.Regexp).String())
461 }
462}
463
464// GobCache exists so we only pay the cost of marshalling a query once when we
465// aggregate it out over all the replicas.
466//
467// Our query and eval layer do not support GobCache. Instead, at the gob
468// boundaries (RPC and Streaming) we check if the Q is a GobCache and unwrap
469// it.
470//
471// "I wish we could get rid of this code soon enough" - tomas
472type GobCache struct {
473 Q
474
475 once sync.Once
476 data []byte
477 err error
478}
479
480// GobEncode implements gob.Encoder.
481func (q *GobCache) GobEncode() ([]byte, error) {
482 q.once.Do(func() {
483 var buf bytes.Buffer
484 enc := gob.NewEncoder(&buf)
485 q.err = enc.Encode(&gobWrapper{
486 WrappedQ: q.Q,
487 })
488 q.data = buf.Bytes()
489 })
490 return q.data, q.err
491}
492
493// GobDecode implements gob.Decoder.
494func (q *GobCache) GobDecode(data []byte) error {
495 dec := gob.NewDecoder(bytes.NewBuffer(data))
496 var w gobWrapper
497 err := dec.Decode(&w)
498 if err != nil {
499 return err
500 }
501 q.Q = w.WrappedQ
502 return nil
503}
504
505// gobWrapper is needed so the gob decoder works.
506type gobWrapper struct {
507 WrappedQ Q
508}
509
510func (q *GobCache) String() string {
511 return fmt.Sprintf("GobCache(%s)", q.Q)
512}
513
514// Or is matched when any of its children is matched.
515type Or struct {
516 Children []Q
517}
518
519func (q *Or) String() string {
520 var sub []string
521 for _, ch := range q.Children {
522 sub = append(sub, ch.String())
523 }
524 return fmt.Sprintf("(or %s)", strings.Join(sub, " "))
525}
526
527// Not inverts the meaning of its child.
528type Not struct {
529 Child Q
530}
531
532func (q *Not) String() string {
533 return fmt.Sprintf("(not %s)", q.Child)
534}
535
536// And is matched when all its children are.
537type And struct {
538 Children []Q
539}
540
541func (q *And) String() string {
542 var sub []string
543 for _, ch := range q.Children {
544 sub = append(sub, ch.String())
545 }
546 return fmt.Sprintf("(and %s)", strings.Join(sub, " "))
547}
548
549// NewAnd is syntactic sugar for constructing And queries.
550func NewAnd(qs ...Q) Q {
551 return &And{Children: qs}
552}
553
554// NewOr is syntactic sugar for constructing Or queries.
555func NewOr(qs ...Q) Q {
556 return &Or{Children: qs}
557}
558
559// Branch limits search to a specific branch.
560type Branch struct {
561 Pattern string
562
563 // exact is true if we want to Pattern to equal branch.
564 Exact bool
565}
566
567func (q *Branch) String() string {
568 if q.Exact {
569 return fmt.Sprintf("branch=%q", q.Pattern)
570 }
571 return fmt.Sprintf("branch:%q", q.Pattern)
572}
573
574func queryChildren(q Q) []Q {
575 switch s := q.(type) {
576 case *And:
577 return s.Children
578 case *Or:
579 return s.Children
580 }
581 return nil
582}
583
584func flattenAndOr(children []Q, typ Q) ([]Q, bool) {
585 var flat []Q
586 changed := false
587 for _, ch := range children {
588 ch, subChanged := flatten(ch)
589 changed = changed || subChanged
590 if reflect.TypeOf(ch) == reflect.TypeOf(typ) {
591 changed = true
592 subChildren := queryChildren(ch)
593 if subChildren != nil {
594 flat = append(flat, subChildren...)
595 }
596 } else {
597 flat = append(flat, ch)
598 }
599 }
600
601 return flat, changed
602}
603
604// (and (and x y) z) => (and x y z) , the same for "or"
605func flatten(q Q) (Q, bool) {
606 switch s := q.(type) {
607 case *And:
608 if len(s.Children) == 1 {
609 return s.Children[0], true
610 }
611 flatChildren, changed := flattenAndOr(s.Children, s)
612 return &And{flatChildren}, changed
613 case *Or:
614 if len(s.Children) == 1 {
615 return s.Children[0], true
616 }
617 flatChildren, changed := flattenAndOr(s.Children, s)
618 return &Or{flatChildren}, changed
619 case *Not:
620 child, changed := flatten(s.Child)
621 return &Not{child}, changed
622 case *Type:
623 child, changed := flatten(s.Child)
624 return &Type{Child: child, Type: s.Type}, changed
625 case *Boost:
626 child, changed := flatten(s.Child)
627 return &Boost{Child: child, Boost: s.Boost}, changed
628 default:
629 return q, false
630 }
631}
632
633func mapQueryList(qs []Q, f func(Q) Q) []Q {
634 neg := make([]Q, len(qs))
635 for i, sub := range qs {
636 neg[i] = Map(sub, f)
637 }
638 return neg
639}
640
641func invertConst(q Q) Q {
642 c, ok := q.(*Const)
643 if ok {
644 return &Const{!c.Value}
645 }
646 return q
647}
648
649func evalAndOrConstants(q Q, children []Q) Q {
650 _, isAnd := q.(*And)
651
652 children = mapQueryList(children, evalConstants)
653
654 newCH := children[:0]
655 for _, ch := range children {
656 c, ok := ch.(*Const)
657 if ok {
658 if c.Value == isAnd {
659 continue
660 } else {
661 return ch
662 }
663 }
664 newCH = append(newCH, ch)
665 }
666 if len(newCH) == 0 {
667 return &Const{isAnd}
668 }
669 if isAnd {
670 return &And{newCH}
671 }
672 return &Or{newCH}
673}
674
675func evalConstants(q Q) Q {
676 switch s := q.(type) {
677 case *And:
678 return evalAndOrConstants(q, s.Children)
679 case *Or:
680 return evalAndOrConstants(q, s.Children)
681 case *Not:
682 ch := evalConstants(s.Child)
683 if _, ok := ch.(*Const); ok {
684 return invertConst(ch)
685 }
686 return &Not{ch}
687 case *Type:
688 ch := evalConstants(s.Child)
689 if _, ok := ch.(*Const); ok {
690 // If q is the root query, then evaluating this to a const changes
691 // the type of result we will return. However, the only case this
692 // makes sense is `type:repo TRUE` to return all repos or
693 // `type:file TRUE` to return all filenames. For other cases we
694 // want to do this constant folding though, so we allow the
695 // unexpected behaviour mentioned previously.
696 return ch
697 }
698 return &Type{Child: ch, Type: s.Type}
699 case *Boost:
700 ch := evalConstants(s.Child)
701 if _, ok := ch.(*Const); ok {
702 return ch
703 }
704 return &Boost{Boost: s.Boost, Child: ch}
705 case *Substring:
706 if len(s.Pattern) == 0 {
707 return &Const{true}
708 }
709 case *Regexp:
710 if s.Regexp.Op == syntax.OpEmptyMatch {
711 return &Const{true}
712 }
713 case *Branch:
714 if s.Pattern == "" {
715 return &Const{true}
716 }
717 case *RepoSet:
718 if len(s.Set) == 0 {
719 return &Const{false}
720 }
721 case *FileNameSet:
722 if len(s.Set) == 0 {
723 return &Const{false}
724 }
725 }
726 return q
727}
728
729func Simplify(q Q) Q {
730 q = evalConstants(q)
731 for {
732 var changed bool
733 q, changed = flatten(q)
734 if !changed {
735 break
736 }
737 }
738
739 return q
740}
741
742// Map runs f over the q.
743func Map(q Q, f func(q Q) Q) Q {
744 switch s := q.(type) {
745 case *And:
746 q = &And{Children: mapQueryList(s.Children, f)}
747 case *Or:
748 q = &Or{Children: mapQueryList(s.Children, f)}
749 case *Not:
750 q = &Not{Child: Map(s.Child, f)}
751 case *Type:
752 q = &Type{Type: s.Type, Child: Map(s.Child, f)}
753 case *Boost:
754 q = &Boost{Boost: s.Boost, Child: Map(s.Child, f)}
755 }
756 return f(q)
757}
758
759// Expand expands Substr queries into (OR file_substr content_substr)
760// queries, and the same for Regexp queries..
761func ExpandFileContent(q Q) Q {
762 switch s := q.(type) {
763 case *Substring:
764 if s.FileName == s.Content {
765 f := *s
766 f.FileName = true
767 f.Content = false
768 c := *s
769 c.FileName = false
770 c.Content = true
771 return NewOr(&f, &c)
772 }
773 case *Regexp:
774 if s.FileName == s.Content {
775 f := *s
776 f.FileName = true
777 f.Content = false
778 c := *s
779 c.FileName = false
780 c.Content = true
781 return NewOr(&f, &c)
782 }
783 }
784 return q
785}
786
787// VisitAtoms runs `v` on all atom queries within `q`.
788func VisitAtoms(q Q, v func(q Q)) {
789 Map(q, func(iQ Q) Q {
790 switch iQ.(type) {
791 case *And:
792 case *Or:
793 case *Not:
794 case *Type:
795 case *Boost:
796 default:
797 v(iQ)
798 }
799 return iQ
800 })
801}