fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package query
16
17import (
18 "bytes"
19 "encoding/gob"
20 "encoding/json"
21 "fmt"
22 "log"
23 "reflect"
24 "regexp/syntax"
25 "sort"
26 "strconv"
27 "strings"
28 "sync"
29
30 "github.com/RoaringBitmap/roaring"
31 "github.com/grafana/regexp"
32 "github.com/sourcegraph/zoekt/internal/syntaxutil"
33)
34
35var _ = log.Println
36
37// Q is a representation for a possibly hierarchical search query.
38type Q interface {
39 String() string
40}
41
42// RPCUnwrap processes q to remove RPC specific elements from q. This is
43// needed because gob isn't flexible enough for us. This should be called by
44// RPC servers at the client/server boundary so that q works with the rest of
45// zoekt.
46func RPCUnwrap(q Q) Q {
47 if cache, ok := q.(*GobCache); ok {
48 return cache.Q
49 }
50 return q
51}
52
53// RawConfig filters repositories based on their encoded RawConfig map.
54type RawConfig uint64
55
56const (
57 RcOnlyPublic RawConfig = 1
58 RcOnlyPrivate RawConfig = 2
59 RcOnlyForks RawConfig = 1 << 2
60 RcNoForks RawConfig = 2 << 2
61 RcOnlyArchived RawConfig = 1 << 4
62 RcNoArchived RawConfig = 2 << 4
63)
64
65var flagNames = []struct {
66 Mask RawConfig
67 Label string
68}{
69 {RcOnlyPublic, "RcOnlyPublic"},
70 {RcOnlyPrivate, "RcOnlyPrivate"},
71 {RcOnlyForks, "RcOnlyForks"},
72 {RcNoForks, "RcNoForks"},
73 {RcOnlyArchived, "RcOnlyArchived"},
74 {RcNoArchived, "RcNoArchived"},
75}
76
77func (r RawConfig) String() string {
78 var s []string
79 for _, fn := range flagNames {
80 if r&fn.Mask != 0 {
81 s = append(s, fn.Label)
82 }
83 }
84 return fmt.Sprintf("rawConfig:%s", strings.Join(s, "|"))
85}
86
87// RegexpQuery is a query looking for regular expressions matches.
88type Regexp struct {
89 Regexp *syntax.Regexp
90 FileName bool
91 Content bool
92 CaseSensitive bool
93}
94
95func (q *Regexp) String() string {
96 pref := ""
97 if q.FileName {
98 pref = "file_"
99 }
100 if q.CaseSensitive {
101 pref = "case_" + pref
102 }
103 return fmt.Sprintf("%sregex:%q", pref, syntaxutil.RegexpString(q.Regexp))
104}
105
106// gobRegexp wraps Regexp to make it gob-encodable/decodable. Regexp contains syntax.Regexp, which
107// contains slices/arrays with possibly nil elements, which gob doesn't support
108// (https://github.com/golang/go/issues/1501).
109type gobRegexp struct {
110 Regexp // Regexp.Regexp (*syntax.Regexp) is set to nil and its string is set in RegexpString
111 RegexpString string
112}
113
114// GobEncode implements gob.Encoder.
115func (q Regexp) GobEncode() ([]byte, error) {
116 gobq := gobRegexp{Regexp: q, RegexpString: syntaxutil.RegexpString(q.Regexp)}
117 gobq.Regexp.Regexp = nil // can't be gob-encoded/decoded
118 return json.Marshal(gobq)
119}
120
121// GobDecode implements gob.Decoder.
122func (q *Regexp) GobDecode(data []byte) error {
123 var gobq gobRegexp
124 err := json.Unmarshal(data, &gobq)
125 if err != nil {
126 return err
127 }
128 gobq.Regexp.Regexp, err = syntax.Parse(gobq.RegexpString, regexpFlags)
129 if err != nil {
130 return err
131 }
132 *q = gobq.Regexp
133 return nil
134}
135
136// Symbol finds a string that is a symbol.
137type Symbol struct {
138 Expr Q
139}
140
141func (s *Symbol) String() string {
142 return fmt.Sprintf("sym:%s", s.Expr)
143}
144
145type caseQ struct {
146 Flavor string
147}
148
149func (c *caseQ) String() string {
150 return "case:" + c.Flavor
151}
152
153type Language struct {
154 Language string
155}
156
157func (l *Language) String() string {
158 return "lang:" + l.Language
159}
160
161type Const struct {
162 Value bool
163}
164
165func (q *Const) String() string {
166 if q.Value {
167 return "TRUE"
168 }
169 return "FALSE"
170}
171
172type Repo struct {
173 Regexp *regexp.Regexp
174}
175
176func (q *Repo) String() string {
177 return fmt.Sprintf("repo:%s", q.Regexp.String())
178}
179
180func (q Repo) GobEncode() ([]byte, error) {
181 return []byte(q.Regexp.String()), nil
182}
183
184func (q *Repo) GobDecode(data []byte) error {
185 var err error
186 q.Regexp, err = regexp.Compile(string(data))
187 return err
188}
189
190// RepoRegexp is a Sourcegraph addition which searches documents where the
191// repository name matches Regexp.
192type RepoRegexp struct {
193 Regexp *regexp.Regexp
194}
195
196func (q *RepoRegexp) String() string {
197 return fmt.Sprintf("reporegex:%q", q.Regexp.String())
198}
199
200// GobEncode implements gob.Encoder.
201func (q *RepoRegexp) GobEncode() ([]byte, error) {
202 // gob can't encode syntax.Regexp
203 return []byte(q.Regexp.String()), nil
204}
205
206// GobDecode implements gob.Decoder.
207func (q *RepoRegexp) GobDecode(data []byte) error {
208 var err error
209 q.Regexp, err = regexp.Compile(string(data))
210 return err
211}
212
213// BranchesRepos is a slice of BranchRepos to match. It is a Sourcegraph
214// addition and only used in the RPC interface for efficient checking of large
215// repo lists.
216type BranchesRepos struct {
217 List []BranchRepos
218}
219
220// NewSingleBranchesRepos is a helper for creating a BranchesRepos which
221// searches a single branch.
222func NewSingleBranchesRepos(branch string, ids ...uint32) *BranchesRepos {
223 return &BranchesRepos{List: []BranchRepos{
224 {branch, roaring.BitmapOf(ids...)},
225 }}
226}
227
228func (q *BranchesRepos) String() string {
229 var sb strings.Builder
230
231 sb.WriteString("(branchesrepos")
232
233 for _, br := range q.List {
234 if size := br.Repos.GetCardinality(); size > 1 {
235 sb.WriteString(" " + br.Branch + ":" + strconv.FormatUint(size, 10))
236 } else {
237 sb.WriteString(" " + br.Branch + "=" + br.Repos.String())
238 }
239 }
240
241 sb.WriteString(")")
242 return sb.String()
243}
244
245// NewRepoIDs is a helper for creating a RepoIDs which
246// searches only the matched repos.
247func NewRepoIDs(ids ...uint32) *RepoIDs {
248 return &RepoIDs{Repos: roaring.BitmapOf(ids...)}
249}
250
251func (q *RepoIDs) String() string {
252 var sb strings.Builder
253
254 sb.WriteString("(repoids ")
255
256 if size := q.Repos.GetCardinality(); size > 1 {
257 sb.WriteString("count:" + strconv.FormatUint(size, 10))
258 } else {
259 sb.WriteString("repoid=" + q.Repos.String())
260 }
261
262 sb.WriteString(")")
263 return sb.String()
264}
265
266// MarshalBinary implements a specialized encoder for BranchesRepos.
267func (q BranchesRepos) MarshalBinary() ([]byte, error) {
268 return branchesReposEncode(q.List)
269}
270
271// UnmarshalBinary implements a specialized decoder for BranchesRepos.
272func (q *BranchesRepos) UnmarshalBinary(b []byte) (err error) {
273 q.List, err = branchesReposDecode(b)
274 return err
275}
276
277// BranchRepos is a (branch, sourcegraph repo ids bitmap) tuple. It is a
278// Sourcegraph addition.
279type BranchRepos struct {
280 Branch string
281 Repos *roaring.Bitmap
282}
283
284// Similar to BranchRepos but will be used to match only by repoid and
285// therefore matches all branches
286type RepoIDs struct {
287 Repos *roaring.Bitmap
288}
289
290// RepoSet is a list of repos to match. It is a Sourcegraph addition and only
291// used in the RPC interface for efficient checking of large repo lists.
292type RepoSet struct {
293 Set map[string]bool
294}
295
296func (q *RepoSet) String() string {
297 var detail string
298 if len(q.Set) > 5 {
299 // Large sets being output are not useful
300 detail = fmt.Sprintf("size=%d", len(q.Set))
301 } else {
302 repos := make([]string, len(q.Set))
303 i := 0
304 for repo := range q.Set {
305 repos[i] = repo
306 i++
307 }
308 sort.Strings(repos)
309 detail = strings.Join(repos, " ")
310 }
311 return fmt.Sprintf("(reposet %s)", detail)
312}
313
314func NewRepoSet(repo ...string) *RepoSet {
315 s := &RepoSet{Set: make(map[string]bool)}
316 for _, r := range repo {
317 s.Set[r] = true
318 }
319 return s
320}
321
322// FileNameSet is a list of file names to match. It is a Sourcegraph addition
323// and only used in the RPC interface for efficient checking of large file
324// lists.
325type FileNameSet struct {
326 Set map[string]struct{}
327}
328
329// MarshalBinary implements a specialized encoder for FileNameSet.
330func (q *FileNameSet) MarshalBinary() ([]byte, error) {
331 return stringSetEncode(q.Set)
332}
333
334// UnmarshalBinary implements a specialized decoder for FileNameSet.
335func (q *FileNameSet) UnmarshalBinary(b []byte) error {
336 var err error
337 q.Set, err = stringSetDecode(b)
338 return err
339}
340
341func (q *FileNameSet) String() string {
342 var detail string
343 if len(q.Set) > 5 {
344 // Large sets being output are not useful
345 detail = fmt.Sprintf("size=%d", len(q.Set))
346 } else {
347 values := make([]string, 0, len(q.Set))
348 for v := range q.Set {
349 values = append(values, v)
350 }
351 sort.Strings(values)
352 detail = strings.Join(values, " ")
353 }
354 return fmt.Sprintf("(filenameset %s)", detail)
355}
356
357func NewFileNameSet(fileNames ...string) *FileNameSet {
358 s := &FileNameSet{Set: make(map[string]struct{})}
359 for _, r := range fileNames {
360 s.Set[r] = struct{}{}
361 }
362 return s
363}
364
365const (
366 TypeFileMatch uint8 = iota
367 TypeFileName
368 TypeRepo
369)
370
371// Type changes the result type returned.
372type Type struct {
373 Child Q
374 Type uint8
375}
376
377func (q *Type) String() string {
378 switch q.Type {
379 case TypeFileMatch:
380 return fmt.Sprintf("(type:filematch %s)", q.Child)
381 case TypeFileName:
382 return fmt.Sprintf("(type:filename %s)", q.Child)
383 case TypeRepo:
384 return fmt.Sprintf("(type:repo %s)", q.Child)
385 default:
386 return fmt.Sprintf("(type:UNKNOWN %s)", q.Child)
387 }
388}
389
390// Boost scales the contribution to score of descendents.
391type Boost struct {
392 Child Q
393 // Boost will multiply the score of its descendents. Values less than 1 will
394 // give less importance while values greater than 1 will give more
395 // importance.
396 Boost float64
397}
398
399func (q *Boost) String() string {
400 return fmt.Sprintf("(boost %0.2f %s)", q.Boost, q.Child)
401}
402
403// Substring is the most basic query: a query for a substring.
404type Substring struct {
405 Pattern string
406 CaseSensitive bool
407
408 // Match only filename
409 FileName bool
410
411 // Match only content
412 Content bool
413}
414
415func (q *Substring) String() string {
416 s := ""
417
418 t := ""
419 if q.FileName {
420 t = "file_"
421 } else if q.Content {
422 t = "content_"
423 }
424
425 s += fmt.Sprintf("%ssubstr:%q", t, q.Pattern)
426 if q.CaseSensitive {
427 s = "case_" + s
428 }
429 return s
430}
431
432type setCaser interface {
433 setCase(string)
434}
435
436func (q *Substring) setCase(k string) {
437 switch k {
438 case "yes":
439 q.CaseSensitive = true
440 case "no":
441 q.CaseSensitive = false
442 case "auto":
443 // TODO - unicode
444 q.CaseSensitive = (q.Pattern != string(toLower([]byte(q.Pattern))))
445 }
446}
447
448func (q *Symbol) setCase(k string) {
449 if sc, ok := q.Expr.(setCaser); ok {
450 sc.setCase(k)
451 }
452}
453
454func (q *Regexp) setCase(k string) {
455 switch k {
456 case "yes":
457 q.CaseSensitive = true
458 case "no":
459 q.CaseSensitive = false
460 case "auto":
461 q.CaseSensitive = !q.Regexp.Equal(LowerRegexp(q.Regexp))
462 }
463}
464
465// GobCache exists so we only pay the cost of marshalling a query once when we
466// aggregate it out over all the replicas.
467//
468// Our query and eval layer do not support GobCache. Instead, at the gob
469// boundaries (RPC and Streaming) we check if the Q is a GobCache and unwrap
470// it.
471//
472// "I wish we could get rid of this code soon enough" - tomas
473type GobCache struct {
474 Q
475
476 once sync.Once
477 data []byte
478 err error
479}
480
481// GobEncode implements gob.Encoder.
482func (q *GobCache) GobEncode() ([]byte, error) {
483 q.once.Do(func() {
484 var buf bytes.Buffer
485 enc := gob.NewEncoder(&buf)
486 q.err = enc.Encode(&gobWrapper{
487 WrappedQ: q.Q,
488 })
489 q.data = buf.Bytes()
490 })
491 return q.data, q.err
492}
493
494// GobDecode implements gob.Decoder.
495func (q *GobCache) GobDecode(data []byte) error {
496 dec := gob.NewDecoder(bytes.NewBuffer(data))
497 var w gobWrapper
498 err := dec.Decode(&w)
499 if err != nil {
500 return err
501 }
502 q.Q = w.WrappedQ
503 return nil
504}
505
506// gobWrapper is needed so the gob decoder works.
507type gobWrapper struct {
508 WrappedQ Q
509}
510
511func (q *GobCache) String() string {
512 return fmt.Sprintf("GobCache(%s)", q.Q)
513}
514
515// Or is matched when any of its children is matched.
516type Or struct {
517 Children []Q
518}
519
520func (q *Or) String() string {
521 var sub []string
522 for _, ch := range q.Children {
523 sub = append(sub, ch.String())
524 }
525 return fmt.Sprintf("(or %s)", strings.Join(sub, " "))
526}
527
528// Not inverts the meaning of its child.
529type Not struct {
530 Child Q
531}
532
533func (q *Not) String() string {
534 return fmt.Sprintf("(not %s)", q.Child)
535}
536
537// And is matched when all its children are.
538type And struct {
539 Children []Q
540}
541
542func (q *And) String() string {
543 var sub []string
544 for _, ch := range q.Children {
545 sub = append(sub, ch.String())
546 }
547 return fmt.Sprintf("(and %s)", strings.Join(sub, " "))
548}
549
550// NewAnd is syntactic sugar for constructing And queries.
551func NewAnd(qs ...Q) Q {
552 return &And{Children: qs}
553}
554
555// NewOr is syntactic sugar for constructing Or queries.
556func NewOr(qs ...Q) Q {
557 return &Or{Children: qs}
558}
559
560// Branch limits search to a specific branch.
561type Branch struct {
562 Pattern string
563
564 // exact is true if we want to Pattern to equal branch.
565 Exact bool
566}
567
568func (q *Branch) String() string {
569 if q.Exact {
570 return fmt.Sprintf("branch=%q", q.Pattern)
571 }
572 return fmt.Sprintf("branch:%q", q.Pattern)
573}
574
575func queryChildren(q Q) []Q {
576 switch s := q.(type) {
577 case *And:
578 return s.Children
579 case *Or:
580 return s.Children
581 }
582 return nil
583}
584
585func flattenAndOr(children []Q, typ Q) ([]Q, bool) {
586 var flat []Q
587 changed := false
588 for _, ch := range children {
589 ch, subChanged := flatten(ch)
590 changed = changed || subChanged
591 if reflect.TypeOf(ch) == reflect.TypeOf(typ) {
592 changed = true
593 subChildren := queryChildren(ch)
594 if subChildren != nil {
595 flat = append(flat, subChildren...)
596 }
597 } else {
598 flat = append(flat, ch)
599 }
600 }
601
602 return flat, changed
603}
604
605// (and (and x y) z) => (and x y z) , the same for "or"
606func flatten(q Q) (Q, bool) {
607 switch s := q.(type) {
608 case *And:
609 if len(s.Children) == 1 {
610 return s.Children[0], true
611 }
612 flatChildren, changed := flattenAndOr(s.Children, s)
613 return &And{flatChildren}, changed
614 case *Or:
615 if len(s.Children) == 1 {
616 return s.Children[0], true
617 }
618 flatChildren, changed := flattenAndOr(s.Children, s)
619 return &Or{flatChildren}, changed
620 case *Not:
621 child, changed := flatten(s.Child)
622 return &Not{child}, changed
623 case *Type:
624 child, changed := flatten(s.Child)
625 return &Type{Child: child, Type: s.Type}, changed
626 case *Boost:
627 child, changed := flatten(s.Child)
628 return &Boost{Child: child, Boost: s.Boost}, changed
629 default:
630 return q, false
631 }
632}
633
634func mapQueryList(qs []Q, f func(Q) Q) []Q {
635 neg := make([]Q, len(qs))
636 for i, sub := range qs {
637 neg[i] = Map(sub, f)
638 }
639 return neg
640}
641
642func invertConst(q Q) Q {
643 c, ok := q.(*Const)
644 if ok {
645 return &Const{!c.Value}
646 }
647 return q
648}
649
650func evalAndOrConstants(q Q, children []Q) Q {
651 _, isAnd := q.(*And)
652
653 children = mapQueryList(children, evalConstants)
654
655 newCH := children[:0]
656 for _, ch := range children {
657 c, ok := ch.(*Const)
658 if ok {
659 if c.Value == isAnd {
660 continue
661 } else {
662 return ch
663 }
664 }
665 newCH = append(newCH, ch)
666 }
667 if len(newCH) == 0 {
668 return &Const{isAnd}
669 }
670 if isAnd {
671 return &And{newCH}
672 }
673 return &Or{newCH}
674}
675
676func evalConstants(q Q) Q {
677 switch s := q.(type) {
678 case *And:
679 return evalAndOrConstants(q, s.Children)
680 case *Or:
681 return evalAndOrConstants(q, s.Children)
682 case *Not:
683 ch := evalConstants(s.Child)
684 if _, ok := ch.(*Const); ok {
685 return invertConst(ch)
686 }
687 return &Not{ch}
688 case *Type:
689 ch := evalConstants(s.Child)
690 if _, ok := ch.(*Const); ok {
691 // If q is the root query, then evaluating this to a const changes
692 // the type of result we will return. However, the only case this
693 // makes sense is `type:repo TRUE` to return all repos or
694 // `type:file TRUE` to return all filenames. For other cases we
695 // want to do this constant folding though, so we allow the
696 // unexpected behaviour mentioned previously.
697 return ch
698 }
699 return &Type{Child: ch, Type: s.Type}
700 case *Boost:
701 ch := evalConstants(s.Child)
702 if _, ok := ch.(*Const); ok {
703 return ch
704 }
705 return &Boost{Boost: s.Boost, Child: ch}
706 case *Substring:
707 if len(s.Pattern) == 0 {
708 return &Const{true}
709 }
710 case *Regexp:
711 if s.Regexp.Op == syntax.OpEmptyMatch {
712 return &Const{true}
713 }
714 case *Branch:
715 if s.Pattern == "" {
716 return &Const{true}
717 }
718 case *RepoSet:
719 if len(s.Set) == 0 {
720 return &Const{false}
721 }
722 case *FileNameSet:
723 if len(s.Set) == 0 {
724 return &Const{false}
725 }
726 }
727 return q
728}
729
730func Simplify(q Q) Q {
731 q = evalConstants(q)
732 for {
733 var changed bool
734 q, changed = flatten(q)
735 if !changed {
736 break
737 }
738 }
739
740 return q
741}
742
743// Map runs f over the q.
744func Map(q Q, f func(q Q) Q) Q {
745 switch s := q.(type) {
746 case *And:
747 q = &And{Children: mapQueryList(s.Children, f)}
748 case *Or:
749 q = &Or{Children: mapQueryList(s.Children, f)}
750 case *Not:
751 q = &Not{Child: Map(s.Child, f)}
752 case *Type:
753 q = &Type{Type: s.Type, Child: Map(s.Child, f)}
754 case *Boost:
755 q = &Boost{Boost: s.Boost, Child: Map(s.Child, f)}
756 }
757 return f(q)
758}
759
760// Expand expands Substr queries into (OR file_substr content_substr)
761// queries, and the same for Regexp queries..
762func ExpandFileContent(q Q) Q {
763 switch s := q.(type) {
764 case *Substring:
765 if s.FileName == s.Content {
766 f := *s
767 f.FileName = true
768 f.Content = false
769 c := *s
770 c.FileName = false
771 c.Content = true
772 return NewOr(&f, &c)
773 }
774 case *Regexp:
775 if s.FileName == s.Content {
776 f := *s
777 f.FileName = true
778 f.Content = false
779 c := *s
780 c.FileName = false
781 c.Content = true
782 return NewOr(&f, &c)
783 }
784 }
785 return q
786}
787
788// VisitAtoms runs `v` on all atom queries within `q`.
789func VisitAtoms(q Q, v func(q Q)) {
790 Map(q, func(iQ Q) Q {
791 switch iQ.(type) {
792 case *And:
793 case *Or:
794 case *Not:
795 case *Type:
796 case *Boost:
797 default:
798 v(iQ)
799 }
800 return iQ
801 })
802}