fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package query
16
17import (
18 "encoding/json"
19 "fmt"
20 "log"
21 "reflect"
22 "regexp/syntax"
23 "sort"
24 "strconv"
25 "strings"
26
27 "github.com/RoaringBitmap/roaring"
28 "github.com/grafana/regexp"
29
30 "github.com/sourcegraph/zoekt/internal/syntaxutil"
31)
32
33var _ = log.Println
34
35// Q is a representation for a possibly hierarchical search query.
36type Q interface {
37 String() string
38}
39
40// RawConfig filters repositories based on their encoded RawConfig map.
41type RawConfig uint64
42
43const (
44 RcOnlyPublic RawConfig = 1
45 RcOnlyPrivate RawConfig = 2
46 RcOnlyForks RawConfig = 1 << 2
47 RcNoForks RawConfig = 2 << 2
48 RcOnlyArchived RawConfig = 1 << 4
49 RcNoArchived RawConfig = 2 << 4
50)
51
52var flagNames = []struct {
53 Mask RawConfig
54 Label string
55}{
56 {RcOnlyPublic, "RcOnlyPublic"},
57 {RcOnlyPrivate, "RcOnlyPrivate"},
58 {RcOnlyForks, "RcOnlyForks"},
59 {RcNoForks, "RcNoForks"},
60 {RcOnlyArchived, "RcOnlyArchived"},
61 {RcNoArchived, "RcNoArchived"},
62}
63
64func (r RawConfig) String() string {
65 var s []string
66 for _, fn := range flagNames {
67 if r&fn.Mask != 0 {
68 s = append(s, fn.Label)
69 }
70 }
71 return fmt.Sprintf("rawConfig:%s", strings.Join(s, "|"))
72}
73
74// RegexpQuery is a query looking for regular expressions matches.
75type Regexp struct {
76 Regexp *syntax.Regexp
77 FileName bool
78 Content bool
79 CaseSensitive bool
80}
81
82// RegexpString returns the marshaled raw regexp pattern for q.
83func (q *Regexp) RegexpString() string {
84 return syntaxutil.RegexpString(q.Regexp)
85}
86
87func (q *Regexp) String() string {
88 pref := ""
89 if q.FileName {
90 pref = "file_"
91 }
92 if q.CaseSensitive {
93 pref = "case_" + pref
94 }
95 return fmt.Sprintf("%sregex:%q", pref, q.RegexpString())
96}
97
98// gobRegexp wraps Regexp to make it gob-encodable/decodable. Regexp contains syntax.Regexp, which
99// contains slices/arrays with possibly nil elements, which gob doesn't support
100// (https://github.com/golang/go/issues/1501).
101type gobRegexp struct {
102 Regexp // Regexp.Regexp (*syntax.Regexp) is set to nil and its string is set in RegexpString
103 RegexpString string
104}
105
106// GobEncode implements gob.Encoder.
107func (q Regexp) GobEncode() ([]byte, error) {
108 gobq := gobRegexp{Regexp: q, RegexpString: q.RegexpString()}
109 gobq.Regexp.Regexp = nil // can't be gob-encoded/decoded
110 return json.Marshal(gobq)
111}
112
113// GobDecode implements gob.Decoder.
114func (q *Regexp) GobDecode(data []byte) error {
115 var gobq gobRegexp
116 err := json.Unmarshal(data, &gobq)
117 if err != nil {
118 return err
119 }
120 gobq.Regexp.Regexp, err = syntax.Parse(gobq.RegexpString, regexpFlags)
121 if err != nil {
122 return err
123 }
124 *q = gobq.Regexp
125 return nil
126}
127
128// Symbol finds a string that is a symbol.
129type Symbol struct {
130 Expr Q
131}
132
133func (s *Symbol) String() string {
134 return fmt.Sprintf("sym:%s", s.Expr)
135}
136
137type caseQ struct {
138 Flavor string
139}
140
141func (c *caseQ) String() string {
142 return "case:" + c.Flavor
143}
144
145type Language struct {
146 Language string
147}
148
149func (l *Language) String() string {
150 return "lang:" + l.Language
151}
152
153type Const struct {
154 Value bool
155}
156
157func (q *Const) String() string {
158 if q.Value {
159 return "TRUE"
160 }
161 return "FALSE"
162}
163
164type Repo struct {
165 Regexp *regexp.Regexp
166}
167
168func (q *Repo) String() string {
169 return fmt.Sprintf("repo:%s", q.Regexp.String())
170}
171
172func (q Repo) GobEncode() ([]byte, error) {
173 return []byte(q.Regexp.String()), nil
174}
175
176func (q *Repo) GobDecode(data []byte) error {
177 var err error
178 q.Regexp, err = regexp.Compile(string(data))
179 return err
180}
181
182// RepoRegexp is a Sourcegraph addition which searches documents where the
183// repository name matches Regexp.
184type RepoRegexp struct {
185 Regexp *regexp.Regexp
186}
187
188func (q *RepoRegexp) String() string {
189 return fmt.Sprintf("reporegex:%q", q.Regexp.String())
190}
191
192// GobEncode implements gob.Encoder.
193func (q *RepoRegexp) GobEncode() ([]byte, error) {
194 // gob can't encode syntax.Regexp
195 return []byte(q.Regexp.String()), nil
196}
197
198// GobDecode implements gob.Decoder.
199func (q *RepoRegexp) GobDecode(data []byte) error {
200 var err error
201 q.Regexp, err = regexp.Compile(string(data))
202 return err
203}
204
205// BranchesRepos is a slice of BranchRepos to match. It is a Sourcegraph
206// addition and only used in the RPC interface for efficient checking of large
207// repo lists.
208type BranchesRepos struct {
209 List []BranchRepos
210}
211
212// NewSingleBranchesRepos is a helper for creating a BranchesRepos which
213// searches a single branch.
214func NewSingleBranchesRepos(branch string, ids ...uint32) *BranchesRepos {
215 return &BranchesRepos{List: []BranchRepos{
216 {branch, roaring.BitmapOf(ids...)},
217 }}
218}
219
220func (q *BranchesRepos) String() string {
221 var sb strings.Builder
222
223 sb.WriteString("(branchesrepos")
224
225 for _, br := range q.List {
226 if size := br.Repos.GetCardinality(); size > 1 {
227 sb.WriteString(" " + br.Branch + ":" + strconv.FormatUint(size, 10))
228 } else {
229 sb.WriteString(" " + br.Branch + "=" + br.Repos.String())
230 }
231 }
232
233 sb.WriteString(")")
234 return sb.String()
235}
236
237// NewRepoIDs is a helper for creating a RepoIDs which
238// searches only the matched repos.
239func NewRepoIDs(ids ...uint32) *RepoIDs {
240 return &RepoIDs{Repos: roaring.BitmapOf(ids...)}
241}
242
243func (q *RepoIDs) String() string {
244 var sb strings.Builder
245
246 sb.WriteString("(repoids ")
247
248 if size := q.Repos.GetCardinality(); size > 1 {
249 sb.WriteString("count:" + strconv.FormatUint(size, 10))
250 } else {
251 sb.WriteString("repoid=" + q.Repos.String())
252 }
253
254 sb.WriteString(")")
255 return sb.String()
256}
257
258// MarshalBinary implements a specialized encoder for BranchesRepos.
259func (q BranchesRepos) MarshalBinary() ([]byte, error) {
260 return branchesReposEncode(q.List)
261}
262
263// UnmarshalBinary implements a specialized decoder for BranchesRepos.
264func (q *BranchesRepos) UnmarshalBinary(b []byte) (err error) {
265 q.List, err = branchesReposDecode(b)
266 return err
267}
268
269// BranchRepos is a (branch, sourcegraph repo ids bitmap) tuple. It is a
270// Sourcegraph addition.
271type BranchRepos struct {
272 Branch string
273 Repos *roaring.Bitmap
274}
275
276// Similar to BranchRepos but will be used to match only by repoid and
277// therefore matches all branches
278type RepoIDs struct {
279 Repos *roaring.Bitmap
280}
281
282// RepoSet is a list of repos to match. It is a Sourcegraph addition and only
283// used in the RPC interface for efficient checking of large repo lists.
284type RepoSet struct {
285 Set map[string]bool
286}
287
288func (q *RepoSet) String() string {
289 var detail string
290 if len(q.Set) > 5 {
291 // Large sets being output are not useful
292 detail = fmt.Sprintf("size=%d", len(q.Set))
293 } else {
294 repos := make([]string, len(q.Set))
295 i := 0
296 for repo := range q.Set {
297 repos[i] = repo
298 i++
299 }
300 sort.Strings(repos)
301 detail = strings.Join(repos, " ")
302 }
303 return fmt.Sprintf("(reposet %s)", detail)
304}
305
306func NewRepoSet(repo ...string) *RepoSet {
307 s := &RepoSet{Set: make(map[string]bool)}
308 for _, r := range repo {
309 s.Set[r] = true
310 }
311 return s
312}
313
314// FileNameSet is a list of file names to match. It is a Sourcegraph addition
315// and only used in the RPC interface for efficient checking of large file
316// lists.
317type FileNameSet struct {
318 Set map[string]struct{}
319}
320
321// MarshalBinary implements a specialized encoder for FileNameSet.
322func (q *FileNameSet) MarshalBinary() ([]byte, error) {
323 return stringSetEncode(q.Set)
324}
325
326// UnmarshalBinary implements a specialized decoder for FileNameSet.
327func (q *FileNameSet) UnmarshalBinary(b []byte) error {
328 var err error
329 q.Set, err = stringSetDecode(b)
330 return err
331}
332
333func (q *FileNameSet) String() string {
334 var detail string
335 if len(q.Set) > 5 {
336 // Large sets being output are not useful
337 detail = fmt.Sprintf("size=%d", len(q.Set))
338 } else {
339 values := make([]string, 0, len(q.Set))
340 for v := range q.Set {
341 values = append(values, v)
342 }
343 sort.Strings(values)
344 detail = strings.Join(values, " ")
345 }
346 return fmt.Sprintf("(filenameset %s)", detail)
347}
348
349func NewFileNameSet(fileNames ...string) *FileNameSet {
350 s := &FileNameSet{Set: make(map[string]struct{})}
351 for _, r := range fileNames {
352 s.Set[r] = struct{}{}
353 }
354 return s
355}
356
357const (
358 TypeFileMatch uint8 = iota
359 TypeFileName
360 TypeRepo
361)
362
363// Type changes the result type returned.
364type Type struct {
365 Child Q
366 Type uint8
367}
368
369func (q *Type) String() string {
370 switch q.Type {
371 case TypeFileMatch:
372 return fmt.Sprintf("(type:filematch %s)", q.Child)
373 case TypeFileName:
374 return fmt.Sprintf("(type:filename %s)", q.Child)
375 case TypeRepo:
376 return fmt.Sprintf("(type:repo %s)", q.Child)
377 default:
378 return fmt.Sprintf("(type:UNKNOWN %s)", q.Child)
379 }
380}
381
382// Boost scales the contribution to score of descendents.
383type Boost struct {
384 Child Q
385 // Boost will multiply the score of its descendents. Values less than 1 will
386 // give less importance while values greater than 1 will give more
387 // importance.
388 Boost float64
389}
390
391func (q *Boost) String() string {
392 return fmt.Sprintf("(boost %0.2f %s)", q.Boost, q.Child)
393}
394
395// Substring is the most basic query: a query for a substring.
396type Substring struct {
397 Pattern string
398 CaseSensitive bool
399
400 // Match only filename
401 FileName bool
402
403 // Match only content
404 Content bool
405}
406
407func (q *Substring) String() string {
408 s := ""
409
410 t := ""
411 if q.FileName {
412 t = "file_"
413 } else if q.Content {
414 t = "content_"
415 }
416
417 s += fmt.Sprintf("%ssubstr:%q", t, q.Pattern)
418 if q.CaseSensitive {
419 s = "case_" + s
420 }
421 return s
422}
423
424type setCaser interface {
425 setCase(string)
426}
427
428func (q *Substring) setCase(k string) {
429 switch k {
430 case "yes":
431 q.CaseSensitive = true
432 case "no":
433 q.CaseSensitive = false
434 case "auto":
435 // TODO - unicode
436 q.CaseSensitive = (q.Pattern != string(toLower([]byte(q.Pattern))))
437 }
438}
439
440func (q *Symbol) setCase(k string) {
441 if sc, ok := q.Expr.(setCaser); ok {
442 sc.setCase(k)
443 }
444}
445
446func (q *Regexp) setCase(k string) {
447 switch k {
448 case "yes":
449 q.CaseSensitive = true
450 case "no":
451 q.CaseSensitive = false
452 case "auto":
453 q.CaseSensitive = !q.Regexp.Equal(LowerRegexp(q.Regexp))
454 }
455}
456
457// Or is matched when any of its children is matched.
458type Or struct {
459 Children []Q
460}
461
462func (q *Or) String() string {
463 var sub []string
464 for _, ch := range q.Children {
465 sub = append(sub, ch.String())
466 }
467 return fmt.Sprintf("(or %s)", strings.Join(sub, " "))
468}
469
470// Not inverts the meaning of its child.
471type Not struct {
472 Child Q
473}
474
475func (q *Not) String() string {
476 return fmt.Sprintf("(not %s)", q.Child)
477}
478
479// And is matched when all its children are.
480type And struct {
481 Children []Q
482}
483
484func (q *And) String() string {
485 var sub []string
486 for _, ch := range q.Children {
487 sub = append(sub, ch.String())
488 }
489 return fmt.Sprintf("(and %s)", strings.Join(sub, " "))
490}
491
492// NewAnd is syntactic sugar for constructing And queries.
493func NewAnd(qs ...Q) Q {
494 return &And{Children: qs}
495}
496
497// NewOr is syntactic sugar for constructing Or queries.
498func NewOr(qs ...Q) Q {
499 return &Or{Children: qs}
500}
501
502// Branch limits search to a specific branch.
503type Branch struct {
504 Pattern string
505
506 // exact is true if we want to Pattern to equal branch.
507 Exact bool
508}
509
510func (q *Branch) String() string {
511 if q.Exact {
512 return fmt.Sprintf("branch=%q", q.Pattern)
513 }
514 return fmt.Sprintf("branch:%q", q.Pattern)
515}
516
517// Meta represents a query for metadata fields.
518type Meta struct {
519 Field string // The metadata field name
520 Value *regexp.Regexp // The value to match
521}
522
523// String returns a string representation of the Meta query.
524func (m *Meta) String() string {
525 return fmt.Sprintf("meta.%s:%s", m.Field, m.Value)
526}
527
528func queryChildren(q Q) []Q {
529 switch s := q.(type) {
530 case *And:
531 return s.Children
532 case *Or:
533 return s.Children
534 }
535 return nil
536}
537
538func flattenAndOr(children []Q, typ Q) ([]Q, bool) {
539 var flat []Q
540 changed := false
541 for _, ch := range children {
542 ch, subChanged := flatten(ch)
543 changed = changed || subChanged
544 if reflect.TypeOf(ch) == reflect.TypeOf(typ) {
545 changed = true
546 subChildren := queryChildren(ch)
547 if subChildren != nil {
548 flat = append(flat, subChildren...)
549 }
550 } else {
551 flat = append(flat, ch)
552 }
553 }
554
555 return flat, changed
556}
557
558// (and (and x y) z) => (and x y z) , the same for "or"
559func flatten(q Q) (Q, bool) {
560 switch s := q.(type) {
561 case *And:
562 if len(s.Children) == 1 {
563 return s.Children[0], true
564 }
565 flatChildren, changed := flattenAndOr(s.Children, s)
566 return &And{flatChildren}, changed
567 case *Or:
568 if len(s.Children) == 1 {
569 return s.Children[0], true
570 }
571 flatChildren, changed := flattenAndOr(s.Children, s)
572 return &Or{flatChildren}, changed
573 case *Not:
574 child, changed := flatten(s.Child)
575 return &Not{child}, changed
576 case *Type:
577 child, changed := flatten(s.Child)
578 return &Type{Child: child, Type: s.Type}, changed
579 case *Boost:
580 child, changed := flatten(s.Child)
581 return &Boost{Child: child, Boost: s.Boost}, changed
582 default:
583 return q, false
584 }
585}
586
587func mapQueryList(qs []Q, f func(Q) Q) []Q {
588 neg := make([]Q, len(qs))
589 for i, sub := range qs {
590 neg[i] = Map(sub, f)
591 }
592 return neg
593}
594
595func invertConst(q Q) Q {
596 c, ok := q.(*Const)
597 if ok {
598 return &Const{!c.Value}
599 }
600 return q
601}
602
603func evalAndOrConstants(q Q, children []Q) Q {
604 _, isAnd := q.(*And)
605
606 children = mapQueryList(children, evalConstants)
607
608 newCH := children[:0]
609 for _, ch := range children {
610 c, ok := ch.(*Const)
611 if ok {
612 if c.Value == isAnd {
613 continue
614 } else {
615 return ch
616 }
617 }
618 newCH = append(newCH, ch)
619 }
620 if len(newCH) == 0 {
621 return &Const{isAnd}
622 }
623 if isAnd {
624 return &And{newCH}
625 }
626 return &Or{newCH}
627}
628
629func evalConstants(q Q) Q {
630 switch s := q.(type) {
631 case *And:
632 return evalAndOrConstants(q, s.Children)
633 case *Or:
634 return evalAndOrConstants(q, s.Children)
635 case *Not:
636 ch := evalConstants(s.Child)
637 if _, ok := ch.(*Const); ok {
638 return invertConst(ch)
639 }
640 return &Not{ch}
641 case *Type:
642 ch := evalConstants(s.Child)
643 if _, ok := ch.(*Const); ok {
644 // If q is the root query, then evaluating this to a const changes
645 // the type of result we will return. However, the only case this
646 // makes sense is `type:repo TRUE` to return all repos or
647 // `type:file TRUE` to return all filenames. For other cases we
648 // want to do this constant folding though, so we allow the
649 // unexpected behaviour mentioned previously.
650 return ch
651 }
652 return &Type{Child: ch, Type: s.Type}
653 case *Boost:
654 ch := evalConstants(s.Child)
655 if _, ok := ch.(*Const); ok {
656 return ch
657 }
658 return &Boost{Boost: s.Boost, Child: ch}
659 case *Substring:
660 if len(s.Pattern) == 0 {
661 return &Const{true}
662 }
663 case *Regexp:
664 if s.Regexp.Op == syntax.OpEmptyMatch {
665 return &Const{true}
666 }
667 case *Branch:
668 if s.Pattern == "" {
669 return &Const{true}
670 }
671 case *BranchesRepos:
672 for _, br := range s.List {
673 if !br.Repos.IsEmpty() {
674 return q
675 }
676 }
677 return &Const{false}
678 case *RepoIDs:
679 if s.Repos.IsEmpty() {
680 return &Const{false}
681 }
682 case *RepoSet:
683 if len(s.Set) == 0 {
684 return &Const{false}
685 }
686 case *FileNameSet:
687 if len(s.Set) == 0 {
688 return &Const{false}
689 }
690 }
691 return q
692}
693
694func Simplify(q Q) Q {
695 q = evalConstants(q)
696 for {
697 var changed bool
698 q, changed = flatten(q)
699 if !changed {
700 break
701 }
702 }
703
704 return q
705}
706
707// Map runs f over the q.
708func Map(q Q, f func(q Q) Q) Q {
709 switch s := q.(type) {
710 case *And:
711 q = &And{Children: mapQueryList(s.Children, f)}
712 case *Or:
713 q = &Or{Children: mapQueryList(s.Children, f)}
714 case *Not:
715 q = &Not{Child: Map(s.Child, f)}
716 case *Type:
717 q = &Type{Type: s.Type, Child: Map(s.Child, f)}
718 case *Boost:
719 q = &Boost{Boost: s.Boost, Child: Map(s.Child, f)}
720 }
721 return f(q)
722}
723
724// Expand expands Substr queries into (OR file_substr content_substr)
725// queries, and the same for Regexp queries..
726func ExpandFileContent(q Q) Q {
727 switch s := q.(type) {
728 case *Substring:
729 if s.FileName == s.Content {
730 f := *s
731 f.FileName = true
732 f.Content = false
733 c := *s
734 c.FileName = false
735 c.Content = true
736 return NewOr(&f, &c)
737 }
738 case *Regexp:
739 if s.FileName == s.Content {
740 f := *s
741 f.FileName = true
742 f.Content = false
743 c := *s
744 c.FileName = false
745 c.Content = true
746 return NewOr(&f, &c)
747 }
748 }
749 return q
750}
751
752// VisitAtoms runs `v` on all atom queries within `q`.
753func VisitAtoms(q Q, v func(q Q)) {
754 Map(q, func(iQ Q) Q {
755 switch iQ.(type) {
756 case *And:
757 case *Or:
758 case *Not:
759 case *Type:
760 case *Boost:
761 default:
762 v(iQ)
763 }
764 return iQ
765 })
766}