fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package query
16
17import (
18 "encoding/json"
19 "fmt"
20 "log"
21 "reflect"
22 "regexp/syntax"
23 "sort"
24 "strconv"
25 "strings"
26
27 "github.com/RoaringBitmap/roaring"
28 "github.com/grafana/regexp"
29
30 "github.com/sourcegraph/zoekt/internal/syntaxutil"
31)
32
33var _ = log.Println
34
35// Q is a representation for a possibly hierarchical search query.
36type Q interface {
37 String() string
38}
39
40// RawConfig filters repositories based on their encoded RawConfig map.
41type RawConfig uint64
42
43const (
44 RcOnlyPublic RawConfig = 1
45 RcOnlyPrivate RawConfig = 2
46 RcOnlyForks RawConfig = 1 << 2
47 RcNoForks RawConfig = 2 << 2
48 RcOnlyArchived RawConfig = 1 << 4
49 RcNoArchived RawConfig = 2 << 4
50)
51
52var flagNames = []struct {
53 Mask RawConfig
54 Label string
55}{
56 {RcOnlyPublic, "RcOnlyPublic"},
57 {RcOnlyPrivate, "RcOnlyPrivate"},
58 {RcOnlyForks, "RcOnlyForks"},
59 {RcNoForks, "RcNoForks"},
60 {RcOnlyArchived, "RcOnlyArchived"},
61 {RcNoArchived, "RcNoArchived"},
62}
63
64func (r RawConfig) String() string {
65 var s []string
66 for _, fn := range flagNames {
67 if r&fn.Mask != 0 {
68 s = append(s, fn.Label)
69 }
70 }
71 return fmt.Sprintf("rawConfig:%s", strings.Join(s, "|"))
72}
73
74// RegexpQuery is a query looking for regular expressions matches.
75type Regexp struct {
76 Regexp *syntax.Regexp
77 FileName bool
78 Content bool
79 CaseSensitive bool
80}
81
82func (q *Regexp) String() string {
83 pref := ""
84 if q.FileName {
85 pref = "file_"
86 }
87 if q.CaseSensitive {
88 pref = "case_" + pref
89 }
90 return fmt.Sprintf("%sregex:%q", pref, syntaxutil.RegexpString(q.Regexp))
91}
92
93// gobRegexp wraps Regexp to make it gob-encodable/decodable. Regexp contains syntax.Regexp, which
94// contains slices/arrays with possibly nil elements, which gob doesn't support
95// (https://github.com/golang/go/issues/1501).
96type gobRegexp struct {
97 Regexp // Regexp.Regexp (*syntax.Regexp) is set to nil and its string is set in RegexpString
98 RegexpString string
99}
100
101// GobEncode implements gob.Encoder.
102func (q Regexp) GobEncode() ([]byte, error) {
103 gobq := gobRegexp{Regexp: q, RegexpString: syntaxutil.RegexpString(q.Regexp)}
104 gobq.Regexp.Regexp = nil // can't be gob-encoded/decoded
105 return json.Marshal(gobq)
106}
107
108// GobDecode implements gob.Decoder.
109func (q *Regexp) GobDecode(data []byte) error {
110 var gobq gobRegexp
111 err := json.Unmarshal(data, &gobq)
112 if err != nil {
113 return err
114 }
115 gobq.Regexp.Regexp, err = syntax.Parse(gobq.RegexpString, regexpFlags)
116 if err != nil {
117 return err
118 }
119 *q = gobq.Regexp
120 return nil
121}
122
123// Symbol finds a string that is a symbol.
124type Symbol struct {
125 Expr Q
126}
127
128func (s *Symbol) String() string {
129 return fmt.Sprintf("sym:%s", s.Expr)
130}
131
132type caseQ struct {
133 Flavor string
134}
135
136func (c *caseQ) String() string {
137 return "case:" + c.Flavor
138}
139
140type Language struct {
141 Language string
142}
143
144func (l *Language) String() string {
145 return "lang:" + l.Language
146}
147
148type Const struct {
149 Value bool
150}
151
152func (q *Const) String() string {
153 if q.Value {
154 return "TRUE"
155 }
156 return "FALSE"
157}
158
159type Repo struct {
160 Regexp *regexp.Regexp
161}
162
163func (q *Repo) String() string {
164 return fmt.Sprintf("repo:%s", q.Regexp.String())
165}
166
167func (q Repo) GobEncode() ([]byte, error) {
168 return []byte(q.Regexp.String()), nil
169}
170
171func (q *Repo) GobDecode(data []byte) error {
172 var err error
173 q.Regexp, err = regexp.Compile(string(data))
174 return err
175}
176
177// RepoRegexp is a Sourcegraph addition which searches documents where the
178// repository name matches Regexp.
179type RepoRegexp struct {
180 Regexp *regexp.Regexp
181}
182
183func (q *RepoRegexp) String() string {
184 return fmt.Sprintf("reporegex:%q", q.Regexp.String())
185}
186
187// GobEncode implements gob.Encoder.
188func (q *RepoRegexp) GobEncode() ([]byte, error) {
189 // gob can't encode syntax.Regexp
190 return []byte(q.Regexp.String()), nil
191}
192
193// GobDecode implements gob.Decoder.
194func (q *RepoRegexp) GobDecode(data []byte) error {
195 var err error
196 q.Regexp, err = regexp.Compile(string(data))
197 return err
198}
199
200// BranchesRepos is a slice of BranchRepos to match. It is a Sourcegraph
201// addition and only used in the RPC interface for efficient checking of large
202// repo lists.
203type BranchesRepos struct {
204 List []BranchRepos
205}
206
207// NewSingleBranchesRepos is a helper for creating a BranchesRepos which
208// searches a single branch.
209func NewSingleBranchesRepos(branch string, ids ...uint32) *BranchesRepos {
210 return &BranchesRepos{List: []BranchRepos{
211 {branch, roaring.BitmapOf(ids...)},
212 }}
213}
214
215func (q *BranchesRepos) String() string {
216 var sb strings.Builder
217
218 sb.WriteString("(branchesrepos")
219
220 for _, br := range q.List {
221 if size := br.Repos.GetCardinality(); size > 1 {
222 sb.WriteString(" " + br.Branch + ":" + strconv.FormatUint(size, 10))
223 } else {
224 sb.WriteString(" " + br.Branch + "=" + br.Repos.String())
225 }
226 }
227
228 sb.WriteString(")")
229 return sb.String()
230}
231
232// NewRepoIDs is a helper for creating a RepoIDs which
233// searches only the matched repos.
234func NewRepoIDs(ids ...uint32) *RepoIDs {
235 return &RepoIDs{Repos: roaring.BitmapOf(ids...)}
236}
237
238func (q *RepoIDs) String() string {
239 var sb strings.Builder
240
241 sb.WriteString("(repoids ")
242
243 if size := q.Repos.GetCardinality(); size > 1 {
244 sb.WriteString("count:" + strconv.FormatUint(size, 10))
245 } else {
246 sb.WriteString("repoid=" + q.Repos.String())
247 }
248
249 sb.WriteString(")")
250 return sb.String()
251}
252
253// MarshalBinary implements a specialized encoder for BranchesRepos.
254func (q BranchesRepos) MarshalBinary() ([]byte, error) {
255 return branchesReposEncode(q.List)
256}
257
258// UnmarshalBinary implements a specialized decoder for BranchesRepos.
259func (q *BranchesRepos) UnmarshalBinary(b []byte) (err error) {
260 q.List, err = branchesReposDecode(b)
261 return err
262}
263
264// BranchRepos is a (branch, sourcegraph repo ids bitmap) tuple. It is a
265// Sourcegraph addition.
266type BranchRepos struct {
267 Branch string
268 Repos *roaring.Bitmap
269}
270
271// Similar to BranchRepos but will be used to match only by repoid and
272// therefore matches all branches
273type RepoIDs struct {
274 Repos *roaring.Bitmap
275}
276
277// RepoSet is a list of repos to match. It is a Sourcegraph addition and only
278// used in the RPC interface for efficient checking of large repo lists.
279type RepoSet struct {
280 Set map[string]bool
281}
282
283func (q *RepoSet) String() string {
284 var detail string
285 if len(q.Set) > 5 {
286 // Large sets being output are not useful
287 detail = fmt.Sprintf("size=%d", len(q.Set))
288 } else {
289 repos := make([]string, len(q.Set))
290 i := 0
291 for repo := range q.Set {
292 repos[i] = repo
293 i++
294 }
295 sort.Strings(repos)
296 detail = strings.Join(repos, " ")
297 }
298 return fmt.Sprintf("(reposet %s)", detail)
299}
300
301func NewRepoSet(repo ...string) *RepoSet {
302 s := &RepoSet{Set: make(map[string]bool)}
303 for _, r := range repo {
304 s.Set[r] = true
305 }
306 return s
307}
308
309// FileNameSet is a list of file names to match. It is a Sourcegraph addition
310// and only used in the RPC interface for efficient checking of large file
311// lists.
312type FileNameSet struct {
313 Set map[string]struct{}
314}
315
316// MarshalBinary implements a specialized encoder for FileNameSet.
317func (q *FileNameSet) MarshalBinary() ([]byte, error) {
318 return stringSetEncode(q.Set)
319}
320
321// UnmarshalBinary implements a specialized decoder for FileNameSet.
322func (q *FileNameSet) UnmarshalBinary(b []byte) error {
323 var err error
324 q.Set, err = stringSetDecode(b)
325 return err
326}
327
328func (q *FileNameSet) String() string {
329 var detail string
330 if len(q.Set) > 5 {
331 // Large sets being output are not useful
332 detail = fmt.Sprintf("size=%d", len(q.Set))
333 } else {
334 values := make([]string, 0, len(q.Set))
335 for v := range q.Set {
336 values = append(values, v)
337 }
338 sort.Strings(values)
339 detail = strings.Join(values, " ")
340 }
341 return fmt.Sprintf("(filenameset %s)", detail)
342}
343
344func NewFileNameSet(fileNames ...string) *FileNameSet {
345 s := &FileNameSet{Set: make(map[string]struct{})}
346 for _, r := range fileNames {
347 s.Set[r] = struct{}{}
348 }
349 return s
350}
351
352const (
353 TypeFileMatch uint8 = iota
354 TypeFileName
355 TypeRepo
356)
357
358// Type changes the result type returned.
359type Type struct {
360 Child Q
361 Type uint8
362}
363
364func (q *Type) String() string {
365 switch q.Type {
366 case TypeFileMatch:
367 return fmt.Sprintf("(type:filematch %s)", q.Child)
368 case TypeFileName:
369 return fmt.Sprintf("(type:filename %s)", q.Child)
370 case TypeRepo:
371 return fmt.Sprintf("(type:repo %s)", q.Child)
372 default:
373 return fmt.Sprintf("(type:UNKNOWN %s)", q.Child)
374 }
375}
376
377// Boost scales the contribution to score of descendents.
378type Boost struct {
379 Child Q
380 // Boost will multiply the score of its descendents. Values less than 1 will
381 // give less importance while values greater than 1 will give more
382 // importance.
383 Boost float64
384}
385
386func (q *Boost) String() string {
387 return fmt.Sprintf("(boost %0.2f %s)", q.Boost, q.Child)
388}
389
390// Substring is the most basic query: a query for a substring.
391type Substring struct {
392 Pattern string
393 CaseSensitive bool
394
395 // Match only filename
396 FileName bool
397
398 // Match only content
399 Content bool
400}
401
402func (q *Substring) String() string {
403 s := ""
404
405 t := ""
406 if q.FileName {
407 t = "file_"
408 } else if q.Content {
409 t = "content_"
410 }
411
412 s += fmt.Sprintf("%ssubstr:%q", t, q.Pattern)
413 if q.CaseSensitive {
414 s = "case_" + s
415 }
416 return s
417}
418
419type setCaser interface {
420 setCase(string)
421}
422
423func (q *Substring) setCase(k string) {
424 switch k {
425 case "yes":
426 q.CaseSensitive = true
427 case "no":
428 q.CaseSensitive = false
429 case "auto":
430 // TODO - unicode
431 q.CaseSensitive = (q.Pattern != string(toLower([]byte(q.Pattern))))
432 }
433}
434
435func (q *Symbol) setCase(k string) {
436 if sc, ok := q.Expr.(setCaser); ok {
437 sc.setCase(k)
438 }
439}
440
441func (q *Regexp) setCase(k string) {
442 switch k {
443 case "yes":
444 q.CaseSensitive = true
445 case "no":
446 q.CaseSensitive = false
447 case "auto":
448 q.CaseSensitive = !q.Regexp.Equal(LowerRegexp(q.Regexp))
449 }
450}
451
452// Or is matched when any of its children is matched.
453type Or struct {
454 Children []Q
455}
456
457func (q *Or) String() string {
458 var sub []string
459 for _, ch := range q.Children {
460 sub = append(sub, ch.String())
461 }
462 return fmt.Sprintf("(or %s)", strings.Join(sub, " "))
463}
464
465// Not inverts the meaning of its child.
466type Not struct {
467 Child Q
468}
469
470func (q *Not) String() string {
471 return fmt.Sprintf("(not %s)", q.Child)
472}
473
474// And is matched when all its children are.
475type And struct {
476 Children []Q
477}
478
479func (q *And) String() string {
480 var sub []string
481 for _, ch := range q.Children {
482 sub = append(sub, ch.String())
483 }
484 return fmt.Sprintf("(and %s)", strings.Join(sub, " "))
485}
486
487// NewAnd is syntactic sugar for constructing And queries.
488func NewAnd(qs ...Q) Q {
489 return &And{Children: qs}
490}
491
492// NewOr is syntactic sugar for constructing Or queries.
493func NewOr(qs ...Q) Q {
494 return &Or{Children: qs}
495}
496
497// Branch limits search to a specific branch.
498type Branch struct {
499 Pattern string
500
501 // exact is true if we want to Pattern to equal branch.
502 Exact bool
503}
504
505func (q *Branch) String() string {
506 if q.Exact {
507 return fmt.Sprintf("branch=%q", q.Pattern)
508 }
509 return fmt.Sprintf("branch:%q", q.Pattern)
510}
511
512// Meta represents a query for metadata fields.
513type Meta struct {
514 Field string // The metadata field name
515 Value *regexp.Regexp // The value to match
516}
517
518// String returns a string representation of the Meta query.
519func (m *Meta) String() string {
520 return fmt.Sprintf("meta.%s:%s", m.Field, m.Value)
521}
522
523func queryChildren(q Q) []Q {
524 switch s := q.(type) {
525 case *And:
526 return s.Children
527 case *Or:
528 return s.Children
529 }
530 return nil
531}
532
533func flattenAndOr(children []Q, typ Q) ([]Q, bool) {
534 var flat []Q
535 changed := false
536 for _, ch := range children {
537 ch, subChanged := flatten(ch)
538 changed = changed || subChanged
539 if reflect.TypeOf(ch) == reflect.TypeOf(typ) {
540 changed = true
541 subChildren := queryChildren(ch)
542 if subChildren != nil {
543 flat = append(flat, subChildren...)
544 }
545 } else {
546 flat = append(flat, ch)
547 }
548 }
549
550 return flat, changed
551}
552
553// (and (and x y) z) => (and x y z) , the same for "or"
554func flatten(q Q) (Q, bool) {
555 switch s := q.(type) {
556 case *And:
557 if len(s.Children) == 1 {
558 return s.Children[0], true
559 }
560 flatChildren, changed := flattenAndOr(s.Children, s)
561 return &And{flatChildren}, changed
562 case *Or:
563 if len(s.Children) == 1 {
564 return s.Children[0], true
565 }
566 flatChildren, changed := flattenAndOr(s.Children, s)
567 return &Or{flatChildren}, changed
568 case *Not:
569 child, changed := flatten(s.Child)
570 return &Not{child}, changed
571 case *Type:
572 child, changed := flatten(s.Child)
573 return &Type{Child: child, Type: s.Type}, changed
574 case *Boost:
575 child, changed := flatten(s.Child)
576 return &Boost{Child: child, Boost: s.Boost}, changed
577 default:
578 return q, false
579 }
580}
581
582func mapQueryList(qs []Q, f func(Q) Q) []Q {
583 neg := make([]Q, len(qs))
584 for i, sub := range qs {
585 neg[i] = Map(sub, f)
586 }
587 return neg
588}
589
590func invertConst(q Q) Q {
591 c, ok := q.(*Const)
592 if ok {
593 return &Const{!c.Value}
594 }
595 return q
596}
597
598func evalAndOrConstants(q Q, children []Q) Q {
599 _, isAnd := q.(*And)
600
601 children = mapQueryList(children, evalConstants)
602
603 newCH := children[:0]
604 for _, ch := range children {
605 c, ok := ch.(*Const)
606 if ok {
607 if c.Value == isAnd {
608 continue
609 } else {
610 return ch
611 }
612 }
613 newCH = append(newCH, ch)
614 }
615 if len(newCH) == 0 {
616 return &Const{isAnd}
617 }
618 if isAnd {
619 return &And{newCH}
620 }
621 return &Or{newCH}
622}
623
624func evalConstants(q Q) Q {
625 switch s := q.(type) {
626 case *And:
627 return evalAndOrConstants(q, s.Children)
628 case *Or:
629 return evalAndOrConstants(q, s.Children)
630 case *Not:
631 ch := evalConstants(s.Child)
632 if _, ok := ch.(*Const); ok {
633 return invertConst(ch)
634 }
635 return &Not{ch}
636 case *Type:
637 ch := evalConstants(s.Child)
638 if _, ok := ch.(*Const); ok {
639 // If q is the root query, then evaluating this to a const changes
640 // the type of result we will return. However, the only case this
641 // makes sense is `type:repo TRUE` to return all repos or
642 // `type:file TRUE` to return all filenames. For other cases we
643 // want to do this constant folding though, so we allow the
644 // unexpected behaviour mentioned previously.
645 return ch
646 }
647 return &Type{Child: ch, Type: s.Type}
648 case *Boost:
649 ch := evalConstants(s.Child)
650 if _, ok := ch.(*Const); ok {
651 return ch
652 }
653 return &Boost{Boost: s.Boost, Child: ch}
654 case *Substring:
655 if len(s.Pattern) == 0 {
656 return &Const{true}
657 }
658 case *Regexp:
659 if s.Regexp.Op == syntax.OpEmptyMatch {
660 return &Const{true}
661 }
662 case *Branch:
663 if s.Pattern == "" {
664 return &Const{true}
665 }
666 case *BranchesRepos:
667 for _, br := range s.List {
668 if !br.Repos.IsEmpty() {
669 return q
670 }
671 }
672 return &Const{false}
673 case *RepoIDs:
674 if s.Repos.IsEmpty() {
675 return &Const{false}
676 }
677 case *RepoSet:
678 if len(s.Set) == 0 {
679 return &Const{false}
680 }
681 case *FileNameSet:
682 if len(s.Set) == 0 {
683 return &Const{false}
684 }
685 }
686 return q
687}
688
689func Simplify(q Q) Q {
690 q = evalConstants(q)
691 for {
692 var changed bool
693 q, changed = flatten(q)
694 if !changed {
695 break
696 }
697 }
698
699 return q
700}
701
702// Map runs f over the q.
703func Map(q Q, f func(q Q) Q) Q {
704 switch s := q.(type) {
705 case *And:
706 q = &And{Children: mapQueryList(s.Children, f)}
707 case *Or:
708 q = &Or{Children: mapQueryList(s.Children, f)}
709 case *Not:
710 q = &Not{Child: Map(s.Child, f)}
711 case *Type:
712 q = &Type{Type: s.Type, Child: Map(s.Child, f)}
713 case *Boost:
714 q = &Boost{Boost: s.Boost, Child: Map(s.Child, f)}
715 }
716 return f(q)
717}
718
719// Expand expands Substr queries into (OR file_substr content_substr)
720// queries, and the same for Regexp queries..
721func ExpandFileContent(q Q) Q {
722 switch s := q.(type) {
723 case *Substring:
724 if s.FileName == s.Content {
725 f := *s
726 f.FileName = true
727 f.Content = false
728 c := *s
729 c.FileName = false
730 c.Content = true
731 return NewOr(&f, &c)
732 }
733 case *Regexp:
734 if s.FileName == s.Content {
735 f := *s
736 f.FileName = true
737 f.Content = false
738 c := *s
739 c.FileName = false
740 c.Content = true
741 return NewOr(&f, &c)
742 }
743 }
744 return q
745}
746
747// VisitAtoms runs `v` on all atom queries within `q`.
748func VisitAtoms(q Q, v func(q Q)) {
749 Map(q, func(iQ Q) Q {
750 switch iQ.(type) {
751 case *And:
752 case *Or:
753 case *Not:
754 case *Type:
755 case *Boost:
756 default:
757 v(iQ)
758 }
759 return iQ
760 })
761}