fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package query
16
17import (
18 "encoding/json"
19 "fmt"
20 "log"
21 "reflect"
22 "regexp/syntax"
23 "sort"
24 "strconv"
25 "strings"
26
27 "github.com/RoaringBitmap/roaring"
28 "github.com/grafana/regexp"
29 "github.com/sourcegraph/zoekt/internal/syntaxutil"
30)
31
32var _ = log.Println
33
34// Q is a representation for a possibly hierarchical search query.
35type Q interface {
36 String() string
37}
38
39// RawConfig filters repositories based on their encoded RawConfig map.
40type RawConfig uint64
41
42const (
43 RcOnlyPublic RawConfig = 1
44 RcOnlyPrivate RawConfig = 2
45 RcOnlyForks RawConfig = 1 << 2
46 RcNoForks RawConfig = 2 << 2
47 RcOnlyArchived RawConfig = 1 << 4
48 RcNoArchived RawConfig = 2 << 4
49)
50
51var flagNames = []struct {
52 Mask RawConfig
53 Label string
54}{
55 {RcOnlyPublic, "RcOnlyPublic"},
56 {RcOnlyPrivate, "RcOnlyPrivate"},
57 {RcOnlyForks, "RcOnlyForks"},
58 {RcNoForks, "RcNoForks"},
59 {RcOnlyArchived, "RcOnlyArchived"},
60 {RcNoArchived, "RcNoArchived"},
61}
62
63func (r RawConfig) String() string {
64 var s []string
65 for _, fn := range flagNames {
66 if r&fn.Mask != 0 {
67 s = append(s, fn.Label)
68 }
69 }
70 return fmt.Sprintf("rawConfig:%s", strings.Join(s, "|"))
71}
72
73// RegexpQuery is a query looking for regular expressions matches.
74type Regexp struct {
75 Regexp *syntax.Regexp
76 FileName bool
77 Content bool
78 CaseSensitive bool
79}
80
81func (q *Regexp) String() string {
82 pref := ""
83 if q.FileName {
84 pref = "file_"
85 }
86 if q.CaseSensitive {
87 pref = "case_" + pref
88 }
89 return fmt.Sprintf("%sregex:%q", pref, syntaxutil.RegexpString(q.Regexp))
90}
91
92// gobRegexp wraps Regexp to make it gob-encodable/decodable. Regexp contains syntax.Regexp, which
93// contains slices/arrays with possibly nil elements, which gob doesn't support
94// (https://github.com/golang/go/issues/1501).
95type gobRegexp struct {
96 Regexp // Regexp.Regexp (*syntax.Regexp) is set to nil and its string is set in RegexpString
97 RegexpString string
98}
99
100// GobEncode implements gob.Encoder.
101func (q Regexp) GobEncode() ([]byte, error) {
102 gobq := gobRegexp{Regexp: q, RegexpString: syntaxutil.RegexpString(q.Regexp)}
103 gobq.Regexp.Regexp = nil // can't be gob-encoded/decoded
104 return json.Marshal(gobq)
105}
106
107// GobDecode implements gob.Decoder.
108func (q *Regexp) GobDecode(data []byte) error {
109 var gobq gobRegexp
110 err := json.Unmarshal(data, &gobq)
111 if err != nil {
112 return err
113 }
114 gobq.Regexp.Regexp, err = syntax.Parse(gobq.RegexpString, regexpFlags)
115 if err != nil {
116 return err
117 }
118 *q = gobq.Regexp
119 return nil
120}
121
122// Symbol finds a string that is a symbol.
123type Symbol struct {
124 Expr Q
125}
126
127func (s *Symbol) String() string {
128 return fmt.Sprintf("sym:%s", s.Expr)
129}
130
131type caseQ struct {
132 Flavor string
133}
134
135func (c *caseQ) String() string {
136 return "case:" + c.Flavor
137}
138
139type Language struct {
140 Language string
141}
142
143func (l *Language) String() string {
144 return "lang:" + l.Language
145}
146
147type Const struct {
148 Value bool
149}
150
151func (q *Const) String() string {
152 if q.Value {
153 return "TRUE"
154 }
155 return "FALSE"
156}
157
158type Repo struct {
159 Regexp *regexp.Regexp
160}
161
162func (q *Repo) String() string {
163 return fmt.Sprintf("repo:%s", q.Regexp.String())
164}
165
166func (q Repo) GobEncode() ([]byte, error) {
167 return []byte(q.Regexp.String()), nil
168}
169
170func (q *Repo) GobDecode(data []byte) error {
171 var err error
172 q.Regexp, err = regexp.Compile(string(data))
173 return err
174}
175
176// RepoRegexp is a Sourcegraph addition which searches documents where the
177// repository name matches Regexp.
178type RepoRegexp struct {
179 Regexp *regexp.Regexp
180}
181
182func (q *RepoRegexp) String() string {
183 return fmt.Sprintf("reporegex:%q", q.Regexp.String())
184}
185
186// GobEncode implements gob.Encoder.
187func (q *RepoRegexp) GobEncode() ([]byte, error) {
188 // gob can't encode syntax.Regexp
189 return []byte(q.Regexp.String()), nil
190}
191
192// GobDecode implements gob.Decoder.
193func (q *RepoRegexp) GobDecode(data []byte) error {
194 var err error
195 q.Regexp, err = regexp.Compile(string(data))
196 return err
197}
198
199// BranchesRepos is a slice of BranchRepos to match. It is a Sourcegraph
200// addition and only used in the RPC interface for efficient checking of large
201// repo lists.
202type BranchesRepos struct {
203 List []BranchRepos
204}
205
206// NewSingleBranchesRepos is a helper for creating a BranchesRepos which
207// searches a single branch.
208func NewSingleBranchesRepos(branch string, ids ...uint32) *BranchesRepos {
209 return &BranchesRepos{List: []BranchRepos{
210 {branch, roaring.BitmapOf(ids...)},
211 }}
212}
213
214func (q *BranchesRepos) String() string {
215 var sb strings.Builder
216
217 sb.WriteString("(branchesrepos")
218
219 for _, br := range q.List {
220 if size := br.Repos.GetCardinality(); size > 1 {
221 sb.WriteString(" " + br.Branch + ":" + strconv.FormatUint(size, 10))
222 } else {
223 sb.WriteString(" " + br.Branch + "=" + br.Repos.String())
224 }
225 }
226
227 sb.WriteString(")")
228 return sb.String()
229}
230
231// NewRepoIDs is a helper for creating a RepoIDs which
232// searches only the matched repos.
233func NewRepoIDs(ids ...uint32) *RepoIDs {
234 return &RepoIDs{Repos: roaring.BitmapOf(ids...)}
235}
236
237func (q *RepoIDs) String() string {
238 var sb strings.Builder
239
240 sb.WriteString("(repoids ")
241
242 if size := q.Repos.GetCardinality(); size > 1 {
243 sb.WriteString("count:" + strconv.FormatUint(size, 10))
244 } else {
245 sb.WriteString("repoid=" + q.Repos.String())
246 }
247
248 sb.WriteString(")")
249 return sb.String()
250}
251
252// MarshalBinary implements a specialized encoder for BranchesRepos.
253func (q BranchesRepos) MarshalBinary() ([]byte, error) {
254 return branchesReposEncode(q.List)
255}
256
257// UnmarshalBinary implements a specialized decoder for BranchesRepos.
258func (q *BranchesRepos) UnmarshalBinary(b []byte) (err error) {
259 q.List, err = branchesReposDecode(b)
260 return err
261}
262
263// BranchRepos is a (branch, sourcegraph repo ids bitmap) tuple. It is a
264// Sourcegraph addition.
265type BranchRepos struct {
266 Branch string
267 Repos *roaring.Bitmap
268}
269
270// Similar to BranchRepos but will be used to match only by repoid and
271// therefore matches all branches
272type RepoIDs struct {
273 Repos *roaring.Bitmap
274}
275
276// RepoSet is a list of repos to match. It is a Sourcegraph addition and only
277// used in the RPC interface for efficient checking of large repo lists.
278type RepoSet struct {
279 Set map[string]bool
280}
281
282func (q *RepoSet) String() string {
283 var detail string
284 if len(q.Set) > 5 {
285 // Large sets being output are not useful
286 detail = fmt.Sprintf("size=%d", len(q.Set))
287 } else {
288 repos := make([]string, len(q.Set))
289 i := 0
290 for repo := range q.Set {
291 repos[i] = repo
292 i++
293 }
294 sort.Strings(repos)
295 detail = strings.Join(repos, " ")
296 }
297 return fmt.Sprintf("(reposet %s)", detail)
298}
299
300func NewRepoSet(repo ...string) *RepoSet {
301 s := &RepoSet{Set: make(map[string]bool)}
302 for _, r := range repo {
303 s.Set[r] = true
304 }
305 return s
306}
307
308// FileNameSet is a list of file names to match. It is a Sourcegraph addition
309// and only used in the RPC interface for efficient checking of large file
310// lists.
311type FileNameSet struct {
312 Set map[string]struct{}
313}
314
315// MarshalBinary implements a specialized encoder for FileNameSet.
316func (q *FileNameSet) MarshalBinary() ([]byte, error) {
317 return stringSetEncode(q.Set)
318}
319
320// UnmarshalBinary implements a specialized decoder for FileNameSet.
321func (q *FileNameSet) UnmarshalBinary(b []byte) error {
322 var err error
323 q.Set, err = stringSetDecode(b)
324 return err
325}
326
327func (q *FileNameSet) String() string {
328 var detail string
329 if len(q.Set) > 5 {
330 // Large sets being output are not useful
331 detail = fmt.Sprintf("size=%d", len(q.Set))
332 } else {
333 values := make([]string, 0, len(q.Set))
334 for v := range q.Set {
335 values = append(values, v)
336 }
337 sort.Strings(values)
338 detail = strings.Join(values, " ")
339 }
340 return fmt.Sprintf("(filenameset %s)", detail)
341}
342
343func NewFileNameSet(fileNames ...string) *FileNameSet {
344 s := &FileNameSet{Set: make(map[string]struct{})}
345 for _, r := range fileNames {
346 s.Set[r] = struct{}{}
347 }
348 return s
349}
350
351const (
352 TypeFileMatch uint8 = iota
353 TypeFileName
354 TypeRepo
355)
356
357// Type changes the result type returned.
358type Type struct {
359 Child Q
360 Type uint8
361}
362
363func (q *Type) String() string {
364 switch q.Type {
365 case TypeFileMatch:
366 return fmt.Sprintf("(type:filematch %s)", q.Child)
367 case TypeFileName:
368 return fmt.Sprintf("(type:filename %s)", q.Child)
369 case TypeRepo:
370 return fmt.Sprintf("(type:repo %s)", q.Child)
371 default:
372 return fmt.Sprintf("(type:UNKNOWN %s)", q.Child)
373 }
374}
375
376// Boost scales the contribution to score of descendents.
377type Boost struct {
378 Child Q
379 // Boost will multiply the score of its descendents. Values less than 1 will
380 // give less importance while values greater than 1 will give more
381 // importance.
382 Boost float64
383}
384
385func (q *Boost) String() string {
386 return fmt.Sprintf("(boost %0.2f %s)", q.Boost, q.Child)
387}
388
389// Substring is the most basic query: a query for a substring.
390type Substring struct {
391 Pattern string
392 CaseSensitive bool
393
394 // Match only filename
395 FileName bool
396
397 // Match only content
398 Content bool
399}
400
401func (q *Substring) String() string {
402 s := ""
403
404 t := ""
405 if q.FileName {
406 t = "file_"
407 } else if q.Content {
408 t = "content_"
409 }
410
411 s += fmt.Sprintf("%ssubstr:%q", t, q.Pattern)
412 if q.CaseSensitive {
413 s = "case_" + s
414 }
415 return s
416}
417
418type setCaser interface {
419 setCase(string)
420}
421
422func (q *Substring) setCase(k string) {
423 switch k {
424 case "yes":
425 q.CaseSensitive = true
426 case "no":
427 q.CaseSensitive = false
428 case "auto":
429 // TODO - unicode
430 q.CaseSensitive = (q.Pattern != string(toLower([]byte(q.Pattern))))
431 }
432}
433
434func (q *Symbol) setCase(k string) {
435 if sc, ok := q.Expr.(setCaser); ok {
436 sc.setCase(k)
437 }
438}
439
440func (q *Regexp) setCase(k string) {
441 switch k {
442 case "yes":
443 q.CaseSensitive = true
444 case "no":
445 q.CaseSensitive = false
446 case "auto":
447 q.CaseSensitive = !q.Regexp.Equal(LowerRegexp(q.Regexp))
448 }
449}
450
451// Or is matched when any of its children is matched.
452type Or struct {
453 Children []Q
454}
455
456func (q *Or) String() string {
457 var sub []string
458 for _, ch := range q.Children {
459 sub = append(sub, ch.String())
460 }
461 return fmt.Sprintf("(or %s)", strings.Join(sub, " "))
462}
463
464// Not inverts the meaning of its child.
465type Not struct {
466 Child Q
467}
468
469func (q *Not) String() string {
470 return fmt.Sprintf("(not %s)", q.Child)
471}
472
473// And is matched when all its children are.
474type And struct {
475 Children []Q
476}
477
478func (q *And) String() string {
479 var sub []string
480 for _, ch := range q.Children {
481 sub = append(sub, ch.String())
482 }
483 return fmt.Sprintf("(and %s)", strings.Join(sub, " "))
484}
485
486// NewAnd is syntactic sugar for constructing And queries.
487func NewAnd(qs ...Q) Q {
488 return &And{Children: qs}
489}
490
491// NewOr is syntactic sugar for constructing Or queries.
492func NewOr(qs ...Q) Q {
493 return &Or{Children: qs}
494}
495
496// Branch limits search to a specific branch.
497type Branch struct {
498 Pattern string
499
500 // exact is true if we want to Pattern to equal branch.
501 Exact bool
502}
503
504func (q *Branch) String() string {
505 if q.Exact {
506 return fmt.Sprintf("branch=%q", q.Pattern)
507 }
508 return fmt.Sprintf("branch:%q", q.Pattern)
509}
510
511func queryChildren(q Q) []Q {
512 switch s := q.(type) {
513 case *And:
514 return s.Children
515 case *Or:
516 return s.Children
517 }
518 return nil
519}
520
521func flattenAndOr(children []Q, typ Q) ([]Q, bool) {
522 var flat []Q
523 changed := false
524 for _, ch := range children {
525 ch, subChanged := flatten(ch)
526 changed = changed || subChanged
527 if reflect.TypeOf(ch) == reflect.TypeOf(typ) {
528 changed = true
529 subChildren := queryChildren(ch)
530 if subChildren != nil {
531 flat = append(flat, subChildren...)
532 }
533 } else {
534 flat = append(flat, ch)
535 }
536 }
537
538 return flat, changed
539}
540
541// (and (and x y) z) => (and x y z) , the same for "or"
542func flatten(q Q) (Q, bool) {
543 switch s := q.(type) {
544 case *And:
545 if len(s.Children) == 1 {
546 return s.Children[0], true
547 }
548 flatChildren, changed := flattenAndOr(s.Children, s)
549 return &And{flatChildren}, changed
550 case *Or:
551 if len(s.Children) == 1 {
552 return s.Children[0], true
553 }
554 flatChildren, changed := flattenAndOr(s.Children, s)
555 return &Or{flatChildren}, changed
556 case *Not:
557 child, changed := flatten(s.Child)
558 return &Not{child}, changed
559 case *Type:
560 child, changed := flatten(s.Child)
561 return &Type{Child: child, Type: s.Type}, changed
562 case *Boost:
563 child, changed := flatten(s.Child)
564 return &Boost{Child: child, Boost: s.Boost}, changed
565 default:
566 return q, false
567 }
568}
569
570func mapQueryList(qs []Q, f func(Q) Q) []Q {
571 neg := make([]Q, len(qs))
572 for i, sub := range qs {
573 neg[i] = Map(sub, f)
574 }
575 return neg
576}
577
578func invertConst(q Q) Q {
579 c, ok := q.(*Const)
580 if ok {
581 return &Const{!c.Value}
582 }
583 return q
584}
585
586func evalAndOrConstants(q Q, children []Q) Q {
587 _, isAnd := q.(*And)
588
589 children = mapQueryList(children, evalConstants)
590
591 newCH := children[:0]
592 for _, ch := range children {
593 c, ok := ch.(*Const)
594 if ok {
595 if c.Value == isAnd {
596 continue
597 } else {
598 return ch
599 }
600 }
601 newCH = append(newCH, ch)
602 }
603 if len(newCH) == 0 {
604 return &Const{isAnd}
605 }
606 if isAnd {
607 return &And{newCH}
608 }
609 return &Or{newCH}
610}
611
612func evalConstants(q Q) Q {
613 switch s := q.(type) {
614 case *And:
615 return evalAndOrConstants(q, s.Children)
616 case *Or:
617 return evalAndOrConstants(q, s.Children)
618 case *Not:
619 ch := evalConstants(s.Child)
620 if _, ok := ch.(*Const); ok {
621 return invertConst(ch)
622 }
623 return &Not{ch}
624 case *Type:
625 ch := evalConstants(s.Child)
626 if _, ok := ch.(*Const); ok {
627 // If q is the root query, then evaluating this to a const changes
628 // the type of result we will return. However, the only case this
629 // makes sense is `type:repo TRUE` to return all repos or
630 // `type:file TRUE` to return all filenames. For other cases we
631 // want to do this constant folding though, so we allow the
632 // unexpected behaviour mentioned previously.
633 return ch
634 }
635 return &Type{Child: ch, Type: s.Type}
636 case *Boost:
637 ch := evalConstants(s.Child)
638 if _, ok := ch.(*Const); ok {
639 return ch
640 }
641 return &Boost{Boost: s.Boost, Child: ch}
642 case *Substring:
643 if len(s.Pattern) == 0 {
644 return &Const{true}
645 }
646 case *Regexp:
647 if s.Regexp.Op == syntax.OpEmptyMatch {
648 return &Const{true}
649 }
650 case *Branch:
651 if s.Pattern == "" {
652 return &Const{true}
653 }
654 case *BranchesRepos:
655 for _, br := range s.List {
656 if !br.Repos.IsEmpty() {
657 return q
658 }
659 }
660 return &Const{false}
661 case *RepoIDs:
662 if s.Repos.IsEmpty() {
663 return &Const{false}
664 }
665 case *RepoSet:
666 if len(s.Set) == 0 {
667 return &Const{false}
668 }
669 case *FileNameSet:
670 if len(s.Set) == 0 {
671 return &Const{false}
672 }
673 }
674 return q
675}
676
677func Simplify(q Q) Q {
678 q = evalConstants(q)
679 for {
680 var changed bool
681 q, changed = flatten(q)
682 if !changed {
683 break
684 }
685 }
686
687 return q
688}
689
690// Map runs f over the q.
691func Map(q Q, f func(q Q) Q) Q {
692 switch s := q.(type) {
693 case *And:
694 q = &And{Children: mapQueryList(s.Children, f)}
695 case *Or:
696 q = &Or{Children: mapQueryList(s.Children, f)}
697 case *Not:
698 q = &Not{Child: Map(s.Child, f)}
699 case *Type:
700 q = &Type{Type: s.Type, Child: Map(s.Child, f)}
701 case *Boost:
702 q = &Boost{Boost: s.Boost, Child: Map(s.Child, f)}
703 }
704 return f(q)
705}
706
707// Expand expands Substr queries into (OR file_substr content_substr)
708// queries, and the same for Regexp queries..
709func ExpandFileContent(q Q) Q {
710 switch s := q.(type) {
711 case *Substring:
712 if s.FileName == s.Content {
713 f := *s
714 f.FileName = true
715 f.Content = false
716 c := *s
717 c.FileName = false
718 c.Content = true
719 return NewOr(&f, &c)
720 }
721 case *Regexp:
722 if s.FileName == s.Content {
723 f := *s
724 f.FileName = true
725 f.Content = false
726 c := *s
727 c.FileName = false
728 c.Content = true
729 return NewOr(&f, &c)
730 }
731 }
732 return q
733}
734
735// VisitAtoms runs `v` on all atom queries within `q`.
736func VisitAtoms(q Q, v func(q Q)) {
737 Map(q, func(iQ Q) Q {
738 switch iQ.(type) {
739 case *And:
740 case *Or:
741 case *Not:
742 case *Type:
743 case *Boost:
744 default:
745 v(iQ)
746 }
747 return iQ
748 })
749}