fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package query
16
17import (
18 "bytes"
19 "fmt"
20 "log"
21 "regexp/syntax"
22
23 "github.com/grafana/regexp"
24
25 "github.com/sourcegraph/zoekt/languages"
26)
27
28var _ = log.Printf
29
30// parseStringLiteral parses a string literal, consumes the starting
31// quote too.
32func parseStringLiteral(in []byte) (lit []byte, n int, err error) {
33 left := in[1:]
34 found := false
35
36loop:
37 for len(left) > 0 {
38 c := left[0]
39 left = left[1:]
40 switch c {
41 case '"':
42 found = true
43 break loop
44 case '\\':
45 // TODO - other escape sequences.
46 if len(left) == 0 {
47 return nil, 0, fmt.Errorf("query: missing char after \\")
48 }
49 c = left[0]
50 left = left[1:]
51
52 lit = append(lit, c)
53 default:
54 lit = append(lit, c)
55 }
56 }
57 if !found {
58 return nil, 0, fmt.Errorf("query: unterminated quoted string")
59 }
60 return lit, len(in) - len(left), nil
61}
62
63// orOperator is a placeholder intermediate so we can represent [A,
64// or, B] before we convert it to Or{A, B}
65type orOperator struct{}
66
67func (o *orOperator) String() string {
68 return "orOp"
69}
70
71// caseScopeQ is a parse-time wrapper used to prevent case directives from an
72// outer expression list from overriding an explicitly scoped inner `case:`.
73type caseScopeQ struct {
74 Child Q
75}
76
77func (c *caseScopeQ) String() string {
78 return c.Child.String()
79}
80
81func stripCaseScopesList(qs []Q) []Q {
82 stripped := make([]Q, len(qs))
83 for i, q := range qs {
84 stripped[i] = stripCaseScopes(q)
85 }
86 return stripped
87}
88
89func stripCaseScopes(q Q) Q {
90 switch s := q.(type) {
91 case *And:
92 return &And{Children: stripCaseScopesList(s.Children)}
93 case *Or:
94 return &Or{Children: stripCaseScopesList(s.Children)}
95 case *Not:
96 return &Not{Child: stripCaseScopes(s.Child)}
97 case *Type:
98 return &Type{Type: s.Type, Child: stripCaseScopes(s.Child)}
99 case *Boost:
100 return &Boost{Boost: s.Boost, Child: stripCaseScopes(s.Child)}
101 case *caseScopeQ:
102 return stripCaseScopes(s.Child)
103 default:
104 return q
105 }
106}
107
108func isSpace(c byte) bool {
109 return c == ' ' || c == '\t'
110}
111
112// Parse parses a string into a query.
113func Parse(qStr string) (Q, error) {
114 b := []byte(qStr)
115
116 qs, n, err := parseExprList(b)
117 if err != nil {
118 return nil, err
119 }
120
121 if n != len(b) {
122 return nil, fmt.Errorf("query: extra tokens found at end input: %q", b[n:])
123 }
124
125 q, err := parseOperators(qs)
126 if err != nil {
127 return nil, err
128 }
129
130 q = stripCaseScopes(q)
131
132 return Simplify(q), nil
133}
134
135// parseExpr parses a single expression, returning the result, and the
136// number of bytes consumed.
137func parseExpr(in []byte) (Q, int, error) {
138 b := in[:]
139 var expr Q
140 for len(b) > 0 && isSpace(b[0]) {
141 b = b[1:]
142 }
143
144 tok, err := nextToken(b)
145 if err != nil {
146 return nil, 0, err
147 }
148 if tok == nil {
149 return nil, 0, nil
150 }
151 b = b[len(tok.Input):]
152
153 text := string(tok.Text)
154 switch tok.Type {
155 case tokCase:
156 switch text {
157 case "yes":
158 case "no":
159 case "auto":
160 default:
161 return nil, 0, fmt.Errorf("query: unknown case argument %q, want {yes,no,auto}", text)
162 }
163 expr = &caseQ{text}
164 case tokRepo:
165 r, err := regexp.Compile(text)
166 if err != nil {
167 return nil, 0, err
168 }
169
170 expr = &Repo{r}
171 case tokArchived:
172 switch text {
173 case "yes":
174 expr = RawConfig(RcOnlyArchived)
175 case "no":
176 expr = RawConfig(RcNoArchived)
177 default:
178 return nil, 0, fmt.Errorf("query: unknown archived argument %q, want {yes,no}", text)
179 }
180 case tokFork:
181 switch text {
182 case "yes":
183 expr = RawConfig(RcOnlyForks)
184 case "no":
185 expr = RawConfig(RcNoForks)
186 default:
187 return nil, 0, fmt.Errorf("query: unknown fork argument %q, want {yes,no}", text)
188 }
189 case tokPublic:
190 switch text {
191 case "yes":
192 expr = RawConfig(RcOnlyPublic)
193 case "no":
194 expr = RawConfig(RcOnlyPrivate)
195 default:
196 return nil, 0, fmt.Errorf("query: unknown public argument %q, want {yes,no}", text)
197 }
198 case tokBranch:
199 expr = &Branch{Pattern: text}
200 case tokText, tokRegex:
201 q, err := RegexpQuery(text, false, false)
202 if err != nil {
203 return nil, 0, err
204 }
205 expr = q
206 case tokFile:
207 q, err := RegexpQuery(text, false, true)
208 if err != nil {
209 return nil, 0, err
210 }
211 expr = q
212 case tokContent:
213 q, err := RegexpQuery(text, true, false)
214 if err != nil {
215 return nil, 0, err
216 }
217 expr = q
218 case tokLang:
219 canonical, ok := languages.GetLanguageByNameOrAlias(text)
220 if !ok {
221 expr = &Const{false}
222 } else {
223 expr = &Language{Language: canonical}
224 }
225
226 case tokSym:
227 if text == "" {
228 return nil, 0, fmt.Errorf("the sym: atom must have an argument")
229 }
230
231 q, err := RegexpQuery(text, false, false)
232 if err != nil {
233 return nil, 0, err
234 }
235
236 expr = &Symbol{q}
237 case tokParenClose:
238 // Caller must consume paren.
239 expr = nil
240
241 case tokParenOpen:
242 qs, n, err := parseExprList(b)
243 b = b[n:]
244 if err != nil {
245 return nil, 0, err
246 }
247
248 pTok, err := nextToken(b)
249 if err != nil {
250 return nil, 0, err
251 }
252 if pTok == nil || pTok.Type != tokParenClose {
253 return nil, 0, fmt.Errorf("query: missing close paren, got token %v", pTok)
254 }
255
256 b = b[len(pTok.Input):]
257 expr, err = parseOperators(qs)
258 if err != nil {
259 return nil, 0, err
260 }
261 case tokNegate:
262 subQ, n, err := parseExpr(b)
263 if err != nil {
264 return nil, 0, err
265 }
266 if subQ == nil {
267 return nil, 0, fmt.Errorf("query: '-' operator needs an argument")
268 }
269 b = b[n:]
270 expr = &Not{subQ}
271
272 case tokType:
273 var t uint8
274 switch text {
275 case "filematch":
276 t = TypeFileMatch
277 case "filename", "file":
278 t = TypeFileName
279 case "repo":
280 t = TypeRepo
281 default:
282 return nil, 0, fmt.Errorf("query: unknown type argument %q, want {filematch,filename,repo}", text)
283 }
284 // Later we will lift this into a root, like we do for caseQ
285 expr = &Type{Type: t, Child: nil}
286 case tokMeta:
287 // Split on ':' to separate field and value
288 parts := bytes.SplitN([]byte(text), []byte(":"), 2)
289 if len(parts) != 2 {
290 return nil, 0, fmt.Errorf("query: invalid meta field syntax %q", text)
291 }
292 field := string(parts[0])
293 valuePattern := string(parts[1])
294 re, err := regexp.Compile(valuePattern)
295 if err != nil {
296 return nil, 0, fmt.Errorf("query: invalid regexp in meta value: %v", err)
297 }
298 expr = &Meta{
299 Field: field,
300 Value: re,
301 }
302 }
303
304 return expr, len(in) - len(b), nil
305}
306
307const regexpFlags syntax.Flags = syntax.ClassNL | syntax.PerlX | syntax.UnicodeGroups
308
309// RegexpQuery parses an atom into either a regular expression, or a
310// simple substring atom.
311func RegexpQuery(text string, content, file bool) (Q, error) {
312 var expr Q
313
314 r, err := syntax.Parse(text, regexpFlags)
315 if err != nil {
316 return nil, err
317 }
318
319 r = OptimizeRegexp(r, regexpFlags)
320
321 if r.Op == syntax.OpLiteral {
322 expr = &Substring{
323 Pattern: string(r.Rune),
324 FileName: file,
325 Content: content,
326 }
327 } else {
328 expr = &Regexp{
329 Regexp: r,
330 FileName: file,
331 Content: content,
332 }
333 }
334
335 return expr, nil
336}
337
338// parseOperators interprets the orOperator in a list of queries.
339func parseOperators(in []Q) (Q, error) {
340 top := &Or{}
341 cur := &And{}
342
343 seenOr := false
344 for _, q := range in {
345 if _, ok := q.(*orOperator); ok {
346 seenOr = true
347 if len(cur.Children) == 0 {
348 return nil, fmt.Errorf("query: OR operator should have operand")
349 }
350 top.Children = append(top.Children, cur)
351 cur = &And{}
352 } else {
353 cur.Children = append(cur.Children, q)
354 }
355 }
356
357 if seenOr && len(cur.Children) == 0 {
358 return nil, fmt.Errorf("query: OR operator should have operand")
359 }
360 top.Children = append(top.Children, cur)
361 return top, nil
362}
363
364// parseExprList parses a list of query expressions. It is the
365// workhorse of the Parse function.
366func parseExprList(in []byte) ([]Q, int, error) {
367 b := in[:]
368 var qs []Q
369 for len(b) > 0 {
370 for len(b) > 0 && isSpace(b[0]) {
371 b = b[1:]
372 }
373 tok, _ := nextToken(b)
374 if tok != nil && tok.Type == tokParenClose {
375 break
376 } else if tok != nil && tok.Type == tokOr {
377 qs = append(qs, &orOperator{})
378 b = b[len(tok.Input):]
379 continue
380 }
381
382 q, n, err := parseExpr(b)
383 if err != nil {
384 return nil, 0, err
385 }
386
387 if q == nil {
388 // eof or a ')'
389 break
390 }
391 qs = append(qs, q)
392 b = b[n:]
393 }
394
395 setCase := "auto"
396 hasCaseScope := false
397 newQS := qs[:0]
398 typeT := uint8(100)
399 for _, q := range qs {
400 switch s := q.(type) {
401 case *caseQ:
402 setCase = s.Flavor
403 hasCaseScope = true
404 case *Type:
405 if s.Type < typeT {
406 typeT = s.Type
407 }
408 default:
409 newQS = append(newQS, q)
410 }
411 }
412 qs = mapQueryList(newQS, func(q Q) Q {
413 if sc, ok := q.(setCaser); ok {
414 sc.setCase(setCase)
415 }
416 return q
417 })
418 if typeT != 100 {
419 typedQ, err := parseOperators(qs)
420 if err != nil {
421 return nil, 0, err
422 }
423 qs = []Q{&Type{Type: typeT, Child: typedQ}}
424 }
425
426 if hasCaseScope {
427 scoped := make([]Q, 0, len(qs))
428 for _, q := range qs {
429 if _, isOrOperator := q.(*orOperator); isOrOperator {
430 scoped = append(scoped, q)
431 continue
432 }
433 scoped = append(scoped, &caseScopeQ{Child: q})
434 }
435 qs = scoped
436 }
437
438 return qs, len(in) - len(b), nil
439}
440
441type token struct {
442 Type int
443 // The value of the token
444 Text []byte
445
446 // The input that we consumed to form the token.
447 Input []byte
448}
449
450func (t *token) String() string {
451 return fmt.Sprintf("%s:%q", tokNames[t.Type], t.Text)
452}
453
454// token types.
455const (
456 tokText = 0
457 tokFile = 1
458 tokRepo = 2
459 tokCase = 3
460 tokBranch = 4
461 tokParenOpen = 5
462 tokParenClose = 6
463 tokError = 7
464 tokNegate = 8
465 tokRegex = 9
466 tokOr = 10
467 tokContent = 11
468 tokLang = 12
469 tokSym = 13
470 tokType = 14
471 tokArchived = 15
472 tokPublic = 16
473 tokFork = 17
474 tokMeta = 18
475)
476
477var tokNames = map[int]string{
478 tokArchived: "Archived",
479 tokBranch: "Branch",
480 tokCase: "Case",
481 tokError: "Error",
482 tokFile: "File",
483 tokFork: "Fork",
484 tokNegate: "Negate",
485 tokOr: "Or",
486 tokParenClose: "ParenClose",
487 tokParenOpen: "ParenOpen",
488 tokPublic: "Public",
489 tokRegex: "Regex",
490 tokRepo: "Repo",
491 tokText: "Text",
492 tokLang: "Language",
493 tokSym: "Symbol",
494 tokType: "Type",
495 tokMeta: "Meta",
496}
497
498var prefixes = map[string]int{
499 "archived:": tokArchived,
500 "b:": tokBranch,
501 "branch:": tokBranch,
502 "c:": tokContent,
503 "case:": tokCase,
504 "content:": tokContent,
505 "f:": tokFile,
506 "file:": tokFile,
507 "fork:": tokFork,
508 "public:": tokPublic,
509 "r:": tokRepo,
510 "regex:": tokRegex,
511 "repo:": tokRepo,
512 "lang:": tokLang,
513 "sym:": tokSym,
514 "t:": tokType,
515 "type:": tokType,
516 "meta.": tokMeta,
517}
518
519var reservedWords = map[string]int{
520 "or": tokOr,
521}
522
523func (t *token) setType() {
524 // After we consumed the input, we have to interpret some of the text,
525 // eg. to distinguish between ")" the text and ) the query grouping
526 // parenthesis.
527 if len(t.Text) == 1 && t.Text[0] == '(' {
528 t.Type = tokParenOpen
529 }
530 if len(t.Text) == 1 && t.Text[0] == ')' {
531 t.Type = tokParenClose
532 }
533
534 for w, typ := range reservedWords {
535 if string(t.Text) == w && string(t.Input) == w {
536 t.Type = typ
537 break
538 }
539 }
540
541 for pref, typ := range prefixes {
542 if !bytes.HasPrefix(t.Input, []byte(pref)) {
543 continue
544 }
545
546 t.Text = t.Text[len(pref):]
547 t.Type = typ
548 break
549 }
550}
551
552// nextToken returns the next token from the given input.
553func nextToken(in []byte) (*token, error) {
554 left := in[:]
555 parenCount := 0
556 var cur token
557 if len(left) == 0 {
558 return nil, nil
559 }
560
561 if left[0] == '-' {
562 return &token{
563 Type: tokNegate,
564 Text: []byte{'-'},
565 Input: in[:1],
566 }, nil
567 }
568
569 foundSpace := false
570
571loop:
572 for len(left) > 0 {
573 c := left[0]
574 switch c {
575 case '(':
576 parenCount++
577 cur.Text = append(cur.Text, c)
578 left = left[1:]
579 case ')':
580 if parenCount == 0 {
581 if len(cur.Text) == 0 {
582 cur.Text = []byte{')'}
583 left = left[1:]
584 }
585 break loop
586 }
587
588 cur.Text = append(cur.Text, c)
589 left = left[1:]
590 parenCount--
591
592 case '"':
593 t, n, err := parseStringLiteral(left)
594 if err != nil {
595 return nil, err
596 }
597 cur.Text = append(cur.Text, t...)
598 left = left[n:]
599 case '\\':
600 left = left[1:]
601 if len(left) == 0 {
602 return nil, fmt.Errorf("query: lone \\ at end")
603 }
604 c := left[0]
605 cur.Text = append(cur.Text, '\\', c)
606 left = left[1:]
607
608 case ' ', '\n', '\t':
609 if parenCount > 0 {
610 foundSpace = true
611 }
612 break loop
613 default:
614 cur.Text = append(cur.Text, c)
615 left = left[1:]
616 }
617 }
618
619 if len(cur.Text) == 0 {
620 return nil, nil
621 }
622
623 if foundSpace && cur.Text[0] == '(' {
624 cur.Text = cur.Text[:1]
625 cur.Input = in[:1]
626 } else {
627 cur.Input = in[:len(in)-len(left)]
628 }
629 cur.setType()
630 return &cur, nil
631}