fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package query
16
17import (
18 "bytes"
19 "fmt"
20 "log"
21 "regexp/syntax"
22
23 "github.com/grafana/regexp"
24
25 "github.com/sourcegraph/zoekt/languages"
26)
27
28var _ = log.Printf
29
30// parseStringLiteral parses a string literal, consumes the starting
31// quote too.
32func parseStringLiteral(in []byte) (lit []byte, n int, err error) {
33 left := in[1:]
34 found := false
35
36loop:
37 for len(left) > 0 {
38 c := left[0]
39 left = left[1:]
40 switch c {
41 case '"':
42 found = true
43 break loop
44 case '\\':
45 // TODO - other escape sequences.
46 if len(left) == 0 {
47 return nil, 0, fmt.Errorf("query: missing char after \\")
48 }
49 c = left[0]
50 left = left[1:]
51
52 lit = append(lit, c)
53 default:
54 lit = append(lit, c)
55 }
56 }
57 if !found {
58 return nil, 0, fmt.Errorf("query: unterminated quoted string")
59 }
60 return lit, len(in) - len(left), nil
61}
62
63// orOperator is a placeholder intermediate so we can represent [A,
64// or, B] before we convert it to Or{A, B}
65type orOperator struct{}
66
67func (o *orOperator) String() string {
68 return "orOp"
69}
70
71// caseScopeQ is a parse-time wrapper used to prevent case directives from an
72// outer expression list from overriding an explicitly scoped inner `case:`.
73type caseScopeQ struct {
74 Child Q
75}
76
77func (c *caseScopeQ) String() string {
78 return c.Child.String()
79}
80
81func stripCaseScopesList(qs []Q) []Q {
82 stripped := make([]Q, len(qs))
83 for i, q := range qs {
84 stripped[i] = stripCaseScopes(q)
85 }
86 return stripped
87}
88
89func stripCaseScopes(q Q) Q {
90 switch s := q.(type) {
91 case *And:
92 return &And{Children: stripCaseScopesList(s.Children)}
93 case *Or:
94 return &Or{Children: stripCaseScopesList(s.Children)}
95 case *Not:
96 return &Not{Child: stripCaseScopes(s.Child)}
97 case *Type:
98 return &Type{Type: s.Type, Child: stripCaseScopes(s.Child)}
99 case *Boost:
100 return &Boost{Boost: s.Boost, Child: stripCaseScopes(s.Child)}
101 case *caseScopeQ:
102 return stripCaseScopes(s.Child)
103 default:
104 return q
105 }
106}
107
108func isSpace(c byte) bool {
109 return c == ' ' || c == '\t'
110}
111
112// Parse parses a string into a query.
113func Parse(qStr string) (Q, error) {
114 b := []byte(qStr)
115
116 qs, n, err := parseExprList(b)
117 if err != nil {
118 return nil, err
119 }
120
121 if n != len(b) {
122 return nil, fmt.Errorf("query: extra tokens found at end input: %q", b[n:])
123 }
124
125 q, err := parseOperators(qs)
126 if err != nil {
127 return nil, err
128 }
129
130 q = stripCaseScopes(q)
131
132 return Simplify(q), nil
133}
134
135// parseExpr parses a single expression, returning the result, and the
136// number of bytes consumed.
137func parseExpr(in []byte) (Q, int, error) {
138 b := in[:]
139 var expr Q
140 for len(b) > 0 && isSpace(b[0]) {
141 b = b[1:]
142 }
143
144 tok, err := nextToken(b)
145 if err != nil {
146 return nil, 0, err
147 }
148 if tok == nil {
149 return nil, 0, nil
150 }
151 b = b[len(tok.Input):]
152
153 text := string(tok.Text)
154 switch tok.Type {
155 case tokCase:
156 switch text {
157 case "yes":
158 case "no":
159 case "auto":
160 default:
161 return nil, 0, fmt.Errorf("query: unknown case argument %q, want {yes,no,auto}", text)
162 }
163 expr = &caseQ{text}
164 case tokRepo:
165 r, err := regexp.Compile(text)
166 if err != nil {
167 return nil, 0, err
168 }
169
170 expr = &Repo{r}
171 case tokArchived:
172 switch text {
173 case "yes":
174 expr = RawConfig(RcOnlyArchived)
175 case "no":
176 expr = RawConfig(RcNoArchived)
177 default:
178 return nil, 0, fmt.Errorf("query: unknown archived argument %q, want {yes,no}", text)
179 }
180 case tokFork:
181 switch text {
182 case "yes":
183 expr = RawConfig(RcOnlyForks)
184 case "no":
185 expr = RawConfig(RcNoForks)
186 default:
187 return nil, 0, fmt.Errorf("query: unknown fork argument %q, want {yes,no}", text)
188 }
189 case tokPublic:
190 switch text {
191 case "yes":
192 expr = RawConfig(RcOnlyPublic)
193 case "no":
194 expr = RawConfig(RcOnlyPrivate)
195 default:
196 return nil, 0, fmt.Errorf("query: unknown public argument %q, want {yes,no}", text)
197 }
198 case tokBranch:
199 expr = &Branch{Pattern: text}
200 case tokText, tokRegex:
201 q, err := RegexpQuery(text, false, false)
202 if err != nil {
203 return nil, 0, err
204 }
205 expr = q
206 case tokFile:
207 q, err := RegexpQuery(text, false, true)
208 if err != nil {
209 return nil, 0, err
210 }
211 expr = q
212 case tokContent:
213 q, err := RegexpQuery(text, true, false)
214 if err != nil {
215 return nil, 0, err
216 }
217 expr = q
218 case tokLang:
219 canonical, ok := languages.GetLanguageByNameOrAlias(text)
220 if !ok {
221 expr = &Const{false}
222 } else {
223 expr = &Language{Language: canonical}
224 }
225
226 case tokSym:
227 if text == "" {
228 return nil, 0, fmt.Errorf("the sym: atom must have an argument")
229 }
230
231 q, err := RegexpQuery(text, false, false)
232 if err != nil {
233 return nil, 0, err
234 }
235
236 expr = &Symbol{q}
237 case tokParenClose:
238 // Caller must consume paren.
239 expr = nil
240
241 case tokParenOpen:
242 qs, n, err := parseExprList(b)
243 b = b[n:]
244 if err != nil {
245 return nil, 0, err
246 }
247
248 pTok, err := nextToken(b)
249 if err != nil {
250 return nil, 0, err
251 }
252 if pTok == nil || pTok.Type != tokParenClose {
253 return nil, 0, fmt.Errorf("query: missing close paren, got token %v", pTok)
254 }
255
256 b = b[len(pTok.Input):]
257 expr, err = parseOperators(qs)
258 if err != nil {
259 return nil, 0, err
260 }
261 case tokNegate:
262 subQ, n, err := parseExpr(b)
263 if err != nil {
264 return nil, 0, err
265 }
266 if subQ == nil {
267 return nil, 0, fmt.Errorf("query: '-' operator needs an argument")
268 }
269 b = b[n:]
270 expr = &Not{subQ}
271
272 case tokType:
273 var t uint8
274 switch text {
275 case "filematch":
276 t = TypeFileMatch
277 case "filename", "file":
278 t = TypeFileName
279 case "repo":
280 t = TypeRepo
281 default:
282 return nil, 0, fmt.Errorf("query: unknown type argument %q, want {filematch,filename,repo}", text)
283 }
284 // Later we will lift this into a root, like we do for caseQ
285 expr = &Type{Type: t, Child: nil}
286 case tokMeta:
287 // Split on ':' to separate field and value
288 parts := bytes.SplitN([]byte(text), []byte(":"), 2)
289 if len(parts) != 2 {
290 return nil, 0, fmt.Errorf("query: invalid meta field syntax %q", text)
291 }
292 field := string(parts[0])
293 valuePattern := string(parts[1])
294 re, err := regexp.Compile(valuePattern)
295 if err != nil {
296 return nil, 0, fmt.Errorf("query: invalid regexp in meta value: %v", err)
297 }
298 expr = &Meta{
299 Field: field,
300 Value: re,
301 }
302 }
303
304 return expr, len(in) - len(b), nil
305}
306
307const regexpFlags syntax.Flags = syntax.ClassNL | syntax.PerlX | syntax.UnicodeGroups
308
309// RegexpQuery parses an atom into either a regular expression, or a
310// simple substring atom.
311func RegexpQuery(text string, content, file bool) (Q, error) {
312 var expr Q
313
314 r, err := syntax.Parse(text, regexpFlags)
315 if err != nil {
316 return nil, err
317 }
318
319 r = OptimizeRegexp(r, regexpFlags)
320
321 if r.Op == syntax.OpLiteral {
322 expr = &Substring{
323 Pattern: string(r.Rune),
324 FileName: file,
325 Content: content,
326 }
327 } else {
328 expr = &Regexp{
329 Regexp: r,
330 FileName: file,
331 Content: content,
332 }
333 }
334
335 return expr, nil
336}
337
338// parseOperators interprets the orOperator in a list of queries.
339func parseOperators(in []Q) (Q, error) {
340 top := &Or{}
341 cur := &And{}
342
343 seenOr := false
344 for _, q := range in {
345 if _, ok := q.(*orOperator); ok {
346 seenOr = true
347 if len(cur.Children) == 0 {
348 return nil, fmt.Errorf("query: OR operator should have operand")
349 }
350 top.Children = append(top.Children, cur)
351 cur = &And{}
352 } else {
353 cur.Children = append(cur.Children, q)
354 }
355 }
356
357 if seenOr && len(cur.Children) == 0 {
358 return nil, fmt.Errorf("query: OR operator should have operand")
359 }
360 top.Children = append(top.Children, cur)
361 return top, nil
362}
363
364// parseExprList parses a list of query expressions. It is the
365// workhorse of the Parse function.
366func parseExprList(in []byte) ([]Q, int, error) {
367 b := in[:]
368 var qs []Q
369 for len(b) > 0 {
370 for len(b) > 0 && isSpace(b[0]) {
371 b = b[1:]
372 }
373 tok, _ := nextToken(b)
374 if tok != nil && tok.Type == tokParenClose {
375 break
376 } else if tok != nil && tok.Type == tokOr {
377 qs = append(qs, &orOperator{})
378 b = b[len(tok.Input):]
379 continue
380 }
381
382 q, n, err := parseExpr(b)
383 if err != nil {
384 return nil, 0, err
385 }
386
387 if q == nil {
388 // eof or a ')'
389 break
390 }
391 qs = append(qs, q)
392 b = b[n:]
393 }
394
395 setCase := "auto"
396 hasCaseScope := false
397 newQS := qs[:0]
398 typeT := uint8(100)
399 for _, q := range qs {
400 switch s := q.(type) {
401 case *caseQ:
402 setCase = s.Flavor
403 hasCaseScope = true
404 case *Type:
405 if s.Type < typeT {
406 typeT = s.Type
407 }
408 default:
409 newQS = append(newQS, q)
410 }
411 }
412 qs = mapQueryList(newQS, func(q Q) Q {
413 if sc, ok := q.(setCaser); ok {
414 sc.setCase(setCase)
415 }
416 return q
417 })
418 if typeT != 100 {
419 qs = []Q{&Type{Type: typeT, Child: NewAnd(qs...)}}
420 }
421
422 if hasCaseScope {
423 scoped := make([]Q, 0, len(qs))
424 for _, q := range qs {
425 if _, isOrOperator := q.(*orOperator); isOrOperator {
426 scoped = append(scoped, q)
427 continue
428 }
429 scoped = append(scoped, &caseScopeQ{Child: q})
430 }
431 qs = scoped
432 }
433
434 return qs, len(in) - len(b), nil
435}
436
437type token struct {
438 Type int
439 // The value of the token
440 Text []byte
441
442 // The input that we consumed to form the token.
443 Input []byte
444}
445
446func (t *token) String() string {
447 return fmt.Sprintf("%s:%q", tokNames[t.Type], t.Text)
448}
449
450// token types.
451const (
452 tokText = 0
453 tokFile = 1
454 tokRepo = 2
455 tokCase = 3
456 tokBranch = 4
457 tokParenOpen = 5
458 tokParenClose = 6
459 tokError = 7
460 tokNegate = 8
461 tokRegex = 9
462 tokOr = 10
463 tokContent = 11
464 tokLang = 12
465 tokSym = 13
466 tokType = 14
467 tokArchived = 15
468 tokPublic = 16
469 tokFork = 17
470 tokMeta = 18
471)
472
473var tokNames = map[int]string{
474 tokArchived: "Archived",
475 tokBranch: "Branch",
476 tokCase: "Case",
477 tokError: "Error",
478 tokFile: "File",
479 tokFork: "Fork",
480 tokNegate: "Negate",
481 tokOr: "Or",
482 tokParenClose: "ParenClose",
483 tokParenOpen: "ParenOpen",
484 tokPublic: "Public",
485 tokRegex: "Regex",
486 tokRepo: "Repo",
487 tokText: "Text",
488 tokLang: "Language",
489 tokSym: "Symbol",
490 tokType: "Type",
491 tokMeta: "Meta",
492}
493
494var prefixes = map[string]int{
495 "archived:": tokArchived,
496 "b:": tokBranch,
497 "branch:": tokBranch,
498 "c:": tokContent,
499 "case:": tokCase,
500 "content:": tokContent,
501 "f:": tokFile,
502 "file:": tokFile,
503 "fork:": tokFork,
504 "public:": tokPublic,
505 "r:": tokRepo,
506 "regex:": tokRegex,
507 "repo:": tokRepo,
508 "lang:": tokLang,
509 "sym:": tokSym,
510 "t:": tokType,
511 "type:": tokType,
512 "meta.": tokMeta,
513}
514
515var reservedWords = map[string]int{
516 "or": tokOr,
517}
518
519func (t *token) setType() {
520 // After we consumed the input, we have to interpret some of the text,
521 // eg. to distinguish between ")" the text and ) the query grouping
522 // parenthesis.
523 if len(t.Text) == 1 && t.Text[0] == '(' {
524 t.Type = tokParenOpen
525 }
526 if len(t.Text) == 1 && t.Text[0] == ')' {
527 t.Type = tokParenClose
528 }
529
530 for w, typ := range reservedWords {
531 if string(t.Text) == w && string(t.Input) == w {
532 t.Type = typ
533 break
534 }
535 }
536
537 for pref, typ := range prefixes {
538 if !bytes.HasPrefix(t.Input, []byte(pref)) {
539 continue
540 }
541
542 t.Text = t.Text[len(pref):]
543 t.Type = typ
544 break
545 }
546}
547
548// nextToken returns the next token from the given input.
549func nextToken(in []byte) (*token, error) {
550 left := in[:]
551 parenCount := 0
552 var cur token
553 if len(left) == 0 {
554 return nil, nil
555 }
556
557 if left[0] == '-' {
558 return &token{
559 Type: tokNegate,
560 Text: []byte{'-'},
561 Input: in[:1],
562 }, nil
563 }
564
565 foundSpace := false
566
567loop:
568 for len(left) > 0 {
569 c := left[0]
570 switch c {
571 case '(':
572 parenCount++
573 cur.Text = append(cur.Text, c)
574 left = left[1:]
575 case ')':
576 if parenCount == 0 {
577 if len(cur.Text) == 0 {
578 cur.Text = []byte{')'}
579 left = left[1:]
580 }
581 break loop
582 }
583
584 cur.Text = append(cur.Text, c)
585 left = left[1:]
586 parenCount--
587
588 case '"':
589 t, n, err := parseStringLiteral(left)
590 if err != nil {
591 return nil, err
592 }
593 cur.Text = append(cur.Text, t...)
594 left = left[n:]
595 case '\\':
596 left = left[1:]
597 if len(left) == 0 {
598 return nil, fmt.Errorf("query: lone \\ at end")
599 }
600 c := left[0]
601 cur.Text = append(cur.Text, '\\', c)
602 left = left[1:]
603
604 case ' ', '\n', '\t':
605 if parenCount > 0 {
606 foundSpace = true
607 }
608 break loop
609 default:
610 cur.Text = append(cur.Text, c)
611 left = left[1:]
612 }
613 }
614
615 if len(cur.Text) == 0 {
616 return nil, nil
617 }
618
619 if foundSpace && cur.Text[0] == '(' {
620 cur.Text = cur.Text[:1]
621 cur.Input = in[:1]
622 } else {
623 cur.Input = in[:len(in)-len(left)]
624 }
625 cur.setType()
626 return &cur, nil
627}