fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package query
16
17import (
18 "bytes"
19 "fmt"
20 "log"
21 "regexp/syntax"
22
23 "github.com/grafana/regexp"
24
25 "github.com/sourcegraph/zoekt/internal/languages"
26)
27
28var _ = log.Printf
29
30// parseStringLiteral parses a string literal, consumes the starting
31// quote too.
32func parseStringLiteral(in []byte) (lit []byte, n int, err error) {
33 left := in[1:]
34 found := false
35
36loop:
37 for len(left) > 0 {
38 c := left[0]
39 left = left[1:]
40 switch c {
41 case '"':
42 found = true
43 break loop
44 case '\\':
45 // TODO - other escape sequences.
46 if len(left) == 0 {
47 return nil, 0, fmt.Errorf("query: missing char after \\")
48 }
49 c = left[0]
50 left = left[1:]
51
52 lit = append(lit, c)
53 default:
54 lit = append(lit, c)
55 }
56 }
57 if !found {
58 return nil, 0, fmt.Errorf("query: unterminated quoted string")
59 }
60 return lit, len(in) - len(left), nil
61}
62
63// orOperator is a placeholder intermediate so we can represent [A,
64// or, B] before we convert it to Or{A, B}
65type orOperator struct{}
66
67func (o *orOperator) String() string {
68 return "orOp"
69}
70
71func isSpace(c byte) bool {
72 return c == ' ' || c == '\t'
73}
74
75// Parse parses a string into a query.
76func Parse(qStr string) (Q, error) {
77 b := []byte(qStr)
78
79 qs, n, err := parseExprList(b)
80 if err != nil {
81 return nil, err
82 }
83
84 if n != len(b) {
85 return nil, fmt.Errorf("query: extra tokens found at end input: %q", b[n:])
86 }
87
88 q, err := parseOperators(qs)
89 if err != nil {
90 return nil, err
91 }
92
93 return Simplify(q), nil
94}
95
96// parseExpr parses a single expression, returning the result, and the
97// number of bytes consumed.
98func parseExpr(in []byte) (Q, int, error) {
99 b := in[:]
100 var expr Q
101 for len(b) > 0 && isSpace(b[0]) {
102 b = b[1:]
103 }
104
105 tok, err := nextToken(b)
106 if err != nil {
107 return nil, 0, err
108 }
109 if tok == nil {
110 return nil, 0, nil
111 }
112 b = b[len(tok.Input):]
113
114 text := string(tok.Text)
115 switch tok.Type {
116 case tokCase:
117 switch text {
118 case "yes":
119 case "no":
120 case "auto":
121 default:
122 return nil, 0, fmt.Errorf("query: unknown case argument %q, want {yes,no,auto}", text)
123 }
124 expr = &caseQ{text}
125 case tokRepo:
126 r, err := regexp.Compile(text)
127 if err != nil {
128 return nil, 0, err
129 }
130
131 expr = &Repo{r}
132 case tokArchived:
133 switch text {
134 case "yes":
135 expr = RawConfig(RcOnlyArchived)
136 case "no":
137 expr = RawConfig(RcNoArchived)
138 default:
139 return nil, 0, fmt.Errorf("query: unknown archived argument %q, want {yes,no}", text)
140 }
141 case tokFork:
142 switch text {
143 case "yes":
144 expr = RawConfig(RcOnlyForks)
145 case "no":
146 expr = RawConfig(RcNoForks)
147 default:
148 return nil, 0, fmt.Errorf("query: unknown fork argument %q, want {yes,no}", text)
149 }
150 case tokPublic:
151 switch text {
152 case "yes":
153 expr = RawConfig(RcOnlyPublic)
154 case "no":
155 expr = RawConfig(RcOnlyPrivate)
156 default:
157 return nil, 0, fmt.Errorf("query: unknown public argument %q, want {yes,no}", text)
158 }
159 case tokBranch:
160 expr = &Branch{Pattern: text}
161 case tokText, tokRegex:
162 q, err := RegexpQuery(text, false, false)
163 if err != nil {
164 return nil, 0, err
165 }
166 expr = q
167 case tokFile:
168 q, err := RegexpQuery(text, false, true)
169 if err != nil {
170 return nil, 0, err
171 }
172 expr = q
173 case tokContent:
174 q, err := RegexpQuery(text, true, false)
175 if err != nil {
176 return nil, 0, err
177 }
178 expr = q
179 case tokLang:
180 canonical, ok := languages.GetLanguageByAlias(text)
181 if !ok {
182 expr = &Const{false}
183 } else {
184 expr = &Language{Language: canonical}
185 }
186
187 case tokSym:
188 if text == "" {
189 return nil, 0, fmt.Errorf("the sym: atom must have an argument")
190 }
191
192 q, err := RegexpQuery(text, false, false)
193 if err != nil {
194 return nil, 0, err
195 }
196
197 expr = &Symbol{q}
198 case tokParenClose:
199 // Caller must consume paren.
200 expr = nil
201
202 case tokParenOpen:
203 qs, n, err := parseExprList(b)
204 b = b[n:]
205 if err != nil {
206 return nil, 0, err
207 }
208
209 pTok, err := nextToken(b)
210 if err != nil {
211 return nil, 0, err
212 }
213 if pTok == nil || pTok.Type != tokParenClose {
214 return nil, 0, fmt.Errorf("query: missing close paren, got token %v", pTok)
215 }
216
217 b = b[len(pTok.Input):]
218 expr, err = parseOperators(qs)
219 if err != nil {
220 return nil, 0, err
221 }
222 case tokNegate:
223 subQ, n, err := parseExpr(b)
224 if err != nil {
225 return nil, 0, err
226 }
227 if subQ == nil {
228 return nil, 0, fmt.Errorf("query: '-' operator needs an argument")
229 }
230 b = b[n:]
231 expr = &Not{subQ}
232
233 case tokType:
234 var t uint8
235 switch text {
236 case "filematch":
237 t = TypeFileMatch
238 case "filename", "file":
239 t = TypeFileName
240 case "repo":
241 t = TypeRepo
242 default:
243 return nil, 0, fmt.Errorf("query: unknown type argument %q, want {filematch,filename,repo}", text)
244 }
245 // Later we will lift this into a root, like we do for caseQ
246 expr = &Type{Type: t, Child: nil}
247 case tokMeta:
248 // Split on ':' to separate field and value
249 parts := bytes.SplitN([]byte(text), []byte(":"), 2)
250 if len(parts) != 2 {
251 return nil, 0, fmt.Errorf("query: invalid meta field syntax %q", text)
252 }
253 field := string(parts[0])
254 valuePattern := string(parts[1])
255 re, err := regexp.Compile(valuePattern)
256 if err != nil {
257 return nil, 0, fmt.Errorf("query: invalid regexp in meta value: %v", err)
258 }
259 expr = &Meta{
260 Field: field,
261 Value: re,
262 }
263 }
264
265 return expr, len(in) - len(b), nil
266}
267
268const regexpFlags syntax.Flags = syntax.ClassNL | syntax.PerlX | syntax.UnicodeGroups
269
270// RegexpQuery parses an atom into either a regular expression, or a
271// simple substring atom.
272func RegexpQuery(text string, content, file bool) (Q, error) {
273 var expr Q
274
275 r, err := syntax.Parse(text, regexpFlags)
276 if err != nil {
277 return nil, err
278 }
279
280 r = OptimizeRegexp(r, regexpFlags)
281
282 if r.Op == syntax.OpLiteral {
283 expr = &Substring{
284 Pattern: string(r.Rune),
285 FileName: file,
286 Content: content,
287 }
288 } else {
289 expr = &Regexp{
290 Regexp: r,
291 FileName: file,
292 Content: content,
293 }
294 }
295
296 return expr, nil
297}
298
299// parseOperators interprets the orOperator in a list of queries.
300func parseOperators(in []Q) (Q, error) {
301 top := &Or{}
302 cur := &And{}
303
304 seenOr := false
305 for _, q := range in {
306 if _, ok := q.(*orOperator); ok {
307 seenOr = true
308 if len(cur.Children) == 0 {
309 return nil, fmt.Errorf("query: OR operator should have operand")
310 }
311 top.Children = append(top.Children, cur)
312 cur = &And{}
313 } else {
314 cur.Children = append(cur.Children, q)
315 }
316 }
317
318 if seenOr && len(cur.Children) == 0 {
319 return nil, fmt.Errorf("query: OR operator should have operand")
320 }
321 top.Children = append(top.Children, cur)
322 return top, nil
323}
324
325// parseExprList parses a list of query expressions. It is the
326// workhorse of the Parse function.
327func parseExprList(in []byte) ([]Q, int, error) {
328 b := in[:]
329 var qs []Q
330 for len(b) > 0 {
331 for len(b) > 0 && isSpace(b[0]) {
332 b = b[1:]
333 }
334 tok, _ := nextToken(b)
335 if tok != nil && tok.Type == tokParenClose {
336 break
337 } else if tok != nil && tok.Type == tokOr {
338 qs = append(qs, &orOperator{})
339 b = b[len(tok.Input):]
340 continue
341 }
342
343 q, n, err := parseExpr(b)
344 if err != nil {
345 return nil, 0, err
346 }
347
348 if q == nil {
349 // eof or a ')'
350 break
351 }
352 qs = append(qs, q)
353 b = b[n:]
354 }
355
356 setCase := "auto"
357 newQS := qs[:0]
358 typeT := uint8(100)
359 for _, q := range qs {
360 switch s := q.(type) {
361 case *caseQ:
362 setCase = s.Flavor
363 case *Type:
364 if s.Type < typeT {
365 typeT = s.Type
366 }
367 default:
368 newQS = append(newQS, q)
369 }
370 }
371 qs = mapQueryList(newQS, func(q Q) Q {
372 if sc, ok := q.(setCaser); ok {
373 sc.setCase(setCase)
374 }
375 return q
376 })
377 if typeT != 100 {
378 qs = []Q{&Type{Type: typeT, Child: NewAnd(qs...)}}
379 }
380 return qs, len(in) - len(b), nil
381}
382
383type token struct {
384 Type int
385 // The value of the token
386 Text []byte
387
388 // The input that we consumed to form the token.
389 Input []byte
390}
391
392func (t *token) String() string {
393 return fmt.Sprintf("%s:%q", tokNames[t.Type], t.Text)
394}
395
396// token types.
397const (
398 tokText = 0
399 tokFile = 1
400 tokRepo = 2
401 tokCase = 3
402 tokBranch = 4
403 tokParenOpen = 5
404 tokParenClose = 6
405 tokError = 7
406 tokNegate = 8
407 tokRegex = 9
408 tokOr = 10
409 tokContent = 11
410 tokLang = 12
411 tokSym = 13
412 tokType = 14
413 tokArchived = 15
414 tokPublic = 16
415 tokFork = 17
416 tokMeta = 18
417)
418
419var tokNames = map[int]string{
420 tokArchived: "Archived",
421 tokBranch: "Branch",
422 tokCase: "Case",
423 tokError: "Error",
424 tokFile: "File",
425 tokFork: "Fork",
426 tokNegate: "Negate",
427 tokOr: "Or",
428 tokParenClose: "ParenClose",
429 tokParenOpen: "ParenOpen",
430 tokPublic: "Public",
431 tokRegex: "Regex",
432 tokRepo: "Repo",
433 tokText: "Text",
434 tokLang: "Language",
435 tokSym: "Symbol",
436 tokType: "Type",
437 tokMeta: "Meta",
438}
439
440var prefixes = map[string]int{
441 "archived:": tokArchived,
442 "b:": tokBranch,
443 "branch:": tokBranch,
444 "c:": tokContent,
445 "case:": tokCase,
446 "content:": tokContent,
447 "f:": tokFile,
448 "file:": tokFile,
449 "fork:": tokFork,
450 "public:": tokPublic,
451 "r:": tokRepo,
452 "regex:": tokRegex,
453 "repo:": tokRepo,
454 "lang:": tokLang,
455 "sym:": tokSym,
456 "t:": tokType,
457 "type:": tokType,
458 "meta.": tokMeta,
459}
460
461var reservedWords = map[string]int{
462 "or": tokOr,
463}
464
465func (t *token) setType() {
466 // After we consumed the input, we have to interpret some of the text,
467 // eg. to distinguish between ")" the text and ) the query grouping
468 // parenthesis.
469 if len(t.Text) == 1 && t.Text[0] == '(' {
470 t.Type = tokParenOpen
471 }
472 if len(t.Text) == 1 && t.Text[0] == ')' {
473 t.Type = tokParenClose
474 }
475
476 for w, typ := range reservedWords {
477 if string(t.Text) == w && string(t.Input) == w {
478 t.Type = typ
479 break
480 }
481 }
482
483 for pref, typ := range prefixes {
484 if !bytes.HasPrefix(t.Input, []byte(pref)) {
485 continue
486 }
487
488 t.Text = t.Text[len(pref):]
489 t.Type = typ
490 break
491 }
492}
493
494// nextToken returns the next token from the given input.
495func nextToken(in []byte) (*token, error) {
496 left := in[:]
497 parenCount := 0
498 var cur token
499 if len(left) == 0 {
500 return nil, nil
501 }
502
503 if left[0] == '-' {
504 return &token{
505 Type: tokNegate,
506 Text: []byte{'-'},
507 Input: in[:1],
508 }, nil
509 }
510
511 foundSpace := false
512
513loop:
514 for len(left) > 0 {
515 c := left[0]
516 switch c {
517 case '(':
518 parenCount++
519 cur.Text = append(cur.Text, c)
520 left = left[1:]
521 case ')':
522 if parenCount == 0 {
523 if len(cur.Text) == 0 {
524 cur.Text = []byte{')'}
525 left = left[1:]
526 }
527 break loop
528 }
529
530 cur.Text = append(cur.Text, c)
531 left = left[1:]
532 parenCount--
533
534 case '"':
535 t, n, err := parseStringLiteral(left)
536 if err != nil {
537 return nil, err
538 }
539 cur.Text = append(cur.Text, t...)
540 left = left[n:]
541 case '\\':
542 left = left[1:]
543 if len(left) == 0 {
544 return nil, fmt.Errorf("query: lone \\ at end")
545 }
546 c := left[0]
547 cur.Text = append(cur.Text, '\\', c)
548 left = left[1:]
549
550 case ' ', '\n', '\t':
551 if parenCount > 0 {
552 foundSpace = true
553 }
554 break loop
555 default:
556 cur.Text = append(cur.Text, c)
557 left = left[1:]
558 }
559 }
560
561 if len(cur.Text) == 0 {
562 return nil, nil
563 }
564
565 if foundSpace && cur.Text[0] == '(' {
566 cur.Text = cur.Text[:1]
567 cur.Input = in[:1]
568 } else {
569 cur.Input = in[:len(in)-len(left)]
570 }
571 cur.setType()
572 return &cur, nil
573}