fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package query
16
17import (
18 "bytes"
19 "fmt"
20 "log"
21 "regexp/syntax"
22
23 "github.com/go-enry/go-enry/v2"
24 "github.com/grafana/regexp"
25)
26
27var _ = log.Printf
28
29// parseStringLiteral parses a string literal, consumes the starting
30// quote too.
31func parseStringLiteral(in []byte) (lit []byte, n int, err error) {
32 left := in[1:]
33 found := false
34
35loop:
36 for len(left) > 0 {
37 c := left[0]
38 left = left[1:]
39 switch c {
40 case '"':
41 found = true
42 break loop
43 case '\\':
44 // TODO - other escape sequences.
45 if len(left) == 0 {
46 return nil, 0, fmt.Errorf("query: missing char after \\")
47 }
48 c = left[0]
49 left = left[1:]
50
51 lit = append(lit, c)
52 default:
53 lit = append(lit, c)
54 }
55 }
56 if !found {
57 return nil, 0, fmt.Errorf("query: unterminated quoted string")
58 }
59 return lit, len(in) - len(left), nil
60}
61
62// orOperator is a placeholder intermediate so we can represent [A,
63// or, B] before we convert it to Or{A, B}
64type orOperator struct{}
65
66func (o *orOperator) String() string {
67 return "orOp"
68}
69
70func isSpace(c byte) bool {
71 return c == ' ' || c == '\t'
72}
73
74// Parse parses a string into a query.
75func Parse(qStr string) (Q, error) {
76 b := []byte(qStr)
77
78 qs, _, err := parseExprList(b)
79 if err != nil {
80 return nil, err
81 }
82
83 q, err := parseOperators(qs)
84 if err != nil {
85 return nil, err
86 }
87
88 return Simplify(q), nil
89}
90
91// parseExpr parses a single expression, returning the result, and the
92// number of bytes consumed.
93func parseExpr(in []byte) (Q, int, error) {
94 b := in[:]
95 var expr Q
96 for len(b) > 0 && isSpace(b[0]) {
97 b = b[1:]
98 }
99
100 tok, err := nextToken(b)
101 if err != nil {
102 return nil, 0, err
103 }
104 if tok == nil {
105 return nil, 0, nil
106 }
107 b = b[len(tok.Input):]
108
109 text := string(tok.Text)
110 switch tok.Type {
111 case tokCase:
112 switch text {
113 case "yes":
114 case "no":
115 case "auto":
116 default:
117 return nil, 0, fmt.Errorf("query: unknown case argument %q, want {yes,no,auto}", text)
118 }
119 expr = &caseQ{text}
120 case tokRepo:
121 r, err := regexp.Compile(text)
122
123 if err != nil {
124 return nil, 0, err
125 }
126
127 expr = &Repo{r}
128 case tokArchived:
129 switch text {
130 case "yes":
131 expr = RawConfig(RcOnlyArchived)
132 case "no":
133 expr = RawConfig(RcNoArchived)
134 default:
135 return nil, 0, fmt.Errorf("query: unknown archived argument %q, want {yes,no}", text)
136 }
137 case tokFork:
138 switch text {
139 case "yes":
140 expr = RawConfig(RcOnlyForks)
141 case "no":
142 expr = RawConfig(RcNoForks)
143 default:
144 return nil, 0, fmt.Errorf("query: unknown fork argument %q, want {yes,no}", text)
145 }
146 case tokPublic:
147 switch text {
148 case "yes":
149 expr = RawConfig(RcOnlyPublic)
150 case "no":
151 expr = RawConfig(RcOnlyPrivate)
152 default:
153 return nil, 0, fmt.Errorf("query: unknown public argument %q, want {yes,no}", text)
154 }
155 case tokBranch:
156 expr = &Branch{Pattern: text}
157 case tokText, tokRegex:
158 q, err := RegexpQuery(text, false, false)
159 if err != nil {
160 return nil, 0, err
161 }
162 expr = q
163 case tokFile:
164 q, err := RegexpQuery(text, false, true)
165 if err != nil {
166 return nil, 0, err
167 }
168 expr = q
169 case tokContent:
170 q, err := RegexpQuery(text, true, false)
171 if err != nil {
172 return nil, 0, err
173 }
174 expr = q
175 case tokLang:
176 canonical, ok := enry.GetLanguageByAlias(text)
177 if !ok {
178 expr = &Const{false}
179 } else {
180 expr = &Language{Language: canonical}
181 }
182
183 case tokSym:
184 if text == "" {
185 return nil, 0, fmt.Errorf("the sym: atom must have an argument")
186 }
187
188 q, err := RegexpQuery(text, false, false)
189 if err != nil {
190 return nil, 0, err
191 }
192
193 expr = &Symbol{q}
194 case tokParenClose:
195 // Caller must consume paren.
196 expr = nil
197
198 case tokParenOpen:
199 qs, n, err := parseExprList(b)
200 b = b[n:]
201 if err != nil {
202 return nil, 0, err
203 }
204
205 pTok, err := nextToken(b)
206 if err != nil {
207 return nil, 0, err
208 }
209 if pTok == nil || pTok.Type != tokParenClose {
210 return nil, 0, fmt.Errorf("query: missing close paren, got token %v", pTok)
211 }
212
213 b = b[len(pTok.Input):]
214 expr, err = parseOperators(qs)
215 if err != nil {
216 return nil, 0, err
217 }
218 case tokNegate:
219 subQ, n, err := parseExpr(b)
220 if err != nil {
221 return nil, 0, err
222 }
223 if subQ == nil {
224 return nil, 0, fmt.Errorf("query: '-' operator needs an argument")
225 }
226 b = b[n:]
227 expr = &Not{subQ}
228
229 case tokType:
230 var t uint8
231 switch text {
232 case "filematch":
233 t = TypeFileMatch
234 case "filename", "file":
235 t = TypeFileName
236 case "repo":
237 t = TypeRepo
238 default:
239 return nil, 0, fmt.Errorf("query: unknown type argument %q, want {filematch,filename,repo}", text)
240 }
241 // Later we will lift this into a root, like we do for caseQ
242 expr = &Type{Type: t, Child: nil}
243 }
244
245 return expr, len(in) - len(b), nil
246}
247
248const regexpFlags syntax.Flags = syntax.ClassNL | syntax.PerlX | syntax.UnicodeGroups
249
250// RegexpQuery parses an atom into either a regular expression, or a
251// simple substring atom.
252func RegexpQuery(text string, content, file bool) (Q, error) {
253 var expr Q
254
255 r, err := syntax.Parse(text, regexpFlags)
256 if err != nil {
257 return nil, err
258 }
259
260 r = OptimizeRegexp(r, regexpFlags)
261
262 if r.Op == syntax.OpLiteral {
263 expr = &Substring{
264 Pattern: string(r.Rune),
265 FileName: file,
266 Content: content,
267 }
268 } else {
269 expr = &Regexp{
270 Regexp: r,
271 FileName: file,
272 Content: content,
273 }
274 }
275
276 return expr, nil
277}
278
279// parseOperators interprets the orOperator in a list of queries.
280func parseOperators(in []Q) (Q, error) {
281 top := &Or{}
282 cur := &And{}
283
284 seenOr := false
285 for _, q := range in {
286 if _, ok := q.(*orOperator); ok {
287 seenOr = true
288 if len(cur.Children) == 0 {
289 return nil, fmt.Errorf("query: OR operator should have operand")
290 }
291 top.Children = append(top.Children, cur)
292 cur = &And{}
293 } else {
294 cur.Children = append(cur.Children, q)
295 }
296 }
297
298 if seenOr && len(cur.Children) == 0 {
299 return nil, fmt.Errorf("query: OR operator should have operand")
300 }
301 top.Children = append(top.Children, cur)
302 return top, nil
303}
304
305// parseExprList parses a list of query expressions. It is the
306// workhorse of the Parse function.
307func parseExprList(in []byte) ([]Q, int, error) {
308 b := in[:]
309 var qs []Q
310 for len(b) > 0 {
311 for len(b) > 0 && isSpace(b[0]) {
312 b = b[1:]
313 }
314 tok, _ := nextToken(b)
315 if tok != nil && tok.Type == tokParenClose {
316 break
317 } else if tok != nil && tok.Type == tokOr {
318 qs = append(qs, &orOperator{})
319 b = b[len(tok.Input):]
320 continue
321 }
322
323 q, n, err := parseExpr(b)
324 if err != nil {
325 return nil, 0, err
326 }
327
328 if q == nil {
329 // eof or a ')'
330 break
331 }
332 qs = append(qs, q)
333 b = b[n:]
334 }
335
336 setCase := "auto"
337 newQS := qs[:0]
338 typeT := uint8(100)
339 for _, q := range qs {
340 switch s := q.(type) {
341 case *caseQ:
342 setCase = s.Flavor
343 case *Type:
344 if s.Type < typeT {
345 typeT = s.Type
346 }
347 default:
348 newQS = append(newQS, q)
349 }
350 }
351 qs = mapQueryList(newQS, func(q Q) Q {
352 if sc, ok := q.(setCaser); ok {
353 sc.setCase(setCase)
354 }
355 return q
356 })
357 if typeT != 100 {
358 qs = []Q{&Type{Type: typeT, Child: NewAnd(qs...)}}
359 }
360 return qs, len(in) - len(b), nil
361}
362
363type token struct {
364 Type int
365 // The value of the token
366 Text []byte
367
368 // The input that we consumed to form the token.
369 Input []byte
370}
371
372func (t *token) String() string {
373 return fmt.Sprintf("%s:%q", tokNames[t.Type], t.Text)
374}
375
376// token types.
377const (
378 tokText = 0
379 tokFile = 1
380 tokRepo = 2
381 tokCase = 3
382 tokBranch = 4
383 tokParenOpen = 5
384 tokParenClose = 6
385 tokError = 7
386 tokNegate = 8
387 tokRegex = 9
388 tokOr = 10
389 tokContent = 11
390 tokLang = 12
391 tokSym = 13
392 tokType = 14
393 tokArchived = 15
394 tokPublic = 16
395 tokFork = 17
396)
397
398var tokNames = map[int]string{
399 tokArchived: "Archived",
400 tokBranch: "Branch",
401 tokCase: "Case",
402 tokError: "Error",
403 tokFile: "File",
404 tokFork: "Fork",
405 tokNegate: "Negate",
406 tokOr: "Or",
407 tokParenClose: "ParenClose",
408 tokParenOpen: "ParenOpen",
409 tokPublic: "Public",
410 tokRegex: "Regex",
411 tokRepo: "Repo",
412 tokText: "Text",
413 tokLang: "Language",
414 tokSym: "Symbol",
415 tokType: "Type",
416}
417
418var prefixes = map[string]int{
419 "archived:": tokArchived,
420 "b:": tokBranch,
421 "branch:": tokBranch,
422 "c:": tokContent,
423 "case:": tokCase,
424 "content:": tokContent,
425 "f:": tokFile,
426 "file:": tokFile,
427 "fork:": tokFork,
428 "public:": tokPublic,
429 "r:": tokRepo,
430 "regex:": tokRegex,
431 "repo:": tokRepo,
432 "lang:": tokLang,
433 "sym:": tokSym,
434 "t:": tokType,
435 "type:": tokType,
436}
437
438var reservedWords = map[string]int{
439 "or": tokOr,
440}
441
442func (t *token) setType() {
443 // After we consumed the input, we have to interpret some of the text,
444 // eg. to distinguish between ")" the text and ) the query grouping
445 // parenthesis.
446 if len(t.Text) == 1 && t.Text[0] == '(' {
447 t.Type = tokParenOpen
448 }
449 if len(t.Text) == 1 && t.Text[0] == ')' {
450 t.Type = tokParenClose
451 }
452
453 for w, typ := range reservedWords {
454 if string(t.Text) == w && string(t.Input) == w {
455 t.Type = typ
456 break
457 }
458 }
459
460 for pref, typ := range prefixes {
461 if !bytes.HasPrefix(t.Input, []byte(pref)) {
462 continue
463 }
464
465 t.Text = t.Text[len(pref):]
466 t.Type = typ
467 break
468 }
469}
470
471// nextToken returns the next token from the given input.
472func nextToken(in []byte) (*token, error) {
473 left := in[:]
474 parenCount := 0
475 var cur token
476 if len(left) == 0 {
477 return nil, nil
478 }
479
480 if left[0] == '-' {
481 return &token{
482 Type: tokNegate,
483 Text: []byte{'-'},
484 Input: in[:1],
485 }, nil
486 }
487
488 foundSpace := false
489
490loop:
491 for len(left) > 0 {
492 c := left[0]
493 switch c {
494 case '(':
495 parenCount++
496 cur.Text = append(cur.Text, c)
497 left = left[1:]
498 case ')':
499 if parenCount == 0 {
500 if len(cur.Text) == 0 {
501 cur.Text = []byte{')'}
502 left = left[1:]
503 }
504 break loop
505 }
506
507 cur.Text = append(cur.Text, c)
508 left = left[1:]
509 parenCount--
510
511 case '"':
512 t, n, err := parseStringLiteral(left)
513 if err != nil {
514 return nil, err
515 }
516 cur.Text = append(cur.Text, t...)
517 left = left[n:]
518 case '\\':
519 left = left[1:]
520 if len(left) == 0 {
521 return nil, fmt.Errorf("query: lone \\ at end")
522 }
523 c := left[0]
524 cur.Text = append(cur.Text, '\\', c)
525 left = left[1:]
526
527 case ' ', '\n', '\t':
528 if parenCount > 0 {
529 foundSpace = true
530 }
531 break loop
532 default:
533 cur.Text = append(cur.Text, c)
534 left = left[1:]
535 }
536 }
537
538 if len(cur.Text) == 0 {
539 return nil, nil
540 }
541
542 if foundSpace && cur.Text[0] == '(' {
543 cur.Text = cur.Text[:1]
544 cur.Input = in[:1]
545 } else {
546 cur.Input = in[:len(in)-len(left)]
547 }
548 cur.setType()
549 return &cur, nil
550}