fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package query
16
17import (
18 "bytes"
19 "fmt"
20 "log"
21 "regexp/syntax"
22
23 "github.com/grafana/regexp"
24
25 "github.com/sourcegraph/zoekt/internal/languages"
26)
27
28var _ = log.Printf
29
30// parseStringLiteral parses a string literal, consumes the starting
31// quote too.
32func parseStringLiteral(in []byte) (lit []byte, n int, err error) {
33 left := in[1:]
34 found := false
35
36loop:
37 for len(left) > 0 {
38 c := left[0]
39 left = left[1:]
40 switch c {
41 case '"':
42 found = true
43 break loop
44 case '\\':
45 // TODO - other escape sequences.
46 if len(left) == 0 {
47 return nil, 0, fmt.Errorf("query: missing char after \\")
48 }
49 c = left[0]
50 left = left[1:]
51
52 lit = append(lit, c)
53 default:
54 lit = append(lit, c)
55 }
56 }
57 if !found {
58 return nil, 0, fmt.Errorf("query: unterminated quoted string")
59 }
60 return lit, len(in) - len(left), nil
61}
62
63// orOperator is a placeholder intermediate so we can represent [A,
64// or, B] before we convert it to Or{A, B}
65type orOperator struct{}
66
67func (o *orOperator) String() string {
68 return "orOp"
69}
70
71func isSpace(c byte) bool {
72 return c == ' ' || c == '\t'
73}
74
75// Parse parses a string into a query.
76func Parse(qStr string) (Q, error) {
77 b := []byte(qStr)
78
79 qs, _, err := parseExprList(b)
80 if err != nil {
81 return nil, err
82 }
83
84 q, err := parseOperators(qs)
85 if err != nil {
86 return nil, err
87 }
88
89 return Simplify(q), nil
90}
91
92// parseExpr parses a single expression, returning the result, and the
93// number of bytes consumed.
94func parseExpr(in []byte) (Q, int, error) {
95 b := in[:]
96 var expr Q
97 for len(b) > 0 && isSpace(b[0]) {
98 b = b[1:]
99 }
100
101 tok, err := nextToken(b)
102 if err != nil {
103 return nil, 0, err
104 }
105 if tok == nil {
106 return nil, 0, nil
107 }
108 b = b[len(tok.Input):]
109
110 text := string(tok.Text)
111 switch tok.Type {
112 case tokCase:
113 switch text {
114 case "yes":
115 case "no":
116 case "auto":
117 default:
118 return nil, 0, fmt.Errorf("query: unknown case argument %q, want {yes,no,auto}", text)
119 }
120 expr = &caseQ{text}
121 case tokRepo:
122 r, err := regexp.Compile(text)
123 if err != nil {
124 return nil, 0, err
125 }
126
127 expr = &Repo{r}
128 case tokArchived:
129 switch text {
130 case "yes":
131 expr = RawConfig(RcOnlyArchived)
132 case "no":
133 expr = RawConfig(RcNoArchived)
134 default:
135 return nil, 0, fmt.Errorf("query: unknown archived argument %q, want {yes,no}", text)
136 }
137 case tokFork:
138 switch text {
139 case "yes":
140 expr = RawConfig(RcOnlyForks)
141 case "no":
142 expr = RawConfig(RcNoForks)
143 default:
144 return nil, 0, fmt.Errorf("query: unknown fork argument %q, want {yes,no}", text)
145 }
146 case tokPublic:
147 switch text {
148 case "yes":
149 expr = RawConfig(RcOnlyPublic)
150 case "no":
151 expr = RawConfig(RcOnlyPrivate)
152 default:
153 return nil, 0, fmt.Errorf("query: unknown public argument %q, want {yes,no}", text)
154 }
155 case tokBranch:
156 expr = &Branch{Pattern: text}
157 case tokText, tokRegex:
158 q, err := RegexpQuery(text, false, false)
159 if err != nil {
160 return nil, 0, err
161 }
162 expr = q
163 case tokFile:
164 q, err := RegexpQuery(text, false, true)
165 if err != nil {
166 return nil, 0, err
167 }
168 expr = q
169 case tokContent:
170 q, err := RegexpQuery(text, true, false)
171 if err != nil {
172 return nil, 0, err
173 }
174 expr = q
175 case tokLang:
176 canonical, ok := languages.GetLanguageByAlias(text)
177 if !ok {
178 expr = &Const{false}
179 } else {
180 expr = &Language{Language: canonical}
181 }
182
183 case tokSym:
184 if text == "" {
185 return nil, 0, fmt.Errorf("the sym: atom must have an argument")
186 }
187
188 q, err := RegexpQuery(text, false, false)
189 if err != nil {
190 return nil, 0, err
191 }
192
193 expr = &Symbol{q}
194 case tokParenClose:
195 // Caller must consume paren.
196 expr = nil
197
198 case tokParenOpen:
199 qs, n, err := parseExprList(b)
200 b = b[n:]
201 if err != nil {
202 return nil, 0, err
203 }
204
205 pTok, err := nextToken(b)
206 if err != nil {
207 return nil, 0, err
208 }
209 if pTok == nil || pTok.Type != tokParenClose {
210 return nil, 0, fmt.Errorf("query: missing close paren, got token %v", pTok)
211 }
212
213 b = b[len(pTok.Input):]
214 expr, err = parseOperators(qs)
215 if err != nil {
216 return nil, 0, err
217 }
218 case tokNegate:
219 subQ, n, err := parseExpr(b)
220 if err != nil {
221 return nil, 0, err
222 }
223 if subQ == nil {
224 return nil, 0, fmt.Errorf("query: '-' operator needs an argument")
225 }
226 b = b[n:]
227 expr = &Not{subQ}
228
229 case tokType:
230 var t uint8
231 switch text {
232 case "filematch":
233 t = TypeFileMatch
234 case "filename", "file":
235 t = TypeFileName
236 case "repo":
237 t = TypeRepo
238 default:
239 return nil, 0, fmt.Errorf("query: unknown type argument %q, want {filematch,filename,repo}", text)
240 }
241 // Later we will lift this into a root, like we do for caseQ
242 expr = &Type{Type: t, Child: nil}
243 case tokMeta:
244 // Split on ':' to separate field and value
245 parts := bytes.SplitN([]byte(text), []byte(":"), 2)
246 if len(parts) != 2 {
247 return nil, 0, fmt.Errorf("query: invalid meta field syntax %q", text)
248 }
249 field := string(parts[0])
250 valuePattern := string(parts[1])
251 re, err := regexp.Compile(valuePattern)
252 if err != nil {
253 return nil, 0, fmt.Errorf("query: invalid regexp in meta value: %v", err)
254 }
255 expr = &Meta{
256 Field: field,
257 Value: re,
258 }
259 }
260
261 return expr, len(in) - len(b), nil
262}
263
264const regexpFlags syntax.Flags = syntax.ClassNL | syntax.PerlX | syntax.UnicodeGroups
265
266// RegexpQuery parses an atom into either a regular expression, or a
267// simple substring atom.
268func RegexpQuery(text string, content, file bool) (Q, error) {
269 var expr Q
270
271 r, err := syntax.Parse(text, regexpFlags)
272 if err != nil {
273 return nil, err
274 }
275
276 r = OptimizeRegexp(r, regexpFlags)
277
278 if r.Op == syntax.OpLiteral {
279 expr = &Substring{
280 Pattern: string(r.Rune),
281 FileName: file,
282 Content: content,
283 }
284 } else {
285 expr = &Regexp{
286 Regexp: r,
287 FileName: file,
288 Content: content,
289 }
290 }
291
292 return expr, nil
293}
294
295// parseOperators interprets the orOperator in a list of queries.
296func parseOperators(in []Q) (Q, error) {
297 top := &Or{}
298 cur := &And{}
299
300 seenOr := false
301 for _, q := range in {
302 if _, ok := q.(*orOperator); ok {
303 seenOr = true
304 if len(cur.Children) == 0 {
305 return nil, fmt.Errorf("query: OR operator should have operand")
306 }
307 top.Children = append(top.Children, cur)
308 cur = &And{}
309 } else {
310 cur.Children = append(cur.Children, q)
311 }
312 }
313
314 if seenOr && len(cur.Children) == 0 {
315 return nil, fmt.Errorf("query: OR operator should have operand")
316 }
317 top.Children = append(top.Children, cur)
318 return top, nil
319}
320
321// parseExprList parses a list of query expressions. It is the
322// workhorse of the Parse function.
323func parseExprList(in []byte) ([]Q, int, error) {
324 b := in[:]
325 var qs []Q
326 for len(b) > 0 {
327 for len(b) > 0 && isSpace(b[0]) {
328 b = b[1:]
329 }
330 tok, _ := nextToken(b)
331 if tok != nil && tok.Type == tokParenClose {
332 break
333 } else if tok != nil && tok.Type == tokOr {
334 qs = append(qs, &orOperator{})
335 b = b[len(tok.Input):]
336 continue
337 }
338
339 q, n, err := parseExpr(b)
340 if err != nil {
341 return nil, 0, err
342 }
343
344 if q == nil {
345 // eof or a ')'
346 break
347 }
348 qs = append(qs, q)
349 b = b[n:]
350 }
351
352 setCase := "auto"
353 newQS := qs[:0]
354 typeT := uint8(100)
355 for _, q := range qs {
356 switch s := q.(type) {
357 case *caseQ:
358 setCase = s.Flavor
359 case *Type:
360 if s.Type < typeT {
361 typeT = s.Type
362 }
363 default:
364 newQS = append(newQS, q)
365 }
366 }
367 qs = mapQueryList(newQS, func(q Q) Q {
368 if sc, ok := q.(setCaser); ok {
369 sc.setCase(setCase)
370 }
371 return q
372 })
373 if typeT != 100 {
374 qs = []Q{&Type{Type: typeT, Child: NewAnd(qs...)}}
375 }
376 return qs, len(in) - len(b), nil
377}
378
379type token struct {
380 Type int
381 // The value of the token
382 Text []byte
383
384 // The input that we consumed to form the token.
385 Input []byte
386}
387
388func (t *token) String() string {
389 return fmt.Sprintf("%s:%q", tokNames[t.Type], t.Text)
390}
391
392// token types.
393const (
394 tokText = 0
395 tokFile = 1
396 tokRepo = 2
397 tokCase = 3
398 tokBranch = 4
399 tokParenOpen = 5
400 tokParenClose = 6
401 tokError = 7
402 tokNegate = 8
403 tokRegex = 9
404 tokOr = 10
405 tokContent = 11
406 tokLang = 12
407 tokSym = 13
408 tokType = 14
409 tokArchived = 15
410 tokPublic = 16
411 tokFork = 17
412 tokMeta = 18
413)
414
415var tokNames = map[int]string{
416 tokArchived: "Archived",
417 tokBranch: "Branch",
418 tokCase: "Case",
419 tokError: "Error",
420 tokFile: "File",
421 tokFork: "Fork",
422 tokNegate: "Negate",
423 tokOr: "Or",
424 tokParenClose: "ParenClose",
425 tokParenOpen: "ParenOpen",
426 tokPublic: "Public",
427 tokRegex: "Regex",
428 tokRepo: "Repo",
429 tokText: "Text",
430 tokLang: "Language",
431 tokSym: "Symbol",
432 tokType: "Type",
433 tokMeta: "Meta",
434}
435
436var prefixes = map[string]int{
437 "archived:": tokArchived,
438 "b:": tokBranch,
439 "branch:": tokBranch,
440 "c:": tokContent,
441 "case:": tokCase,
442 "content:": tokContent,
443 "f:": tokFile,
444 "file:": tokFile,
445 "fork:": tokFork,
446 "public:": tokPublic,
447 "r:": tokRepo,
448 "regex:": tokRegex,
449 "repo:": tokRepo,
450 "lang:": tokLang,
451 "sym:": tokSym,
452 "t:": tokType,
453 "type:": tokType,
454 "meta.": tokMeta,
455}
456
457var reservedWords = map[string]int{
458 "or": tokOr,
459}
460
461func (t *token) setType() {
462 // After we consumed the input, we have to interpret some of the text,
463 // eg. to distinguish between ")" the text and ) the query grouping
464 // parenthesis.
465 if len(t.Text) == 1 && t.Text[0] == '(' {
466 t.Type = tokParenOpen
467 }
468 if len(t.Text) == 1 && t.Text[0] == ')' {
469 t.Type = tokParenClose
470 }
471
472 for w, typ := range reservedWords {
473 if string(t.Text) == w && string(t.Input) == w {
474 t.Type = typ
475 break
476 }
477 }
478
479 for pref, typ := range prefixes {
480 if !bytes.HasPrefix(t.Input, []byte(pref)) {
481 continue
482 }
483
484 t.Text = t.Text[len(pref):]
485 t.Type = typ
486 break
487 }
488}
489
490// nextToken returns the next token from the given input.
491func nextToken(in []byte) (*token, error) {
492 left := in[:]
493 parenCount := 0
494 var cur token
495 if len(left) == 0 {
496 return nil, nil
497 }
498
499 if left[0] == '-' {
500 return &token{
501 Type: tokNegate,
502 Text: []byte{'-'},
503 Input: in[:1],
504 }, nil
505 }
506
507 foundSpace := false
508
509loop:
510 for len(left) > 0 {
511 c := left[0]
512 switch c {
513 case '(':
514 parenCount++
515 cur.Text = append(cur.Text, c)
516 left = left[1:]
517 case ')':
518 if parenCount == 0 {
519 if len(cur.Text) == 0 {
520 cur.Text = []byte{')'}
521 left = left[1:]
522 }
523 break loop
524 }
525
526 cur.Text = append(cur.Text, c)
527 left = left[1:]
528 parenCount--
529
530 case '"':
531 t, n, err := parseStringLiteral(left)
532 if err != nil {
533 return nil, err
534 }
535 cur.Text = append(cur.Text, t...)
536 left = left[n:]
537 case '\\':
538 left = left[1:]
539 if len(left) == 0 {
540 return nil, fmt.Errorf("query: lone \\ at end")
541 }
542 c := left[0]
543 cur.Text = append(cur.Text, '\\', c)
544 left = left[1:]
545
546 case ' ', '\n', '\t':
547 if parenCount > 0 {
548 foundSpace = true
549 }
550 break loop
551 default:
552 cur.Text = append(cur.Text, c)
553 left = left[1:]
554 }
555 }
556
557 if len(cur.Text) == 0 {
558 return nil, nil
559 }
560
561 if foundSpace && cur.Text[0] == '(' {
562 cur.Text = cur.Text[:1]
563 cur.Input = in[:1]
564 } else {
565 cur.Input = in[:len(in)-len(left)]
566 }
567 cur.setType()
568 return &cur, nil
569}