fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package query
16
17import (
18 "bytes"
19 "fmt"
20 "log"
21 "regexp/syntax"
22
23 "github.com/grafana/regexp"
24 "github.com/sourcegraph/zoekt/internal/languages"
25)
26
27var _ = log.Printf
28
29// parseStringLiteral parses a string literal, consumes the starting
30// quote too.
31func parseStringLiteral(in []byte) (lit []byte, n int, err error) {
32 left := in[1:]
33 found := false
34
35loop:
36 for len(left) > 0 {
37 c := left[0]
38 left = left[1:]
39 switch c {
40 case '"':
41 found = true
42 break loop
43 case '\\':
44 // TODO - other escape sequences.
45 if len(left) == 0 {
46 return nil, 0, fmt.Errorf("query: missing char after \\")
47 }
48 c = left[0]
49 left = left[1:]
50
51 lit = append(lit, c)
52 default:
53 lit = append(lit, c)
54 }
55 }
56 if !found {
57 return nil, 0, fmt.Errorf("query: unterminated quoted string")
58 }
59 return lit, len(in) - len(left), nil
60}
61
62// orOperator is a placeholder intermediate so we can represent [A,
63// or, B] before we convert it to Or{A, B}
64type orOperator struct{}
65
66func (o *orOperator) String() string {
67 return "orOp"
68}
69
70func isSpace(c byte) bool {
71 return c == ' ' || c == '\t'
72}
73
74// Parse parses a string into a query.
75func Parse(qStr string) (Q, error) {
76 b := []byte(qStr)
77
78 qs, _, err := parseExprList(b)
79 if err != nil {
80 return nil, err
81 }
82
83 q, err := parseOperators(qs)
84 if err != nil {
85 return nil, err
86 }
87
88 return Simplify(q), nil
89}
90
91// parseExpr parses a single expression, returning the result, and the
92// number of bytes consumed.
93func parseExpr(in []byte) (Q, int, error) {
94 b := in[:]
95 var expr Q
96 for len(b) > 0 && isSpace(b[0]) {
97 b = b[1:]
98 }
99
100 tok, err := nextToken(b)
101 if err != nil {
102 return nil, 0, err
103 }
104 if tok == nil {
105 return nil, 0, nil
106 }
107 b = b[len(tok.Input):]
108
109 text := string(tok.Text)
110 switch tok.Type {
111 case tokCase:
112 switch text {
113 case "yes":
114 case "no":
115 case "auto":
116 default:
117 return nil, 0, fmt.Errorf("query: unknown case argument %q, want {yes,no,auto}", text)
118 }
119 expr = &caseQ{text}
120 case tokRepo:
121 r, err := regexp.Compile(text)
122 if err != nil {
123 return nil, 0, err
124 }
125
126 expr = &Repo{r}
127 case tokArchived:
128 switch text {
129 case "yes":
130 expr = RawConfig(RcOnlyArchived)
131 case "no":
132 expr = RawConfig(RcNoArchived)
133 default:
134 return nil, 0, fmt.Errorf("query: unknown archived argument %q, want {yes,no}", text)
135 }
136 case tokFork:
137 switch text {
138 case "yes":
139 expr = RawConfig(RcOnlyForks)
140 case "no":
141 expr = RawConfig(RcNoForks)
142 default:
143 return nil, 0, fmt.Errorf("query: unknown fork argument %q, want {yes,no}", text)
144 }
145 case tokPublic:
146 switch text {
147 case "yes":
148 expr = RawConfig(RcOnlyPublic)
149 case "no":
150 expr = RawConfig(RcOnlyPrivate)
151 default:
152 return nil, 0, fmt.Errorf("query: unknown public argument %q, want {yes,no}", text)
153 }
154 case tokBranch:
155 expr = &Branch{Pattern: text}
156 case tokText, tokRegex:
157 q, err := RegexpQuery(text, false, false)
158 if err != nil {
159 return nil, 0, err
160 }
161 expr = q
162 case tokFile:
163 q, err := RegexpQuery(text, false, true)
164 if err != nil {
165 return nil, 0, err
166 }
167 expr = q
168 case tokContent:
169 q, err := RegexpQuery(text, true, false)
170 if err != nil {
171 return nil, 0, err
172 }
173 expr = q
174 case tokLang:
175 canonical, ok := languages.GetLanguageByAlias(text)
176 if !ok {
177 expr = &Const{false}
178 } else {
179 expr = &Language{Language: canonical}
180 }
181
182 case tokSym:
183 if text == "" {
184 return nil, 0, fmt.Errorf("the sym: atom must have an argument")
185 }
186
187 q, err := RegexpQuery(text, false, false)
188 if err != nil {
189 return nil, 0, err
190 }
191
192 expr = &Symbol{q}
193 case tokParenClose:
194 // Caller must consume paren.
195 expr = nil
196
197 case tokParenOpen:
198 qs, n, err := parseExprList(b)
199 b = b[n:]
200 if err != nil {
201 return nil, 0, err
202 }
203
204 pTok, err := nextToken(b)
205 if err != nil {
206 return nil, 0, err
207 }
208 if pTok == nil || pTok.Type != tokParenClose {
209 return nil, 0, fmt.Errorf("query: missing close paren, got token %v", pTok)
210 }
211
212 b = b[len(pTok.Input):]
213 expr, err = parseOperators(qs)
214 if err != nil {
215 return nil, 0, err
216 }
217 case tokNegate:
218 subQ, n, err := parseExpr(b)
219 if err != nil {
220 return nil, 0, err
221 }
222 if subQ == nil {
223 return nil, 0, fmt.Errorf("query: '-' operator needs an argument")
224 }
225 b = b[n:]
226 expr = &Not{subQ}
227
228 case tokType:
229 var t uint8
230 switch text {
231 case "filematch":
232 t = TypeFileMatch
233 case "filename", "file":
234 t = TypeFileName
235 case "repo":
236 t = TypeRepo
237 default:
238 return nil, 0, fmt.Errorf("query: unknown type argument %q, want {filematch,filename,repo}", text)
239 }
240 // Later we will lift this into a root, like we do for caseQ
241 expr = &Type{Type: t, Child: nil}
242 }
243
244 return expr, len(in) - len(b), nil
245}
246
247const regexpFlags syntax.Flags = syntax.ClassNL | syntax.PerlX | syntax.UnicodeGroups
248
249// RegexpQuery parses an atom into either a regular expression, or a
250// simple substring atom.
251func RegexpQuery(text string, content, file bool) (Q, error) {
252 var expr Q
253
254 r, err := syntax.Parse(text, regexpFlags)
255 if err != nil {
256 return nil, err
257 }
258
259 r = OptimizeRegexp(r, regexpFlags)
260
261 if r.Op == syntax.OpLiteral {
262 expr = &Substring{
263 Pattern: string(r.Rune),
264 FileName: file,
265 Content: content,
266 }
267 } else {
268 expr = &Regexp{
269 Regexp: r,
270 FileName: file,
271 Content: content,
272 }
273 }
274
275 return expr, nil
276}
277
278// parseOperators interprets the orOperator in a list of queries.
279func parseOperators(in []Q) (Q, error) {
280 top := &Or{}
281 cur := &And{}
282
283 seenOr := false
284 for _, q := range in {
285 if _, ok := q.(*orOperator); ok {
286 seenOr = true
287 if len(cur.Children) == 0 {
288 return nil, fmt.Errorf("query: OR operator should have operand")
289 }
290 top.Children = append(top.Children, cur)
291 cur = &And{}
292 } else {
293 cur.Children = append(cur.Children, q)
294 }
295 }
296
297 if seenOr && len(cur.Children) == 0 {
298 return nil, fmt.Errorf("query: OR operator should have operand")
299 }
300 top.Children = append(top.Children, cur)
301 return top, nil
302}
303
304// parseExprList parses a list of query expressions. It is the
305// workhorse of the Parse function.
306func parseExprList(in []byte) ([]Q, int, error) {
307 b := in[:]
308 var qs []Q
309 for len(b) > 0 {
310 for len(b) > 0 && isSpace(b[0]) {
311 b = b[1:]
312 }
313 tok, _ := nextToken(b)
314 if tok != nil && tok.Type == tokParenClose {
315 break
316 } else if tok != nil && tok.Type == tokOr {
317 qs = append(qs, &orOperator{})
318 b = b[len(tok.Input):]
319 continue
320 }
321
322 q, n, err := parseExpr(b)
323 if err != nil {
324 return nil, 0, err
325 }
326
327 if q == nil {
328 // eof or a ')'
329 break
330 }
331 qs = append(qs, q)
332 b = b[n:]
333 }
334
335 setCase := "auto"
336 newQS := qs[:0]
337 typeT := uint8(100)
338 for _, q := range qs {
339 switch s := q.(type) {
340 case *caseQ:
341 setCase = s.Flavor
342 case *Type:
343 if s.Type < typeT {
344 typeT = s.Type
345 }
346 default:
347 newQS = append(newQS, q)
348 }
349 }
350 qs = mapQueryList(newQS, func(q Q) Q {
351 if sc, ok := q.(setCaser); ok {
352 sc.setCase(setCase)
353 }
354 return q
355 })
356 if typeT != 100 {
357 qs = []Q{&Type{Type: typeT, Child: NewAnd(qs...)}}
358 }
359 return qs, len(in) - len(b), nil
360}
361
362type token struct {
363 Type int
364 // The value of the token
365 Text []byte
366
367 // The input that we consumed to form the token.
368 Input []byte
369}
370
371func (t *token) String() string {
372 return fmt.Sprintf("%s:%q", tokNames[t.Type], t.Text)
373}
374
375// token types.
376const (
377 tokText = 0
378 tokFile = 1
379 tokRepo = 2
380 tokCase = 3
381 tokBranch = 4
382 tokParenOpen = 5
383 tokParenClose = 6
384 tokError = 7
385 tokNegate = 8
386 tokRegex = 9
387 tokOr = 10
388 tokContent = 11
389 tokLang = 12
390 tokSym = 13
391 tokType = 14
392 tokArchived = 15
393 tokPublic = 16
394 tokFork = 17
395)
396
397var tokNames = map[int]string{
398 tokArchived: "Archived",
399 tokBranch: "Branch",
400 tokCase: "Case",
401 tokError: "Error",
402 tokFile: "File",
403 tokFork: "Fork",
404 tokNegate: "Negate",
405 tokOr: "Or",
406 tokParenClose: "ParenClose",
407 tokParenOpen: "ParenOpen",
408 tokPublic: "Public",
409 tokRegex: "Regex",
410 tokRepo: "Repo",
411 tokText: "Text",
412 tokLang: "Language",
413 tokSym: "Symbol",
414 tokType: "Type",
415}
416
417var prefixes = map[string]int{
418 "archived:": tokArchived,
419 "b:": tokBranch,
420 "branch:": tokBranch,
421 "c:": tokContent,
422 "case:": tokCase,
423 "content:": tokContent,
424 "f:": tokFile,
425 "file:": tokFile,
426 "fork:": tokFork,
427 "public:": tokPublic,
428 "r:": tokRepo,
429 "regex:": tokRegex,
430 "repo:": tokRepo,
431 "lang:": tokLang,
432 "sym:": tokSym,
433 "t:": tokType,
434 "type:": tokType,
435}
436
437var reservedWords = map[string]int{
438 "or": tokOr,
439}
440
441func (t *token) setType() {
442 // After we consumed the input, we have to interpret some of the text,
443 // eg. to distinguish between ")" the text and ) the query grouping
444 // parenthesis.
445 if len(t.Text) == 1 && t.Text[0] == '(' {
446 t.Type = tokParenOpen
447 }
448 if len(t.Text) == 1 && t.Text[0] == ')' {
449 t.Type = tokParenClose
450 }
451
452 for w, typ := range reservedWords {
453 if string(t.Text) == w && string(t.Input) == w {
454 t.Type = typ
455 break
456 }
457 }
458
459 for pref, typ := range prefixes {
460 if !bytes.HasPrefix(t.Input, []byte(pref)) {
461 continue
462 }
463
464 t.Text = t.Text[len(pref):]
465 t.Type = typ
466 break
467 }
468}
469
470// nextToken returns the next token from the given input.
471func nextToken(in []byte) (*token, error) {
472 left := in[:]
473 parenCount := 0
474 var cur token
475 if len(left) == 0 {
476 return nil, nil
477 }
478
479 if left[0] == '-' {
480 return &token{
481 Type: tokNegate,
482 Text: []byte{'-'},
483 Input: in[:1],
484 }, nil
485 }
486
487 foundSpace := false
488
489loop:
490 for len(left) > 0 {
491 c := left[0]
492 switch c {
493 case '(':
494 parenCount++
495 cur.Text = append(cur.Text, c)
496 left = left[1:]
497 case ')':
498 if parenCount == 0 {
499 if len(cur.Text) == 0 {
500 cur.Text = []byte{')'}
501 left = left[1:]
502 }
503 break loop
504 }
505
506 cur.Text = append(cur.Text, c)
507 left = left[1:]
508 parenCount--
509
510 case '"':
511 t, n, err := parseStringLiteral(left)
512 if err != nil {
513 return nil, err
514 }
515 cur.Text = append(cur.Text, t...)
516 left = left[n:]
517 case '\\':
518 left = left[1:]
519 if len(left) == 0 {
520 return nil, fmt.Errorf("query: lone \\ at end")
521 }
522 c := left[0]
523 cur.Text = append(cur.Text, '\\', c)
524 left = left[1:]
525
526 case ' ', '\n', '\t':
527 if parenCount > 0 {
528 foundSpace = true
529 }
530 break loop
531 default:
532 cur.Text = append(cur.Text, c)
533 left = left[1:]
534 }
535 }
536
537 if len(cur.Text) == 0 {
538 return nil, nil
539 }
540
541 if foundSpace && cur.Text[0] == '(' {
542 cur.Text = cur.Text[:1]
543 cur.Input = in[:1]
544 } else {
545 cur.Input = in[:len(in)-len(left)]
546 }
547 cur.setType()
548 return &cur, nil
549}