fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

all: use a faster vendored regexp/syntax/Regexp.String (#753)

We replace all calls to Regexp.String with a vendored version which is
faster.

go1.22 introduced a commit which "minimizes" the string returned by
Regexp.String(). Part of what it does is run enumerate through literals
runes in your string to see calculate flags related to unicode and case
sensitivity. This can be quite slow, but is made worse by the fact we
call it per shard per regexp in your query.Q to construct the matchtree.

Currently Regexp.String() represents 40% of CPU time on sourcegraph.com.
Before go1.22 it was ~0%.

Note: This is a temporary change to resolve the issue. I have a deeper
change to make this less clumsy.

Note: In one place we remove the use of string by relying on
Regexp.Equal instead.

Test Plan: go test

+715 -11
+58
internal/syntaxutil/README.md
··· 1 + # vendored std regexp/syntax 2 + 3 + This package contains a vendored copy of std regexp/syntax. However, it only 4 + contains the code for converting syntax.Regexp into a String. It is the 5 + version of the code at a recent go commit, but with a commit which introduces 6 + a significant performance regression reverted. 7 + 8 + At the time of writing regexp.String on go1.22 is taking 40% of CPU at 9 + Sourcegraph. This should return to ~0% with this vendored code. 10 + 11 + https://github.com/sourcegraph/sourcegraph/issues/61462 12 + 13 + ## Vendored commit 14 + 15 + ``` 16 + commit 2e1003e2f7e42efc5771812b9ee6ed264803796c 17 + Author: Daniel Martí <mvdan@mvdan.cc> 18 + Date: Tue Mar 26 22:59:41 2024 +0200 19 + 20 + cmd/go: replace reflect.DeepEqual with slices.Equal and maps.Equal 21 + 22 + All of these maps and slices are made up of comparable types, 23 + so we can avoid the overhead of reflection entirely. 24 + 25 + Change-Id: If77dbe648a336ba729c171e84c9ff3f7e160297d 26 + Reviewed-on: https://go-review.googlesource.com/c/go/+/574597 27 + Reviewed-by: Than McIntosh <thanm@google.com> 28 + LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> 29 + Reviewed-by: Ian Lance Taylor <iant@google.com> 30 + ``` 31 + 32 + ## Reverted commit 33 + 34 + ``` 35 + commit 98c9f271d67b501ecf2ce995539abd2cdc81d505 36 + Author: Russ Cox <rsc@golang.org> 37 + Date: Wed Jun 28 17:45:26 2023 -0400 38 + 39 + regexp/syntax: use more compact Regexp.String output 40 + 41 + Compact the Regexp.String output. It was only ever intended for debugging, 42 + but there are at least some uses in the wild where regexps are built up 43 + using regexp/syntax and then formatted using the String method. 44 + Compact the output to help that use case. Specifically: 45 + 46 + - Compact 2-element character class ranges: [a-b] -> [ab]. 47 + - Aggregate flags: (?i:A)(?i:B)*(?i:C)|(?i:D)?(?i:E) -> (?i:AB*C|D?E). 48 + 49 + Fixes #57950. 50 + 51 + Change-Id: I1161d0e3aa6c3ae5a302677032bb7cd55caae5fb 52 + Reviewed-on: https://go-review.googlesource.com/c/go/+/507015 53 + TryBot-Result: Gopher Robot <gobot@golang.org> 54 + Reviewed-by: Than McIntosh <thanm@google.com> 55 + Run-TryBot: Russ Cox <rsc@golang.org> 56 + Reviewed-by: Rob Pike <r@golang.org> 57 + Auto-Submit: Russ Cox <rsc@golang.org> 58 + ```
+51
internal/syntaxutil/alias_test.go
··· 1 + package syntaxutil 2 + 3 + import "regexp/syntax" 4 + 5 + // A bunch of aliases to avoid needing to modify parse_test.go too much. 6 + 7 + type Regexp = syntax.Regexp 8 + 9 + type Op = syntax.Op 10 + 11 + const ( 12 + OpNoMatch = syntax.OpNoMatch 13 + OpEmptyMatch = syntax.OpEmptyMatch 14 + OpLiteral = syntax.OpLiteral 15 + OpCharClass = syntax.OpCharClass 16 + OpAnyCharNotNL = syntax.OpAnyCharNotNL 17 + OpAnyChar = syntax.OpAnyChar 18 + OpBeginLine = syntax.OpBeginLine 19 + OpEndLine = syntax.OpEndLine 20 + OpBeginText = syntax.OpBeginText 21 + OpEndText = syntax.OpEndText 22 + OpWordBoundary = syntax.OpWordBoundary 23 + OpNoWordBoundary = syntax.OpNoWordBoundary 24 + OpCapture = syntax.OpCapture 25 + OpStar = syntax.OpStar 26 + OpPlus = syntax.OpPlus 27 + OpQuest = syntax.OpQuest 28 + OpRepeat = syntax.OpRepeat 29 + OpConcat = syntax.OpConcat 30 + OpAlternate = syntax.OpAlternate 31 + ) 32 + 33 + type Flags = syntax.Flags 34 + 35 + const ( 36 + FoldCase = syntax.FoldCase 37 + Literal = syntax.Literal 38 + ClassNL = syntax.ClassNL 39 + DotNL = syntax.DotNL 40 + OneLine = syntax.OneLine 41 + NonGreedy = syntax.NonGreedy 42 + PerlX = syntax.PerlX 43 + UnicodeGroups = syntax.UnicodeGroups 44 + WasDollar = syntax.WasDollar 45 + Simple = syntax.Simple 46 + MatchNL = syntax.MatchNL 47 + Perl = syntax.Perl 48 + POSIX = syntax.POSIX 49 + ) 50 + 51 + var Parse = syntax.Parse
+397
internal/syntaxutil/parse_test.go
··· 1 + // Copyright 2011 The Go Authors. All rights reserved. 2 + // Use of this source code is governed by a BSD-style 3 + // license that can be found in the LICENSE file. 4 + 5 + package syntaxutil 6 + 7 + import ( 8 + "fmt" 9 + "strings" 10 + "testing" 11 + "unicode" 12 + ) 13 + 14 + type parseTest struct { 15 + Regexp string 16 + Dump string 17 + } 18 + 19 + var parseTests = []parseTest{ 20 + // Base cases 21 + {`a`, `lit{a}`}, 22 + {`a.`, `cat{lit{a}dot{}}`}, 23 + {`a.b`, `cat{lit{a}dot{}lit{b}}`}, 24 + {`ab`, `str{ab}`}, 25 + {`a.b.c`, `cat{lit{a}dot{}lit{b}dot{}lit{c}}`}, 26 + {`abc`, `str{abc}`}, 27 + {`a|^`, `alt{lit{a}bol{}}`}, 28 + {`a|b`, `cc{0x61-0x62}`}, 29 + {`(a)`, `cap{lit{a}}`}, 30 + {`(a)|b`, `alt{cap{lit{a}}lit{b}}`}, 31 + {`a*`, `star{lit{a}}`}, 32 + {`a+`, `plus{lit{a}}`}, 33 + {`a?`, `que{lit{a}}`}, 34 + {`a{2}`, `rep{2,2 lit{a}}`}, 35 + {`a{2,3}`, `rep{2,3 lit{a}}`}, 36 + {`a{2,}`, `rep{2,-1 lit{a}}`}, 37 + {`a*?`, `nstar{lit{a}}`}, 38 + {`a+?`, `nplus{lit{a}}`}, 39 + {`a??`, `nque{lit{a}}`}, 40 + {`a{2}?`, `nrep{2,2 lit{a}}`}, 41 + {`a{2,3}?`, `nrep{2,3 lit{a}}`}, 42 + {`a{2,}?`, `nrep{2,-1 lit{a}}`}, 43 + // Malformed { } are treated as literals. 44 + {`x{1001`, `str{x{1001}`}, 45 + {`x{9876543210`, `str{x{9876543210}`}, 46 + {`x{9876543210,`, `str{x{9876543210,}`}, 47 + {`x{2,1`, `str{x{2,1}`}, 48 + {`x{1,9876543210`, `str{x{1,9876543210}`}, 49 + {``, `emp{}`}, 50 + {`|`, `emp{}`}, // alt{emp{}emp{}} but got factored 51 + {`|x|`, `alt{emp{}lit{x}emp{}}`}, 52 + {`.`, `dot{}`}, 53 + {`^`, `bol{}`}, 54 + {`$`, `eol{}`}, 55 + {`\|`, `lit{|}`}, 56 + {`\(`, `lit{(}`}, 57 + {`\)`, `lit{)}`}, 58 + {`\*`, `lit{*}`}, 59 + {`\+`, `lit{+}`}, 60 + {`\?`, `lit{?}`}, 61 + {`{`, `lit{{}`}, 62 + {`}`, `lit{}}`}, 63 + {`\.`, `lit{.}`}, 64 + {`\^`, `lit{^}`}, 65 + {`\$`, `lit{$}`}, 66 + {`\\`, `lit{\}`}, 67 + {`[ace]`, `cc{0x61 0x63 0x65}`}, 68 + {`[abc]`, `cc{0x61-0x63}`}, 69 + {`[a-z]`, `cc{0x61-0x7a}`}, 70 + {`[a]`, `lit{a}`}, 71 + {`\-`, `lit{-}`}, 72 + {`-`, `lit{-}`}, 73 + {`\_`, `lit{_}`}, 74 + {`abc`, `str{abc}`}, 75 + {`abc|def`, `alt{str{abc}str{def}}`}, 76 + {`abc|def|ghi`, `alt{str{abc}str{def}str{ghi}}`}, 77 + 78 + // Posix and Perl extensions 79 + {`[[:lower:]]`, `cc{0x61-0x7a}`}, 80 + {`[a-z]`, `cc{0x61-0x7a}`}, 81 + {`[^[:lower:]]`, `cc{0x0-0x60 0x7b-0x10ffff}`}, 82 + {`[[:^lower:]]`, `cc{0x0-0x60 0x7b-0x10ffff}`}, 83 + {`(?i)[[:lower:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`}, 84 + {`(?i)[a-z]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`}, 85 + {`(?i)[^[:lower:]]`, `cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`}, 86 + {`(?i)[[:^lower:]]`, `cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`}, 87 + {`\d`, `cc{0x30-0x39}`}, 88 + {`\D`, `cc{0x0-0x2f 0x3a-0x10ffff}`}, 89 + {`\s`, `cc{0x9-0xa 0xc-0xd 0x20}`}, 90 + {`\S`, `cc{0x0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}`}, 91 + {`\w`, `cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}`}, 92 + {`\W`, `cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}`}, 93 + {`(?i)\w`, `cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}`}, 94 + {`(?i)\W`, `cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`}, 95 + {`[^\\]`, `cc{0x0-0x5b 0x5d-0x10ffff}`}, 96 + // { `\C`, `byte{}` }, // probably never 97 + 98 + // Unicode, negatives, and a double negative. 99 + {`\p{Braille}`, `cc{0x2800-0x28ff}`}, 100 + {`\P{Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, 101 + {`\p{^Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, 102 + {`\P{^Braille}`, `cc{0x2800-0x28ff}`}, 103 + {`\pZ`, `cc{0x20 0xa0 0x1680 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`}, 104 + {`[\p{Braille}]`, `cc{0x2800-0x28ff}`}, 105 + {`[\P{Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, 106 + {`[\p{^Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, 107 + {`[\P{^Braille}]`, `cc{0x2800-0x28ff}`}, 108 + {`[\pZ]`, `cc{0x20 0xa0 0x1680 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`}, 109 + {`\p{Lu}`, mkCharClass(unicode.IsUpper)}, 110 + {`[\p{Lu}]`, mkCharClass(unicode.IsUpper)}, 111 + {`(?i)[\p{Lu}]`, mkCharClass(isUpperFold)}, 112 + {`\p{Any}`, `dot{}`}, 113 + {`\p{^Any}`, `cc{}`}, 114 + 115 + // Hex, octal. 116 + {`[\012-\234]\141`, `cat{cc{0xa-0x9c}lit{a}}`}, 117 + {`[\x{41}-\x7a]\x61`, `cat{cc{0x41-0x7a}lit{a}}`}, 118 + 119 + // More interesting regular expressions. 120 + {`a{,2}`, `str{a{,2}}`}, 121 + {`\.\^\$\\`, `str{.^$\}`}, 122 + {`[a-zABC]`, `cc{0x41-0x43 0x61-0x7a}`}, 123 + {`[^a]`, `cc{0x0-0x60 0x62-0x10ffff}`}, 124 + {`[α-ε☺]`, `cc{0x3b1-0x3b5 0x263a}`}, // utf-8 125 + {`a*{`, `cat{star{lit{a}}lit{{}}`}, 126 + 127 + // Test precedences 128 + {`(?:ab)*`, `star{str{ab}}`}, 129 + {`(ab)*`, `star{cap{str{ab}}}`}, 130 + {`ab|cd`, `alt{str{ab}str{cd}}`}, 131 + {`a(b|c)d`, `cat{lit{a}cap{cc{0x62-0x63}}lit{d}}`}, 132 + 133 + // Test flattening. 134 + {`(?:a)`, `lit{a}`}, 135 + {`(?:ab)(?:cd)`, `str{abcd}`}, 136 + {`(?:a+b+)(?:c+d+)`, `cat{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}`}, 137 + {`(?:a+|b+)|(?:c+|d+)`, `alt{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}`}, 138 + {`(?:a|b)|(?:c|d)`, `cc{0x61-0x64}`}, 139 + {`a|.`, `dot{}`}, 140 + {`.|a`, `dot{}`}, 141 + {`(?:[abc]|A|Z|hello|world)`, `alt{cc{0x41 0x5a 0x61-0x63}str{hello}str{world}}`}, 142 + {`(?:[abc]|A|Z)`, `cc{0x41 0x5a 0x61-0x63}`}, 143 + 144 + // Test Perl quoted literals 145 + {`\Q+|*?{[\E`, `str{+|*?{[}`}, 146 + {`\Q+\E+`, `plus{lit{+}}`}, 147 + {`\Qab\E+`, `cat{lit{a}plus{lit{b}}}`}, 148 + {`\Q\\E`, `lit{\}`}, 149 + {`\Q\\\E`, `str{\\}`}, 150 + 151 + // Test Perl \A and \z 152 + {`(?m)^`, `bol{}`}, 153 + {`(?m)$`, `eol{}`}, 154 + {`(?-m)^`, `bot{}`}, 155 + {`(?-m)$`, `eot{}`}, 156 + {`(?m)\A`, `bot{}`}, 157 + {`(?m)\z`, `eot{\z}`}, 158 + {`(?-m)\A`, `bot{}`}, 159 + {`(?-m)\z`, `eot{\z}`}, 160 + 161 + // Test named captures 162 + {`(?P<name>a)`, `cap{name:lit{a}}`}, 163 + {`(?<name>a)`, `cap{name:lit{a}}`}, 164 + 165 + // Case-folded literals 166 + {`[Aa]`, `litfold{A}`}, 167 + {`[\x{100}\x{101}]`, `litfold{Ā}`}, 168 + {`[Δδ]`, `litfold{Δ}`}, 169 + 170 + // Strings 171 + {`abcde`, `str{abcde}`}, 172 + {`[Aa][Bb]cd`, `cat{strfold{AB}str{cd}}`}, 173 + 174 + // Factoring. 175 + {`abc|abd|aef|bcx|bcy`, `alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}cat{str{bc}cc{0x78-0x79}}}`}, 176 + {`ax+y|ax+z|ay+w`, `cat{lit{a}alt{cat{plus{lit{x}}lit{y}}cat{plus{lit{x}}lit{z}}cat{plus{lit{y}}lit{w}}}}`}, 177 + 178 + // Bug fixes. 179 + {`(?:.)`, `dot{}`}, 180 + {`(?:x|(?:xa))`, `cat{lit{x}alt{emp{}lit{a}}}`}, 181 + {`(?:.|(?:.a))`, `cat{dot{}alt{emp{}lit{a}}}`}, 182 + {`(?:A(?:A|a))`, `cat{lit{A}litfold{A}}`}, 183 + {`(?:A|a)`, `litfold{A}`}, 184 + {`A|(?:A|a)`, `litfold{A}`}, 185 + {`(?s).`, `dot{}`}, 186 + {`(?-s).`, `dnl{}`}, 187 + {`(?:(?:^).)`, `cat{bol{}dot{}}`}, 188 + {`(?-s)(?:(?:^).)`, `cat{bol{}dnl{}}`}, 189 + {`[\s\S]a`, `cat{cc{0x0-0x10ffff}lit{a}}`}, 190 + 191 + // RE2 prefix_tests 192 + {`abc|abd`, `cat{str{ab}cc{0x63-0x64}}`}, 193 + {`a(?:b)c|abd`, `cat{str{ab}cc{0x63-0x64}}`}, 194 + {`abc|abd|aef|bcx|bcy`, 195 + `alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}` + 196 + `cat{str{bc}cc{0x78-0x79}}}`}, 197 + {`abc|x|abd`, `alt{str{abc}lit{x}str{abd}}`}, 198 + {`(?i)abc|ABD`, `cat{strfold{AB}cc{0x43-0x44 0x63-0x64}}`}, 199 + {`[ab]c|[ab]d`, `cat{cc{0x61-0x62}cc{0x63-0x64}}`}, 200 + {`.c|.d`, `cat{dot{}cc{0x63-0x64}}`}, 201 + {`x{2}|x{2}[0-9]`, 202 + `cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}`}, 203 + {`x{2}y|x{2}[0-9]y`, 204 + `cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}`}, 205 + {`a.*?c|a.*?b`, 206 + `cat{lit{a}alt{cat{nstar{dot{}}lit{c}}cat{nstar{dot{}}lit{b}}}}`}, 207 + 208 + // Valid repetitions. 209 + {`((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}))`, ``}, 210 + {`((((((((((x{1}){2}){2}){2}){2}){2}){2}){2}){2}){2})`, ``}, 211 + 212 + // Valid nesting. 213 + {strings.Repeat("(", 999) + strings.Repeat(")", 999), ``}, 214 + {strings.Repeat("(?:", 999) + strings.Repeat(")*", 999), ``}, 215 + {"(" + strings.Repeat("|", 12345) + ")", ``}, // not nested at all 216 + } 217 + 218 + const testFlags = MatchNL | PerlX | UnicodeGroups 219 + 220 + // dump prints a string representation of the regexp showing 221 + // the structure explicitly. 222 + func dump(re *Regexp) string { 223 + var b strings.Builder 224 + dumpRegexp(&b, re) 225 + return b.String() 226 + } 227 + 228 + var opNames = []string{ 229 + OpNoMatch: "no", 230 + OpEmptyMatch: "emp", 231 + OpLiteral: "lit", 232 + OpCharClass: "cc", 233 + OpAnyCharNotNL: "dnl", 234 + OpAnyChar: "dot", 235 + OpBeginLine: "bol", 236 + OpEndLine: "eol", 237 + OpBeginText: "bot", 238 + OpEndText: "eot", 239 + OpWordBoundary: "wb", 240 + OpNoWordBoundary: "nwb", 241 + OpCapture: "cap", 242 + OpStar: "star", 243 + OpPlus: "plus", 244 + OpQuest: "que", 245 + OpRepeat: "rep", 246 + OpConcat: "cat", 247 + OpAlternate: "alt", 248 + } 249 + 250 + // dumpRegexp writes an encoding of the syntax tree for the regexp re to b. 251 + // It is used during testing to distinguish between parses that might print 252 + // the same using re's String method. 253 + func dumpRegexp(b *strings.Builder, re *Regexp) { 254 + if int(re.Op) >= len(opNames) || opNames[re.Op] == "" { 255 + fmt.Fprintf(b, "op%d", re.Op) 256 + } else { 257 + switch re.Op { 258 + default: 259 + b.WriteString(opNames[re.Op]) 260 + case OpStar, OpPlus, OpQuest, OpRepeat: 261 + if re.Flags&NonGreedy != 0 { 262 + b.WriteByte('n') 263 + } 264 + b.WriteString(opNames[re.Op]) 265 + case OpLiteral: 266 + if len(re.Rune) > 1 { 267 + b.WriteString("str") 268 + } else { 269 + b.WriteString("lit") 270 + } 271 + if re.Flags&FoldCase != 0 { 272 + for _, r := range re.Rune { 273 + if unicode.SimpleFold(r) != r { 274 + b.WriteString("fold") 275 + break 276 + } 277 + } 278 + } 279 + } 280 + } 281 + b.WriteByte('{') 282 + switch re.Op { 283 + case OpEndText: 284 + if re.Flags&WasDollar == 0 { 285 + b.WriteString(`\z`) 286 + } 287 + case OpLiteral: 288 + for _, r := range re.Rune { 289 + b.WriteRune(r) 290 + } 291 + case OpConcat, OpAlternate: 292 + for _, sub := range re.Sub { 293 + dumpRegexp(b, sub) 294 + } 295 + case OpStar, OpPlus, OpQuest: 296 + dumpRegexp(b, re.Sub[0]) 297 + case OpRepeat: 298 + fmt.Fprintf(b, "%d,%d ", re.Min, re.Max) 299 + dumpRegexp(b, re.Sub[0]) 300 + case OpCapture: 301 + if re.Name != "" { 302 + b.WriteString(re.Name) 303 + b.WriteByte(':') 304 + } 305 + dumpRegexp(b, re.Sub[0]) 306 + case OpCharClass: 307 + sep := "" 308 + for i := 0; i < len(re.Rune); i += 2 { 309 + b.WriteString(sep) 310 + sep = " " 311 + lo, hi := re.Rune[i], re.Rune[i+1] 312 + if lo == hi { 313 + fmt.Fprintf(b, "%#x", lo) 314 + } else { 315 + fmt.Fprintf(b, "%#x-%#x", lo, hi) 316 + } 317 + } 318 + } 319 + b.WriteByte('}') 320 + } 321 + 322 + func mkCharClass(f func(rune) bool) string { 323 + re := &Regexp{Op: OpCharClass} 324 + lo := rune(-1) 325 + for i := rune(0); i <= unicode.MaxRune; i++ { 326 + if f(i) { 327 + if lo < 0 { 328 + lo = i 329 + } 330 + } else { 331 + if lo >= 0 { 332 + re.Rune = append(re.Rune, lo, i-1) 333 + lo = -1 334 + } 335 + } 336 + } 337 + if lo >= 0 { 338 + re.Rune = append(re.Rune, lo, unicode.MaxRune) 339 + } 340 + return dump(re) 341 + } 342 + 343 + func isUpperFold(r rune) bool { 344 + if unicode.IsUpper(r) { 345 + return true 346 + } 347 + c := unicode.SimpleFold(r) 348 + for c != r { 349 + if unicode.IsUpper(c) { 350 + return true 351 + } 352 + c = unicode.SimpleFold(c) 353 + } 354 + return false 355 + } 356 + 357 + func TestToStringEquivalentParse(t *testing.T) { 358 + for _, tt := range parseTests { 359 + re, err := Parse(tt.Regexp, testFlags) 360 + if err != nil { 361 + t.Errorf("Parse(%#q): %v", tt.Regexp, err) 362 + continue 363 + } 364 + if tt.Dump == "" { 365 + // It parsed. That's all we care about. 366 + continue 367 + } 368 + d := dump(re) 369 + if d != tt.Dump { 370 + t.Errorf("Parse(%#q).Dump() = %#q want %#q", tt.Regexp, d, tt.Dump) 371 + continue 372 + } 373 + 374 + s := re.String() 375 + if s != tt.Regexp { 376 + // If ToString didn't return the original regexp, 377 + // it must have found one with fewer parens. 378 + // Unfortunately we can't check the length here, because 379 + // ToString produces "\\{" for a literal brace, 380 + // but "{" is a shorter equivalent in some contexts. 381 + nre, err := Parse(s, testFlags) 382 + if err != nil { 383 + t.Errorf("Parse(%#q.String() = %#q): %v", tt.Regexp, s, err) 384 + continue 385 + } 386 + nd := dump(nre) 387 + if d != nd { 388 + t.Errorf("Parse(%#q) -> %#q; %#q vs %#q", tt.Regexp, s, d, nd) 389 + } 390 + 391 + ns := nre.String() 392 + if s != ns { 393 + t.Errorf("Parse(%#q) -> %#q -> %#q", tt.Regexp, s, ns) 394 + } 395 + } 396 + } 397 + }
+192
internal/syntaxutil/regexp.go
··· 1 + // Copyright 2011 The Go Authors. All rights reserved. 2 + // Use of this source code is governed by a BSD-style 3 + // license that can be found in the LICENSE file. 4 + 5 + package syntaxutil 6 + 7 + import ( 8 + "regexp/syntax" 9 + "strconv" 10 + "strings" 11 + "unicode" 12 + ) 13 + 14 + // Note to implementers: 15 + // In this package, re is always a *Regexp and r is always a rune. 16 + 17 + // writeRegexp writes the Perl syntax for the regular expression re to b. 18 + func writeRegexp(b *strings.Builder, re *syntax.Regexp) { 19 + switch re.Op { 20 + default: 21 + b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">") 22 + case syntax.OpNoMatch: 23 + b.WriteString(`[^\x00-\x{10FFFF}]`) 24 + case syntax.OpEmptyMatch: 25 + b.WriteString(`(?:)`) 26 + case syntax.OpLiteral: 27 + if re.Flags&syntax.FoldCase != 0 { 28 + b.WriteString(`(?i:`) 29 + } 30 + for _, r := range re.Rune { 31 + escape(b, r, false) 32 + } 33 + if re.Flags&syntax.FoldCase != 0 { 34 + b.WriteString(`)`) 35 + } 36 + case syntax.OpCharClass: 37 + if len(re.Rune)%2 != 0 { 38 + b.WriteString(`[invalid char class]`) 39 + break 40 + } 41 + b.WriteRune('[') 42 + if len(re.Rune) == 0 { 43 + b.WriteString(`^\x00-\x{10FFFF}`) 44 + } else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune && len(re.Rune) > 2 { 45 + // Contains 0 and MaxRune. Probably a negated class. 46 + // Print the gaps. 47 + b.WriteRune('^') 48 + for i := 1; i < len(re.Rune)-1; i += 2 { 49 + lo, hi := re.Rune[i]+1, re.Rune[i+1]-1 50 + escape(b, lo, lo == '-') 51 + if lo != hi { 52 + b.WriteRune('-') 53 + escape(b, hi, hi == '-') 54 + } 55 + } 56 + } else { 57 + for i := 0; i < len(re.Rune); i += 2 { 58 + lo, hi := re.Rune[i], re.Rune[i+1] 59 + escape(b, lo, lo == '-') 60 + if lo != hi { 61 + b.WriteRune('-') 62 + escape(b, hi, hi == '-') 63 + } 64 + } 65 + } 66 + b.WriteRune(']') 67 + case syntax.OpAnyCharNotNL: 68 + b.WriteString(`(?-s:.)`) 69 + case syntax.OpAnyChar: 70 + b.WriteString(`(?s:.)`) 71 + case syntax.OpBeginLine: 72 + b.WriteString(`(?m:^)`) 73 + case syntax.OpEndLine: 74 + b.WriteString(`(?m:$)`) 75 + case syntax.OpBeginText: 76 + b.WriteString(`\A`) 77 + case syntax.OpEndText: 78 + if re.Flags&syntax.WasDollar != 0 { 79 + b.WriteString(`(?-m:$)`) 80 + } else { 81 + b.WriteString(`\z`) 82 + } 83 + case syntax.OpWordBoundary: 84 + b.WriteString(`\b`) 85 + case syntax.OpNoWordBoundary: 86 + b.WriteString(`\B`) 87 + case syntax.OpCapture: 88 + if re.Name != "" { 89 + b.WriteString(`(?P<`) 90 + b.WriteString(re.Name) 91 + b.WriteRune('>') 92 + } else { 93 + b.WriteRune('(') 94 + } 95 + if re.Sub[0].Op != syntax.OpEmptyMatch { 96 + writeRegexp(b, re.Sub[0]) 97 + } 98 + b.WriteRune(')') 99 + case syntax.OpStar, syntax.OpPlus, syntax.OpQuest, syntax.OpRepeat: 100 + if sub := re.Sub[0]; sub.Op > syntax.OpCapture || sub.Op == syntax.OpLiteral && len(sub.Rune) > 1 { 101 + b.WriteString(`(?:`) 102 + writeRegexp(b, sub) 103 + b.WriteString(`)`) 104 + } else { 105 + writeRegexp(b, sub) 106 + } 107 + switch re.Op { 108 + case syntax.OpStar: 109 + b.WriteRune('*') 110 + case syntax.OpPlus: 111 + b.WriteRune('+') 112 + case syntax.OpQuest: 113 + b.WriteRune('?') 114 + case syntax.OpRepeat: 115 + b.WriteRune('{') 116 + b.WriteString(strconv.Itoa(re.Min)) 117 + if re.Max != re.Min { 118 + b.WriteRune(',') 119 + if re.Max >= 0 { 120 + b.WriteString(strconv.Itoa(re.Max)) 121 + } 122 + } 123 + b.WriteRune('}') 124 + } 125 + if re.Flags&syntax.NonGreedy != 0 { 126 + b.WriteRune('?') 127 + } 128 + case syntax.OpConcat: 129 + for _, sub := range re.Sub { 130 + if sub.Op == syntax.OpAlternate { 131 + b.WriteString(`(?:`) 132 + writeRegexp(b, sub) 133 + b.WriteString(`)`) 134 + } else { 135 + writeRegexp(b, sub) 136 + } 137 + } 138 + case syntax.OpAlternate: 139 + for i, sub := range re.Sub { 140 + if i > 0 { 141 + b.WriteRune('|') 142 + } 143 + writeRegexp(b, sub) 144 + } 145 + } 146 + } 147 + 148 + func RegexpString(re *syntax.Regexp) string { 149 + var b strings.Builder 150 + writeRegexp(&b, re) 151 + return b.String() 152 + } 153 + 154 + const meta = `\.+*?()|[]{}^$` 155 + 156 + func escape(b *strings.Builder, r rune, force bool) { 157 + if unicode.IsPrint(r) { 158 + if strings.ContainsRune(meta, r) || force { 159 + b.WriteRune('\\') 160 + } 161 + b.WriteRune(r) 162 + return 163 + } 164 + 165 + switch r { 166 + case '\a': 167 + b.WriteString(`\a`) 168 + case '\f': 169 + b.WriteString(`\f`) 170 + case '\n': 171 + b.WriteString(`\n`) 172 + case '\r': 173 + b.WriteString(`\r`) 174 + case '\t': 175 + b.WriteString(`\t`) 176 + case '\v': 177 + b.WriteString(`\v`) 178 + default: 179 + if r < 0x100 { 180 + b.WriteString(`\x`) 181 + s := strconv.FormatInt(int64(r), 16) 182 + if len(s) == 1 { 183 + b.WriteRune('0') 184 + } 185 + b.WriteString(s) 186 + break 187 + } 188 + b.WriteString(`\x{`) 189 + b.WriteString(strconv.FormatInt(int64(r), 16)) 190 + b.WriteString(`}`) 191 + } 192 + }
+2 -1
matchtree.go
··· 24 24 25 25 "github.com/grafana/regexp" 26 26 27 + "github.com/sourcegraph/zoekt/internal/syntaxutil" 27 28 "github.com/sourcegraph/zoekt/query" 28 29 ) 29 30 ··· 204 205 } 205 206 206 207 return &regexpMatchTree{ 207 - regexp: regexp.MustCompile(prefix + s.Regexp.String()), 208 + regexp: regexp.MustCompile(prefix + syntaxutil.RegexpString(s.Regexp)), 208 209 origRegexp: s.Regexp, 209 210 fileName: s.FileName, 210 211 }
+2 -2
matchtree_test.go
··· 238 238 regex string 239 239 regexAll bool 240 240 }{ 241 - {query: "sym:.*", regex: "(?i)(?-s:.*)", regexAll: true}, 241 + {query: "sym:.*", regex: "(?i)(?-s:.)*", regexAll: true}, 242 242 {query: "sym:(ab|cd)", regex: "(?i)ab|cd"}, 243 - {query: "sym:b.r", regex: "(?i)(?-s:b.r)"}, 243 + {query: "sym:b.r", regex: "(?i)b(?-s:.)r"}, 244 244 {query: "sym:horse", substr: "horse"}, 245 245 {query: `sym:\bthread\b case:yes`, regex: `\bthread\b`}, // check we disable word search opt 246 246 {query: `sym:\bthread\b case:no`, regex: `(?i)\bthread\b`},
+4 -3
query/query.go
··· 29 29 30 30 "github.com/RoaringBitmap/roaring" 31 31 "github.com/grafana/regexp" 32 + "github.com/sourcegraph/zoekt/internal/syntaxutil" 32 33 ) 33 34 34 35 var _ = log.Println ··· 99 100 if q.CaseSensitive { 100 101 pref = "case_" + pref 101 102 } 102 - return fmt.Sprintf("%sregex:%q", pref, q.Regexp.String()) 103 + return fmt.Sprintf("%sregex:%q", pref, syntaxutil.RegexpString(q.Regexp)) 103 104 } 104 105 105 106 // gobRegexp wraps Regexp to make it gob-encodable/decodable. Regexp contains syntax.Regexp, which ··· 112 113 113 114 // GobEncode implements gob.Encoder. 114 115 func (q Regexp) GobEncode() ([]byte, error) { 115 - gobq := gobRegexp{Regexp: q, RegexpString: q.Regexp.String()} 116 + gobq := gobRegexp{Regexp: q, RegexpString: syntaxutil.RegexpString(q.Regexp)} 116 117 gobq.Regexp.Regexp = nil // can't be gob-encoded/decoded 117 118 return json.Marshal(gobq) 118 119 } ··· 457 458 case "no": 458 459 q.CaseSensitive = false 459 460 case "auto": 460 - q.CaseSensitive = (q.Regexp.String() != LowerRegexp(q.Regexp).String()) 461 + q.CaseSensitive = !q.Regexp.Equal(LowerRegexp(q.Regexp)) 461 462 } 462 463 } 463 464
+4 -2
query/regexp.go
··· 17 17 import ( 18 18 "log" 19 19 "regexp/syntax" 20 + 21 + "github.com/sourcegraph/zoekt/internal/syntaxutil" 20 22 ) 21 23 22 24 var _ = log.Println ··· 56 58 } 57 59 58 60 // Make a copy so in unlikely event of an error the original can be used as a fallback 59 - r, err := syntax.Parse(re.String(), flags) 61 + r, err := syntax.Parse(syntaxutil.RegexpString(re), flags) 60 62 if err != nil { 61 63 log.Printf("failed to copy regexp `%s`: %v", re, err) 62 64 return re ··· 65 67 r = uncapture(r) 66 68 67 69 // Parse again for new structure to take effect 68 - r, err = syntax.Parse(r.String(), flags) 70 + r, err = syntax.Parse(syntaxutil.RegexpString(r), flags) 69 71 if err != nil { 70 72 log.Printf("failed to parse regexp after uncapture `%s`: %v", r, err) 71 73 return re
+5 -3
query/regexp_test.go
··· 18 18 "regexp/syntax" 19 19 "strings" 20 20 "testing" 21 + 22 + "github.com/sourcegraph/zoekt/internal/syntaxutil" 21 23 ) 22 24 23 25 var opnames = map[syntax.Op]string{ ··· 52 54 func TestLowerRegexp(t *testing.T) { 53 55 in := "[a-zA-Z]fooBAR" 54 56 re := mustParseRE(in) 55 - in = re.String() 57 + in = syntaxutil.RegexpString(re) 56 58 got := LowerRegexp(re) 57 59 want := "[a-za-z]foobar" 58 60 if got.String() != want { ··· 61 63 t.Errorf("got %s, want %s", got, want) 62 64 } 63 65 64 - if re.String() != in { 65 - t.Errorf("got mutated original %s want %s", re.String(), in) 66 + if orig := syntaxutil.RegexpString(re); orig != in { 67 + t.Errorf("got mutated original %s want %s", orig, in) 66 68 } 67 69 } 68 70