all: use a faster vendored regexp/syntax/Regexp.String (#753) · boltless.me/zoekt@c39011a

+58

internal/syntaxutil/README.md

··· 1 + # vendored std regexp/syntax 2 + 3 + This package contains a vendored copy of std regexp/syntax. However, it only 4 + contains the code for converting syntax.Regexp into a String. It is the 5 + version of the code at a recent go commit, but with a commit which introduces 6 + a significant performance regression reverted. 7 + 8 + At the time of writing regexp.String on go1.22 is taking 40% of CPU at 9 + Sourcegraph. This should return to ~0% with this vendored code. 10 + 11 + https://github.com/sourcegraph/sourcegraph/issues/61462 12 + 13 + ## Vendored commit 14 + 15 + ``` 16 + commit 2e1003e2f7e42efc5771812b9ee6ed264803796c 17 + Author: Daniel Martí <mvdan@mvdan.cc> 18 + Date: Tue Mar 26 22:59:41 2024 +0200 19 + 20 + cmd/go: replace reflect.DeepEqual with slices.Equal and maps.Equal 21 + 22 + All of these maps and slices are made up of comparable types, 23 + so we can avoid the overhead of reflection entirely. 24 + 25 + Change-Id: If77dbe648a336ba729c171e84c9ff3f7e160297d 26 + Reviewed-on: https://go-review.googlesource.com/c/go/+/574597 27 + Reviewed-by: Than McIntosh <thanm@google.com> 28 + LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> 29 + Reviewed-by: Ian Lance Taylor <iant@google.com> 30 + ``` 31 + 32 + ## Reverted commit 33 + 34 + ``` 35 + commit 98c9f271d67b501ecf2ce995539abd2cdc81d505 36 + Author: Russ Cox <rsc@golang.org> 37 + Date: Wed Jun 28 17:45:26 2023 -0400 38 + 39 + regexp/syntax: use more compact Regexp.String output 40 + 41 + Compact the Regexp.String output. It was only ever intended for debugging, 42 + but there are at least some uses in the wild where regexps are built up 43 + using regexp/syntax and then formatted using the String method. 44 + Compact the output to help that use case. Specifically: 45 + 46 + - Compact 2-element character class ranges: [a-b] -> [ab]. 47 + - Aggregate flags: (?i:A)(?i:B)*(?i:C)|(?i:D)?(?i:E) -> (?i:AB*C|D?E). 48 + 49 + Fixes #57950. 50 + 51 + Change-Id: I1161d0e3aa6c3ae5a302677032bb7cd55caae5fb 52 + Reviewed-on: https://go-review.googlesource.com/c/go/+/507015 53 + TryBot-Result: Gopher Robot <gobot@golang.org> 54 + Reviewed-by: Than McIntosh <thanm@google.com> 55 + Run-TryBot: Russ Cox <rsc@golang.org> 56 + Reviewed-by: Rob Pike <r@golang.org> 57 + Auto-Submit: Russ Cox <rsc@golang.org> 58 + ```

+51

internal/syntaxutil/alias_test.go

··· 1 + package syntaxutil 2 + 3 + import "regexp/syntax" 4 + 5 + // A bunch of aliases to avoid needing to modify parse_test.go too much. 6 + 7 + type Regexp = syntax.Regexp 8 + 9 + type Op = syntax.Op 10 + 11 + const ( 12 + OpNoMatch = syntax.OpNoMatch 13 + OpEmptyMatch = syntax.OpEmptyMatch 14 + OpLiteral = syntax.OpLiteral 15 + OpCharClass = syntax.OpCharClass 16 + OpAnyCharNotNL = syntax.OpAnyCharNotNL 17 + OpAnyChar = syntax.OpAnyChar 18 + OpBeginLine = syntax.OpBeginLine 19 + OpEndLine = syntax.OpEndLine 20 + OpBeginText = syntax.OpBeginText 21 + OpEndText = syntax.OpEndText 22 + OpWordBoundary = syntax.OpWordBoundary 23 + OpNoWordBoundary = syntax.OpNoWordBoundary 24 + OpCapture = syntax.OpCapture 25 + OpStar = syntax.OpStar 26 + OpPlus = syntax.OpPlus 27 + OpQuest = syntax.OpQuest 28 + OpRepeat = syntax.OpRepeat 29 + OpConcat = syntax.OpConcat 30 + OpAlternate = syntax.OpAlternate 31 + ) 32 + 33 + type Flags = syntax.Flags 34 + 35 + const ( 36 + FoldCase = syntax.FoldCase 37 + Literal = syntax.Literal 38 + ClassNL = syntax.ClassNL 39 + DotNL = syntax.DotNL 40 + OneLine = syntax.OneLine 41 + NonGreedy = syntax.NonGreedy 42 + PerlX = syntax.PerlX 43 + UnicodeGroups = syntax.UnicodeGroups 44 + WasDollar = syntax.WasDollar 45 + Simple = syntax.Simple 46 + MatchNL = syntax.MatchNL 47 + Perl = syntax.Perl 48 + POSIX = syntax.POSIX 49 + ) 50 + 51 + var Parse = syntax.Parse

+397

internal/syntaxutil/parse_test.go

··· 1 + // Copyright 2011 The Go Authors. All rights reserved. 2 + // Use of this source code is governed by a BSD-style 3 + // license that can be found in the LICENSE file. 4 + 5 + package syntaxutil 6 + 7 + import ( 8 + "fmt" 9 + "strings" 10 + "testing" 11 + "unicode" 12 + ) 13 + 14 + type parseTest struct { 15 + Regexp string 16 + Dump string 17 + } 18 + 19 + var parseTests = []parseTest{ 20 + // Base cases 21 + {`a`, `lit{a}`}, 22 + {`a.`, `cat{lit{a}dot{}}`}, 23 + {`a.b`, `cat{lit{a}dot{}lit{b}}`}, 24 + {`ab`, `str{ab}`}, 25 + {`a.b.c`, `cat{lit{a}dot{}lit{b}dot{}lit{c}}`}, 26 + {`abc`, `str{abc}`}, 27 + {`a|^`, `alt{lit{a}bol{}}`}, 28 + {`a|b`, `cc{0x61-0x62}`}, 29 + {`(a)`, `cap{lit{a}}`}, 30 + {`(a)|b`, `alt{cap{lit{a}}lit{b}}`}, 31 + {`a*`, `star{lit{a}}`}, 32 + {`a+`, `plus{lit{a}}`}, 33 + {`a?`, `que{lit{a}}`}, 34 + {`a{2}`, `rep{2,2 lit{a}}`}, 35 + {`a{2,3}`, `rep{2,3 lit{a}}`}, 36 + {`a{2,}`, `rep{2,-1 lit{a}}`}, 37 + {`a*?`, `nstar{lit{a}}`}, 38 + {`a+?`, `nplus{lit{a}}`}, 39 + {`a??`, `nque{lit{a}}`}, 40 + {`a{2}?`, `nrep{2,2 lit{a}}`}, 41 + {`a{2,3}?`, `nrep{2,3 lit{a}}`}, 42 + {`a{2,}?`, `nrep{2,-1 lit{a}}`}, 43 + // Malformed { } are treated as literals. 44 + {`x{1001`, `str{x{1001}`}, 45 + {`x{9876543210`, `str{x{9876543210}`}, 46 + {`x{9876543210,`, `str{x{9876543210,}`}, 47 + {`x{2,1`, `str{x{2,1}`}, 48 + {`x{1,9876543210`, `str{x{1,9876543210}`}, 49 + {``, `emp{}`}, 50 + {`|`, `emp{}`}, // alt{emp{}emp{}} but got factored 51 + {`|x|`, `alt{emp{}lit{x}emp{}}`}, 52 + {`.`, `dot{}`}, 53 + {`^`, `bol{}`}, 54 + {`$`, `eol{}`}, 55 + {`\|`, `lit{|}`}, 56 + {`$`, `lit{(}`}, 57 + {`$`, `lit{)}`}, 58 + {`\*`, `lit{*}`}, 59 + {`\+`, `lit{+}`}, 60 + {`\?`, `lit{?}`}, 61 + {`{`, `lit{{}`}, 62 + {`}`, `lit{}}`}, 63 + {`\.`, `lit{.}`}, 64 + {`\^`, `lit{^}`}, 65 + {`\$`, `lit{$}`}, 66 + {`\\`, `lit{\}`}, 67 + {`[ace]`, `cc{0x61 0x63 0x65}`}, 68 + {`[abc]`, `cc{0x61-0x63}`}, 69 + {`[a-z]`, `cc{0x61-0x7a}`}, 70 + {`[a]`, `lit{a}`}, 71 + {`\-`, `lit{-}`}, 72 + {`-`, `lit{-}`}, 73 + {`\_`, `lit{_}`}, 74 + {`abc`, `str{abc}`}, 75 + {`abc|def`, `alt{str{abc}str{def}}`}, 76 + {`abc|def|ghi`, `alt{str{abc}str{def}str{ghi}}`}, 77 + 78 + // Posix and Perl extensions 79 + {`[[:lower:]]`, `cc{0x61-0x7a}`}, 80 + {`[a-z]`, `cc{0x61-0x7a}`}, 81 + {`[^[:lower:]]`, `cc{0x0-0x60 0x7b-0x10ffff}`}, 82 + {`[[:^lower:]]`, `cc{0x0-0x60 0x7b-0x10ffff}`}, 83 + {`(?i)[[:lower:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`}, 84 + {`(?i)[a-z]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`}, 85 + {`(?i)[^[:lower:]]`, `cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`}, 86 + {`(?i)[[:^lower:]]`, `cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`}, 87 + {`\d`, `cc{0x30-0x39}`}, 88 + {`\D`, `cc{0x0-0x2f 0x3a-0x10ffff}`}, 89 + {`\s`, `cc{0x9-0xa 0xc-0xd 0x20}`}, 90 + {`\S`, `cc{0x0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}`}, 91 + {`\w`, `cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}`}, 92 + {`\W`, `cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}`}, 93 + {`(?i)\w`, `cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}`}, 94 + {`(?i)\W`, `cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`}, 95 + {`[^\\]`, `cc{0x0-0x5b 0x5d-0x10ffff}`}, 96 + // { `\C`, `byte{}` }, // probably never 97 + 98 + // Unicode, negatives, and a double negative. 99 + {`\p{Braille}`, `cc{0x2800-0x28ff}`}, 100 + {`\P{Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, 101 + {`\p{^Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, 102 + {`\P{^Braille}`, `cc{0x2800-0x28ff}`}, 103 + {`\pZ`, `cc{0x20 0xa0 0x1680 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`}, 104 + {`[\p{Braille}]`, `cc{0x2800-0x28ff}`}, 105 + {`[\P{Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, 106 + {`[\p{^Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, 107 + {`[\P{^Braille}]`, `cc{0x2800-0x28ff}`}, 108 + {`[\pZ]`, `cc{0x20 0xa0 0x1680 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`}, 109 + {`\p{Lu}`, mkCharClass(unicode.IsUpper)}, 110 + {`[\p{Lu}]`, mkCharClass(unicode.IsUpper)}, 111 + {`(?i)[\p{Lu}]`, mkCharClass(isUpperFold)}, 112 + {`\p{Any}`, `dot{}`}, 113 + {`\p{^Any}`, `cc{}`}, 114 + 115 + // Hex, octal. 116 + {`[\012-\234]\141`, `cat{cc{0xa-0x9c}lit{a}}`}, 117 + {`[\x{41}-\x7a]\x61`, `cat{cc{0x41-0x7a}lit{a}}`}, 118 + 119 + // More interesting regular expressions. 120 + {`a{,2}`, `str{a{,2}}`}, 121 + {`\.\^\$\\`, `str{.^$\}`}, 122 + {`[a-zABC]`, `cc{0x41-0x43 0x61-0x7a}`}, 123 + {`[^a]`, `cc{0x0-0x60 0x62-0x10ffff}`}, 124 + {`[α-ε☺]`, `cc{0x3b1-0x3b5 0x263a}`}, // utf-8 125 + {`a*{`, `cat{star{lit{a}}lit{{}}`}, 126 + 127 + // Test precedences 128 + {`(?:ab)*`, `star{str{ab}}`}, 129 + {`(ab)*`, `star{cap{str{ab}}}`}, 130 + {`ab|cd`, `alt{str{ab}str{cd}}`}, 131 + {`a(b|c)d`, `cat{lit{a}cap{cc{0x62-0x63}}lit{d}}`}, 132 + 133 + // Test flattening. 134 + {`(?:a)`, `lit{a}`}, 135 + {`(?:ab)(?:cd)`, `str{abcd}`}, 136 + {`(?:a+b+)(?:c+d+)`, `cat{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}`}, 137 + {`(?:a+|b+)|(?:c+|d+)`, `alt{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}`}, 138 + {`(?:a|b)|(?:c|d)`, `cc{0x61-0x64}`}, 139 + {`a|.`, `dot{}`}, 140 + {`.|a`, `dot{}`}, 141 + {`(?:[abc]|A|Z|hello|world)`, `alt{cc{0x41 0x5a 0x61-0x63}str{hello}str{world}}`}, 142 + {`(?:[abc]|A|Z)`, `cc{0x41 0x5a 0x61-0x63}`}, 143 + 144 + // Test Perl quoted literals 145 + {`\Q+|*?{[\E`, `str{+|*?{[}`}, 146 + {`\Q+\E+`, `plus{lit{+}}`}, 147 + {`\Qab\E+`, `cat{lit{a}plus{lit{b}}}`}, 148 + {`\Q\\E`, `lit{\}`}, 149 + {`\Q\\\E`, `str{\\}`}, 150 + 151 + // Test Perl \A and \z 152 + {`(?m)^`, `bol{}`}, 153 + {`(?m)$`, `eol{}`}, 154 + {`(?-m)^`, `bot{}`}, 155 + {`(?-m)$`, `eot{}`}, 156 + {`(?m)\A`, `bot{}`}, 157 + {`(?m)\z`, `eot{\z}`}, 158 + {`(?-m)\A`, `bot{}`}, 159 + {`(?-m)\z`, `eot{\z}`}, 160 + 161 + // Test named captures 162 + {`(?P<name>a)`, `cap{name:lit{a}}`}, 163 + {`(?<name>a)`, `cap{name:lit{a}}`}, 164 + 165 + // Case-folded literals 166 + {`[Aa]`, `litfold{A}`}, 167 + {`[\x{100}\x{101}]`, `litfold{Ā}`}, 168 + {`[Δδ]`, `litfold{Δ}`}, 169 + 170 + // Strings 171 + {`abcde`, `str{abcde}`}, 172 + {`[Aa][Bb]cd`, `cat{strfold{AB}str{cd}}`}, 173 + 174 + // Factoring. 175 + {`abc|abd|aef|bcx|bcy`, `alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}cat{str{bc}cc{0x78-0x79}}}`}, 176 + {`ax+y|ax+z|ay+w`, `cat{lit{a}alt{cat{plus{lit{x}}lit{y}}cat{plus{lit{x}}lit{z}}cat{plus{lit{y}}lit{w}}}}`}, 177 + 178 + // Bug fixes. 179 + {`(?:.)`, `dot{}`}, 180 + {`(?:x|(?:xa))`, `cat{lit{x}alt{emp{}lit{a}}}`}, 181 + {`(?:.|(?:.a))`, `cat{dot{}alt{emp{}lit{a}}}`}, 182 + {`(?:A(?:A|a))`, `cat{lit{A}litfold{A}}`}, 183 + {`(?:A|a)`, `litfold{A}`}, 184 + {`A|(?:A|a)`, `litfold{A}`}, 185 + {`(?s).`, `dot{}`}, 186 + {`(?-s).`, `dnl{}`}, 187 + {`(?:(?:^).)`, `cat{bol{}dot{}}`}, 188 + {`(?-s)(?:(?:^).)`, `cat{bol{}dnl{}}`}, 189 + {`[\s\S]a`, `cat{cc{0x0-0x10ffff}lit{a}}`}, 190 + 191 + // RE2 prefix_tests 192 + {`abc|abd`, `cat{str{ab}cc{0x63-0x64}}`}, 193 + {`a(?:b)c|abd`, `cat{str{ab}cc{0x63-0x64}}`}, 194 + {`abc|abd|aef|bcx|bcy`, 195 + `alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}` + 196 + `cat{str{bc}cc{0x78-0x79}}}`}, 197 + {`abc|x|abd`, `alt{str{abc}lit{x}str{abd}}`}, 198 + {`(?i)abc|ABD`, `cat{strfold{AB}cc{0x43-0x44 0x63-0x64}}`}, 199 + {`[ab]c|[ab]d`, `cat{cc{0x61-0x62}cc{0x63-0x64}}`}, 200 + {`.c|.d`, `cat{dot{}cc{0x63-0x64}}`}, 201 + {`x{2}|x{2}[0-9]`, 202 + `cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}`}, 203 + {`x{2}y|x{2}[0-9]y`, 204 + `cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}`}, 205 + {`a.*?c|a.*?b`, 206 + `cat{lit{a}alt{cat{nstar{dot{}}lit{c}}cat{nstar{dot{}}lit{b}}}}`}, 207 + 208 + // Valid repetitions. 209 + {`((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}))`, ``}, 210 + {`((((((((((x{1}){2}){2}){2}){2}){2}){2}){2}){2}){2})`, ``}, 211 + 212 + // Valid nesting. 213 + {strings.Repeat("(", 999) + strings.Repeat(")", 999), ``}, 214 + {strings.Repeat("(?:", 999) + strings.Repeat(")*", 999), ``}, 215 + {"(" + strings.Repeat("|", 12345) + ")", ``}, // not nested at all 216 + } 217 + 218 + const testFlags = MatchNL | PerlX | UnicodeGroups 219 + 220 + // dump prints a string representation of the regexp showing 221 + // the structure explicitly. 222 + func dump(re *Regexp) string { 223 + var b strings.Builder 224 + dumpRegexp(&b, re) 225 + return b.String() 226 + } 227 + 228 + var opNames = []string{ 229 + OpNoMatch: "no", 230 + OpEmptyMatch: "emp", 231 + OpLiteral: "lit", 232 + OpCharClass: "cc", 233 + OpAnyCharNotNL: "dnl", 234 + OpAnyChar: "dot", 235 + OpBeginLine: "bol", 236 + OpEndLine: "eol", 237 + OpBeginText: "bot", 238 + OpEndText: "eot", 239 + OpWordBoundary: "wb", 240 + OpNoWordBoundary: "nwb", 241 + OpCapture: "cap", 242 + OpStar: "star", 243 + OpPlus: "plus", 244 + OpQuest: "que", 245 + OpRepeat: "rep", 246 + OpConcat: "cat", 247 + OpAlternate: "alt", 248 + } 249 + 250 + // dumpRegexp writes an encoding of the syntax tree for the regexp re to b. 251 + // It is used during testing to distinguish between parses that might print 252 + // the same using re's String method. 253 + func dumpRegexp(b *strings.Builder, re *Regexp) { 254 + if int(re.Op) >= len(opNames) || opNames[re.Op] == "" { 255 + fmt.Fprintf(b, "op%d", re.Op) 256 + } else { 257 + switch re.Op { 258 + default: 259 + b.WriteString(opNames[re.Op]) 260 + case OpStar, OpPlus, OpQuest, OpRepeat: 261 + if re.Flags&NonGreedy != 0 { 262 + b.WriteByte('n') 263 + } 264 + b.WriteString(opNames[re.Op]) 265 + case OpLiteral: 266 + if len(re.Rune) > 1 { 267 + b.WriteString("str") 268 + } else { 269 + b.WriteString("lit") 270 + } 271 + if re.Flags&FoldCase != 0 { 272 + for _, r := range re.Rune { 273 + if unicode.SimpleFold(r) != r { 274 + b.WriteString("fold") 275 + break 276 + } 277 + } 278 + } 279 + } 280 + } 281 + b.WriteByte('{') 282 + switch re.Op { 283 + case OpEndText: 284 + if re.Flags&WasDollar == 0 { 285 + b.WriteString(`\z`) 286 + } 287 + case OpLiteral: 288 + for _, r := range re.Rune { 289 + b.WriteRune(r) 290 + } 291 + case OpConcat, OpAlternate: 292 + for _, sub := range re.Sub { 293 + dumpRegexp(b, sub) 294 + } 295 + case OpStar, OpPlus, OpQuest: 296 + dumpRegexp(b, re.Sub[0]) 297 + case OpRepeat: 298 + fmt.Fprintf(b, "%d,%d ", re.Min, re.Max) 299 + dumpRegexp(b, re.Sub[0]) 300 + case OpCapture: 301 + if re.Name != "" { 302 + b.WriteString(re.Name) 303 + b.WriteByte(':') 304 + } 305 + dumpRegexp(b, re.Sub[0]) 306 + case OpCharClass: 307 + sep := "" 308 + for i := 0; i < len(re.Rune); i += 2 { 309 + b.WriteString(sep) 310 + sep = " " 311 + lo, hi := re.Rune[i], re.Rune[i+1] 312 + if lo == hi { 313 + fmt.Fprintf(b, "%#x", lo) 314 + } else { 315 + fmt.Fprintf(b, "%#x-%#x", lo, hi) 316 + } 317 + } 318 + } 319 + b.WriteByte('}') 320 + } 321 + 322 + func mkCharClass(f func(rune) bool) string { 323 + re := &Regexp{Op: OpCharClass} 324 + lo := rune(-1) 325 + for i := rune(0); i <= unicode.MaxRune; i++ { 326 + if f(i) { 327 + if lo < 0 { 328 + lo = i 329 + } 330 + } else { 331 + if lo >= 0 { 332 + re.Rune = append(re.Rune, lo, i-1) 333 + lo = -1 334 + } 335 + } 336 + } 337 + if lo >= 0 { 338 + re.Rune = append(re.Rune, lo, unicode.MaxRune) 339 + } 340 + return dump(re) 341 + } 342 + 343 + func isUpperFold(r rune) bool { 344 + if unicode.IsUpper(r) { 345 + return true 346 + } 347 + c := unicode.SimpleFold(r) 348 + for c != r { 349 + if unicode.IsUpper(c) { 350 + return true 351 + } 352 + c = unicode.SimpleFold(c) 353 + } 354 + return false 355 + } 356 + 357 + func TestToStringEquivalentParse(t *testing.T) { 358 + for _, tt := range parseTests { 359 + re, err := Parse(tt.Regexp, testFlags) 360 + if err != nil { 361 + t.Errorf("Parse(%#q): %v", tt.Regexp, err) 362 + continue 363 + } 364 + if tt.Dump == "" { 365 + // It parsed. That's all we care about. 366 + continue 367 + } 368 + d := dump(re) 369 + if d != tt.Dump { 370 + t.Errorf("Parse(%#q).Dump() = %#q want %#q", tt.Regexp, d, tt.Dump) 371 + continue 372 + } 373 + 374 + s := re.String() 375 + if s != tt.Regexp { 376 + // If ToString didn't return the original regexp, 377 + // it must have found one with fewer parens. 378 + // Unfortunately we can't check the length here, because 379 + // ToString produces "\\{" for a literal brace, 380 + // but "{" is a shorter equivalent in some contexts. 381 + nre, err := Parse(s, testFlags) 382 + if err != nil { 383 + t.Errorf("Parse(%#q.String() = %#q): %v", tt.Regexp, s, err) 384 + continue 385 + } 386 + nd := dump(nre) 387 + if d != nd { 388 + t.Errorf("Parse(%#q) -> %#q; %#q vs %#q", tt.Regexp, s, d, nd) 389 + } 390 + 391 + ns := nre.String() 392 + if s != ns { 393 + t.Errorf("Parse(%#q) -> %#q -> %#q", tt.Regexp, s, ns) 394 + } 395 + } 396 + } 397 + }

+192

internal/syntaxutil/regexp.go

··· 1 + // Copyright 2011 The Go Authors. All rights reserved. 2 + // Use of this source code is governed by a BSD-style 3 + // license that can be found in the LICENSE file. 4 + 5 + package syntaxutil 6 + 7 + import ( 8 + "regexp/syntax" 9 + "strconv" 10 + "strings" 11 + "unicode" 12 + ) 13 + 14 + // Note to implementers: 15 + // In this package, re is always a *Regexp and r is always a rune. 16 + 17 + // writeRegexp writes the Perl syntax for the regular expression re to b. 18 + func writeRegexp(b *strings.Builder, re *syntax.Regexp) { 19 + switch re.Op { 20 + default: 21 + b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">") 22 + case syntax.OpNoMatch: 23 + b.WriteString(`[^\x00-\x{10FFFF}]`) 24 + case syntax.OpEmptyMatch: 25 + b.WriteString(`(?:)`) 26 + case syntax.OpLiteral: 27 + if re.Flags&syntax.FoldCase != 0 { 28 + b.WriteString(`(?i:`) 29 + } 30 + for _, r := range re.Rune { 31 + escape(b, r, false) 32 + } 33 + if re.Flags&syntax.FoldCase != 0 { 34 + b.WriteString(`)`) 35 + } 36 + case syntax.OpCharClass: 37 + if len(re.Rune)%2 != 0 { 38 + b.WriteString(`[invalid char class]`) 39 + break 40 + } 41 + b.WriteRune('[') 42 + if len(re.Rune) == 0 { 43 + b.WriteString(`^\x00-\x{10FFFF}`) 44 + } else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune && len(re.Rune) > 2 { 45 + // Contains 0 and MaxRune. Probably a negated class. 46 + // Print the gaps. 47 + b.WriteRune('^') 48 + for i := 1; i < len(re.Rune)-1; i += 2 { 49 + lo, hi := re.Rune[i]+1, re.Rune[i+1]-1 50 + escape(b, lo, lo == '-') 51 + if lo != hi { 52 + b.WriteRune('-') 53 + escape(b, hi, hi == '-') 54 + } 55 + } 56 + } else { 57 + for i := 0; i < len(re.Rune); i += 2 { 58 + lo, hi := re.Rune[i], re.Rune[i+1] 59 + escape(b, lo, lo == '-') 60 + if lo != hi { 61 + b.WriteRune('-') 62 + escape(b, hi, hi == '-') 63 + } 64 + } 65 + } 66 + b.WriteRune(']') 67 + case syntax.OpAnyCharNotNL: 68 + b.WriteString(`(?-s:.)`) 69 + case syntax.OpAnyChar: 70 + b.WriteString(`(?s:.)`) 71 + case syntax.OpBeginLine: 72 + b.WriteString(`(?m:^)`) 73 + case syntax.OpEndLine: 74 + b.WriteString(`(?m:$)`) 75 + case syntax.OpBeginText: 76 + b.WriteString(`\A`) 77 + case syntax.OpEndText: 78 + if re.Flags&syntax.WasDollar != 0 { 79 + b.WriteString(`(?-m:$)`) 80 + } else { 81 + b.WriteString(`\z`) 82 + } 83 + case syntax.OpWordBoundary: 84 + b.WriteString(`\b`) 85 + case syntax.OpNoWordBoundary: 86 + b.WriteString(`\B`) 87 + case syntax.OpCapture: 88 + if re.Name != "" { 89 + b.WriteString(`(?P<`) 90 + b.WriteString(re.Name) 91 + b.WriteRune('>') 92 + } else { 93 + b.WriteRune('(') 94 + } 95 + if re.Sub[0].Op != syntax.OpEmptyMatch { 96 + writeRegexp(b, re.Sub[0]) 97 + } 98 + b.WriteRune(')') 99 + case syntax.OpStar, syntax.OpPlus, syntax.OpQuest, syntax.OpRepeat: 100 + if sub := re.Sub[0]; sub.Op > syntax.OpCapture || sub.Op == syntax.OpLiteral && len(sub.Rune) > 1 { 101 + b.WriteString(`(?:`) 102 + writeRegexp(b, sub) 103 + b.WriteString(`)`) 104 + } else { 105 + writeRegexp(b, sub) 106 + } 107 + switch re.Op { 108 + case syntax.OpStar: 109 + b.WriteRune('*') 110 + case syntax.OpPlus: 111 + b.WriteRune('+') 112 + case syntax.OpQuest: 113 + b.WriteRune('?') 114 + case syntax.OpRepeat: 115 + b.WriteRune('{') 116 + b.WriteString(strconv.Itoa(re.Min)) 117 + if re.Max != re.Min { 118 + b.WriteRune(',') 119 + if re.Max >= 0 { 120 + b.WriteString(strconv.Itoa(re.Max)) 121 + } 122 + } 123 + b.WriteRune('}') 124 + } 125 + if re.Flags&syntax.NonGreedy != 0 { 126 + b.WriteRune('?') 127 + } 128 + case syntax.OpConcat: 129 + for _, sub := range re.Sub { 130 + if sub.Op == syntax.OpAlternate { 131 + b.WriteString(`(?:`) 132 + writeRegexp(b, sub) 133 + b.WriteString(`)`) 134 + } else { 135 + writeRegexp(b, sub) 136 + } 137 + } 138 + case syntax.OpAlternate: 139 + for i, sub := range re.Sub { 140 + if i > 0 { 141 + b.WriteRune('|') 142 + } 143 + writeRegexp(b, sub) 144 + } 145 + } 146 + } 147 + 148 + func RegexpString(re *syntax.Regexp) string { 149 + var b strings.Builder 150 + writeRegexp(&b, re) 151 + return b.String() 152 + } 153 + 154 + const meta = `\.+*?()|[]{}^$` 155 + 156 + func escape(b *strings.Builder, r rune, force bool) { 157 + if unicode.IsPrint(r) { 158 + if strings.ContainsRune(meta, r) || force { 159 + b.WriteRune('\\') 160 + } 161 + b.WriteRune(r) 162 + return 163 + } 164 + 165 + switch r { 166 + case '\a': 167 + b.WriteString(`\a`) 168 + case '\f': 169 + b.WriteString(`\f`) 170 + case '\n': 171 + b.WriteString(`\n`) 172 + case '\r': 173 + b.WriteString(`\r`) 174 + case '\t': 175 + b.WriteString(`\t`) 176 + case '\v': 177 + b.WriteString(`\v`) 178 + default: 179 + if r < 0x100 { 180 + b.WriteString(`\x`) 181 + s := strconv.FormatInt(int64(r), 16) 182 + if len(s) == 1 { 183 + b.WriteRune('0') 184 + } 185 + b.WriteString(s) 186 + break 187 + } 188 + b.WriteString(`\x{`) 189 + b.WriteString(strconv.FormatInt(int64(r), 16)) 190 + b.WriteString(`}`) 191 + } 192 + }

+2 -1

matchtree.go

··· 24 24 25 25 "github.com/grafana/regexp" 26 26 27 + "github.com/sourcegraph/zoekt/internal/syntaxutil" 27 28 "github.com/sourcegraph/zoekt/query" 28 29 ) 29 30 ··· 204 205 } 205 206 206 207 return &regexpMatchTree{ 207 - regexp: regexp.MustCompile(prefix + s.Regexp.String()), 208 + regexp: regexp.MustCompile(prefix + syntaxutil.RegexpString(s.Regexp)), 208 209 origRegexp: s.Regexp, 209 210 fileName: s.FileName, 210 211 }

+2 -2

matchtree_test.go

··· 238 238 regex string 239 239 regexAll bool 240 240 }{ 241 - {query: "sym:.*", regex: "(?i)(?-s:.*)", regexAll: true}, 241 + {query: "sym:.*", regex: "(?i)(?-s:.)*", regexAll: true}, 242 242 {query: "sym:(ab|cd)", regex: "(?i)ab|cd"}, 243 - {query: "sym:b.r", regex: "(?i)(?-s:b.r)"}, 243 + {query: "sym:b.r", regex: "(?i)b(?-s:.)r"}, 244 244 {query: "sym:horse", substr: "horse"}, 245 245 {query: `sym:\bthread\b case:yes`, regex: `\bthread\b`}, // check we disable word search opt 246 246 {query: `sym:\bthread\b case:no`, regex: `(?i)\bthread\b`},

+4 -3

query/query.go

··· 29 29 30 30 "github.com/RoaringBitmap/roaring" 31 31 "github.com/grafana/regexp" 32 + "github.com/sourcegraph/zoekt/internal/syntaxutil" 32 33 ) 33 34 34 35 var _ = log.Println ··· 99 100 if q.CaseSensitive { 100 101 pref = "case_" + pref 101 102 } 102 - return fmt.Sprintf("%sregex:%q", pref, q.Regexp.String()) 103 + return fmt.Sprintf("%sregex:%q", pref, syntaxutil.RegexpString(q.Regexp)) 103 104 } 104 105 105 106 // gobRegexp wraps Regexp to make it gob-encodable/decodable. Regexp contains syntax.Regexp, which ··· 112 113 113 114 // GobEncode implements gob.Encoder. 114 115 func (q Regexp) GobEncode() ([]byte, error) { 115 - gobq := gobRegexp{Regexp: q, RegexpString: q.Regexp.String()} 116 + gobq := gobRegexp{Regexp: q, RegexpString: syntaxutil.RegexpString(q.Regexp)} 116 117 gobq.Regexp.Regexp = nil // can't be gob-encoded/decoded 117 118 return json.Marshal(gobq) 118 119 } ··· 457 458 case "no": 458 459 q.CaseSensitive = false 459 460 case "auto": 460 - q.CaseSensitive = (q.Regexp.String() != LowerRegexp(q.Regexp).String()) 461 + q.CaseSensitive = !q.Regexp.Equal(LowerRegexp(q.Regexp)) 461 462 } 462 463 } 463 464

+4 -2

query/regexp.go

··· 17 17 import ( 18 18 "log" 19 19 "regexp/syntax" 20 + 21 + "github.com/sourcegraph/zoekt/internal/syntaxutil" 20 22 ) 21 23 22 24 var _ = log.Println ··· 56 58 } 57 59 58 60 // Make a copy so in unlikely event of an error the original can be used as a fallback 59 - r, err := syntax.Parse(re.String(), flags) 61 + r, err := syntax.Parse(syntaxutil.RegexpString(re), flags) 60 62 if err != nil { 61 63 log.Printf("failed to copy regexp `%s`: %v", re, err) 62 64 return re ··· 65 67 r = uncapture(r) 66 68 67 69 // Parse again for new structure to take effect 68 - r, err = syntax.Parse(r.String(), flags) 70 + r, err = syntax.Parse(syntaxutil.RegexpString(r), flags) 69 71 if err != nil { 70 72 log.Printf("failed to parse regexp after uncapture `%s`: %v", r, err) 71 73 return re

+5 -3

query/regexp_test.go

··· 18 18 "regexp/syntax" 19 19 "strings" 20 20 "testing" 21 + 22 + "github.com/sourcegraph/zoekt/internal/syntaxutil" 21 23 ) 22 24 23 25 var opnames = map[syntax.Op]string{ ··· 52 54 func TestLowerRegexp(t *testing.T) { 53 55 in := "[a-zA-Z]fooBAR" 54 56 re := mustParseRE(in) 55 - in = re.String() 57 + in = syntaxutil.RegexpString(re) 56 58 got := LowerRegexp(re) 57 59 want := "[a-za-z]foobar" 58 60 if got.String() != want { ··· 61 63 t.Errorf("got %s, want %s", got, want) 62 64 } 63 65 64 - if re.String() != in { 65 - t.Errorf("got mutated original %s want %s", re.String(), in) 66 + if orig := syntaxutil.RegexpString(re); orig != in { 67 + t.Errorf("got mutated original %s want %s", orig, in) 66 68 } 67 69 } 68 70

Configure Feed

Configure Feed