fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

matchtree: do not depend on String for symbol all optimization (#746)

In go 1.22 the way go converts a query like .* into a String changed.
This meant that when we compiled zoekt in the sourcegraph codebase our
"regex matches all" optimization stopped working.

This commit moves us to a simple inspection of the syntax tree which
will be stable across versions and to be honest just feels better.

To do this we introduce a field "origRegexp" to the regexpMatchTree so
we can avoid needing to recompile the syntax tree. Note: Regexp does not
offer a way to get at its internal syntax.Regexp which would be nice.

Test Plan: go test and added a unit test

+92 -23
+56 -23
matchtree.go
··· 183 183 type regexpMatchTree struct { 184 184 regexp *regexp.Regexp 185 185 186 + // origRegexp is the original parsed regexp from the query structure. It 187 + // does not include mutations such as case sensitivity. 188 + origRegexp *syntax.Regexp 189 + 186 190 fileName bool 187 191 188 192 // mutable ··· 191 195 192 196 // nextDoc, prepare. 193 197 bruteForceMatchTree 198 + } 199 + 200 + func newRegexpMatchTree(s *query.Regexp) *regexpMatchTree { 201 + prefix := "" 202 + if !s.CaseSensitive { 203 + prefix = "(?i)" 204 + } 205 + 206 + return &regexpMatchTree{ 207 + regexp: regexp.MustCompile(prefix + s.Regexp.String()), 208 + origRegexp: s.Regexp, 209 + fileName: s.FileName, 210 + } 194 211 } 195 212 196 213 // \bLITERAL\b ··· 972 989 // provide something faster. 973 990 tr = wmt 974 991 } else { 975 - prefix := "" 976 - if !s.CaseSensitive { 977 - prefix = "(?i)" 978 - } 979 - 980 - tr = &regexpMatchTree{ 981 - regexp: regexp.MustCompile(prefix + s.Regexp.String()), 982 - fileName: s.FileName, 983 - } 992 + tr = newRegexpMatchTree(s) 984 993 } 985 994 986 995 return &andMatchTree{ ··· 1103 1112 }, nil 1104 1113 } 1105 1114 1106 - var regexp *regexp.Regexp 1115 + var regexpMT *regexpMatchTree 1107 1116 visitMatchTree(subMT, func(mt matchTree) { 1108 1117 if t, ok := mt.(*regexpMatchTree); ok { 1109 - regexp = t.regexp 1118 + regexpMT = t 1110 1119 } 1111 1120 }) 1112 - if regexp == nil { 1121 + if regexpMT == nil { 1113 1122 return nil, fmt.Errorf("found %T inside query.Symbol", subMT) 1114 1123 } 1115 1124 1116 1125 return &symbolRegexpMatchTree{ 1117 - regexp: regexp, 1118 - all: regexp.String() == "(?i)(?-s:.*)", 1126 + regexp: regexpMT.regexp, 1127 + all: isRegexpAll(regexpMT.origRegexp), 1119 1128 matchTree: subMT, 1120 1129 }, nil 1121 1130 ··· 1230 1239 } 1231 1240 1232 1241 if utf8.RuneCountInString(s.Pattern) < ngramSize { 1233 - prefix := "" 1234 - if !s.CaseSensitive { 1235 - prefix = "(?i)" 1236 - } 1237 - t := &regexpMatchTree{ 1238 - regexp: regexp.MustCompile(prefix + regexp.QuoteMeta(s.Pattern)), 1239 - fileName: s.FileName, 1240 - } 1241 - return t, nil 1242 + return newRegexpMatchTree(&query.Regexp{ 1243 + Regexp: &syntax.Regexp{Op: syntax.OpLiteral, Rune: []rune(s.Pattern)}, 1244 + FileName: s.FileName, 1245 + Content: s.Content, 1246 + CaseSensitive: s.CaseSensitive, 1247 + }), nil 1242 1248 } 1243 1249 1244 1250 result, err := d.iterateNgrams(s) ··· 1375 1381 } 1376 1382 return mt, err 1377 1383 } 1384 + 1385 + // isRegexpAll returns true if the query matches all possible lines. 1386 + // 1387 + // Note: it is possible for a funky regex to actually match all but this 1388 + // returns false. This returns true for normal looking regexes like ".*" or 1389 + // "(?-s:.*)". 1390 + func isRegexpAll(r *syntax.Regexp) bool { 1391 + // Note: we don't care about any flags since we are looking for .* and we 1392 + // don't mind if . matches all or everything but newline. 1393 + 1394 + // for loop is for visiting children of OpCapture/OpConcat until we hit .* 1395 + for { 1396 + // Our main target: .* 1397 + if r.Op == syntax.OpStar && len(r.Sub) == 1 { // * 1398 + // (?s:.) or (?-s:.) 1399 + return r.Sub[0].Op == syntax.OpAnyChar || r.Sub[0].Op == syntax.OpAnyCharNotNL 1400 + } 1401 + 1402 + // Strip away expressions being wrapped in paranthesis 1403 + if (r.Op == syntax.OpCapture || r.Op == syntax.OpConcat) && len(r.Sub) == 1 { 1404 + r = r.Sub[0] 1405 + continue 1406 + } 1407 + 1408 + return false 1409 + } 1410 + }
+36
matchtree_test.go
··· 16 16 17 17 import ( 18 18 "reflect" 19 + "regexp/syntax" 19 20 "testing" 20 21 21 22 "github.com/RoaringBitmap/roaring" ··· 385 386 t.Fatalf("expected %d document, but got at least 1 more", len(want)) 386 387 } 387 388 } 389 + 390 + func TestIsRegexpAll(t *testing.T) { 391 + valid := []string{ 392 + ".*", 393 + "(.*)", 394 + "(?-s:.*)", 395 + "(?s:.*)", 396 + "(?i)(?-s:.*)", 397 + } 398 + invalid := []string{ 399 + ".", 400 + "foo", 401 + "(foo.*)", 402 + } 403 + 404 + for _, s := range valid { 405 + r, err := syntax.Parse(s, syntax.Perl) 406 + if err != nil { 407 + t.Fatal(err) 408 + } 409 + if !isRegexpAll(r) { 410 + t.Errorf("expected %q to match all", s) 411 + } 412 + } 413 + 414 + for _, s := range invalid { 415 + r, err := syntax.Parse(s, syntax.Perl) 416 + if err != nil { 417 + t.Fatal(err) 418 + } 419 + if isRegexpAll(r) { 420 + t.Errorf("expected %q to not match all", s) 421 + } 422 + } 423 + }