fork of https://github.com/sourcegraph/zoekt
1// Copyright 2026 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package hybridre2
16
17import (
18 "fmt"
19 "testing"
20
21 grafanaregexp "github.com/grafana/regexp"
22)
23
24// withThreshold overrides the effective threshold for the duration of the test
25// and registers a t.Cleanup to restore it afterwards.
26//
27// NOT safe for concurrent use: do not call t.Parallel() after withThreshold,
28// and do not use it from TestMain or init().
29func withThreshold(tb testing.TB, thresh int64) {
30 tb.Helper()
31 old := threshold
32 threshold = func() int64 { return thresh }
33 tb.Cleanup(func() { threshold = old })
34}
35
36// ---- unit tests ----
37
38func TestCompileValid(t *testing.T) {
39 _, err := Compile(`foo.*bar`)
40 if err != nil {
41 t.Fatalf("unexpected error: %v", err)
42 }
43}
44
45func TestCompileInvalid(t *testing.T) {
46 _, err := Compile(`[invalid`)
47 if err == nil {
48 t.Fatal("expected error for invalid pattern, got nil")
49 }
50}
51
52func TestMustCompilePanics(t *testing.T) {
53 defer func() {
54 if r := recover(); r == nil {
55 t.Fatal("MustCompile should panic on invalid pattern")
56 }
57 }()
58 MustCompile(`[invalid`)
59}
60
61func TestString(t *testing.T) {
62 const pat = `foo.*bar`
63 re := MustCompile(pat)
64 if re.String() != pat {
65 t.Fatalf("String() = %q, want %q", re.String(), pat)
66 }
67}
68
69// TestFindAllIndexDisabled checks that with threshold=-1, we use grafana/regexp.
70func TestFindAllIndexDisabled(t *testing.T) {
71 corpus := []byte("func main() { fmt.Println(\"hello world\") }")
72 patterns := []string{`\w+`, `fmt\.\w+`, `(?i)MAIN`, `"[^"]*"`}
73
74 withThreshold(t, disabled)
75 for _, pat := range patterns {
76 hybrid := MustCompile(pat)
77 grafana := grafanaregexp.MustCompile(pat)
78 got := hybrid.FindAllIndex(corpus, -1)
79 want := grafana.FindAllIndex(corpus, -1)
80 if !equalIndexSlices(got, want) {
81 t.Errorf("disabled mode, pattern %q: hybrid=%v grafana=%v", pat, got, want)
82 }
83 }
84}
85
86// TestFindAllIndexForcedRE2 checks that with threshold=0, go-re2 is used and
87// produces identical results to grafana/regexp for standard patterns.
88func TestFindAllIndexForcedRE2(t *testing.T) {
89 corpus := []byte("func main() { fmt.Println(\"hello world\") }")
90 patterns := []string{`\w+`, `fmt\.\w+`, `(?i)MAIN`, `"[^"]*"`}
91
92 withThreshold(t, 0)
93 for _, pat := range patterns {
94 hybrid := MustCompile(pat)
95 grafana := grafanaregexp.MustCompile(pat)
96 got := hybrid.FindAllIndex(corpus, -1)
97 want := grafana.FindAllIndex(corpus, -1)
98 if !equalIndexSlices(got, want) {
99 t.Errorf("forced-re2 mode, pattern %q: hybrid=%v grafana=%v", pat, got, want)
100 }
101 }
102}
103
104// TestThresholdSwitching verifies the engine switches at the configured byte boundary.
105func TestThresholdSwitching(t *testing.T) {
106 const thresh = int64(512)
107 pattern := `func\s+\w+`
108 grafana := grafanaregexp.MustCompile(pattern)
109
110 smallCorpus := makeCorpus(300) // < 512
111 largeCorpus := makeCorpus(600) // >= 512
112
113 withThreshold(t, thresh)
114 hybrid := MustCompile(pattern)
115
116 for _, tc := range []struct {
117 name string
118 corpus []byte
119 }{
120 {"small(<threshold)", smallCorpus},
121 {"large(>=threshold)", largeCorpus},
122 } {
123 got := hybrid.FindAllIndex(tc.corpus, -1)
124 want := grafana.FindAllIndex(tc.corpus, -1)
125 if !equalIndexSlices(got, want) {
126 t.Errorf("%s: hybrid=%v grafana=%v", tc.name, got, want)
127 }
128 }
129}
130
131// TestFindAllIndexIdenticalResults is a comprehensive correctness sweep across
132// pattern types and input sizes, asserting identical match positions.
133func TestFindAllIndexIdenticalResults(t *testing.T) {
134 patterns := []struct {
135 name string
136 pattern string
137 }{
138 {"literal", `hello`},
139 {"case-insensitive", `(?i)Hello`},
140 {"word-boundary", `\bfunc\b`},
141 {"alternation", `foo|bar|baz`},
142 {"char-class", `[a-zA-Z_]\w*`},
143 {"complex", `(func|var|const)\s+[A-Z]\w*`},
144 {"dot-plus", `.+`},
145 {"anchored-line", `(?m)^package\s+\w+`},
146 {"no-match", `XYZZY_NEVER_MATCHES`},
147 }
148
149 sizes := []struct {
150 name string
151 size int
152 }{
153 {"64B", 64},
154 {"512B", 512},
155 {"4KB", 4 * 1024},
156 {"64KB", 64 * 1024},
157 {"256KB", 256 * 1024},
158 }
159
160 // Force re2 path to test its correctness across all sizes.
161 withThreshold(t, 0)
162 for _, sz := range sizes {
163 corpus := makeCorpus(sz.size)
164 for _, pat := range patterns {
165 name := sz.name + "/" + pat.name
166 t.Run(name, func(t *testing.T) {
167 hybrid := MustCompile(pat.pattern)
168 grafana := grafanaregexp.MustCompile(pat.pattern)
169
170 got := hybrid.FindAllIndex(corpus, -1)
171 want := grafana.FindAllIndex(corpus, -1)
172 if !equalIndexSlices(got, want) {
173 t.Errorf("pattern=%q size=%d: len(hybrid)=%d len(grafana)=%d",
174 pat.pattern, sz.size, len(got), len(want))
175 if len(got) > 0 && len(want) > 0 {
176 t.Errorf(" first hybrid=%v first grafana=%v", got[0], want[0])
177 }
178 }
179 })
180 }
181 }
182}
183
184// TestFindAllIndexLimitN verifies that the n parameter (match count limit) is
185// honoured identically by both engines.
186func TestFindAllIndexLimitN(t *testing.T) {
187 corpus := makeCorpus(64 * 1024) // large enough to have many matches
188 patterns := []string{`func\s+\w+`, `\bvar\b`, `[A-Z]\w*`}
189
190 withThreshold(t, 0) // force re2 path
191 for _, pat := range patterns {
192 hybrid := MustCompile(pat)
193 grafana := grafanaregexp.MustCompile(pat)
194
195 got := hybrid.FindAllIndex(corpus, 1)
196 want := grafana.FindAllIndex(corpus, 1)
197 if !equalIndexSlices(got, want) {
198 t.Errorf("n=1, pattern=%q: hybrid=%v grafana=%v", pat, got, want)
199 }
200 // Sanity: n=1 should return at most one match.
201 if len(got) > 1 {
202 t.Errorf("n=1, pattern=%q: got %d matches, want <= 1", pat, len(got))
203 }
204 }
205}
206
207// TestNoMatchReturnsEmpty verifies no-match returns nil/empty consistently.
208func TestNoMatchReturnsEmpty(t *testing.T) {
209 corpus := makeCorpus(1024)
210
211 for _, thresh := range []int64{disabled, 0} {
212 t.Run(fmt.Sprintf("thresh=%d", thresh), func(t *testing.T) {
213 withThreshold(t, thresh)
214 // MustCompile must be after withThreshold so that the lazy RE2
215 // compilation in Compile() sees the overridden threshold and
216 // actually initialises re.re2 when thresh=0.
217 re := MustCompile(`XYZZY_NEVER_MATCHES`)
218 if got := re.FindAllIndex(corpus, -1); len(got) != 0 {
219 t.Errorf("thresh=%d: expected empty, got %v", thresh, got)
220 }
221 })
222 }
223}
224
225// ---- benchmarks ----
226
227// BenchmarkEngines measures FindAllIndex performance for grafana/regexp vs
228// go-re2 across multiple input sizes and pattern complexities.
229//
230// Run with:
231//
232// go test -bench=BenchmarkEngines -benchmem -benchtime=3s ./internal/hybridre2/
233func BenchmarkEngines(b *testing.B) {
234 patterns := []struct {
235 name string
236 pattern string
237 }{
238 {"literal", `main`},
239 {"case-insensitive", `(?i)func`},
240 {"alternation-5", `func|var|const|type|import`},
241 {"complex", `(func|var)\s+[A-Z]\w*\s*\(`},
242 {"hard-no-match", `XYZZY_NEVER_MATCHES_AT_ALL`},
243 }
244
245 sizes := []struct {
246 name string
247 size int
248 }{
249 {"512B", 512},
250 {"4KB", 4 * 1024},
251 {"32KB", 32 * 1024},
252 {"128KB", 128 * 1024},
253 {"512KB", 512 * 1024},
254 }
255
256 // Pre-build all corpora outside the benchmark loop.
257 corpora := make(map[string][]byte, len(sizes))
258 for _, sz := range sizes {
259 corpora[sz.name] = makeCorpus(sz.size)
260 }
261
262 for _, pat := range patterns {
263 grafanaRe := grafanaregexp.MustCompile(pat.pattern)
264
265 for _, sz := range sizes {
266 corpus := corpora[sz.name]
267 name := pat.name + "/" + sz.name
268
269 b.Run("grafana/"+name, func(b *testing.B) {
270 b.SetBytes(int64(len(corpus)))
271 b.ResetTimer()
272 for i := 0; i < b.N; i++ {
273 _ = grafanaRe.FindAllIndex(corpus, -1)
274 }
275 })
276
277 b.Run("go-re2/"+name, func(b *testing.B) {
278 withThreshold(b, 0) // force re2 for all sizes
279 // MustCompile must be after withThreshold so that re.re2
280 // is initialised (lazy compilation checks threshold() at
281 // compile time, not match time).
282 hybridRe := MustCompile(pat.pattern)
283 b.SetBytes(int64(len(corpus)))
284 b.ResetTimer()
285 for i := 0; i < b.N; i++ {
286 _ = hybridRe.FindAllIndex(corpus, -1)
287 }
288 })
289 }
290 }
291}
292
293// ---- helpers ----
294
295// makeCorpus returns a realistic-looking Go source corpus of approximately
296// the requested size.
297func makeCorpus(size int) []byte {
298 const template = `package main
299
300import (
301 "fmt"
302 "strings"
303)
304
305// Foo is an exported function that transforms its input.
306func Foo(input string) string {
307 return strings.ToUpper(input)
308}
309
310// Bar demonstrates calling Foo.
311func Bar() {
312 result := Foo("hello world")
313 fmt.Println(result)
314}
315
316var globalVar = "some value"
317const MaxItems = 100
318
319type MyStruct struct {
320 Name string
321 Value int
322}
323
324func (m MyStruct) String() string {
325 return fmt.Sprintf("%s=%d", m.Name, m.Value)
326}
327
328`
329 buf := make([]byte, 0, size)
330 for len(buf) < size {
331 buf = append(buf, []byte(template)...)
332 }
333 return buf[:size]
334}
335
336func equalIndexSlices(a, b [][]int) bool {
337 if len(a) != len(b) {
338 return false
339 }
340 for i := range a {
341 if !equalIntSlice(a[i], b[i]) {
342 return false
343 }
344 }
345 return true
346}
347
348func equalIntSlice(a, b []int) bool {
349 if len(a) != len(b) {
350 return false
351 }
352 for i := range a {
353 if a[i] != b[i] {
354 return false
355 }
356 }
357 return true
358}