fork of https://github.com/sourcegraph/zoekt
1// Copyright 2026 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// Package hybridre2 provides a hybrid regex engine that switches between
16// grafana/regexp (an optimized fork of Go's stdlib regexp) and
17// wasilibs/go-re2 (RE2 via WebAssembly) based on input size.
18//
19// Motivation: Go's regexp engine lacks a lazy DFA, making it O(n·m) for
20// hard patterns. RE2's lazy DFA provides linear-time matching, which is
21// dramatically faster for large inputs (>32KB) or complex patterns. For
22// small inputs the WASM call overhead of go-re2 exceeds the savings,
23// so grafana/regexp remains the better choice there.
24//
25// The threshold is controlled by the ZOEKT_RE2_THRESHOLD_BYTES environment
26// variable, read once at program startup:
27//
28// - -1 (default): disabled, always use grafana/regexp
29// - 0: always use go-re2
30// - N > 0: use go-re2 when len(input) >= N bytes
31//
32// # Known tradeoffs
33//
34// Memory: each Regexp holds compiled state for both engines when RE2 is
35// enabled. Patterns are compiled per-search (not cached globally), so under
36// high concurrency with many unique patterns the WASM heap adds up. Monitor
37// RSS when first enabling the threshold in production.
38//
39// UTF-8 semantics: go-re2 stops at invalid UTF-8; grafana/regexp replaces
40// invalid bytes with U+FFFD and continues. Results may differ on binary
41// content that slips past content-type detection. See FindAllIndex for
42// details.
43//
44// RE2 compilation failure: if RE2 rejects a pattern that grafana/regexp
45// accepts (due to syntax differences between the two engines), Compile
46// returns an error rather than silently falling back to grafana/regexp.
47// This is intentional (fail-fast), but it means enabling the threshold
48// could surface errors for edge-case patterns that work today. Patterns
49// sourced from zoekt query parsing are validated before reaching this
50// package, so this is unlikely in practice.
51package hybridre2
52
53import (
54 "os"
55 "strconv"
56 "sync"
57
58 grafanaregexp "github.com/grafana/regexp"
59 re2regexp "github.com/wasilibs/go-re2"
60)
61
62const (
63 // envThreshold is the environment variable name controlling the size
64 // threshold (bytes) at which go-re2 is used instead of grafana/regexp.
65 // Set to -1 (default) to disable go-re2 entirely, 0 to always use it.
66 envThreshold = "ZOEKT_RE2_THRESHOLD_BYTES"
67
68 // disabled is the sentinel value meaning go-re2 is never used.
69 disabled = int64(-1)
70)
71
72// threshold returns the configured byte threshold, reading
73// ZOEKT_RE2_THRESHOLD_BYTES from the environment exactly once.
74// Negative means disabled; zero means always use RE2.
75//
76// Tests may reassign this variable to override the threshold.
77var threshold = sync.OnceValue(func() int64 {
78 if val, ok := os.LookupEnv(envThreshold); ok {
79 if n, err := strconv.ParseInt(val, 10, 64); err == nil {
80 return n
81 }
82 }
83 return disabled
84})
85
86// Regexp is a compiled regular expression that dispatches to either
87// grafana/regexp or go-re2 at match time, based on input size.
88type Regexp struct {
89 grafana *grafanaregexp.Regexp
90 re2 *re2regexp.Regexp // nil when threshold() < 0 (disabled)
91}
92
93// Compile returns a new Regexp. The grafana/regexp variant is always compiled.
94// The go-re2 variant is only compiled when ZOEKT_RE2_THRESHOLD_BYTES is set to
95// a non-negative value; when RE2 is disabled (the default), skipping WASM
96// compilation keeps the disabled path truly zero-cost.
97func Compile(pattern string) (*Regexp, error) {
98 g, err := grafanaregexp.Compile(pattern)
99 if err != nil {
100 return nil, err
101 }
102 result := &Regexp{grafana: g}
103 if threshold() >= 0 {
104 r, err := re2regexp.Compile(pattern)
105 if err != nil {
106 return nil, err
107 }
108 result.re2 = r
109 }
110 return result, nil
111}
112
113// MustCompile is like Compile but panics on error.
114func MustCompile(pattern string) *Regexp {
115 re, err := Compile(pattern)
116 if err != nil {
117 panic("hybridre2: Compile(" + pattern + "): " + err.Error())
118 }
119 return re
120}
121
122// useRE2 reports whether the RE2 engine should be used for an input of the
123// given length, based on the current threshold setting.
124func useRE2(inputLen int) bool {
125 t := threshold()
126 return t >= 0 && int64(inputLen) >= t
127}
128
129// FindAllIndex returns successive non-overlapping matches of the expression
130// in b. It uses go-re2 when len(b) >= threshold() (and RE2 is enabled),
131// and grafana/regexp otherwise. Match indices are relative to b.
132//
133// NOTE: go-re2 stops matching at invalid UTF-8 bytes, whereas grafana/regexp
134// replaces them with U+FFFD and continues. This means results may differ on
135// binary or non-UTF-8 content when RE2 is active. The default threshold of -1
136// (disabled) ensures zero behaviour change for existing deployments; operators
137// enabling the threshold should be aware of this distinction.
138func (re *Regexp) FindAllIndex(b []byte, n int) [][]int {
139 if re.re2 != nil && useRE2(len(b)) {
140 return re.re2.FindAllIndex(b, n)
141 }
142 return re.grafana.FindAllIndex(b, n)
143}
144
145// String returns the source text used to compile the regular expression.
146func (re *Regexp) String() string {
147 return re.grafana.String()
148}