fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2026 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15// Package hybridre2 provides a hybrid regex engine that switches between 16// grafana/regexp (an optimized fork of Go's stdlib regexp) and 17// wasilibs/go-re2 (RE2 via WebAssembly) based on input size. 18// 19// Motivation: Go's regexp engine lacks a lazy DFA, making it O(n·m) for 20// hard patterns. RE2's lazy DFA provides linear-time matching, which is 21// dramatically faster for large inputs (>32KB) or complex patterns. For 22// small inputs the WASM call overhead of go-re2 exceeds the savings, 23// so grafana/regexp remains the better choice there. 24// 25// The threshold is controlled by the ZOEKT_RE2_THRESHOLD_BYTES environment 26// variable, read once at program startup: 27// 28// - -1 (default): disabled, always use grafana/regexp 29// - 0: always use go-re2 30// - N > 0: use go-re2 when len(input) >= N bytes 31// 32// # Known tradeoffs 33// 34// Memory: each Regexp holds compiled state for both engines when RE2 is 35// enabled. Patterns are compiled per-search (not cached globally), so under 36// high concurrency with many unique patterns the WASM heap adds up. Monitor 37// RSS when first enabling the threshold in production. 38// 39// UTF-8 semantics: go-re2 stops at invalid UTF-8; grafana/regexp replaces 40// invalid bytes with U+FFFD and continues. Results may differ on binary 41// content that slips past content-type detection. See FindAllIndex for 42// details. 43// 44// RE2 compilation failure: if RE2 rejects a pattern that grafana/regexp 45// accepts (due to syntax differences between the two engines), Compile 46// returns an error rather than silently falling back to grafana/regexp. 47// This is intentional (fail-fast), but it means enabling the threshold 48// could surface errors for edge-case patterns that work today. Patterns 49// sourced from zoekt query parsing are validated before reaching this 50// package, so this is unlikely in practice. 51package hybridre2 52 53import ( 54 "os" 55 "strconv" 56 "sync" 57 58 grafanaregexp "github.com/grafana/regexp" 59 re2regexp "github.com/wasilibs/go-re2" 60) 61 62const ( 63 // envThreshold is the environment variable name controlling the size 64 // threshold (bytes) at which go-re2 is used instead of grafana/regexp. 65 // Set to -1 (default) to disable go-re2 entirely, 0 to always use it. 66 envThreshold = "ZOEKT_RE2_THRESHOLD_BYTES" 67 68 // disabled is the sentinel value meaning go-re2 is never used. 69 disabled = int64(-1) 70) 71 72// threshold returns the configured byte threshold, reading 73// ZOEKT_RE2_THRESHOLD_BYTES from the environment exactly once. 74// Negative means disabled; zero means always use RE2. 75// 76// Tests may reassign this variable to override the threshold. 77var threshold = sync.OnceValue(func() int64 { 78 if val, ok := os.LookupEnv(envThreshold); ok { 79 if n, err := strconv.ParseInt(val, 10, 64); err == nil { 80 return n 81 } 82 } 83 return disabled 84}) 85 86// Regexp is a compiled regular expression that dispatches to either 87// grafana/regexp or go-re2 at match time, based on input size. 88type Regexp struct { 89 grafana *grafanaregexp.Regexp 90 re2 *re2regexp.Regexp // nil when threshold() < 0 (disabled) 91} 92 93// Compile returns a new Regexp. The grafana/regexp variant is always compiled. 94// The go-re2 variant is only compiled when ZOEKT_RE2_THRESHOLD_BYTES is set to 95// a non-negative value; when RE2 is disabled (the default), skipping WASM 96// compilation keeps the disabled path truly zero-cost. 97func Compile(pattern string) (*Regexp, error) { 98 g, err := grafanaregexp.Compile(pattern) 99 if err != nil { 100 return nil, err 101 } 102 result := &Regexp{grafana: g} 103 if threshold() >= 0 { 104 r, err := re2regexp.Compile(pattern) 105 if err != nil { 106 return nil, err 107 } 108 result.re2 = r 109 } 110 return result, nil 111} 112 113// MustCompile is like Compile but panics on error. 114func MustCompile(pattern string) *Regexp { 115 re, err := Compile(pattern) 116 if err != nil { 117 panic("hybridre2: Compile(" + pattern + "): " + err.Error()) 118 } 119 return re 120} 121 122// useRE2 reports whether the RE2 engine should be used for an input of the 123// given length, based on the current threshold setting. 124func useRE2(inputLen int) bool { 125 t := threshold() 126 return t >= 0 && int64(inputLen) >= t 127} 128 129// FindAllIndex returns successive non-overlapping matches of the expression 130// in b. It uses go-re2 when len(b) >= threshold() (and RE2 is enabled), 131// and grafana/regexp otherwise. Match indices are relative to b. 132// 133// NOTE: go-re2 stops matching at invalid UTF-8 bytes, whereas grafana/regexp 134// replaces them with U+FFFD and continues. This means results may differ on 135// binary or non-UTF-8 content when RE2 is active. The default threshold of -1 136// (disabled) ensures zero behaviour change for existing deployments; operators 137// enabling the threshold should be aware of this distinction. 138func (re *Regexp) FindAllIndex(b []byte, n int) [][]int { 139 if re.re2 != nil && useRE2(len(b)) { 140 return re.re2.FindAllIndex(b, n) 141 } 142 return re.grafana.FindAllIndex(b, n) 143} 144 145// String returns the source text used to compile the regular expression. 146func (re *Regexp) String() string { 147 return re.grafana.String() 148}