···1818 "cmp"
1919 "encoding/binary"
2020 "math"
2121- "math/rand/v2"
2222- "slices"
2321 "sort"
2422 "unicode"
2523 "unicode/utf8"
···126124}
127125128126func splitNGrams(str []byte) []runeNgramOff {
129129- // len(maxNgrams) >= the number of ngrams in str => no limit
130130- return splitNGramsLimit(str, len(str))
131131-}
132132-133133-func splitNGramsLimit(str []byte, maxNgrams int) []runeNgramOff {
134127 var runeGram [3]rune
135128 var off [3]uint32
136129 var runeCount int
···158151 ngram: ng,
159152 index: len(result),
160153 })
161161- }
162162-163163- // We return a random subset of size maxNgrams. This is to prevent the start
164164- // of the string biasing ngram selection.
165165- if maxNgrams < len(result) {
166166- // Deterministic seed for tests. Additionally makes comparing repeated
167167- // queries performance easier.
168168- r := rand.New(rand.NewPCG(uint64(maxNgrams), 0))
169169-170170- // Pick random subset via a shuffle
171171- r.Shuffle(maxNgrams, func(i, j int) { result[i], result[j] = result[j], result[i] })
172172- result = result[:maxNgrams]
173173-174174- // Caller expects ngrams in order of appearance.
175175- slices.SortFunc(result, runeNgramOff.Compare)
176154 }
177155178156 return result
+1-29
indexdata.go
···2121 "hash/crc64"
2222 "log"
2323 "math/bits"
2424- "os"
2524 "slices"
2626- "strconv"
2725 "unicode/utf8"
28262927 "github.com/sourcegraph/zoekt/query"
···403401 return cs
404402}
405403406406-// experimentIterateNgramLookupLimit when non-zero will only lookup this many
407407-// ngrams from a query string. Note: that if case-insensitive, this only
408408-// limits the input. So we will still lookup the case folding.
409409-//
410410-// This experiment is targetting looking up large snippets. If it is
411411-// successful, we will likely hardcode the value we use in production.
412412-//
413413-// Future note: if we find cases where this works badly, we can consider only
414414-// searching a random subset of the query string to avoid bad strings.
415415-var experimentIterateNgramLookupLimit = getEnvInt("SRC_EXPERIMENT_ITERATE_NGRAM_LOOKUP_LIMIT")
416416-417417-func getEnvInt(k string) int {
418418- v, _ := strconv.Atoi(os.Getenv(k))
419419- if v != 0 {
420420- log.Printf("%s = %d\n", k, v)
421421- }
422422- return v
423423-}
424424-425404func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResults, error) {
426405 str := query.Pattern
427406428407 // Find the 2 least common ngrams from the string.
429429- var ngramOffs []runeNgramOff
430430- if ngramLimit := experimentIterateNgramLookupLimit; ngramLimit > 0 {
431431- // Note: we can't just do str = str[:ngramLimit] due to utf-8 and str
432432- // length is asked later on for other optimizations.
433433- ngramOffs = splitNGramsLimit([]byte(str), ngramLimit)
434434- } else {
435435- ngramOffs = splitNGrams([]byte(str))
436436- }
408408+ ngramOffs := splitNGrams([]byte(str))
437409438410 // protect against accidental searching of empty strings
439411 if len(ngramOffs) == 0 {