remove SRC_EXPERIMENT_ITERATE_NGRAM_LOOKUP_LIMIT (#800) · boltless.me/zoekt@c01b6c7

+1 -51

2 changed files

Expand all

bits.go

indexdata.go

-22

bits.go

··· 18 18 "cmp" 19 19 "encoding/binary" 20 20 "math" 21 - "math/rand/v2" 22 - "slices" 23 21 "sort" 24 22 "unicode" 25 23 "unicode/utf8" ··· 126 124 } 127 125 128 126 func splitNGrams(str []byte) []runeNgramOff { 129 - // len(maxNgrams) >= the number of ngrams in str => no limit 130 - return splitNGramsLimit(str, len(str)) 131 - } 132 - 133 - func splitNGramsLimit(str []byte, maxNgrams int) []runeNgramOff { 134 127 var runeGram [3]rune 135 128 var off [3]uint32 136 129 var runeCount int ··· 158 151 ngram: ng, 159 152 index: len(result), 160 153 }) 161 - } 162 - 163 - // We return a random subset of size maxNgrams. This is to prevent the start 164 - // of the string biasing ngram selection. 165 - if maxNgrams < len(result) { 166 - // Deterministic seed for tests. Additionally makes comparing repeated 167 - // queries performance easier. 168 - r := rand.New(rand.NewPCG(uint64(maxNgrams), 0)) 169 - 170 - // Pick random subset via a shuffle 171 - r.Shuffle(maxNgrams, func(i, j int) { result[i], result[j] = result[j], result[i] }) 172 - result = result[:maxNgrams] 173 - 174 - // Caller expects ngrams in order of appearance. 175 - slices.SortFunc(result, runeNgramOff.Compare) 176 154 } 177 155 178 156 return result

+1 -29

indexdata.go

··· 21 21 "hash/crc64" 22 22 "log" 23 23 "math/bits" 24 - "os" 25 24 "slices" 26 - "strconv" 27 25 "unicode/utf8" 28 26 29 27 "github.com/sourcegraph/zoekt/query" ··· 403 401 return cs 404 402 } 405 403 406 - // experimentIterateNgramLookupLimit when non-zero will only lookup this many 407 - // ngrams from a query string. Note: that if case-insensitive, this only 408 - // limits the input. So we will still lookup the case folding. 409 - // 410 - // This experiment is targetting looking up large snippets. If it is 411 - // successful, we will likely hardcode the value we use in production. 412 - // 413 - // Future note: if we find cases where this works badly, we can consider only 414 - // searching a random subset of the query string to avoid bad strings. 415 - var experimentIterateNgramLookupLimit = getEnvInt("SRC_EXPERIMENT_ITERATE_NGRAM_LOOKUP_LIMIT") 416 - 417 - func getEnvInt(k string) int { 418 - v, _ := strconv.Atoi(os.Getenv(k)) 419 - if v != 0 { 420 - log.Printf("%s = %d\n", k, v) 421 - } 422 - return v 423 - } 424 - 425 404 func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResults, error) { 426 405 str := query.Pattern 427 406 428 407 // Find the 2 least common ngrams from the string. 429 - var ngramOffs []runeNgramOff 430 - if ngramLimit := experimentIterateNgramLookupLimit; ngramLimit > 0 { 431 - // Note: we can't just do str = str[:ngramLimit] due to utf-8 and str 432 - // length is asked later on for other optimizations. 433 - ngramOffs = splitNGramsLimit([]byte(str), ngramLimit) 434 - } else { 435 - ngramOffs = splitNGrams([]byte(str)) 436 - } 408 + ngramOffs := splitNGrams([]byte(str)) 437 409 438 410 // protect against accidental searching of empty strings 439 411 if len(ngramOffs) == 0 {

Configure Feed

Configure Feed