fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

all: remove bloom filter code (#431)

We have had this disabled on the read path for many months. Additionally
I noticed in profiles of zoekt-git-index that generation of the bloom
filter was taking 24% of the time on the sourcegraph repo.

Test Plan: Tested the matrix of old/new zoekt-webserver reading old/new
shards to ensure everything just keeped working nicely.

+23 -922
+2 -2
api.go
··· 349 349 // Shards that we did not process because a query was canceled. 350 350 ShardsSkipped int 351 351 352 - // Shards that we did not process because the query was rejected 353 - // by the bloom or ngram filter indicating it had no matches. 352 + // Shards that we did not process because the query was rejected by the 353 + // ngram filter indicating it had no matches. 354 354 ShardsSkippedFilter int 355 355 356 356 // Number of non-overlapping matches
-314
bloom.go
··· 1 - // Copyright 2021 Google Inc. All rights reserved. 2 - // 3 - // Licensed under the Apache License, Version 2.0 (the "License"); 4 - // you may not use this file except in compliance with the License. 5 - // You may obtain a copy of the License at 6 - // 7 - // http://www.apache.org/licenses/LICENSE-2.0 8 - // 9 - // Unless required by applicable law or agreed to in writing, software 10 - // distributed under the License is distributed on an "AS IS" BASIS, 11 - // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 - // See the License for the specific language governing permissions and 13 - // limitations under the License. 14 - 15 - // Bloom implements a simple bloom filter over case-insensitive word fragments, 16 - // with the default hash function providing a blocked bloom filter: 17 - // https://algo2.iti.kit.edu/singler/publications/cacheefficientbloomfilters-wea2007.pdf 18 - // 19 - // Various permutations of hash functions, fragment sizes, and block sizes were 20 - // tested to determine the pareto frontier of false positive rate vs avg bloom filter size. 21 - // FPR = (false positives / (false positives + true negatives)) 22 - // 23 - // This determined the hash function in use: 24 - // CRC over word fragments of length 4-8, in a block size of 512 bits, and 3 probes. 25 - // 26 - // In particular: 27 - // * using a crypto hash like siphash provided no benefit, and is slower. 28 - // * having longer word fragments increases false positive rate, and 3-long fragments 29 - // are handled by the trigram index. 30 - // * a 1% FPR is near the optimal bits-per-precision tradeoff, with 2.5% FPR 31 - // only reducing bloom filter sizes by 25%. 32 - 33 - package zoekt // import "github.com/sourcegraph/zoekt" 34 - 35 - import ( 36 - "bytes" 37 - "errors" 38 - "hash/crc32" 39 - "math" 40 - "math/bits" 41 - "reflect" 42 - "unicode" 43 - "unicode/utf8" 44 - ) 45 - 46 - type bloom struct { 47 - hasher bloomHash 48 - bits []uint8 49 - } 50 - 51 - type bloomHash func([]byte) []uint32 52 - 53 - // Least common multiple of of {1..18}. 54 - // This permits precise resizing for many different factors without 55 - // using excessive RAM during processing. Some shards will saturate 56 - // the bloom filter (have a load factor greater than the target), 57 - // but they tend to be edge cases with a huge number of distinct 58 - // ngrams, so we have to rely on the trigram index iteration to search. 59 - const bloomSizeBase = 12252240 60 - 61 - // A smaller base bloom filter size for faster tests. LCM(1..10) 62 - const bloomSizeTest = 2520 63 - 64 - // bloomDefaultHash and bloomDefaultLoad were empirically 65 - // determined to achieve 1% FPR with minimal space usage. 66 - var bloomDefaultHash = bloomHasherCRCBlocked64B8K3 67 - 68 - const bloomDefaultLoad = 0.42 69 - 70 - // Castagnoli CRCs have hardware instructions to compute them. 71 - var crcTab = crc32.MakeTable(crc32.Castagnoli) 72 - 73 - func makeBloomFilterEmpty() bloom { 74 - return bloom{bloomDefaultHash, make([]uint8, bloomSizeBase)} 75 - } 76 - 77 - func makeBloomFilterWithHasher(hash bloomHash) bloom { 78 - return bloom{hash, make([]uint8, bloomSizeBase)} 79 - } 80 - 81 - func (b *bloom) Len() int { 82 - return len(b.bits) * 8 83 - } 84 - 85 - func (b *bloom) add(xs []uint32) { 86 - for _, x := range xs { 87 - b.bits[int(x/8)%len(b.bits)] |= 1 << (x % 8) 88 - } 89 - } 90 - 91 - // addBytes splits the input into case-insensitive word fragments, hashes them, 92 - // and adds them all to the bloom filter. 93 - func (b *bloom) addBytes(data []byte) { 94 - b.add(b.hasher(data)) 95 - } 96 - 97 - // maybeHas returns whether all input hashes are in the bloom filter. 98 - // False positives are possible, but false negatives are impossible. 99 - func (b *bloom) maybeHas(xs []uint32) bool { 100 - if len(b.bits) == 0 { 101 - return true 102 - } 103 - for _, x := range xs { 104 - if b.bits[int(x/8)%len(b.bits)]&(1<<(x%8)) == 0 { 105 - return false 106 - } 107 - } 108 - return true 109 - } 110 - 111 - // maybeHasBytes splits the input into case-insensitive word fragments, 112 - // hashes them, and tests if they're all in the bloom filter. 113 - func (b *bloom) maybeHasBytes(xs []byte) bool { 114 - if b.hasher == nil { 115 - return true 116 - } 117 - return b.maybeHas(b.hasher(xs)) 118 - } 119 - 120 - func (b *bloom) load() float64 { 121 - // TODO: this is 4x faster with unsafe 64-bit casting, or 122 - // constant time if add() tracks the load directly. 123 - total := 0 124 - for _, x := range b.bits { 125 - total += bits.OnesCount8(x) 126 - } 127 - return float64(total) / float64(len(b.bits)*8) 128 - } 129 - 130 - // shrinkToSize returns a resized bloom filter with a bit density close to target. 131 - // This exploits the fact that a test for a probe x in the bloom filter is actually 132 - // a test for bit x%len, and a bloom filter of size newlen that divides len is easily 133 - // derived by oring the bits together len/newlen times. This works because 134 - // x%newlen == x%len%newlen iff newlen divides len, so we can shrink the bloom filter 135 - // without having the original probes or keys! This functionality lets us construct 136 - // a bloom filter while only having an upper bound on cardinality, instead of having 137 - // to have a separate, expensive input-counting phase. 138 - func (b *bloom) shrinkToSize(target float64) bloom { 139 - if target <= 0.0 || target >= 1.0 { 140 - return *b 141 - } 142 - 143 - // shrinking sets each output bit to the OR-ed together 144 - // output of k=`factor` bits that are set with probability 145 - // x=`b.load()`. We want to achieve a target load `y`. 146 - // 147 - // The probability that a bit is set is one minus the probability 148 - // that its inputs are all unset-- 1-(1-x)^k. To get k given y, 149 - // https://www.wolframalpha.com/input/?i=solve+for+k+in+y%3D1-%281-x%29%5Ek 150 - // => k=log(1-y)/log(1-x) 151 - factor := len(b.bits) 152 - divisor := math.Log(1 - b.load()) 153 - if divisor != 0 { // avoid divide by zero for empty filter (b.load() is 0) 154 - factor = int(math.Log(1-target) / divisor) 155 - } 156 - 157 - // We can only shrink the bloom filter to a size that is a factor of the 158 - // input size. This is made easier by bloomSizeBase being highly composite. 159 - for factor > 0 && len(b.bits)%factor != 0 { 160 - factor-- 161 - } 162 - 163 - if factor <= 1 { 164 - return *b 165 - } 166 - out := bloom{b.hasher, make([]uint8, len(b.bits)/factor)} 167 - j := 0 168 - for i := 0; i < len(b.bits); i++ { 169 - out.bits[j] |= b.bits[i] 170 - j++ 171 - if j >= len(out.bits) { 172 - j = 0 173 - } 174 - } 175 - 176 - return out 177 - } 178 - 179 - func (b bloom) write(w *writer) { 180 - // header: serialization version, hasher id 181 - w.Write([]byte{1, bloomHasherIds[reflect.ValueOf(b.hasher).Pointer()]}) 182 - w.Write(b.bits) 183 - } 184 - 185 - func makeBloomFilterFromEncoded(buf []byte) (bloom, error) { 186 - b := bloom{} 187 - if len(buf) < 2 || buf[0] != 1 { 188 - return b, errors.New("invalid bloom filter encoding (wrong size/version)") 189 - } 190 - if buf[1] <= 0 || int(buf[1]) > len(bloomHashers) { 191 - return b, errors.New("invalid bloom filter encoding (unknown hasher type)") 192 - } 193 - b.hasher = bloomHashers[buf[1]-1] 194 - b.bits = buf[2:] 195 - return b, nil 196 - } 197 - 198 - // bloomHasherIds maps from function pointers to hash numbers, to allow 199 - // backwards compatible hash function changes. 200 - var bloomHasherIds = map[uintptr]byte{ 201 - reflect.ValueOf(bloomHasherCRCBlocked64B8K3).Pointer(): 1, 202 - } 203 - 204 - // bloomHashers maps from hash identifierss stored in encoded bloom filters to 205 - // hash functions, to allo backwards compatible hash function evolution. 206 - var bloomHashers = []bloomHash{ 207 - bloomHasherCRCBlocked64B8K3, 208 - } 209 - 210 - // The following functions and constants *must not* be changed unless you can prove 211 - // they have exactly identical behavior. Instead of changing these functions, 212 - // add a new hash function and a new entry in bloomHasherIds and bloomHashers, 213 - // then change the default hash function. 214 - // 215 - // This allows changing to a new hash function without invalidating all existing 216 - // files, and more importantly, without starting to return false negatives (!!!) 217 - // because the hash function changed unexpectedly. 218 - const bloomHashMinWordLength = 4 219 - 220 - // bloomWordTab uses a table to implement a matcher for the regex \w{4,} 221 - var bloomWordTab [128 / 64]uint64 = initBloomWordTab() 222 - 223 - func initBloomWordTab() [128 / 64]uint64 { 224 - var tab [128 / 64]uint64 225 - for x := byte(0); x < 128; x++ { 226 - if x == '_' || 'a' <= x && x <= 'z' || 'A' <= x && x <= 'Z' || '0' <= x && x <= '9' { 227 - tab[x/64] |= 1 << (x % 64) 228 - } 229 - } 230 - return tab 231 - } 232 - 233 - func findNextWord(i int, in []byte) (int, []byte) { 234 - // Dropping the unicode case-folding requirement would 235 - // improve performance here. There are *exactly* two Unicode 236 - // codepoints that map down to ASCII: 237 - // K: U+212A KELVIN SIGN 238 - // ſ: U+017F LATIN SMALL LETTER LONG S 239 - for i < len(in) { 240 - // skip non-word runes 241 - for i < len(in) { 242 - c, sz := utf8.DecodeRune(in[i:]) 243 - c = unicode.ToLower(c) 244 - if c < 128 && bloomWordTab[c/64]&(1<<(c%64)) != 0 { 245 - break 246 - } 247 - i += sz 248 - } 249 - // count length of word section 250 - wordStart := i 251 - runeLength := 0 252 - for i < len(in) { 253 - c, sz := utf8.DecodeRune(in[i:]) 254 - c = unicode.ToLower(c) 255 - if c >= 128 || bloomWordTab[c/64]&(1<<(c%64)) == 0 { 256 - break 257 - } 258 - runeLength++ 259 - i += sz 260 - } 261 - // Skip short words. 262 - if runeLength < bloomHashMinWordLength { 263 - continue 264 - } 265 - return i, bytes.ToLower(in[wordStart:i]) 266 - } 267 - return i, nil 268 - } 269 - 270 - func bloomHasherCRC(in []byte) []uint32 { 271 - out := []uint32{} 272 - for i := 0; i < len(in); { 273 - var s []byte 274 - i, s = findNextWord(i, in) 275 - // Add all substrings of length 4-10 to the bloom filter. 276 - // Not having a bound on the maximum length causes quadratic 277 - // probe counts on long "words"-- like a 241KB line of 278 - // DNA ("gtggcaccctgactgg...") 279 - for l := 10; l >= 4; l-- { 280 - for i := 0; i+l <= len(s); i++ { 281 - if '0' <= s[i] && s[i] <= '9' { 282 - // Long numeric/hex constants are generally unlikely 283 - // to be searched for, so don't include probes for 284 - // substrings that start with a number. 285 - continue 286 - } 287 - out = append(out, crc32.Checksum(s[i:i+l], crcTab)) 288 - } 289 - } 290 - } 291 - return out 292 - } 293 - 294 - func bloomHasherCRCBlocked64B8K3(in []byte) []uint32 { 295 - out := []uint32{} 296 - for i := 0; i < len(in); { 297 - var s []byte 298 - i, s = findNextWord(i, in) 299 - for i := 0; i <= len(s)-4; i++ { 300 - if '0' <= s[i] && s[i] <= '9' { 301 - continue 302 - } 303 - base := crc32.Checksum(s[i:i+4], crcTab) * 512 304 - for j := i + 4; j < i+8 && j <= len(s); j++ { 305 - h := crc32.Checksum(s[i:j], crcTab) 306 - out = append(out, 307 - base|h%512, base|(h>>9)%512, 308 - base|(h>>18)%512, 309 - ) 310 - } 311 - } 312 - } 313 - return out 314 - }
-507
bloom_test.go
··· 1 - // Copyright 2021 Google Inc. All rights reserved. 2 - // 3 - // Licensed under the Apache License, Version 2.0 (the "License"); 4 - // you may not use this file except in compliance with the License. 5 - // You may obtain a copy of the License at 6 - // 7 - // http://www.apache.org/licenses/LICENSE-2.0 8 - // 9 - // Unless required by applicable law or agreed to in writing, software 10 - // distributed under the License is distributed on an "AS IS" BASIS, 11 - // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 - // See the License for the specific language governing permissions and 13 - // limitations under the License. 14 - 15 - package zoekt // import "github.com/sourcegraph/zoekt" 16 - 17 - import ( 18 - "bytes" 19 - "flag" 20 - "fmt" 21 - "io/fs" 22 - "math" 23 - "math/rand" 24 - "os" 25 - "path" 26 - "reflect" 27 - "runtime" 28 - "sort" 29 - "strconv" 30 - "strings" 31 - "sync" 32 - "testing" 33 - ) 34 - 35 - var ( 36 - ngramDataDir = flag.String("ngramdir", "", "directory containing testdata with files with one word per line") 37 - docCount = flag.Int("docs", 0, "number of docs to load, (default 0 for all)") 38 - hasherNum = flag.Int("hasher", 1, "index of the hasher to test") 39 - loadPerc = flag.String("load", "42", "space-separated lists of target load percentages") 40 - docFpr = flag.Bool("docfpr", false, "show per-document FPRs") 41 - trigramFpr = flag.Bool("tri", false, "compute FPR for trigram-based filtering") 42 - ) 43 - 44 - func TestFindNextWord(t *testing.T) { 45 - for _, tc := range []struct { 46 - input string 47 - want []string 48 - }{ 49 - { 50 - "aeiou and SOMETIMES y", 51 - []string{"aeiou", "sometimes"}, 52 - }, 53 - { 54 - "\n//_azAZ09[~]3456", 55 - []string{"_azaz09", "3456"}, 56 - }, 57 - { 58 - "nee\u212A aa\u212A", // kelvin degree symbol => 'k' 59 - []string{"neek"}, 60 - }, 61 - } { 62 - out := []string{} 63 - in := []byte(tc.input) 64 - for i := 0; i < len(in); { 65 - var s []byte 66 - i, s = findNextWord(i, in) 67 - if s != nil { 68 - out = append(out, string(s)) 69 - } 70 - } 71 - if !reflect.DeepEqual(tc.want, out) { 72 - t.Errorf("findNextWord(%q) got %q want %q", tc.input, out, tc.want) 73 - } 74 - } 75 - } 76 - 77 - func TestBloomHasher(t *testing.T) { 78 - b := makeBloomFilterEmpty() 79 - hashCount := len(b.hasher([]byte("testing"))) 80 - expected := 3 * len(strings.Split("test testi testin testing esti estin esting stin sting ting", " ")) 81 - if hashCount != expected { 82 - t.Errorf("hasher(\"testing\") produced %d hashes instead of %d", hashCount, expected) 83 - } 84 - 85 - inpA := []byte("some inputs to the bloom filter hashing") 86 - inpB := []byte("SOME inputs to the bloom filter hashing a b cd") 87 - if !reflect.DeepEqual(b.hasher(inpA), b.hasher(inpB)) { 88 - t.Errorf("hash(%v) => %v != hash(%v) => %v", inpA, b.hasher(inpA), inpB, b.hasher(inpB)) 89 - } 90 - } 91 - 92 - func TestBloomHasherStability(t *testing.T) { 93 - want := []uint32{ 94 - 0x41b0c462, 0x41b0c46c, 0x41b0c5a8, 0x79882c16, 0x79882c62, 0x79882d0f, 95 - 0x79882dbc, 0x79882d03, 0x79882d64, 0x79882cfd, 0x79882d90, 0x79882c74, 96 - 0x79882d79, 0x79882d75, 0x79882df3, 0xde692090, 0xde69219a, 0xde6920db, 97 - 0xde6920c0, 0xde6921ce, 0xde692132, 0xde6920a7, 0xde69207b, 0xde69201a, 98 - 0xde6920df, 0xde69214b, 0xde692183, 0x814351a8, 0x81435050, 0x81435090, 99 - 0x81435037, 0x814350db, 0x814350ce, 0x81435188, 0x8143509d, 0x81435113, 100 - 0x814351bc, 0x814351b6, 0x81435054, 0x88772190, 0x8877201d, 0x887720b1, 101 - 0x88772148, 0x8877208b, 0x887720b5, 0x88772154, 0x88772069, 0x887720aa, 102 - 0x8877215c, 0x8877213a, 0x887720b2, 0x3654361b, 0x36543795, 0x365436c6, 103 - 0x3654364e, 0x3654361a, 0x36543623, 0x365436ec, 0x3654365f, 0x3654364d, 104 - 0x3654368b, 0x365437a4, 0x3654375c, 0x2d64f078, 0x2d64f159, 0x2d64f105, 105 - 0x2d64f033, 0x2d64f145, 0x2d64f1ea, 0x2d64f130, 0x2d64f085, 0x2d64f029, 106 - 0x2d64f0ad, 0x2d64f188, 0x2d64f148, 0xc9ba3319, 0xc9ba326e, 0xc9ba32d9, 107 - 0xc9ba3381, 0xc9ba3331, 0xc9ba32ff, 0xc9ba320f, 0xc9ba335d, 0xc9ba3345, 108 - 0xc9ba338a, 0xc9ba32aa, 0xc9ba3273, 0xc9cb6fb7, 0xc9cb6e72, 0xc9cb6fd9, 109 - 0xc9cb6ed0, 0xc9cb6e47, 0xc9cb6ee2, 0xc9cb6e31, 0xc9cb6f8b, 0xc9cb6f06, 110 - 0x07b383c1, 0x07b383ec, 0x07b38200, 0x07b3830a, 0x07b382ec, 0x07b3838d, 111 - 0x90a95aad, 0x90a95a2a, 0x90a95bf2} 112 - have := bloomHasherCRCBlocked64B8K3([]byte("nee\u212A STAbilizAtion??")) 113 - if !reflect.DeepEqual(have, want) { 114 - t.Error("Bloom hasher outputs have changed. This will break queries! Revert and add a new method.") 115 - t.Errorf("have=%#v want=%#v", have, want) 116 - } 117 - } 118 - 119 - func TestBloomZero(t *testing.T) { 120 - var b bloom 121 - if !b.maybeHasBytes([]byte("some example strings")) { 122 - t.Error("bloom{}.maybeHasBytes should always return true") 123 - } 124 - if !b.maybeHas([]uint32{123}) { 125 - t.Error("bloom{}.maybeHas should always return true") 126 - } 127 - } 128 - 129 - func TestBloomBasic(t *testing.T) { 130 - b := makeBloomFilterEmpty() 131 - 132 - // Edge case: empty bloom filter resizing 133 - b1 := b.shrinkToSize(0.9999) 134 - if b1.Len() != 8 { 135 - t.Error("Empty bloom filter didn't resize to 1B") 136 - } 137 - 138 - // Edge case: nearly empty bloom filter resizing 139 - b.addBytes([]byte("some")) 140 - b2 := b.shrinkToSize(0.999) 141 - if b2.Len() != 8 { 142 - t.Error("Nearly empty bloom filter didn't resize to 1B") 143 - } 144 - 145 - // these test strings are carefully selected to not collide 146 - // with the default hash functions. 147 - inp := []byte(`some different test words that will definitely be present 148 - within the bloom filter`) 149 - missed := []byte("somehow another sequences falsified probabilisitically") 150 - 151 - b.addBytes(inp) 152 - 153 - for i := 0; i < 90; i += 5 { 154 - bi := b.shrinkToSize(float64(i) * .01) 155 - t.Logf("target %d%% load: shrink %d=>%d bytes, load factor %.07f%% => %.02f%%", 156 - i, len(b.bits), len(bi.bits), b.load()*100, bi.load()*100) 157 - 158 - for _, w := range bytes.Split(inp, []byte{' '}) { 159 - if !bi.maybeHasBytes(w) { 160 - t.Errorf("%d filter should contain %q but doesn't", i, string(w)) 161 - } 162 - } 163 - 164 - for _, w := range bytes.Split(missed, []byte{' '}) { 165 - if bi.maybeHasBytes(w) { 166 - t.Errorf("%d filter shouldn't contain %q but does", i, string(w)) 167 - } 168 - } 169 - } 170 - } 171 - 172 - func BenchmarkBloomFilterResize(b *testing.B) { 173 - f := makeBloomFilterEmpty() 174 - 175 - rng := rand.New(rand.NewSource(123)) 176 - for i := 0; i < 1e6; i++ { 177 - f.addBytes(randWord(4, 10, rng)) 178 - } 179 - 180 - b.SetBytes(int64(len(f.bits))) 181 - 182 - b.ResetTimer() 183 - for i := 0; i < b.N; i++ { 184 - f.shrinkToSize(bloomDefaultLoad) 185 - } 186 - } 187 - 188 - func randWord(min, max int, rng *rand.Rand) []byte { 189 - length := rng.Intn(max-min) + max 190 - out := make([]byte, length) 191 - for i := 0; i < length; i++ { 192 - out[i] = "abcdefghijklmnopqrstuvwxyz0123456789"[rng.Intn(36)] 193 - } 194 - return out 195 - } 196 - 197 - type customSort struct { 198 - len func() int 199 - less func(i, j int) bool 200 - swap func(i, j int) 201 - } 202 - 203 - func (c *customSort) Len() int { return c.len() } 204 - func (c *customSort) Less(i, j int) bool { return c.less(i, j) } 205 - func (c *customSort) Swap(i, j int) { c.swap(i, j) } 206 - 207 - func TestBloomFalsePositiveRate(t *testing.T) { 208 - rng := rand.New(rand.NewSource(123)) 209 - 210 - var wg sync.WaitGroup 211 - var lock sync.Mutex 212 - cpuCount := runtime.NumCPU() 213 - 214 - var hasher bloomHash 215 - var hname string 216 - 217 - switch *hasherNum { 218 - case 0: 219 - hasher = bloomHasherCRC 220 - hname = "crc" 221 - case 1: 222 - hasher = bloomHasherCRCBlocked64B8K3 223 - hname = "crcblock64k3_8" 224 - } 225 - t.Log("hasher:", hname) 226 - 227 - targetRate := []int{} 228 - for _, n := range strings.Split(*loadPerc, " ") { 229 - tr, err := strconv.Atoi(n) 230 - if err != nil { 231 - t.Fatal(err) 232 - } 233 - targetRate = append(targetRate, tr) 234 - } 235 - t.Log("load percentage targets:", targetRate) 236 - totsize := make([]int, len(targetRate)) 237 - docsize := 0 238 - 239 - docs := [][]byte{} 240 - docNames := []string{} 241 - blooms := [][]bloom{} 242 - 243 - addDoc := func(name string, doc []byte, parallel bool) ([]bloom, int) { 244 - b := makeBloomFilterWithHasher(hasher) 245 - b.addBytes(doc) 246 - bs := []bloom{} 247 - for _, r := range targetRate { 248 - bs = append(bs, b.shrinkToSize(float64(r)*0.01)) 249 - } 250 - 251 - if parallel { 252 - lock.Lock() 253 - } 254 - docNames = append(docNames, name) 255 - blooms = append(blooms, bs) 256 - i := len(blooms) 257 - docs = append(docs, doc) 258 - docsize += len(doc) 259 - for i, b := range bs { 260 - totsize[i] += b.Len() 261 - } 262 - if parallel { 263 - lock.Unlock() 264 - } 265 - return bs, i 266 - } 267 - 268 - if *ngramDataDir != "" { 269 - dirents, err := os.ReadDir(*ngramDataDir) 270 - if err != nil { 271 - t.Fatal(err) 272 - } 273 - sort.Slice(dirents, func(i, j int) bool { 274 - return dirents[i].Name() < dirents[j].Name() 275 - }) 276 - 277 - if *docCount > 0 && *docCount < len(dirents) { 278 - dirents = dirents[:*docCount] 279 - } 280 - 281 - work := make(chan fs.DirEntry) 282 - 283 - for i := 0; i < cpuCount; i++ { 284 - go func() { 285 - for dirent := range work { 286 - doc, err := os.ReadFile(path.Join(*ngramDataDir, dirent.Name())) 287 - if err != nil { 288 - t.Error(err) 289 - return 290 - } 291 - b, i := addDoc(dirent.Name(), doc, true) 292 - if i%100 == 0 { 293 - fmt.Println(i, bytes.Count(doc, []byte{'\n'}), b[0].Len(), b[0].load(), 294 - dirent.Name()) 295 - } 296 - wg.Done() 297 - } 298 - }() 299 - } 300 - for _, dirent := range dirents { 301 - if dirent.IsDir() { 302 - continue 303 - } 304 - wg.Add(1) 305 - work <- dirent 306 - } 307 - close(work) 308 - wg.Wait() 309 - } else { 310 - if *docCount == 0 { 311 - *docCount = 4 312 - } 313 - for i := 0; i < *docCount; i++ { 314 - wordCount := 100 + rng.Intn(100)*rng.Intn(100) 315 - doc := []byte{} 316 - for j := 0; j < wordCount; j++ { 317 - doc = append(doc, randWord(4, 7, rng)...) 318 - doc = append(doc, '\n') 319 - } 320 - b, l := addDoc(fmt.Sprintf("%04d", i), doc, false) 321 - t.Log(l, wordCount, b[0].Len(), b[0].load()) 322 - } 323 - } 324 - 325 - // sort docs by name for more deterministic output 326 - sort.Sort(&customSort{ 327 - len: func() int { return len(docs) }, 328 - less: func(i, j int) bool { return docNames[i] < docNames[j] }, 329 - swap: func(i, j int) { 330 - docNames[i], docNames[j] = docNames[j], docNames[i] 331 - docs[i], docs[j] = docs[j], docs[i] 332 - blooms[i], blooms[j] = blooms[j], blooms[i] 333 - }, 334 - }) 335 - 336 - t.Logf("loaded %d docs (%d MB / avg %d KB)", len(docs), docsize/1024/1024, docsize/len(docs)/1024) 337 - 338 - probes := [][]byte{} 339 - probeHashes := [][]uint32{} 340 - for _, doc := range docs { 341 - ws := bytes.Split(doc, []byte{'\n'}) 342 - n := 0 343 - for _, w := range ws { 344 - if len(w) == 0 || len(w) > 20 { 345 - continue 346 - } 347 - if w[0] < '0' || w[0] > '9' { 348 - ws[n] = w 349 - n++ 350 - } 351 - } 352 - ws = ws[:n] 353 - 354 - rng.Shuffle(len(ws), func(i, j int) { 355 - ws[i], ws[j] = ws[j], ws[i] 356 - }) 357 - if len(ws) > 100 { 358 - ws = ws[:100] 359 - } 360 - if len(docs) > 1000 && len(ws) > 10 { 361 - ws = ws[:10] 362 - } 363 - probes = append(probes, ws...) 364 - for _, w := range ws { 365 - probeHashes = append(probeHashes, blooms[0][0].hasher(w)) 366 - } 367 - } 368 - t.Logf("created %d probes", len(probes)) 369 - 370 - fpCount := make([][]int, len(docs)) // false positive 371 - tpCount := make([][]int, len(docs)) // true positive 372 - tnCount := make([][]int, len(docs)) // true negative 373 - // false negative is impossible in bloom filters 374 - 375 - for i := 0; i < len(docs); i++ { 376 - fpCount[i] = make([]int, len(targetRate)+1) 377 - tpCount[i] = make([]int, len(targetRate)+1) 378 - tnCount[i] = make([]int, len(targetRate)+1) 379 - } 380 - 381 - work := make(chan int) 382 - 383 - for n := 0; n < cpuCount; n++ { 384 - wg.Add(1) 385 - go func() { 386 - for i := range work { 387 - // compute all ngrams that might be tested to reduce probing 388 - // time complexity from O(mn) to O(mlogn+nlogn) 389 - gram := make([]string, 0, len(docs[i])/8) 390 - // also compute trigrams for FPR baseling 391 - trigrams := map[string]bool{} 392 - for _, s := range bytes.Split(docs[i], []byte{'\n'}) { 393 - for i := 0; i <= len(s)-4; i++ { 394 - if '0' <= s[i] && s[i] <= '9' { 395 - continue 396 - } 397 - for j := i + 4; j < i+20 && j <= len(s); j++ { 398 - gram = append(gram, string(s[i:j])) 399 - } 400 - if *trigramFpr { 401 - trigrams[string(s[i:i+3])] = true 402 - trigrams[string(s[i+1:i+4])] = true 403 - } 404 - } 405 - } 406 - sort.Strings(gram) 407 - 408 - for j, w := range probes { 409 - gidx := -1 410 - trueValue := false 411 - 412 - if *trigramFpr { 413 - maybeTrigrams := true 414 - for wo := 0; wo < len(w)-3; wo++ { 415 - if !trigrams[string(w[wo:wo+3])] { 416 - maybeTrigrams = false 417 - break 418 - } 419 - } 420 - if maybeTrigrams { 421 - gidx = sort.SearchStrings(gram, string(w)) 422 - trueValue = gidx >= 0 && gidx < len(gram) && gram[gidx] == string(w) 423 - if trueValue { 424 - tpCount[i][len(targetRate)]++ 425 - } else { 426 - fpCount[i][len(targetRate)]++ 427 - } 428 - } else { 429 - tnCount[i][len(targetRate)]++ 430 - } 431 - } 432 - 433 - for bn, b := range blooms[i] { 434 - maybeHas := b.maybeHas(probeHashes[j]) 435 - if maybeHas { 436 - if gidx == -1 { 437 - gidx = sort.SearchStrings(gram, string(w)) 438 - trueValue = gidx >= 0 && gidx < len(gram) && gram[gidx] == string(w) 439 - } 440 - if trueValue { 441 - tpCount[i][bn]++ 442 - } else { 443 - fpCount[i][bn]++ 444 - } 445 - } else { 446 - tnCount[i][bn]++ 447 - } 448 - } 449 - } 450 - } 451 - wg.Done() 452 - }() 453 - } 454 - 455 - for i := 0; i < len(docs); i++ { 456 - work <- i 457 - } 458 - close(work) 459 - wg.Wait() 460 - 461 - summer := make([]kahanSummer, len(targetRate)+1) 462 - for i := 0; i < len(docs); i++ { 463 - fprs := []string{} 464 - for bn := 0; bn < len(targetRate); bn++ { 465 - fpr := float64(fpCount[i][bn]) / float64(fpCount[i][bn]+tnCount[i][bn]) 466 - if math.IsNaN(fpr) { 467 - fpr = 1 468 - } 469 - if fpr > 0.1 { 470 - t.Errorf("false positive rate %.04f > 0.01", fpr) 471 - } 472 - summer[bn].add(fpr) 473 - if *docFpr { 474 - fprs = append(fprs, fmt.Sprintf("%5.2f", 100*fpr)) 475 - } 476 - } 477 - if *docFpr { 478 - fmt.Printf("doc: %4d bits: %8d fprs: %v name: %s\n", i, blooms[i][0].Len(), fprs[:len(targetRate)], docNames[i]) 479 - } 480 - } 481 - t.Logf("hash=%s", hname) 482 - t.Log("load,fpr,avg size") 483 - for bn, rate := range targetRate { 484 - t.Logf("%d, %.03f, %d\n", rate, 100*summer[bn].avg(), totsize[bn]/8/len(docs)) 485 - } 486 - if *trigramFpr { 487 - t.Logf("trigram fpr: %.03f\n", 100*summer[len(targetRate)].avg()) 488 - } 489 - } 490 - 491 - type kahanSummer struct { // Kahan Summation 492 - sum float64 493 - c float64 494 - n int 495 - } 496 - 497 - func (k *kahanSummer) add(x float64) { 498 - y := x - k.c 499 - t := k.sum + y 500 - k.c = (t - k.sum) - y 501 - k.sum = t 502 - k.n++ 503 - } 504 - 505 - func (k *kahanSummer) avg() float64 { 506 - return k.sum / float64(k.n) 507 - }
-2
go.sum
··· 754 754 golang.org/x/net v0.0.0-20220225172249-27dd8689420f/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= 755 755 golang.org/x/net v0.0.0-20220325170049-de3da57026de/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= 756 756 golang.org/x/net v0.0.0-20220412020605-290c469a71a5/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= 757 - golang.org/x/net v0.0.0-20220425223048-2871e0cb64e4 h1:HVyaeDAYux4pnY+D/SiwmLOR36ewZ4iGQIIrtnuCjFA= 758 - golang.org/x/net v0.0.0-20220425223048-2871e0cb64e4/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= 759 757 golang.org/x/net v0.0.0-20220927171203-f486391704dc h1:FxpXZdoBqT8RjqTy6i1E8nXHhW21wK7ptQ/EPIGxzPQ= 760 758 golang.org/x/net v0.0.0-20220927171203-f486391704dc/go.mod h1:YDH+HFinaLZZlnHAfSS6ZXJJ9M9t4Dl22yv3iI2vPwk= 761 759 golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
-27
index_test.go
··· 49 49 t.Helper() 50 50 51 51 b, err := NewIndexBuilder(repo) 52 - b.contentBloom.bits = b.contentBloom.bits[:bloomSizeTest] 53 - b.nameBloom.bits = b.nameBloom.bits[:bloomSizeTest] 54 52 if err != nil { 55 53 t.Fatalf("NewIndexBuilder: %v", err) 56 54 } ··· 69 67 70 68 b := newIndexBuilder() 71 69 b.indexFormatVersion = NextIndexFormatVersion 72 - b.contentBloom.bits = b.contentBloom.bits[:bloomSizeTest] 73 - b.nameBloom.bits = b.nameBloom.bits[:bloomSizeTest] 74 70 75 71 if len(repos) != len(docs) { 76 72 t.Fatalf("testIndexBuilderCompound: repos must be the same length as docs, got: len(repos)=%d len(docs)=%d", len(repos), len(docs)) ··· 123 119 124 120 if err := b.Add(doc); err == nil { 125 121 t.Errorf("doc sections beyond EOF should fail") 126 - } 127 - } 128 - 129 - func TestBloomSkip(t *testing.T) { 130 - for _, tc := range []struct { 131 - skip bool 132 - want int 133 - }{ 134 - {false, 1}, 135 - {true, 0}, 136 - } { 137 - if !tc.skip { 138 - os.Setenv("ZOEKT_ENABLE_BLOOM", "1") 139 - } 140 - b := testIndexBuilder(t, nil, 141 - Document{Name: "f1", Content: []byte("reader derre errea")}, 142 - ) 143 - res := searchForTest(t, b, &query.Substring{Pattern: "derrea"}) 144 - if res.Stats.ShardsSkippedFilter != tc.want { 145 - t.Errorf("bloom disabled=%v filtered out %v shards, want %v", 146 - tc.skip, res.Stats.ShardsSkippedFilter, tc.want) 147 - } 148 - os.Unsetenv("ZOEKT_ENABLE_BLOOM") 149 122 } 150 123 } 151 124
-7
indexbuilder.go
··· 179 179 contentPostings *postingsBuilder 180 180 namePostings *postingsBuilder 181 181 182 - contentBloom bloom 183 - nameBloom bloom 184 - 185 182 // root repositories 186 183 repoList []Repository 187 184 ··· 239 236 240 237 contentPostings: newPostingsBuilder(), 241 238 namePostings: newPostingsBuilder(), 242 - contentBloom: makeBloomFilterEmpty(), 243 - nameBloom: makeBloomFilterEmpty(), 244 239 fileEndSymbol: []uint32{0}, 245 240 symIndex: make(map[string]uint32), 246 241 symKindIndex: make(map[string]uint32), ··· 463 458 return fmt.Errorf("path %q must start subrepo path %q", doc.Name, doc.SubRepositoryPath) 464 459 } 465 460 } 466 - b.contentBloom.addBytes(doc.Content) 467 - b.nameBloom.addBytes([]byte(doc.Name)) 468 461 docStr, runeSecs, err := b.contentPostings.newSearchableString(doc.Content, doc.Symbols) 469 462 if err != nil { 470 463 return err
-24
indexdata.go
··· 97 97 98 98 // rawConfigMasks contains the encoded RawConfig for each repository 99 99 rawConfigMasks []uint8 100 - 101 - // A bloom filter over file contents. 102 - bloomContents bloom 103 - 104 - // A bloom filter over filenames. 105 - bloomNames bloom 106 100 } 107 101 108 102 type symbolData struct { ··· 379 373 380 374 func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResults, error) { 381 375 str := query.Pattern 382 - 383 - if len(query.Pattern) >= bloomHashMinWordLength { 384 - // test against appropriate content or filename bloom filters 385 - pat := []byte(query.Pattern) 386 - var match bool 387 - if query.FileName { 388 - match = d.bloomNames.maybeHasBytes(pat) 389 - } else { 390 - match = d.bloomContents.maybeHasBytes(pat) 391 - } 392 - if !match { 393 - return &ngramIterationResults{ 394 - matchIterator: &noMatchTree{ 395 - Why: "bloomfilter", 396 - }, 397 - }, nil 398 - } 399 - } 400 376 401 377 // Find the 2 least common ngrams from the string. 402 378 ngramOffs := splitNGrams([]byte(query.Pattern))
+14 -2
merge_test.go
··· 108 108 109 109 // We could also use bytes.Equal, but the output of cmd.Diff is very helpful for 110 110 // differences in metadata. 111 - if d := cmp.Diff(b1, b2); d != "" { 112 - t.Fatalf("-%s\n+%s:\n%s", shard1, shard2, d) 111 + d := cmp.Diff(b1, b2) 112 + if d == "" { 113 + return 113 114 } 115 + 116 + if *update { 117 + t.Logf("updating %s", shard1) 118 + err := os.WriteFile(shard1, b2, 0600) 119 + if err != nil { 120 + t.Fatal(err) 121 + } 122 + return 123 + } 124 + 125 + t.Fatalf("-%s\n+%s:\n%s", shard1, shard2, d) 114 126 }
-24
read.go
··· 303 303 d.ngrams = ngramMap{offsetMap: offsetMap} 304 304 } 305 305 306 - if os.Getenv("ZOEKT_ENABLE_BLOOM") != "" { 307 - d.bloomContents, err = d.readBloom(toc.contentBloom) 308 - if err != nil { 309 - return nil, err 310 - } 311 - 312 - d.bloomNames, err = d.readBloom(toc.nameBloom) 313 - if err != nil { 314 - return nil, err 315 - } 316 - } 317 - 318 306 d.fileBranchMasks, err = readSectionU64(d.file, toc.branchMasks) 319 307 if err != nil { 320 308 return nil, err ··· 581 569 } 582 570 583 571 return unmarshalDocSections(blob, buf), sec.sz, nil 584 - } 585 - 586 - func (d *indexData) readBloom(sec simpleSection) (bloom, error) { 587 - if sec.sz == 0 { 588 - // an empty bloom filter is fine 589 - return bloom{}, nil 590 - } 591 - data, err := d.readSectionBlob(sec) 592 - if err != nil { 593 - return bloom{}, err 594 - } 595 - return makeBloomFilterFromEncoded(data) 596 572 } 597 573 598 574 // NewSearcher creates a Searcher for a single index file. Search
testdata/shards/repo2_v16.00000.zoekt

This is a binary file and will not be displayed.

testdata/shards/repo_v16.00000.zoekt

This is a binary file and will not be displayed.

+7 -5
toc.go
··· 96 96 contentChecksums simpleSection 97 97 runeDocSections simpleSection 98 98 99 - contentBloom simpleSection 100 - nameBloom simpleSection 101 - 102 99 repos simpleSection 103 100 } 104 101 ··· 155 152 } 156 153 157 154 func (t *indexTOC) sectionsTaggedList() []taggedSection { 155 + var unusedSimple simpleSection 156 + 158 157 return []taggedSection{ 159 158 {"metaData", &t.metaData}, 160 159 {"repoMetaData", &t.repoMetaData}, ··· 180 179 {"languages", &t.languages}, 181 180 {"runeDocSections", &t.runeDocSections}, 182 181 {"repos", &t.repos}, 183 - {"nameBloom", &t.nameBloom}, 184 - {"contentBloom", &t.contentBloom}, 182 + 183 + // We no longer write bloom sections, but we still return them here to 184 + // avoid warnings about unknown sections. 185 + {"nameBloom", &unusedSimple}, 186 + {"contentBloom", &unusedSimple}, 185 187 } 186 188 } 187 189
-8
write.go
··· 139 139 } 140 140 toc.fileSections.end(w) 141 141 142 - toc.nameBloom.start(w) 143 - b.nameBloom.shrinkToSize(bloomDefaultLoad).write(w) 144 - toc.nameBloom.end(w) 145 - 146 - toc.contentBloom.start(w) 147 - b.contentBloom.shrinkToSize(bloomDefaultLoad).write(w) 148 - toc.contentBloom.end(w) 149 - 150 142 writePostings(w, b.contentPostings, &toc.ngramText, &toc.runeOffsets, &toc.postings, &toc.fileEndRunes) 151 143 152 144 // names.