remove ngram offset code (#616) · boltless.me/zoekt@3d0bdd5

+12

btree.go

··· 289 289 postingIndex simpleSection 290 290 } 291 291 292 + // SizeBytes returns how much memory this structure uses in the heap. 292 293 func (b btreeIndex) SizeBytes() (sz int) { 293 294 // btree 294 295 if b.bt != nil { ··· 401 402 return 402 403 } 403 404 405 + // DumpMap is a debug method which returns the btree as an in-memory 406 + // representation. This is how zoekt represents the ngram index in 407 + // google/zoekt. 404 408 func (b btreeIndex) DumpMap() map[ngram]simpleSection { 405 409 if b.bt == nil { 406 410 return nil ··· 427 431 428 432 return m 429 433 } 434 + 435 + // GetBlob returns the raw encoded offset list for ng. 436 + // 437 + // Note: the returned byte slice is mmap backed normally. 438 + func (b btreeIndex) GetBlob(ng ngram) ([]byte, error) { 439 + sec := b.Get(ng) 440 + return b.file.Read(sec.off, sec.sz) 441 + }

-4

hititer.go

··· 110 110 } 111 111 112 112 func (d *indexData) trigramHitIterator(ng ngram, caseSensitive, fileName bool) (hitIterator, error) { 113 - if d.ngrams == nil { 114 - return nil, fmt.Errorf("trigramHitIterator: ngrams=nil") 115 - } 116 - 117 113 variants := []ngram{ng} 118 114 if !caseSensitive { 119 115 variants = generateCaseNgrams(ng)

+4 -11

indexdata.go

··· 33 33 34 34 file IndexFile 35 35 36 - ngrams ngramIndex 36 + ngrams btreeIndex 37 37 38 38 newlinesStart uint32 39 39 newlinesIndex []uint32 ··· 56 56 57 57 fileNameContent []byte 58 58 fileNameIndex []uint32 59 - fileNameNgrams fileNameNgrams 59 + fileNameNgrams btreeIndex 60 60 61 61 // fileEndSymbol[i] is the index of the first symbol for document i. 62 62 fileEndSymbol []uint32 ··· 314 314 } 315 315 sz += 8 * len(d.runeDocSections) 316 316 sz += 8 * len(d.fileBranchMasks) 317 - if d.ngrams != nil { 318 - sz += d.ngrams.SizeBytes() 319 - } 317 + sz += d.ngrams.SizeBytes() 320 318 sz += d.fileNameNgrams.SizeBytes() 321 319 return sz 322 320 } ··· 349 347 350 348 func (data *indexData) ngramFrequency(ng ngram, filename bool) uint32 { 351 349 if filename { 352 - return data.fileNameNgrams.Frequency(ng) 350 + return data.fileNameNgrams.Get(ng).sz 353 351 } 354 - 355 - if data.ngrams == nil { 356 - return 0 357 - } 358 - 359 352 return data.ngrams.Get(ng).sz 360 353 } 361 354

-401

ngramoffset.go

··· 1 - // Copyright 2021 Google Inc. All rights reserved. 2 - // 3 - // Licensed under the Apache License, Version 2.0 (the "License"); 4 - // you may not use this file except in compliance with the License. 5 - // You may obtain a copy of the License at 6 - // 7 - // http://www.apache.org/licenses/LICENSE-2.0 8 - // 9 - // Unless required by applicable law or agreed to in writing, software 10 - // distributed under the License is distributed on an "AS IS" BASIS, 11 - // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 - // See the License for the specific language governing permissions and 13 - // limitations under the License. 14 - 15 - package zoekt 16 - 17 - import ( 18 - "sort" 19 - ) 20 - 21 - // shrinkUint32Slice copies slices with excess capacity to precisely sized ones 22 - // to avoid wasting memory. It should be used on slices with long static durations. 23 - func shrinkUint32Slice(a []uint32) []uint32 { 24 - if cap(a)-len(a) < 32 { 25 - return a 26 - } 27 - out := make([]uint32, len(a)) 28 - copy(out, a) 29 - return out 30 - } 31 - 32 - type topOffset struct { 33 - top, off uint32 34 - } 35 - 36 - // arrayNgramOffset splits ngrams into two 32-bit parts and uses binary search 37 - // to satisfy requests. A three-level trie (over the runes of an ngram) uses 20% 38 - // more memory than this simple two-level split. 39 - type arrayNgramOffset struct { 40 - // tops specify where the bottom halves of ngrams with the 32-bit top half begin. 41 - // The offset of the next value is used to find where the bottom section ends. 42 - tops []topOffset 43 - 44 - // bots are bottom halves of an ngram, referenced by tops 45 - bots []uint32 46 - 47 - // offsets is values from simpleSection.off, simpleSection.sz is computed by subtracting 48 - // adjacent offsets. 49 - offsets []uint32 50 - } 51 - 52 - func makeArrayNgramOffset(ngrams []ngram, offsets []uint32) arrayNgramOffset { 53 - arr := arrayNgramOffset{ 54 - bots: make([]uint32, 0, len(ngrams)), 55 - } 56 - arr.offsets = shrinkUint32Slice(offsets) 57 - 58 - lastTop := uint32(0xffffffff) 59 - lastStart := uint32(0) 60 - for i, v := range ngrams { 61 - curTop := uint32(v >> 32) 62 - if curTop != lastTop { 63 - if lastTop != 0xffffffff { 64 - arr.tops = append(arr.tops, topOffset{lastTop, lastStart}) 65 - } 66 - lastTop = curTop 67 - lastStart = uint32(i) 68 - } 69 - arr.bots = append(arr.bots, uint32(v)) 70 - } 71 - // add a sentinel value to make it simple to compute sizes 72 - arr.tops = append(arr.tops, topOffset{lastTop, lastStart}, topOffset{0xffffffff, uint32(len(arr.bots))}) 73 - 74 - // shrink arr.tops to minimal size 75 - tops := make([]topOffset, len(arr.tops)) 76 - copy(tops, arr.tops) 77 - arr.tops = tops 78 - 79 - return arr 80 - } 81 - 82 - func (a *arrayNgramOffset) Get(gram ngram) simpleSection { 83 - if a.tops == nil { 84 - return simpleSection{} 85 - } 86 - 87 - top, bot := uint32(uint64(gram)>>32), uint32(gram) 88 - 89 - topIdx := sort.Search(len(a.tops)-1, func(i int) bool { return a.tops[i].top >= top }) 90 - if topIdx == len(a.tops)-1 || a.tops[topIdx].top != top { 91 - return simpleSection{} 92 - } 93 - 94 - botsSec := a.bots[a.tops[topIdx].off:a.tops[topIdx+1].off] 95 - botIdx := sort.Search(len(botsSec), func(i int) bool { return botsSec[i] >= bot }) 96 - if botIdx == len(botsSec) || botsSec[botIdx] != bot { 97 - return simpleSection{} 98 - } 99 - idx := botIdx + int(a.tops[topIdx].off) 100 - return simpleSection{ 101 - off: a.offsets[idx], 102 - sz: a.offsets[idx+1] - a.offsets[idx], 103 - } 104 - } 105 - 106 - func (a *arrayNgramOffset) DumpMap() map[ngram]simpleSection { 107 - m := make(map[ngram]simpleSection, len(a.offsets)-1) 108 - for i := 0; i < len(a.tops)-1; i++ { 109 - top, botStart, botEnd := a.tops[i].top, a.tops[i].off, a.tops[i+1].off 110 - botSec := a.bots[botStart:botEnd] 111 - for j, bot := range botSec { 112 - idx := int(botStart) + j 113 - m[ngram(uint64(top)<<32|uint64(bot))] = simpleSection{ 114 - off: a.offsets[idx], 115 - sz: a.offsets[idx+1] - a.offsets[idx], 116 - } 117 - } 118 - } 119 - return m 120 - } 121 - 122 - func (a *arrayNgramOffset) SizeBytes() int { 123 - return 8*len(a.tops) + 4*len(a.bots) + 4*len(a.offsets) 124 - } 125 - 126 - // combinedNgramOffset combines an ascii ngram mapping with a unicode ngram mapping, 127 - // falling back on unicode for unicode ngrams or ascii ngrams with excessive lengths. 128 - type combinedNgramOffset struct { 129 - asc *asciiNgramOffset 130 - uni *arrayNgramOffset 131 - } 132 - 133 - func makeCombinedNgramOffset(ngrams []ngram, offsets []uint32) combinedNgramOffset { 134 - // split ngrams & offsets into ascii ngrams and unicode ngrams, 135 - // since ascii ngrams can be represented much more compactly (21b instead of 63b) 136 - 137 - // allocate these arrays based off of rough measurements of what their typical 138 - // sizes are-- source code is mostly ASCII with a little bit of Unicode. 139 - // Allocating 101% of the total number of ngrams gives a little space for the 140 - // duplicate entries used to mark section ends. 141 - ngramsAscii := make([]ngramAscii, 0, len(ngrams)*101/100) 142 - offsetsAscii := make([]uint32, 0, len(ngrams)*101/100) 143 - 144 - ngramsUnicode := make([]ngram, 0, len(ngrams)*11/100) 145 - offsetsUnicode := make([]uint32, 0, len(ngrams)*11/100) 146 - 147 - for i, ng := range ngrams { 148 - if ng&ngramAsciiMask == ng { // is ngram ascii-only? 149 - ngp := ngramAsciiToPacked(ng) 150 - if i == len(ngrams)-1 || ngrams[i+1]&ngramAsciiMask != ngrams[i+1] { 151 - // at the end of a section we insert an extra offset with the same ngram, 152 - // so the size of the segment can be calculated properly 153 - ngramsAscii = append(ngramsAscii, ngp, ngp) 154 - offsetsAscii = append(offsetsAscii, offsets[i], offsets[i+1]) 155 - } else { 156 - ngramsAscii = append(ngramsAscii, ngp) 157 - offsetsAscii = append(offsetsAscii, offsets[i]) 158 - } 159 - // note: len(offsets) == len(ngrams) + 1 160 - if offsets[i+1]-offsets[i] >= ngramAsciiMaxSectionLength { 161 - // max-length ascii sections can't be represented properly in the ascii mapping, 162 - // and are duplicated in the normal unicode entries. 163 - ngramsUnicode = append(ngramsUnicode, ng, ng) 164 - offsetsUnicode = append(offsetsUnicode, offsets[i], offsets[i+1]) 165 - } 166 - } else { 167 - if i == len(ngrams)-1 || ngrams[i+1]&ngramAsciiMask == ngrams[i+1] { 168 - ngramsUnicode = append(ngramsUnicode, ng, ng) 169 - offsetsUnicode = append(offsetsUnicode, offsets[i], offsets[i+1]) 170 - } else { 171 - ngramsUnicode = append(ngramsUnicode, ng) 172 - offsetsUnicode = append(offsetsUnicode, offsets[i]) 173 - } 174 - } 175 - } 176 - 177 - // The last segment always has an extra trailing ngram entry that we don't need, and 178 - // is only present for spacing and alignment. Trim it. 179 - if len(ngramsAscii) > 0 { 180 - ngramsAscii = ngramsAscii[:len(ngramsAscii)-1] 181 - } 182 - if len(ngramsUnicode) > 0 { 183 - ngramsUnicode = ngramsUnicode[:len(ngramsUnicode)-1] 184 - } 185 - 186 - asc := makeAsciiNgramOffset(ngramsAscii, offsetsAscii) 187 - uni := makeArrayNgramOffset(ngramsUnicode, offsetsUnicode) 188 - 189 - return combinedNgramOffset{asc, &uni} 190 - } 191 - 192 - // Get returns a simpleSection with sz=0 if no entry, otherwise the appropriate 193 - // offset based on the underlying ASCII or Unicode offset index. 194 - func (a combinedNgramOffset) Get(gram ngram) simpleSection { 195 - if a.asc == nil { 196 - return simpleSection{} 197 - } 198 - 199 - var sec simpleSection 200 - if gram&ngramAsciiMask == gram { 201 - sec = a.asc.Get(gram) 202 - if sec.sz == ngramAsciiMaxSectionLength { 203 - // Fallback: this section's length was too long to store in the 204 - // ASCII map, find it in the Unicode map. 205 - sec = a.uni.Get(gram) 206 - } 207 - } else { 208 - sec = a.uni.Get(gram) 209 - } 210 - 211 - return sec 212 - } 213 - 214 - func (a combinedNgramOffset) DumpMap() map[ngram]simpleSection { 215 - m := a.asc.DumpMap() 216 - for k, v := range a.uni.DumpMap() { 217 - m[k] = v 218 - } 219 - return m 220 - } 221 - 222 - func (a combinedNgramOffset) SizeBytes() int { 223 - return a.asc.SizeBytes() + a.uni.SizeBytes() 224 - } 225 - 226 - const ngramAsciiMask = 127 | 127<<21 | 127<<42 227 - 228 - // Ascii mapping packs 3*7b chars and 11 bits of lengths, with this as the set maximum. 229 - // We could save another ~3% of total RAM / 5% of combinedNgramOffset RAM by switching to 230 - // a 40b packing with 19-bit lengths, but the code would be significantly uglier so it doesn't 231 - // seem worth it. 232 - const ngramAsciiMaxSectionLength = (1 << 11) - 1 233 - 234 - type ngramAscii uint32 235 - 236 - func ngramAsciiToPacked(ng ngram) ngramAscii { 237 - return ngramAscii(uint32(ng&127) | uint32((ng>>(21-7))&(127<<7)) | uint32((ng>>(42-14))&(127<<14))) 238 - } 239 - 240 - func ngramAsciiPackedToNgram(ng ngramAscii) ngram { 241 - return ngram(ng&127) | ngram(ng&(127<<7))<<(21-7) | ngram(ng&(127<<14))<<(42-14) 242 - } 243 - 244 - // asciiNgramOffset stores ascii trigrams packed together with short lengths, 245 - // using offsets for a chunk of entries to limit the number of lengths that must 246 - // be summed to compute a section's offset. 247 - type asciiNgramOffset struct { 248 - entries []uint32 // (chara << 25 | charb << 18 | charc << 11 | length) 249 - chunkOffsets []uint32 // offset for entries[i*asciiNgramOffsetChunkLength] 250 - } 251 - 252 - // asciiNgramOffsetChunkLength specifies how many entries share one initial offset. 253 - // It must be a power of 2, and was chosen empirically by measuring RAM usage: 254 - // 8: 4132MB, 16: 4047MB, 32: 4006MB, 64: 3992MB, 128: 3990MB 255 - const asciiNgramOffsetChunkLength = 32 256 - 257 - func makeAsciiNgramOffset(ngrams []ngramAscii, offsets []uint32) *asciiNgramOffset { 258 - ao := &asciiNgramOffset{ 259 - entries: make([]uint32, 0, len(ngrams)), 260 - chunkOffsets: make([]uint32, 0, len(ngrams)/asciiNgramOffsetChunkLength), 261 - } 262 - 263 - for i, ng := range ngrams { 264 - if len(ao.entries)%asciiNgramOffsetChunkLength == 0 { 265 - ao.chunkOffsets = append(ao.chunkOffsets, offsets[i]) 266 - } 267 - length := offsets[i+1] - offsets[i] 268 - 269 - for { 270 - if length < ngramAsciiMaxSectionLength { 271 - ao.entries = append(ao.entries, uint32(ng)<<11|length) 272 - break 273 - } else { 274 - // entries with lengths that are too long can't be represented fully in this 275 - // map, but we repeatedly insert offsets to make the next entry's offset computable 276 - // by summing the offsets in the preceding entries in the chunk, including 277 - // this invalid one. 278 - ao.entries = append(ao.entries, uint32(ng)<<11|ngramAsciiMaxSectionLength) 279 - length -= ngramAsciiMaxSectionLength 280 - if len(ao.entries)%asciiNgramOffsetChunkLength == 0 { 281 - // We reached the end of the chunk, so there's no need to reach the 282 - // offset for the next entry. 283 - break 284 - } 285 - } 286 - } 287 - } 288 - 289 - ao.entries = shrinkUint32Slice(ao.entries) 290 - ao.chunkOffsets = shrinkUint32Slice(ao.chunkOffsets) 291 - 292 - return ao 293 - } 294 - 295 - // Get returns a simpleSection with sz=0 if no entry, or sz=ngramAsciiMaxSectionLength 296 - // if the length of the ngram is too large for this type and it should cascade to the next entry. 297 - func (a *asciiNgramOffset) Get(gram ngram) simpleSection { 298 - if gram&ngramAsciiMask != gram { 299 - return simpleSection{} 300 - } 301 - g := uint32(ngramAsciiToPacked(gram) << 11) 302 - 303 - idx := sort.Search(len(a.entries), func(i int) bool { 304 - return a.entries[i] >= g 305 - }) 306 - 307 - if idx == len(a.entries) || a.entries[idx]>>11 != g>>11 { 308 - return simpleSection{} 309 - } 310 - 311 - length := a.entries[idx] & ngramAsciiMaxSectionLength 312 - if length == ngramAsciiMaxSectionLength { 313 - // this ascii ngram's section length is too large to be represented; 314 - // repeate the Get() on the unicode map to get the correct result. 315 - return simpleSection{ 316 - off: 0, 317 - sz: ngramAsciiMaxSectionLength, 318 - } 319 - } 320 - 321 - chunkNum := idx / asciiNgramOffsetChunkLength 322 - chunkBase := chunkNum * asciiNgramOffsetChunkLength 323 - offset := a.chunkOffsets[chunkNum] 324 - for i := chunkBase; i < idx; i++ { 325 - offset += a.entries[i] & ngramAsciiMaxSectionLength 326 - } 327 - 328 - return simpleSection{ 329 - off: offset, 330 - sz: length, 331 - } 332 - } 333 - 334 - func (a *asciiNgramOffset) DumpMap() map[ngram]simpleSection { 335 - m := make(map[ngram]simpleSection, len(a.entries)) 336 - off := uint32(0) 337 - for i, ent := range a.entries { 338 - if i%asciiNgramOffsetChunkLength == 0 { 339 - off = a.chunkOffsets[i/asciiNgramOffsetChunkLength] 340 - } 341 - length := ent & ngramAsciiMaxSectionLength 342 - if length == ngramAsciiMaxSectionLength { 343 - // This entry is an ascii gram with a section too long 344 - // to be represented, so skip the entry. 345 - continue 346 - } 347 - m[ngramAsciiPackedToNgram(ngramAscii(ent>>11))] = simpleSection{ 348 - off: off, 349 - sz: length, 350 - } 351 - off += length 352 - } 353 - return m 354 - } 355 - 356 - func (a *asciiNgramOffset) SizeBytes() int { 357 - return 4*len(a.entries) + 4*len(a.chunkOffsets) 358 - } 359 - 360 - type ngramIndex interface { 361 - Get(gram ngram) simpleSection 362 - DumpMap() map[ngram]simpleSection 363 - SizeBytes() int 364 - } 365 - 366 - // This is a temporary type to wrap two very different implementations of the 367 - // inverted index for the purpose of feature-flagging. We will remove this after 368 - // we enable the b-tree permanently. 369 - // 370 - // Alternatively we could have adapted/extended the interface "ngramIndex". 371 - // However, adapting the existing implementations and their tests to match the 372 - // access pattern of map[ngram][]byte seems more cumbersome than this makeshift 373 - // wrapper. In the end, both ngramIndex and this wrapper will be replaced by a 374 - // concrete type. 375 - type fileNameNgrams struct { 376 - m map[ngram][]byte 377 - bt btreeIndex 378 - } 379 - 380 - func (n fileNameNgrams) GetBlob(ng ngram) ([]byte, error) { 381 - if n.m != nil { 382 - return n.m[ng], nil 383 - } 384 - sec := n.bt.Get(ng) 385 - return n.bt.file.Read(sec.off, sec.sz) 386 - } 387 - 388 - func (n fileNameNgrams) Frequency(ng ngram) uint32 { 389 - if n.m != nil { 390 - return uint32(len(n.m[ng])) 391 - } 392 - return n.bt.Get(ng).sz 393 - } 394 - 395 - func (n fileNameNgrams) SizeBytes() int { 396 - if n.m != nil { 397 - // these slices reference mmap-ed memory 398 - return 12 * len(n.m) 399 - } 400 - return n.bt.SizeBytes() 401 - }

-166

ngramoffset_test.go

··· 1 - // Copyright 2021 Google Inc. All rights reserved. 2 - // 3 - // Licensed under the Apache License, Version 2.0 (the "License"); 4 - // you may not use this file except in compliance with the License. 5 - // You may obtain a copy of the License at 6 - // 7 - // http://www.apache.org/licenses/LICENSE-2.0 8 - // 9 - // Unless required by applicable law or agreed to in writing, software 10 - // distributed under the License is distributed on an "AS IS" BASIS, 11 - // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 - // See the License for the specific language governing permissions and 13 - // limitations under the License. 14 - 15 - package zoekt 16 - 17 - import ( 18 - "fmt" 19 - "math/rand" 20 - "sort" 21 - "testing" 22 - ) 23 - 24 - func TestMakeArrayNgramOffset(t *testing.T) { 25 - for n, tc := range []struct { 26 - ngrams []string 27 - offsets []uint32 28 - }{ 29 - {nil, nil}, 30 - {[]string{"ant", "any", "awl", "big", "bin", "bit", "can", "con"}, []uint32{0, 2, 5, 8, 10, 14, 18, 25, 30}}, 31 - } { 32 - ngrams := []ngram{} 33 - for _, s := range tc.ngrams { 34 - ngrams = append(ngrams, stringToNGram(s)) 35 - } 36 - m := makeArrayNgramOffset(ngrams, tc.offsets) 37 - t.Logf("makeNgramOffset(%v, %v) => %s", tc.ngrams, tc.offsets, &m) 38 - failn := stringToNGram("foo") 39 - if getFail := m.Get(failn); getFail != (simpleSection{}) { 40 - t.Errorf("#%d: Get(%q) got %v, want zero", n, failn, getFail) 41 - } 42 - for i := 0; i < len(tc.offsets)-1; i++ { 43 - want := simpleSection{tc.offsets[i], tc.offsets[i+1] - tc.offsets[i]} 44 - got := m.Get(ngrams[i]) 45 - if want != got { 46 - t.Errorf("#%d.%d: Get(%q) got %v, want %v", n, i, tc.ngrams[i], got, want) 47 - } 48 - failn := ngram(uint64(ngrams[i] - 1)) 49 - if getFail := m.Get(failn); getFail != (simpleSection{}) { 50 - t.Errorf("#%d.%d: Get(%q) got %v, want zero", n, i, failn, getFail) 51 - } 52 - failn = ngram(uint64(ngrams[i] + 1)) 53 - if getFail := m.Get(failn); getFail != (simpleSection{}) { 54 - t.Errorf("#%d.%d: Get(%q) got %v, want zero", n, i, failn, getFail) 55 - } 56 - } 57 - } 58 - } 59 - 60 - func TestMakeCombinedNgramOffset(t *testing.T) { 61 - // The ascii / unicode ngram offset splitting is significantly 62 - // more complicated. Exercise it with a more comprehensive test! 63 - unicodeProbability := 0.2 64 - ngramCount := 1000 65 - ngramMap := map[ngram]bool{} 66 - 67 - rng := rand.New(rand.NewSource(42)) 68 - 69 - randRune := func() rune { 70 - if rng.Float64() < unicodeProbability { 71 - return rune(0x100 + rand.Intn(0x80)) // Emoji 72 - } 73 - return rune('A' + rng.Intn('Z'-'A')) // A letter 74 - } 75 - 76 - for len(ngramMap) < ngramCount { 77 - ngramMap[runesToNGram([3]rune{randRune(), randRune(), randRune()})] = true 78 - } 79 - 80 - ngrams := []ngram{} 81 - for ng := range ngramMap { 82 - ngrams = append(ngrams, ng) 83 - } 84 - sort.Slice(ngrams, func(i, j int) bool { return ngrams[i] < ngrams[j] }) 85 - 86 - offset := uint32(0) 87 - offsets := []uint32{0} 88 - 89 - for i := 0; i < len(ngrams); i++ { 90 - // vary 91 - offset += uint32(ngramAsciiMaxSectionLength/2 + rand.Intn(ngramAsciiMaxSectionLength)) 92 - offsets = append(offsets, offset) 93 - } 94 - 95 - m := makeCombinedNgramOffset(ngrams, offsets) 96 - 97 - for i, ng := range ngrams { 98 - want := simpleSection{offsets[i], offsets[i+1] - offsets[i]} 99 - got := m.Get(ng) 100 - if want != got { 101 - t.Errorf("#%d: Get(%q) got %v, want %v", i, ng, got, want) 102 - } 103 - failn := ngram(uint64(ng - 1)) 104 - if getFail := m.Get(failn); !ngramMap[failn] && getFail != (simpleSection{}) { 105 - t.Errorf("#%d: Get(%q) got %v, want zero", i, failn, getFail) 106 - } 107 - failn = ngram(uint64(ng + 1)) 108 - if getFail := m.Get(failn); !ngramMap[failn] && getFail != (simpleSection{}) { 109 - t.Errorf("#%d: Get(%q) got %v, want zero", i, failn, getFail) 110 - } 111 - } 112 - 113 - if t.Failed() || true { 114 - t.Log(ngrams) 115 - t.Log(offsets) 116 - t.Log(m) 117 - } 118 - } 119 - 120 - func (a combinedNgramOffset) String() string { 121 - return fmt.Sprintf("combinedNgramOffset{\n asc: %s,\n uni: %s,\n}", a.asc, a.uni) 122 - } 123 - 124 - func (a *arrayNgramOffset) String() string { 125 - o := "arrayNgramOffset{tops:{" 126 - for i, p := range a.tops { 127 - if i > 0 { 128 - o += ", " 129 - } 130 - if p.top&1023 == 0 { 131 - // only one rune is represented here 132 - o += fmt.Sprintf("%s: %d", string(rune(p.top>>10)), p.off) 133 - } else { 134 - o += fmt.Sprintf("0x%x: %d", p.top>>10, p.off) 135 - } 136 - } 137 - o += "}, bots: {" 138 - for i, p := range a.bots { 139 - if i > 0 { 140 - o += ", " 141 - } 142 - if p < (256 << 21) { 143 - // two ascii-ish runes (probably) 144 - o += fmt.Sprintf("%s%s", string(rune(p>>21)), string(rune(p&runeMask))) 145 - } else { 146 - o += fmt.Sprintf("0x%x", p) 147 - } 148 - } 149 - o += fmt.Sprintf("}, offsets: %v}", a.offsets) 150 - return o 151 - } 152 - 153 - func (a *asciiNgramOffset) String() string { 154 - o := "asciiNgramOffset{entries:{" 155 - for i, e := range a.entries { 156 - ng := ngramAsciiPackedToNgram(ngramAscii(uint32(e) >> 11)) 157 - length := e & ngramAsciiMaxSectionLength 158 - if i > 0 { 159 - o += ", " 160 - } 161 - o += fmt.Sprintf("%s: %d", ng, length) 162 - } 163 - o += fmt.Sprintf("}, chunkOffsets: %v}", a.chunkOffsets) 164 - return o 165 - 166 - }

+4 -65

read.go

··· 288 288 return nil, err 289 289 } 290 290 291 - if os.Getenv("ZOEKT_DISABLE_BTREE") != "" { 292 - offsetMap, err := d.readNgrams(toc) 293 - if err != nil { 294 - return nil, err 295 - } 296 - d.ngrams = offsetMap 297 - } else { 298 - bt, err := d.newBtreeIndex(toc.ngramText, toc.postings) 299 - if err != nil { 300 - return nil, err 301 - } 302 - d.ngrams = bt 291 + d.ngrams, err = d.newBtreeIndex(toc.ngramText, toc.postings) 292 + if err != nil { 293 + return nil, err 303 294 } 304 295 305 296 d.fileBranchMasks, err = readSectionU64(d.file, toc.branchMasks) ··· 314 305 315 306 d.fileNameIndex = toc.fileNames.relativeIndex() 316 307 317 - if os.Getenv("ZOEKT_DISABLE_BTREE") != "" { 318 - d.fileNameNgrams.m, err = d.readFileNameNgrams(toc) 319 - } else { 320 - d.fileNameNgrams.bt, err = d.newBtreeIndex(toc.nameNgramText, toc.namePostings) 321 - } 308 + d.fileNameNgrams, err = d.newBtreeIndex(toc.nameNgramText, toc.namePostings) 322 309 if err != nil { 323 310 return nil, err 324 311 } ··· 460 447 461 448 const ngramEncoding = 8 462 449 463 - func (d *indexData) readNgrams(toc *indexTOC) (combinedNgramOffset, error) { 464 - textContent, err := d.readSectionBlob(toc.ngramText) 465 - if err != nil { 466 - return combinedNgramOffset{}, err 467 - } 468 - postingsIndex := toc.postings.relativeIndex() 469 - 470 - for i := 0; i < len(postingsIndex); i++ { 471 - postingsIndex[i] += toc.postings.data.off 472 - } 473 - 474 - ngrams := make([]ngram, 0, len(textContent)/ngramEncoding) 475 - for i := 0; i < len(textContent); i += ngramEncoding { 476 - ng := ngram(binary.BigEndian.Uint64(textContent[i : i+ngramEncoding])) 477 - ngrams = append(ngrams, ng) 478 - } 479 - 480 - return makeCombinedNgramOffset(ngrams, postingsIndex), nil 481 - } 482 - 483 450 func (d *indexData) newBtreeIndex(ngramSec simpleSection, postings compoundSection) (btreeIndex, error) { 484 451 bi := btreeIndex{file: d.file} 485 452 ··· 505 472 bi.postingIndex = postings.index 506 473 507 474 return bi, nil 508 - } 509 - 510 - func (d *indexData) readFileNameNgrams(toc *indexTOC) (map[ngram][]byte, error) { 511 - nameNgramText, err := d.readSectionBlob(toc.nameNgramText) 512 - if err != nil { 513 - return nil, err 514 - } 515 - 516 - fileNamePostingsData, err := d.readSectionBlob(toc.namePostings.data) 517 - if err != nil { 518 - return nil, err 519 - } 520 - 521 - fileNamePostingsIndex := toc.namePostings.relativeIndex() 522 - 523 - fileNameNgrams := make(map[ngram][]byte, len(nameNgramText)/ngramEncoding) 524 - for i := 0; i < len(nameNgramText); i += ngramEncoding { 525 - j := i / ngramEncoding 526 - off := fileNamePostingsIndex[j] 527 - end := fileNamePostingsIndex[j+1] 528 - ng := ngram(binary.BigEndian.Uint64(nameNgramText[i : i+ngramEncoding])) 529 - fileNameNgrams[ng] = fileNamePostingsData[off:end] 530 - } 531 - 532 - return fileNameNgrams, nil 533 475 } 534 476 535 477 func (d *indexData) verify() error { ··· 713 655 return err 714 656 } 715 657 716 - if id.ngrams == nil { 717 - return fmt.Errorf("PrintNgramStats: ngrams=nil") 718 - } 719 658 var rNgram [3]rune 720 659 for ngram, ss := range id.ngrams.DumpMap() { 721 660 rNgram = ngramToRunes(ngram)

Configure Feed

Configure Feed