ngramoffset: add a dense asciiNgramOffset mapper and a combiner. (#90) · boltless.me/zoekt@6e5e87c

+1 -1

indexdata.go

··· 33 33 34 34 file IndexFile 35 35 36 - ngrams arrayNgramOffset 36 + ngrams combinedNgramOffset 37 37 38 38 newlinesStart uint32 39 39 newlinesIndex []uint32

+247 -3

ngramoffset.go

··· 18 18 "sort" 19 19 ) 20 20 21 + // shrinkUint32Slice copies slices with excess capacity to precisely seized ones 22 + // to avoid wasting memory. It should be used on slices with long static durations. 23 + func shrinkUint32Slice(a []uint32) []uint32 { 24 + if cap(a)-len(a) < 32 { 25 + return a 26 + } 27 + out := make([]uint32, len(a)) 28 + copy(out, a) 29 + return out 30 + } 31 + 21 32 type topOffset struct { 22 33 top, off uint32 23 34 } ··· 40 51 41 52 func makeArrayNgramOffset(ngrams []ngram, offsets []uint32) arrayNgramOffset { 42 53 arr := arrayNgramOffset{ 43 - bots: make([]uint32, 0, len(ngrams)), 44 - offsets: make([]uint32, len(offsets)), 54 + bots: make([]uint32, 0, len(ngrams)), 45 55 } 46 - copy(arr.offsets, offsets) // copy to ensure offsets is minimally sized 56 + arr.offsets = shrinkUint32Slice(offsets) 47 57 48 58 lastTop := uint32(0xffffffff) 49 59 lastStart := uint32(0) ··· 112 122 func (a *arrayNgramOffset) SizeBytes() int { 113 123 return 8*len(a.tops) + 4*len(a.bots) + 4*len(a.offsets) 114 124 } 125 + 126 + // combinedNgramOffset combines an ascii ngram mapping with a unicode ngram mapping, 127 + // falling back on unicode for unicode ngrams or ascii ngrams with excessive lengths. 128 + type combinedNgramOffset struct { 129 + asc *asciiNgramOffset 130 + uni *arrayNgramOffset 131 + } 132 + 133 + func makeCombinedNgramOffset(ngrams []ngram, offsets []uint32) combinedNgramOffset { 134 + // split ngrams & offsets into ascii ngrams and unicode ngrams, 135 + // since ascii ngrams can be represented much more compactly (21b instead of 63b) 136 + 137 + // allocate these arrays based off of rough measurements of what their typical 138 + // sizes are-- source code is mostly ASCII with a little bit of Unicode. 139 + // Allocating 101% of the total number of ngrams gives a little space for the 140 + // duplicate entries used to mark section ends. 141 + ngramsAscii := make([]ngramAscii, 0, len(ngrams)*101/100) 142 + offsetsAscii := make([]uint32, 0, len(ngrams)*101/100) 143 + 144 + ngramsUnicode := make([]ngram, 0, len(ngrams)*11/100) 145 + offsetsUnicode := make([]uint32, 0, len(ngrams)*11/100) 146 + 147 + for i, ng := range ngrams { 148 + if ng&ngramAsciiMask == ng { // is ngram ascii-only? 149 + ngp := ngramAsciiToPacked(ng) 150 + if i == len(ngrams)-1 || ngrams[i+1]&ngramAsciiMask != ngrams[i+1] { 151 + // at the end of a section we insert an extra offset with the same ngram, 152 + // so the size of the segment can be calculated properly 153 + ngramsAscii = append(ngramsAscii, ngp, ngp) 154 + offsetsAscii = append(offsetsAscii, offsets[i], offsets[i+1]) 155 + } else { 156 + ngramsAscii = append(ngramsAscii, ngp) 157 + offsetsAscii = append(offsetsAscii, offsets[i]) 158 + } 159 + // note: len(offsets) == len(ngrams) + 1 160 + if offsets[i+1]-offsets[i] >= ngramAsciiMaxSectionLength { 161 + // max-length ascii sections can't be represented properly in the ascii mapping, 162 + // and are duplicated in the normal unicode entries. 163 + ngramsUnicode = append(ngramsUnicode, ng, ng) 164 + offsetsUnicode = append(offsetsUnicode, offsets[i], offsets[i+1]) 165 + } 166 + } else { 167 + if i == len(ngrams)-1 || ngrams[i+1]&ngramAsciiMask == ngrams[i+1] { 168 + ngramsUnicode = append(ngramsUnicode, ng, ng) 169 + offsetsUnicode = append(offsetsUnicode, offsets[i], offsets[i+1]) 170 + } else { 171 + ngramsUnicode = append(ngramsUnicode, ng) 172 + offsetsUnicode = append(offsetsUnicode, offsets[i]) 173 + } 174 + } 175 + } 176 + 177 + // The last segment always has an extra trailing ngram entry that we don't need, and 178 + // is only present for spacing and alignment. Trim it. 179 + if len(ngramsAscii) > 0 { 180 + ngramsAscii = ngramsAscii[:len(ngramsAscii)-1] 181 + } 182 + if len(ngramsUnicode) > 0 { 183 + ngramsUnicode = ngramsUnicode[:len(ngramsUnicode)-1] 184 + } 185 + 186 + asc := makeAsciiNgramOffset(ngramsAscii, offsetsAscii) 187 + uni := makeArrayNgramOffset(ngramsUnicode, offsetsUnicode) 188 + 189 + return combinedNgramOffset{asc, &uni} 190 + } 191 + 192 + // Get returns a simpleSection with sz=0 if no entry, otherwise the appropriate 193 + // offset based on the underlying ASCII or Unicode offset index. 194 + func (a combinedNgramOffset) Get(gram ngram) simpleSection { 195 + if a.asc == nil { 196 + return simpleSection{} 197 + } 198 + 199 + var sec simpleSection 200 + if gram&ngramAsciiMask == gram { 201 + sec = a.asc.Get(gram) 202 + if sec.sz == ngramAsciiMaxSectionLength { 203 + // Fallback: this section's length was too long to store in the 204 + // ASCII map, find it in the Unicode map. 205 + sec = a.uni.Get(gram) 206 + } 207 + } else { 208 + sec = a.uni.Get(gram) 209 + } 210 + 211 + return sec 212 + } 213 + 214 + func (a combinedNgramOffset) DumpMap() map[ngram]simpleSection { 215 + m := a.asc.DumpMap() 216 + for k, v := range a.uni.DumpMap() { 217 + m[k] = v 218 + } 219 + return m 220 + } 221 + 222 + func (a combinedNgramOffset) SizeBytes() int { 223 + return a.asc.SizeBytes() + a.uni.SizeBytes() 224 + } 225 + 226 + const ngramAsciiMask = 127 | 127<<21 | 127<<42 227 + 228 + // Ascii mapping packs 3*7b chars and 11 bits of lengths, with this as the set maximum. 229 + // We could save another ~3% of total RAM / 5% of combinedNgramOffset RAM by switching to 230 + // a 40b packing with 19-bit lengths, but the code would be significantly uglier so it doesn't 231 + // seem worth it. 232 + const ngramAsciiMaxSectionLength = (1 << 11) - 1 233 + 234 + type ngramAscii uint32 235 + 236 + func ngramAsciiToPacked(ng ngram) ngramAscii { 237 + return ngramAscii(uint32(ng&127) | uint32((ng>>(21-7))&(127<<7)) | uint32((ng>>(42-14))&(127<<14))) 238 + } 239 + 240 + func ngramAsciiPackedToNgram(ng ngramAscii) ngram { 241 + return ngram(ng&127) | ngram(ng&(127<<7))<<(21-7) | ngram(ng&(127<<14))<<(42-14) 242 + } 243 + 244 + // asciiNgramOffset stores ascii trigrams packed together with short lengths, 245 + // using offsets for a chunk of entries to limit the number of lengths that must 246 + // be summed to compute a section's offset. 247 + type asciiNgramOffset struct { 248 + entries []uint32 // (chara << 25 | charb << 18 | charc << 11 | length) 249 + chunkOffsets []uint32 // offset for entries[i*asciiNgramOffsetChunkLength] 250 + } 251 + 252 + // asciiNgramOffsetChunkLength specifies how many entries share one initial offset. 253 + // It must be a power of 2, and was chosen empirically by measuring RAM usage: 254 + // 8: 4132MB, 16: 4047MB, 32: 4006MB, 64: 3992MB, 128: 3990MB 255 + const asciiNgramOffsetChunkLength = 32 256 + 257 + func makeAsciiNgramOffset(ngrams []ngramAscii, offsets []uint32) *asciiNgramOffset { 258 + ao := &asciiNgramOffset{ 259 + entries: make([]uint32, 0, len(ngrams)), 260 + chunkOffsets: make([]uint32, 0, len(ngrams)/asciiNgramOffsetChunkLength), 261 + } 262 + 263 + for i, ng := range ngrams { 264 + if len(ao.entries)%asciiNgramOffsetChunkLength == 0 { 265 + ao.chunkOffsets = append(ao.chunkOffsets, offsets[i]) 266 + } 267 + length := offsets[i+1] - offsets[i] 268 + 269 + for { 270 + if length < ngramAsciiMaxSectionLength { 271 + ao.entries = append(ao.entries, uint32(ng)<<11|length) 272 + break 273 + } else { 274 + // entries with lengths that are too long can't be represented fully in this 275 + // map, but we repeatedly insert offsets to make the next entry's offset computable 276 + // by summing the offsets in the preceding entries in the chunk, including 277 + // this invalid one. 278 + ao.entries = append(ao.entries, uint32(ng)<<11|ngramAsciiMaxSectionLength) 279 + length -= ngramAsciiMaxSectionLength 280 + if len(ao.entries)%asciiNgramOffsetChunkLength == 0 { 281 + // We reached the end of the chunk, so there's no need to reach the 282 + // offset for the next entry. 283 + break 284 + } 285 + } 286 + } 287 + } 288 + 289 + ao.entries = shrinkUint32Slice(ao.entries) 290 + ao.chunkOffsets = shrinkUint32Slice(ao.chunkOffsets) 291 + 292 + return ao 293 + } 294 + 295 + // Get returns a simpleSection with sz=0 if no entry, or sz=ngramAsciiMaxSectionLength 296 + // if the length of the ngram is too large for this type and it should cascade to the next entry. 297 + func (a *asciiNgramOffset) Get(gram ngram) simpleSection { 298 + if gram&ngramAsciiMask != gram { 299 + return simpleSection{} 300 + } 301 + g := uint32(ngramAsciiToPacked(gram) << 11) 302 + 303 + idx := sort.Search(len(a.entries), func(i int) bool { 304 + return a.entries[i] >= g 305 + }) 306 + 307 + if idx == len(a.entries) || a.entries[idx]>>11 != g>>11 { 308 + return simpleSection{} 309 + } 310 + 311 + length := a.entries[idx] & ngramAsciiMaxSectionLength 312 + if length == ngramAsciiMaxSectionLength { 313 + // this ascii ngram's section length is too large to be represented; 314 + // repeate the Get() on the unicode map to get the correct result. 315 + return simpleSection{ 316 + off: 0, 317 + sz: ngramAsciiMaxSectionLength, 318 + } 319 + } 320 + 321 + chunkNum := idx / asciiNgramOffsetChunkLength 322 + chunkBase := chunkNum * asciiNgramOffsetChunkLength 323 + offset := a.chunkOffsets[chunkNum] 324 + for i := chunkBase; i < idx; i++ { 325 + offset += a.entries[i] & ngramAsciiMaxSectionLength 326 + } 327 + 328 + return simpleSection{ 329 + off: offset, 330 + sz: length, 331 + } 332 + } 333 + 334 + func (a *asciiNgramOffset) DumpMap() map[ngram]simpleSection { 335 + m := make(map[ngram]simpleSection, len(a.entries)) 336 + off := uint32(0) 337 + for i, ent := range a.entries { 338 + if i%asciiNgramOffsetChunkLength == 0 { 339 + off = a.chunkOffsets[i/asciiNgramOffsetChunkLength] 340 + } 341 + length := ent & ngramAsciiMaxSectionLength 342 + if length == ngramAsciiMaxSectionLength { 343 + // This entry is an ascii gram with a section too long 344 + // to be represented, so skip the entry. 345 + continue 346 + } 347 + m[ngramAsciiPackedToNgram(ngramAscii(ent>>11))] = simpleSection{ 348 + off: off, 349 + sz: length, 350 + } 351 + off += length 352 + } 353 + return m 354 + } 355 + 356 + func (a *asciiNgramOffset) SizeBytes() int { 357 + return 4*len(a.entries) + 4*len(a.chunkOffsets) 358 + }

+83 -2

ngramoffset_test.go

··· 16 16 17 17 import ( 18 18 "fmt" 19 + "math/rand" 20 + "sort" 19 21 "testing" 20 22 ) 21 23 ··· 55 57 } 56 58 } 57 59 60 + func TestMakeCombinedNgramOffset(t *testing.T) { 61 + // The ascii / unicode ngram offset splitting is significantly 62 + // more complicated. Exercise it with a more comprehensive test! 63 + unicodeProbability := 0.2 64 + ngramCount := 1000 65 + ngramMap := map[ngram]bool{} 66 + 67 + rng := rand.New(rand.NewSource(42)) 68 + 69 + randRune := func() rune { 70 + if rng.Float64() < unicodeProbability { 71 + return rune(0x100 + rand.Intn(0x80)) // Emoji 72 + } 73 + return rune('A' + rng.Intn('Z'-'A')) // A letter 74 + } 75 + 76 + for len(ngramMap) < ngramCount { 77 + ngramMap[runesToNGram([3]rune{randRune(), randRune(), randRune()})] = true 78 + } 79 + 80 + ngrams := []ngram{} 81 + for ng := range ngramMap { 82 + ngrams = append(ngrams, ng) 83 + } 84 + sort.Slice(ngrams, func(i, j int) bool { return ngrams[i] < ngrams[j] }) 85 + 86 + offset := uint32(0) 87 + offsets := []uint32{0} 88 + 89 + for i := 0; i < len(ngrams); i++ { 90 + // vary 91 + offset += uint32(ngramAsciiMaxSectionLength/2 + rand.Intn(ngramAsciiMaxSectionLength)) 92 + offsets = append(offsets, offset) 93 + } 94 + 95 + m := makeCombinedNgramOffset(ngrams, offsets) 96 + 97 + for i, ng := range ngrams { 98 + want := simpleSection{offsets[i], offsets[i+1] - offsets[i]} 99 + got := m.Get(ng) 100 + if want != got { 101 + t.Errorf("#%d: Get(%q) got %v, want %v", i, ng, got, want) 102 + } 103 + failn := ngram(uint64(ng - 1)) 104 + if getFail := m.Get(failn); !ngramMap[failn] && getFail != (simpleSection{}) { 105 + t.Errorf("#%d: Get(%q) got %v, want zero", i, failn, getFail) 106 + } 107 + failn = ngram(uint64(ng + 1)) 108 + if getFail := m.Get(failn); !ngramMap[failn] && getFail != (simpleSection{}) { 109 + t.Errorf("#%d: Get(%q) got %v, want zero", i, failn, getFail) 110 + } 111 + } 112 + 113 + if t.Failed() || true { 114 + t.Log(ngrams) 115 + t.Log(offsets) 116 + t.Log(m) 117 + } 118 + } 119 + 120 + func (a combinedNgramOffset) String() string { 121 + return fmt.Sprintf("combinedNgramOffset{\n asc: %s,\n uni: %s,\n}", a.asc, a.uni) 122 + } 123 + 58 124 func (a *arrayNgramOffset) String() string { 59 125 o := "arrayNgramOffset{tops:{" 60 126 for i, p := range a.tops { ··· 65 131 // only one rune is represented here 66 132 o += fmt.Sprintf("%s: %d", string(rune(p.top>>10)), p.off) 67 133 } else { 68 - o += fmt.Sprintf("%x: %d", p.top>>10, p.off) 134 + o += fmt.Sprintf("0x%x: %d", p.top>>10, p.off) 69 135 } 70 136 } 71 137 o += "}, bots: {" ··· 77 143 // two ascii-ish runes (probably) 78 144 o += fmt.Sprintf("%s%s", string(rune(p>>21)), string(rune(p&runeMask))) 79 145 } else { 80 - o += fmt.Sprintf("%x", p) 146 + o += fmt.Sprintf("0x%x", p) 81 147 } 82 148 } 83 149 o += fmt.Sprintf("}, offsets: %v}", a.offsets) 84 150 return o 85 151 } 152 + 153 + func (a *asciiNgramOffset) String() string { 154 + o := "asciiNgramOffset{entries:{" 155 + for i, e := range a.entries { 156 + ng := ngramAsciiPackedToNgram(ngramAscii(uint32(e) >> 11)) 157 + length := e & ngramAsciiMaxSectionLength 158 + if i > 0 { 159 + o += ", " 160 + } 161 + o += fmt.Sprintf("%s: %d", ng, length) 162 + } 163 + o += fmt.Sprintf("}, chunkOffsets: %v}", a.chunkOffsets) 164 + return o 165 + 166 + }

+3 -3

read.go

··· 318 318 319 319 const ngramEncoding = 8 320 320 321 - func (d *indexData) readNgrams(toc *indexTOC) (arrayNgramOffset, error) { 321 + func (d *indexData) readNgrams(toc *indexTOC) (combinedNgramOffset, error) { 322 322 textContent, err := d.readSectionBlob(toc.ngramText) 323 323 if err != nil { 324 - return arrayNgramOffset{}, err 324 + return combinedNgramOffset{}, err 325 325 } 326 326 postingsIndex := toc.postings.relativeIndex() 327 327 ··· 335 335 ngrams = append(ngrams, ng) 336 336 } 337 337 338 - return makeArrayNgramOffset(ngrams, postingsIndex), nil 338 + return makeCombinedNgramOffset(ngrams, postingsIndex), nil 339 339 } 340 340 341 341 func (d *indexData) readFileNameNgrams(toc *indexTOC) (map[ngram][]byte, error) {

Configure Feed

Configure Feed