fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2021 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "sort" 19) 20 21// shrinkUint32Slice copies slices with excess capacity to precisely sized ones 22// to avoid wasting memory. It should be used on slices with long static durations. 23func shrinkUint32Slice(a []uint32) []uint32 { 24 if cap(a)-len(a) < 32 { 25 return a 26 } 27 out := make([]uint32, len(a)) 28 copy(out, a) 29 return out 30} 31 32type topOffset struct { 33 top, off uint32 34} 35 36// arrayNgramOffset splits ngrams into two 32-bit parts and uses binary search 37// to satisfy requests. A three-level trie (over the runes of an ngram) uses 20% 38// more memory than this simple two-level split. 39type arrayNgramOffset struct { 40 // tops specify where the bottom halves of ngrams with the 32-bit top half begin. 41 // The offset of the next value is used to find where the bottom section ends. 42 tops []topOffset 43 44 // bots are bottom halves of an ngram, referenced by tops 45 bots []uint32 46 47 // offsets is values from simpleSection.off, simpleSection.sz is computed by subtracting 48 // adjacent offsets. 49 offsets []uint32 50} 51 52func makeArrayNgramOffset(ngrams []ngram, offsets []uint32) arrayNgramOffset { 53 arr := arrayNgramOffset{ 54 bots: make([]uint32, 0, len(ngrams)), 55 } 56 arr.offsets = shrinkUint32Slice(offsets) 57 58 lastTop := uint32(0xffffffff) 59 lastStart := uint32(0) 60 for i, v := range ngrams { 61 curTop := uint32(v >> 32) 62 if curTop != lastTop { 63 if lastTop != 0xffffffff { 64 arr.tops = append(arr.tops, topOffset{lastTop, lastStart}) 65 } 66 lastTop = curTop 67 lastStart = uint32(i) 68 } 69 arr.bots = append(arr.bots, uint32(v)) 70 } 71 // add a sentinel value to make it simple to compute sizes 72 arr.tops = append(arr.tops, topOffset{lastTop, lastStart}, topOffset{0xffffffff, uint32(len(arr.bots))}) 73 74 // shrink arr.tops to minimal size 75 tops := make([]topOffset, len(arr.tops)) 76 copy(tops, arr.tops) 77 arr.tops = tops 78 79 return arr 80} 81 82func (a *arrayNgramOffset) Get(gram ngram) simpleSection { 83 if a.tops == nil { 84 return simpleSection{} 85 } 86 87 top, bot := uint32(uint64(gram)>>32), uint32(gram) 88 89 topIdx := sort.Search(len(a.tops)-1, func(i int) bool { return a.tops[i].top >= top }) 90 if topIdx == len(a.tops)-1 || a.tops[topIdx].top != top { 91 return simpleSection{} 92 } 93 94 botsSec := a.bots[a.tops[topIdx].off:a.tops[topIdx+1].off] 95 botIdx := sort.Search(len(botsSec), func(i int) bool { return botsSec[i] >= bot }) 96 if botIdx == len(botsSec) || botsSec[botIdx] != bot { 97 return simpleSection{} 98 } 99 idx := botIdx + int(a.tops[topIdx].off) 100 return simpleSection{ 101 off: a.offsets[idx], 102 sz: a.offsets[idx+1] - a.offsets[idx], 103 } 104} 105 106func (a *arrayNgramOffset) DumpMap() map[ngram]simpleSection { 107 m := make(map[ngram]simpleSection, len(a.offsets)-1) 108 for i := 0; i < len(a.tops)-1; i++ { 109 top, botStart, botEnd := a.tops[i].top, a.tops[i].off, a.tops[i+1].off 110 botSec := a.bots[botStart:botEnd] 111 for j, bot := range botSec { 112 idx := int(botStart) + j 113 m[ngram(uint64(top)<<32|uint64(bot))] = simpleSection{ 114 off: a.offsets[idx], 115 sz: a.offsets[idx+1] - a.offsets[idx], 116 } 117 } 118 } 119 return m 120} 121 122func (a *arrayNgramOffset) SizeBytes() int { 123 return 8*len(a.tops) + 4*len(a.bots) + 4*len(a.offsets) 124} 125 126// combinedNgramOffset combines an ascii ngram mapping with a unicode ngram mapping, 127// falling back on unicode for unicode ngrams or ascii ngrams with excessive lengths. 128type combinedNgramOffset struct { 129 asc *asciiNgramOffset 130 uni *arrayNgramOffset 131} 132 133func makeCombinedNgramOffset(ngrams []ngram, offsets []uint32) combinedNgramOffset { 134 // split ngrams & offsets into ascii ngrams and unicode ngrams, 135 // since ascii ngrams can be represented much more compactly (21b instead of 63b) 136 137 // allocate these arrays based off of rough measurements of what their typical 138 // sizes are-- source code is mostly ASCII with a little bit of Unicode. 139 // Allocating 101% of the total number of ngrams gives a little space for the 140 // duplicate entries used to mark section ends. 141 ngramsAscii := make([]ngramAscii, 0, len(ngrams)*101/100) 142 offsetsAscii := make([]uint32, 0, len(ngrams)*101/100) 143 144 ngramsUnicode := make([]ngram, 0, len(ngrams)*11/100) 145 offsetsUnicode := make([]uint32, 0, len(ngrams)*11/100) 146 147 for i, ng := range ngrams { 148 if ng&ngramAsciiMask == ng { // is ngram ascii-only? 149 ngp := ngramAsciiToPacked(ng) 150 if i == len(ngrams)-1 || ngrams[i+1]&ngramAsciiMask != ngrams[i+1] { 151 // at the end of a section we insert an extra offset with the same ngram, 152 // so the size of the segment can be calculated properly 153 ngramsAscii = append(ngramsAscii, ngp, ngp) 154 offsetsAscii = append(offsetsAscii, offsets[i], offsets[i+1]) 155 } else { 156 ngramsAscii = append(ngramsAscii, ngp) 157 offsetsAscii = append(offsetsAscii, offsets[i]) 158 } 159 // note: len(offsets) == len(ngrams) + 1 160 if offsets[i+1]-offsets[i] >= ngramAsciiMaxSectionLength { 161 // max-length ascii sections can't be represented properly in the ascii mapping, 162 // and are duplicated in the normal unicode entries. 163 ngramsUnicode = append(ngramsUnicode, ng, ng) 164 offsetsUnicode = append(offsetsUnicode, offsets[i], offsets[i+1]) 165 } 166 } else { 167 if i == len(ngrams)-1 || ngrams[i+1]&ngramAsciiMask == ngrams[i+1] { 168 ngramsUnicode = append(ngramsUnicode, ng, ng) 169 offsetsUnicode = append(offsetsUnicode, offsets[i], offsets[i+1]) 170 } else { 171 ngramsUnicode = append(ngramsUnicode, ng) 172 offsetsUnicode = append(offsetsUnicode, offsets[i]) 173 } 174 } 175 } 176 177 // The last segment always has an extra trailing ngram entry that we don't need, and 178 // is only present for spacing and alignment. Trim it. 179 if len(ngramsAscii) > 0 { 180 ngramsAscii = ngramsAscii[:len(ngramsAscii)-1] 181 } 182 if len(ngramsUnicode) > 0 { 183 ngramsUnicode = ngramsUnicode[:len(ngramsUnicode)-1] 184 } 185 186 asc := makeAsciiNgramOffset(ngramsAscii, offsetsAscii) 187 uni := makeArrayNgramOffset(ngramsUnicode, offsetsUnicode) 188 189 return combinedNgramOffset{asc, &uni} 190} 191 192// Get returns a simpleSection with sz=0 if no entry, otherwise the appropriate 193// offset based on the underlying ASCII or Unicode offset index. 194func (a combinedNgramOffset) Get(gram ngram) simpleSection { 195 if a.asc == nil { 196 return simpleSection{} 197 } 198 199 var sec simpleSection 200 if gram&ngramAsciiMask == gram { 201 sec = a.asc.Get(gram) 202 if sec.sz == ngramAsciiMaxSectionLength { 203 // Fallback: this section's length was too long to store in the 204 // ASCII map, find it in the Unicode map. 205 sec = a.uni.Get(gram) 206 } 207 } else { 208 sec = a.uni.Get(gram) 209 } 210 211 return sec 212} 213 214func (a combinedNgramOffset) DumpMap() map[ngram]simpleSection { 215 m := a.asc.DumpMap() 216 for k, v := range a.uni.DumpMap() { 217 m[k] = v 218 } 219 return m 220} 221 222func (a combinedNgramOffset) SizeBytes() int { 223 return a.asc.SizeBytes() + a.uni.SizeBytes() 224} 225 226const ngramAsciiMask = 127 | 127<<21 | 127<<42 227 228// Ascii mapping packs 3*7b chars and 11 bits of lengths, with this as the set maximum. 229// We could save another ~3% of total RAM / 5% of combinedNgramOffset RAM by switching to 230// a 40b packing with 19-bit lengths, but the code would be significantly uglier so it doesn't 231// seem worth it. 232const ngramAsciiMaxSectionLength = (1 << 11) - 1 233 234type ngramAscii uint32 235 236func ngramAsciiToPacked(ng ngram) ngramAscii { 237 return ngramAscii(uint32(ng&127) | uint32((ng>>(21-7))&(127<<7)) | uint32((ng>>(42-14))&(127<<14))) 238} 239 240func ngramAsciiPackedToNgram(ng ngramAscii) ngram { 241 return ngram(ng&127) | ngram(ng&(127<<7))<<(21-7) | ngram(ng&(127<<14))<<(42-14) 242} 243 244// asciiNgramOffset stores ascii trigrams packed together with short lengths, 245// using offsets for a chunk of entries to limit the number of lengths that must 246// be summed to compute a section's offset. 247type asciiNgramOffset struct { 248 entries []uint32 // (chara << 25 | charb << 18 | charc << 11 | length) 249 chunkOffsets []uint32 // offset for entries[i*asciiNgramOffsetChunkLength] 250} 251 252// asciiNgramOffsetChunkLength specifies how many entries share one initial offset. 253// It must be a power of 2, and was chosen empirically by measuring RAM usage: 254// 8: 4132MB, 16: 4047MB, 32: 4006MB, 64: 3992MB, 128: 3990MB 255const asciiNgramOffsetChunkLength = 32 256 257func makeAsciiNgramOffset(ngrams []ngramAscii, offsets []uint32) *asciiNgramOffset { 258 ao := &asciiNgramOffset{ 259 entries: make([]uint32, 0, len(ngrams)), 260 chunkOffsets: make([]uint32, 0, len(ngrams)/asciiNgramOffsetChunkLength), 261 } 262 263 for i, ng := range ngrams { 264 if len(ao.entries)%asciiNgramOffsetChunkLength == 0 { 265 ao.chunkOffsets = append(ao.chunkOffsets, offsets[i]) 266 } 267 length := offsets[i+1] - offsets[i] 268 269 for { 270 if length < ngramAsciiMaxSectionLength { 271 ao.entries = append(ao.entries, uint32(ng)<<11|length) 272 break 273 } else { 274 // entries with lengths that are too long can't be represented fully in this 275 // map, but we repeatedly insert offsets to make the next entry's offset computable 276 // by summing the offsets in the preceding entries in the chunk, including 277 // this invalid one. 278 ao.entries = append(ao.entries, uint32(ng)<<11|ngramAsciiMaxSectionLength) 279 length -= ngramAsciiMaxSectionLength 280 if len(ao.entries)%asciiNgramOffsetChunkLength == 0 { 281 // We reached the end of the chunk, so there's no need to reach the 282 // offset for the next entry. 283 break 284 } 285 } 286 } 287 } 288 289 ao.entries = shrinkUint32Slice(ao.entries) 290 ao.chunkOffsets = shrinkUint32Slice(ao.chunkOffsets) 291 292 return ao 293} 294 295// Get returns a simpleSection with sz=0 if no entry, or sz=ngramAsciiMaxSectionLength 296// if the length of the ngram is too large for this type and it should cascade to the next entry. 297func (a *asciiNgramOffset) Get(gram ngram) simpleSection { 298 if gram&ngramAsciiMask != gram { 299 return simpleSection{} 300 } 301 g := uint32(ngramAsciiToPacked(gram) << 11) 302 303 idx := sort.Search(len(a.entries), func(i int) bool { 304 return a.entries[i] >= g 305 }) 306 307 if idx == len(a.entries) || a.entries[idx]>>11 != g>>11 { 308 return simpleSection{} 309 } 310 311 length := a.entries[idx] & ngramAsciiMaxSectionLength 312 if length == ngramAsciiMaxSectionLength { 313 // this ascii ngram's section length is too large to be represented; 314 // repeate the Get() on the unicode map to get the correct result. 315 return simpleSection{ 316 off: 0, 317 sz: ngramAsciiMaxSectionLength, 318 } 319 } 320 321 chunkNum := idx / asciiNgramOffsetChunkLength 322 chunkBase := chunkNum * asciiNgramOffsetChunkLength 323 offset := a.chunkOffsets[chunkNum] 324 for i := chunkBase; i < idx; i++ { 325 offset += a.entries[i] & ngramAsciiMaxSectionLength 326 } 327 328 return simpleSection{ 329 off: offset, 330 sz: length, 331 } 332} 333 334func (a *asciiNgramOffset) DumpMap() map[ngram]simpleSection { 335 m := make(map[ngram]simpleSection, len(a.entries)) 336 off := uint32(0) 337 for i, ent := range a.entries { 338 if i%asciiNgramOffsetChunkLength == 0 { 339 off = a.chunkOffsets[i/asciiNgramOffsetChunkLength] 340 } 341 length := ent & ngramAsciiMaxSectionLength 342 if length == ngramAsciiMaxSectionLength { 343 // This entry is an ascii gram with a section too long 344 // to be represented, so skip the entry. 345 continue 346 } 347 m[ngramAsciiPackedToNgram(ngramAscii(ent>>11))] = simpleSection{ 348 off: off, 349 sz: length, 350 } 351 off += length 352 } 353 return m 354} 355 356func (a *asciiNgramOffset) SizeBytes() int { 357 return 4*len(a.entries) + 4*len(a.chunkOffsets) 358} 359 360type ngramIndex interface { 361 Get(gram ngram) simpleSection 362 DumpMap() map[ngram]simpleSection 363 SizeBytes() int 364} 365 366// This is a temporary type to wrap two very different implementations of the 367// inverted index for the purpose of feature-flagging. We will remove this after 368// we enable the b-tree permanently. 369// 370// Alternatively we could have adapted/extended the interface "ngramIndex". 371// However, adapting the existing implementations and their tests to match the 372// access pattern of map[ngram][]byte seems more cumbersome than this makeshift 373// wrapper. In the end, both ngramIndex and this wrapper will be replaced by a 374// concrete type. 375type fileNameNgrams struct { 376 m map[ngram][]byte 377 bt btreeIndex 378} 379 380func (n fileNameNgrams) GetBlob(ng ngram) ([]byte, error) { 381 if n.m != nil { 382 return n.m[ng], nil 383 } 384 sec := n.bt.Get(ng) 385 return n.bt.file.Read(sec.off, sec.sz) 386} 387 388func (n fileNameNgrams) Frequency(ng ngram) uint32 { 389 if n.m != nil { 390 return uint32(len(n.m[ng])) 391 } 392 return n.bt.Get(ng).sz 393} 394 395func (n fileNameNgrams) SizeBytes() int { 396 if n.m != nil { 397 // these slices reference mmap-ed memory 398 return 12 * len(n.m) 399 } 400 return n.bt.SizeBytes() 401}