fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package index
16
17import (
18 "encoding/binary"
19 "encoding/json"
20 "fmt"
21 "hash/crc64"
22 "log"
23 "os"
24 "slices"
25 "sort"
26
27 "github.com/RoaringBitmap/roaring"
28 "github.com/prometheus/client_golang/prometheus"
29 "github.com/prometheus/client_golang/prometheus/promauto"
30 "github.com/rs/xid"
31
32 "github.com/sourcegraph/zoekt"
33)
34
35// IndexFile is a file suitable for concurrent read access. For performance
36// reasons, it allows a mmap'd implementation.
37type IndexFile interface {
38 Read(off uint32, sz uint32) ([]byte, error)
39 Size() (uint32, error)
40 Close()
41 Name() string
42}
43
44// reader is a stateful file
45type reader struct {
46 r IndexFile
47 off uint32
48}
49
50func (r *reader) seek(off uint32) {
51 r.off = off
52}
53
54func (r *reader) U32() (uint32, error) {
55 b, err := r.r.Read(r.off, 4)
56 r.off += 4
57 if err != nil {
58 return 0, err
59 }
60 return binary.BigEndian.Uint32(b), nil
61}
62
63func (r *reader) U64() (uint64, error) {
64 b, err := r.r.Read(r.off, 8)
65 r.off += 8
66 if err != nil {
67 return 0, err
68 }
69 return binary.BigEndian.Uint64(b), nil
70}
71
72func (r *reader) ReadByte() (byte, error) {
73 b, err := r.r.Read(r.off, 1)
74 r.off += 1
75 if err != nil {
76 return 0, err
77 }
78 return b[0], nil
79}
80
81func (r *reader) Varint() (uint64, error) {
82 v, err := binary.ReadUvarint(r)
83 if err != nil {
84 return 0, err
85 }
86 return v, nil
87}
88
89func (r *reader) Str() (string, error) {
90 slen, err := r.Varint()
91 if err != nil {
92 return "", err
93 }
94 b, err := r.r.Read(r.off, uint32(slen))
95 if err != nil {
96 return "", err
97 }
98 r.off += uint32(slen)
99 return string(b), nil
100}
101
102func (r *reader) readTOC(toc *indexTOC) error {
103 return r.readTOCSections(toc, []string{})
104}
105
106// readTOCSections reads the table of contents of the index file.
107//
108// If the tags parameter is non-empty, it reads only those tagged sections for efficiency
109// and does not populate the other sections.
110func (r *reader) readTOCSections(toc *indexTOC, tags []string) error {
111 tocSection, sectionCount, err := r.readHeader()
112 if err != nil {
113 return err
114 }
115
116 if sectionCount == 0 {
117 // tagged sections are indicated by a 0 sectionCount,
118 // and then a list of string-tagged type-indicated sections.
119 secs := toc.sectionsTagged()
120 for r.off < tocSection.off+tocSection.sz {
121 tag, err := r.Str()
122 if err != nil {
123 return err
124 }
125 kind, err := r.Varint()
126 if err != nil {
127 return err
128 }
129
130 skipSection := len(tags) > 0 && !slices.Contains(tags, tag)
131 sec := secs[tag]
132 if sec == nil || sec.kind() != sectionKind(kind) {
133 // If we don't recognize the section, we may be reading a newer index than the current version. Use
134 // a "dummy section" struct to skip over it.
135 skipSection = true
136 log.Printf("encountered unrecognized index section (%s), skipping over it", tag)
137
138 switch sectionKind(kind) {
139 case sectionKindSimple:
140 sec = &simpleSection{}
141 case sectionKindCompound:
142 sec = &compoundSection{}
143 case sectionKindCompoundLazy:
144 sec = &lazyCompoundSection{}
145 default:
146 return fmt.Errorf("unknown section kind %d", kind)
147 }
148 }
149
150 if skipSection {
151 if err := sec.skip(r); err != nil {
152 return err
153 }
154 } else {
155 if err := sec.read(r); err != nil {
156 return err
157 }
158 }
159 }
160 } else {
161 // TODO: Remove this branch when ReaderMinFeatureVersion >= 10
162
163 secs := toc.sections()
164
165 if len(secs) != int(sectionCount) {
166 secs = toc.sectionsNext()
167 }
168
169 if len(secs) != int(sectionCount) {
170 return fmt.Errorf("section count mismatch: got %d want %d", sectionCount, len(secs))
171 }
172
173 for _, s := range secs {
174 if err := s.read(r); err != nil {
175 return err
176 }
177 }
178 }
179 return nil
180}
181
182func (r *reader) readHeader() (simpleSection, uint32, error) {
183 sz, err := r.r.Size()
184 if err != nil {
185 return simpleSection{}, 0, err
186 }
187 r.off = sz - 8
188
189 var tocSection simpleSection
190 if err := tocSection.read(r); err != nil {
191 return simpleSection{}, 0, err
192 }
193
194 r.seek(tocSection.off)
195
196 sectionCount, err := r.U32()
197 if err != nil {
198 return simpleSection{}, 0, err
199 }
200 return tocSection, sectionCount, nil
201}
202
203func (r *indexData) readSectionBlob(sec simpleSection) ([]byte, error) {
204 return r.file.Read(sec.off, sec.sz)
205}
206
207func readSectionU32(f IndexFile, sec simpleSection) ([]uint32, error) {
208 if sec.sz%4 != 0 {
209 return nil, fmt.Errorf("barf: section size %% 4 != 0: sz %d ", sec.sz)
210 }
211 blob, err := f.Read(sec.off, sec.sz)
212 if err != nil {
213 return nil, err
214 }
215 arr := make([]uint32, 0, len(blob)/4)
216 for len(blob) > 0 {
217 arr = append(arr, binary.BigEndian.Uint32(blob))
218 blob = blob[4:]
219 }
220 return arr, nil
221}
222
223func readSectionU64(f IndexFile, sec simpleSection) ([]uint64, error) {
224 if sec.sz%8 != 0 {
225 return nil, fmt.Errorf("barf: section size %% 8 != 0: sz %d ", sec.sz)
226 }
227 blob, err := f.Read(sec.off, sec.sz)
228 if err != nil {
229 return nil, err
230 }
231 arr := make([]uint64, 0, len(blob)/8)
232 for len(blob) > 0 {
233 arr = append(arr, binary.BigEndian.Uint64(blob))
234 blob = blob[8:]
235 }
236 return arr, nil
237}
238
239func (r *reader) readJSON(data any, sec simpleSection) error {
240 blob, err := r.r.Read(sec.off, sec.sz)
241 if err != nil {
242 return err
243 }
244
245 return json.Unmarshal(blob, data)
246}
247
248// canReadVersion returns checks if zoekt can read in md. If it can't a
249// non-nil error is returned.
250func canReadVersion(md *zoekt.IndexMetadata) bool {
251 // Backwards compatible with v16
252 return md.IndexFormatVersion == IndexFormatVersion || md.IndexFormatVersion == NextIndexFormatVersion
253}
254
255func (r *reader) readIndexData(toc *indexTOC) (*indexData, error) {
256 d := indexData{
257 file: r.r,
258 branchIDs: []map[string]uint{},
259 branchNames: []map[uint]string{},
260
261 // docMatchTreeCache is disabled by default.
262 // The number of max entries can be set with environment variable ZOEKT_DOCMATCHTREE_CACHE
263 docMatchTreeCache: newDocMatchTreeCache(0),
264 }
265
266 repos, md, err := r.parseMetadata(toc.metaData, toc.repoMetaData)
267 if md != nil && !canReadVersion(md) {
268 return nil, fmt.Errorf("file is v%d, want v%d", md.IndexFormatVersion, IndexFormatVersion)
269 } else if err != nil {
270 return nil, err
271 }
272
273 d.metaData = *md
274 d.repoMetaData = make([]zoekt.Repository, 0, len(repos))
275 for _, r := range repos {
276 d.repoMetaData = append(d.repoMetaData, *r)
277 }
278
279 if d.metaData.IndexFeatureVersion < ReadMinFeatureVersion {
280 return nil, fmt.Errorf("file is feature version %d, want feature version >= %d", d.metaData.IndexFeatureVersion, ReadMinFeatureVersion)
281 }
282
283 if d.metaData.IndexMinReaderVersion > FeatureVersion {
284 return nil, fmt.Errorf("file needs read feature version >= %d, have read feature version %d", d.metaData.IndexMinReaderVersion, FeatureVersion)
285 }
286
287 d.boundariesStart = toc.fileContents.data.off
288 d.boundaries = toc.fileContents.relativeIndex()
289 d.newlinesStart = toc.newlines.data.off
290 d.newlinesIndex = toc.newlines.relativeIndex()
291 d.docSectionsStart = toc.fileSections.data.off
292 d.docSectionsIndex = toc.fileSections.relativeIndex()
293
294 d.symbols.symKindIndex = toc.symbolKindMap.relativeIndex()
295 d.fileEndSymbol, err = readSectionU32(d.file, toc.fileEndSymbol)
296 if err != nil {
297 return nil, err
298 }
299
300 // Call readSectionBlob on each section key, and store the result in
301 // the blob value.
302 for sect, blob := range map[simpleSection]*[]byte{
303 toc.symbolMap.index: &d.symbols.symIndex,
304 toc.symbolMap.data: &d.symbols.symContent,
305 toc.symbolKindMap.data: &d.symbols.symKindContent,
306 toc.symbolMetaData: &d.symbols.symMetaData,
307 } {
308 if *blob, err = d.readSectionBlob(sect); err != nil {
309 return nil, err
310 }
311 }
312
313 d.checksums, err = d.readSectionBlob(toc.contentChecksums)
314 if err != nil {
315 return nil, err
316 }
317
318 d.languages, err = d.readSectionBlob(toc.languages)
319 if err != nil {
320 return nil, err
321 }
322
323 d.categories, err = d.readSectionBlob(toc.categories)
324 if err != nil {
325 return nil, err
326 }
327
328 d.contentNgrams, err = d.newBtreeIndex(toc.ngramText, toc.postings)
329 if err != nil {
330 return nil, err
331 }
332
333 d.fileBranchMasks, err = readSectionU64(d.file, toc.branchMasks)
334 if err != nil {
335 return nil, err
336 }
337
338 d.fileNameContent, err = d.readSectionBlob(toc.fileNames.data)
339 if err != nil {
340 return nil, err
341 }
342
343 d.fileNameIndex = toc.fileNames.relativeIndex()
344
345 d.fileNameNgrams, err = d.newBtreeIndex(toc.nameNgramText, toc.namePostings)
346 if err != nil {
347 return nil, err
348 }
349
350 for _, md := range d.repoMetaData {
351 repoBranchIDs := make(map[string]uint, len(md.Branches))
352 repoBranchNames := make(map[uint]string, len(md.Branches))
353 for j, br := range md.Branches {
354 id := uint(1) << uint(j)
355 repoBranchIDs[br.Name] = id
356 repoBranchNames[id] = br.Name
357 }
358 d.branchIDs = append(d.branchIDs, repoBranchIDs)
359 d.branchNames = append(d.branchNames, repoBranchNames)
360 d.rawConfigMasks = append(d.rawConfigMasks, encodeRawConfig(md.RawConfig))
361 }
362
363 blob, err := d.readSectionBlob(toc.runeDocSections)
364 if err != nil {
365 return nil, err
366 }
367
368 d.runeDocSections = unmarshalDocSections(blob, nil)
369
370 var runeOffsets, fileNameRuneOffsets []uint32
371
372 for sect, dest := range map[simpleSection]*[]uint32{
373 toc.subRepos: &d.subRepos,
374 toc.runeOffsets: &runeOffsets,
375 toc.nameRuneOffsets: &fileNameRuneOffsets,
376 toc.nameEndRunes: &d.fileNameEndRunes,
377 toc.fileEndRunes: &d.fileEndRunes,
378 } {
379 if blob, err := d.readSectionBlob(sect); err != nil {
380 return nil, err
381 } else {
382 *dest = fromSizedDeltas(blob, nil)
383 }
384 }
385
386 d.runeOffsets = makeRuneOffsetMap(runeOffsets)
387 d.fileNameRuneOffsets = makeRuneOffsetMap(fileNameRuneOffsets)
388
389 d.subRepoPaths = make([][]string, 0, len(d.repoMetaData))
390 for i := range d.repoMetaData {
391 keys := make([]string, 0, len(d.repoMetaData[i].SubRepoMap)+1)
392 keys = append(keys, "")
393 for k := range d.repoMetaData[i].SubRepoMap {
394 if k != "" {
395 keys = append(keys, k)
396 }
397 }
398 sort.Strings(keys)
399 d.subRepoPaths = append(d.subRepoPaths, keys)
400 }
401
402 d.languageMap = map[uint16]string{}
403 for k, v := range d.metaData.LanguageMap {
404 d.languageMap[v] = k
405 }
406
407 if err := d.verify(); err != nil {
408 return nil, err
409 }
410
411 if d.metaData.IndexFormatVersion >= 17 {
412 blob, err := d.readSectionBlob(toc.repos)
413 if err != nil {
414 return nil, err
415 }
416 d.repos = fromSizedDeltas16(blob, nil)
417 } else {
418 // every document is for repo index 0 (default value of uint16)
419 d.repos = make([]uint16, len(d.fileBranchMasks))
420 }
421
422 if err := d.calculateStats(); err != nil {
423 return nil, err
424 }
425
426 return &d, nil
427}
428
429func (r *reader) parseMetadata(metaData simpleSection, repoMetaData simpleSection) ([]*zoekt.Repository, *zoekt.IndexMetadata, error) {
430 var md zoekt.IndexMetadata
431 if err := r.readJSON(&md, metaData); err != nil {
432 return nil, nil, err
433 }
434
435 // Sourcegraph specific: we support mutating metadata via an additional
436 // ".meta" file. This is to support tombstoning. An additional benefit is we
437 // can update metadata (such as Rank and Name) without re-indexing content.
438 blob, err := os.ReadFile(r.r.Name() + ".meta")
439 if err != nil && !os.IsNotExist(err) {
440 return nil, &md, fmt.Errorf("failed to read meta file: %w", err)
441 }
442
443 if len(blob) == 0 {
444 blob, err = r.r.Read(repoMetaData.off, repoMetaData.sz)
445 if err != nil {
446 return nil, &md, err
447 }
448 }
449
450 var repos []*zoekt.Repository
451 if md.IndexFormatVersion >= 17 {
452 if err := json.Unmarshal(blob, &repos); err != nil {
453 return nil, &md, err
454 }
455 } else {
456 repos = make([]*zoekt.Repository, 1)
457 if err := json.Unmarshal(blob, &repos[0]); err != nil {
458 return nil, &md, err
459 }
460 }
461
462 if md.ID == "" {
463 if len(repos) == 0 {
464 return nil, nil, fmt.Errorf("len(repos)=0. Cannot backfill ID")
465 }
466 md.ID = backfillID(repos[0].Name)
467 }
468
469 return repos, &md, nil
470}
471
472const ngramEncoding = 8
473
474func (d *indexData) newBtreeIndex(ngramSec simpleSection, postings compoundSection) (btreeIndex, error) {
475 bi := btreeIndex{file: d.file}
476
477 textContent, err := d.readSectionBlob(ngramSec)
478 if err != nil {
479 return btreeIndex{}, err
480 }
481
482 // For 500k trigams we can expect approx 1000 leaf nodes (500k divided by
483 // half the bucketSize) and 20 nodes on level 2 (all but the rightmost
484 // inner nodes will have exactly v=50 children) plus a root node.
485 bt := newBtree(btreeOpts{bucketSize: btreeBucketSize, v: 50})
486 for i := 0; i < len(textContent); i += ngramEncoding {
487 ng := ngram(binary.BigEndian.Uint64(textContent[i : i+ngramEncoding]))
488 bt.insert(ng)
489 }
490 bt.freeze()
491
492 bi.bt = bt
493
494 // hold on to simple sections (8 bytes each)
495 bi.ngramSec = ngramSec
496 bi.postingIndex = postings.index
497
498 return bi, nil
499}
500
501func (d *indexData) verify() error {
502 // This is not an exhaustive check: the postings can easily
503 // generate OOB acccesses, and are expensive to check, but this lets us rule out
504 // other sources of OOB access.
505 n := len(d.fileNameIndex)
506 if n == 0 {
507 return nil
508 }
509
510 n--
511 for what, got := range map[string]int{
512 "boundaries": len(d.boundaries) - 1,
513 "branch masks": len(d.fileBranchMasks),
514 "doc section index": len(d.docSectionsIndex) - 1,
515 "newlines index": len(d.newlinesIndex) - 1,
516 } {
517 if got != n {
518 return fmt.Errorf("got %s %d, want %d", what, got, n)
519 }
520 }
521 return nil
522}
523
524func (d *indexData) readContents(i uint32) ([]byte, error) {
525 return d.readSectionBlob(simpleSection{
526 off: d.boundariesStart + d.boundaries[i],
527 sz: d.boundaries[i+1] - d.boundaries[i],
528 })
529}
530
531func (d *indexData) readContentSlice(off uint32, sz uint32) ([]byte, error) {
532 // TODO(hanwen): cap result if it is at the end of the content
533 // section.
534 return d.readSectionBlob(simpleSection{
535 off: d.boundariesStart + off,
536 sz: sz,
537 })
538}
539
540func (d *indexData) readNewlines(i uint32, buf []uint32) ([]uint32, uint32, error) {
541 sec := simpleSection{
542 off: d.newlinesStart + d.newlinesIndex[i],
543 sz: d.newlinesIndex[i+1] - d.newlinesIndex[i],
544 }
545 blob, err := d.readSectionBlob(sec)
546 if err != nil {
547 return nil, 0, err
548 }
549
550 nl := fromSizedDeltas(blob, buf)
551
552 // can be nil if buf is nil and there are no doc sections. However, we rely
553 // on it being non-nil to cache the read.
554 if nl == nil {
555 nl = make([]uint32, 0)
556 }
557 return nl, sec.sz, nil
558}
559
560func (d *indexData) readDocSections(i uint32, buf []DocumentSection) ([]DocumentSection, uint32, error) {
561 sec := simpleSection{
562 off: d.docSectionsStart + d.docSectionsIndex[i],
563 sz: d.docSectionsIndex[i+1] - d.docSectionsIndex[i],
564 }
565 blob, err := d.readSectionBlob(sec)
566 if err != nil {
567 return nil, 0, err
568 }
569
570 ds := unmarshalDocSections(blob, buf)
571
572 // can be nil if buf is nil and there are no doc sections. However, we rely
573 // on it being non-nil to cache the read.
574 if ds == nil {
575 ds = make([]DocumentSection, 0)
576 }
577
578 return ds, sec.sz, nil
579}
580
581// NewSearcher creates a Searcher for a single index file. Search
582// results coming from this searcher are valid only for the lifetime
583// of the Searcher itself, ie. []byte members should be copied into
584// fresh buffers if the result is to survive closing the shard.
585func NewSearcher(r IndexFile) (zoekt.Searcher, error) {
586 rd := &reader{r: r}
587
588 var toc indexTOC
589 if err := rd.readTOC(&toc); err != nil {
590 return nil, err
591 }
592 indexData, err := rd.readIndexData(&toc)
593 if err != nil {
594 return nil, err
595 }
596 indexData.file = r
597 return indexData, nil
598}
599
600// ReadMetadata returns the metadata of index shard without reading
601// the index data. The IndexFile is not closed.
602func ReadMetadata(inf IndexFile) ([]*zoekt.Repository, *zoekt.IndexMetadata, error) {
603 rd := &reader{r: inf}
604 var toc indexTOC
605 err := rd.readTOCSections(&toc, []string{"metaData", "repoMetaData"})
606 if err != nil {
607 return nil, nil, err
608 }
609 return rd.parseMetadata(toc.metaData, toc.repoMetaData)
610}
611
612// ReadMetadataPathAlive is like ReadMetadataPath except that it only returns
613// alive repositories.
614func ReadMetadataPathAlive(p string) ([]*zoekt.Repository, *zoekt.IndexMetadata, error) {
615 repos, id, err := ReadMetadataPath(p)
616 if err != nil {
617 return nil, nil, err
618 }
619 alive := repos[:0]
620 for _, repo := range repos {
621 if !repo.Tombstone {
622 alive = append(alive, repo)
623 }
624 }
625 return alive, id, nil
626}
627
628// ReadMetadataPath returns the metadata of index shard at p without reading
629// the index data. ReadMetadataPath is a helper for ReadMetadata which opens
630// the IndexFile at p.
631func ReadMetadataPath(p string) ([]*zoekt.Repository, *zoekt.IndexMetadata, error) {
632 f, err := os.Open(p)
633 if err != nil {
634 return nil, nil, err
635 }
636 defer f.Close()
637
638 iFile, err := NewIndexFile(f)
639 if err != nil {
640 return nil, nil, err
641 }
642 defer iFile.Close()
643
644 return ReadMetadata(iFile)
645}
646
647// IndexFilePaths returns all paths for the IndexFile at filepath p that
648// exist. Note: if no files exist this will return an empty slice and nil
649// error.
650//
651// This is p and the ".meta" file for p.
652func IndexFilePaths(p string) ([]string, error) {
653 paths := []string{p, p + ".meta"}
654 exist := paths[:0]
655 for _, p := range paths {
656 if _, err := os.Stat(p); err == nil {
657 exist = append(exist, p)
658 } else if !os.IsNotExist(err) {
659 return nil, err
660 }
661 }
662 return exist, nil
663}
664
665// maybeContainsRepo is a performance optimization mainly intended to be used by
666// containsRepo to avoid unmarshalling large metadata files for compound shards.
667// It is best-effort, so if it encounters any error returns true (ie indicating
668// you need to do more checks).
669func maybeContainsRepo(inf IndexFile, repoID uint32) bool {
670 rd := &reader{r: inf}
671 var toc indexTOC
672 err := rd.readTOCSections(&toc, []string{"reposIDsBitmap"})
673 if err != nil {
674 return true
675 }
676
677 // shard does not yet contain reposIDsBitmap so we can't tell if it contains
678 // repo.
679 if toc.reposIDsBitmap.sz == 0 {
680 return true
681 }
682
683 blob, err := inf.Read(toc.reposIDsBitmap.off, toc.reposIDsBitmap.sz)
684 if err != nil {
685 return true
686 }
687
688 var rb roaring.Bitmap
689 _, err = rb.FromUnsafeBytes(blob)
690 if err != nil {
691 return true
692 }
693
694 return rb.Contains(repoID)
695}
696
697var metricCompoundShardLookups = promauto.NewCounterVec(prometheus.CounterOpts{
698 Name: "zoekt_compound_shard_lookups",
699 Help: "Number of compound shard lookups and how much work was done.",
700}, []string{"state"})
701
702// containsRepo returns true if the shard at path contains a repo with id. The
703// function returns false if the shard does not contain the repo or if it
704// encounters an error.
705func containsRepo(p string, id uint32) bool {
706 var err error
707 earlyReturn := false
708
709 defer func() {
710 if err != nil {
711 metricCompoundShardLookups.WithLabelValues("error").Inc()
712 return
713 }
714 if earlyReturn {
715 metricCompoundShardLookups.WithLabelValues("skipped").Inc()
716 return
717 }
718 metricCompoundShardLookups.WithLabelValues("full_lookup").Inc()
719 }()
720
721 f, err := os.Open(p)
722 if err != nil {
723 return false
724 }
725 defer f.Close()
726
727 inf, err := NewIndexFile(f)
728 if err != nil {
729 return false
730 }
731 defer inf.Close()
732
733 // PERF: Looping over repos can be relatively slow on instances with thousands
734 // of tiny repos in compound shards. This is a much faster check to see if we
735 // need to do more work.
736 //
737 // If we are still seeing performance issues, we should consider adding
738 // some sort of global oracle here to avoid filepath.Glob and checking
739 // each compound shard.
740 if !maybeContainsRepo(inf, id) {
741 earlyReturn = true
742 return false
743 }
744
745 repos, _, err := ReadMetadata(inf)
746 if err != nil {
747 return false
748 }
749 for _, repo := range repos {
750 if repo.Tombstone {
751 continue
752 }
753 if repo.ID == id {
754 return true
755 }
756 }
757
758 return false
759}
760
761func loadIndexData(r IndexFile) (*indexData, error) {
762 rd := &reader{r: r}
763
764 var toc indexTOC
765 if err := rd.readTOC(&toc); err != nil {
766 return nil, err
767 }
768 return rd.readIndexData(&toc)
769}
770
771// PrintNgramStats outputs a list of the form
772//
773// n_1 trigram_1
774// n_2 trigram_2
775// ...
776//
777// where n_i is the length of the postings list of trigram_i stored in r.
778func PrintNgramStats(r IndexFile) error {
779 id, err := loadIndexData(r)
780 if err != nil {
781 return err
782 }
783
784 var rNgram [3]rune
785 for ngram, ss := range id.contentNgrams.DumpMap() {
786 rNgram = ngramToRunes(ngram)
787 fmt.Printf("%d\t%q\n", ss.sz, string(rNgram[:]))
788 }
789 return nil
790}
791
792var crc64Table = crc64.MakeTable(crc64.ECMA)
793
794// backfillID returns a 20 char long sortable ID. The ID only depends on s. It
795// should only be used to set the ID of simple v16 shards on read.
796func backfillID(s string) string {
797 var id xid.ID
798
799 // Our timestamps are based on Unix time. Shards without IDs are assigned IDs
800 // based on the 0 epoch.
801 binary.BigEndian.PutUint32(id[:], 0)
802 binary.BigEndian.PutUint64(id[4:], crc64.Checksum([]byte(s), crc64Table))
803 return id.String()
804}