index/read.go at 6d2e296f2a289c3477c0d3f9f5806354c13626a1 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / index / read.go
at 6d2e296f2a289c3477c0d3f9f5806354c13626a1 21 kB View raw
Keegan Carruthers-Smith all: run modernize across codebase (#919) 1y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package index
 16
 17import (
 18	"encoding/binary"
 19	"encoding/json"
 20	"fmt"
 21	"hash/crc64"
 22	"log"
 23	"os"
 24	"slices"
 25	"sort"
 26
 27	"github.com/RoaringBitmap/roaring"
 28	"github.com/prometheus/client_golang/prometheus"
 29	"github.com/prometheus/client_golang/prometheus/promauto"
 30	"github.com/rs/xid"
 31
 32	"github.com/sourcegraph/zoekt"
 33)
 34
 35// IndexFile is a file suitable for concurrent read access. For performance
 36// reasons, it allows a mmap'd implementation.
 37type IndexFile interface {
 38	Read(off uint32, sz uint32) ([]byte, error)
 39	Size() (uint32, error)
 40	Close()
 41	Name() string
 42}
 43
 44// reader is a stateful file
 45type reader struct {
 46	r   IndexFile
 47	off uint32
 48}
 49
 50func (r *reader) seek(off uint32) {
 51	r.off = off
 52}
 53
 54func (r *reader) U32() (uint32, error) {
 55	b, err := r.r.Read(r.off, 4)
 56	r.off += 4
 57	if err != nil {
 58		return 0, err
 59	}
 60	return binary.BigEndian.Uint32(b), nil
 61}
 62
 63func (r *reader) U64() (uint64, error) {
 64	b, err := r.r.Read(r.off, 8)
 65	r.off += 8
 66	if err != nil {
 67		return 0, err
 68	}
 69	return binary.BigEndian.Uint64(b), nil
 70}
 71
 72func (r *reader) ReadByte() (byte, error) {
 73	b, err := r.r.Read(r.off, 1)
 74	r.off += 1
 75	if err != nil {
 76		return 0, err
 77	}
 78	return b[0], nil
 79}
 80
 81func (r *reader) Varint() (uint64, error) {
 82	v, err := binary.ReadUvarint(r)
 83	if err != nil {
 84		return 0, err
 85	}
 86	return v, nil
 87}
 88
 89func (r *reader) Str() (string, error) {
 90	slen, err := r.Varint()
 91	if err != nil {
 92		return "", err
 93	}
 94	b, err := r.r.Read(r.off, uint32(slen))
 95	if err != nil {
 96		return "", err
 97	}
 98	r.off += uint32(slen)
 99	return string(b), nil
100}
101
102func (r *reader) readTOC(toc *indexTOC) error {
103	return r.readTOCSections(toc, []string{})
104}
105
106// readTOCSections reads the table of contents of the index file.
107//
108// If the tags parameter is non-empty, it reads only those tagged sections for efficiency
109// and does not populate the other sections.
110func (r *reader) readTOCSections(toc *indexTOC, tags []string) error {
111	tocSection, sectionCount, err := r.readHeader()
112	if err != nil {
113		return err
114	}
115
116	if sectionCount == 0 {
117		// tagged sections are indicated by a 0 sectionCount,
118		// and then a list of string-tagged type-indicated sections.
119		secs := toc.sectionsTagged()
120		for r.off < tocSection.off+tocSection.sz {
121			tag, err := r.Str()
122			if err != nil {
123				return err
124			}
125			kind, err := r.Varint()
126			if err != nil {
127				return err
128			}
129
130			skipSection := len(tags) > 0 && !slices.Contains(tags, tag)
131			sec := secs[tag]
132			if sec == nil || sec.kind() != sectionKind(kind) {
133				// If we don't recognize the section, we may be reading a newer index than the current version. Use
134				// a "dummy section" struct to skip over it.
135				skipSection = true
136				log.Printf("encountered unrecognized index section (%s), skipping over it", tag)
137
138				switch sectionKind(kind) {
139				case sectionKindSimple:
140					sec = &simpleSection{}
141				case sectionKindCompound:
142					sec = &compoundSection{}
143				case sectionKindCompoundLazy:
144					sec = &lazyCompoundSection{}
145				default:
146					return fmt.Errorf("unknown section kind %d", kind)
147				}
148			}
149
150			if skipSection {
151				if err := sec.skip(r); err != nil {
152					return err
153				}
154			} else {
155				if err := sec.read(r); err != nil {
156					return err
157				}
158			}
159		}
160	} else {
161		// TODO: Remove this branch when ReaderMinFeatureVersion >= 10
162
163		secs := toc.sections()
164
165		if len(secs) != int(sectionCount) {
166			secs = toc.sectionsNext()
167		}
168
169		if len(secs) != int(sectionCount) {
170			return fmt.Errorf("section count mismatch: got %d want %d", sectionCount, len(secs))
171		}
172
173		for _, s := range secs {
174			if err := s.read(r); err != nil {
175				return err
176			}
177		}
178	}
179	return nil
180}
181
182func (r *reader) readHeader() (simpleSection, uint32, error) {
183	sz, err := r.r.Size()
184	if err != nil {
185		return simpleSection{}, 0, err
186	}
187	r.off = sz - 8
188
189	var tocSection simpleSection
190	if err := tocSection.read(r); err != nil {
191		return simpleSection{}, 0, err
192	}
193
194	r.seek(tocSection.off)
195
196	sectionCount, err := r.U32()
197	if err != nil {
198		return simpleSection{}, 0, err
199	}
200	return tocSection, sectionCount, nil
201}
202
203func (r *indexData) readSectionBlob(sec simpleSection) ([]byte, error) {
204	return r.file.Read(sec.off, sec.sz)
205}
206
207func readSectionU32(f IndexFile, sec simpleSection) ([]uint32, error) {
208	if sec.sz%4 != 0 {
209		return nil, fmt.Errorf("barf: section size %% 4 != 0: sz %d ", sec.sz)
210	}
211	blob, err := f.Read(sec.off, sec.sz)
212	if err != nil {
213		return nil, err
214	}
215	arr := make([]uint32, 0, len(blob)/4)
216	for len(blob) > 0 {
217		arr = append(arr, binary.BigEndian.Uint32(blob))
218		blob = blob[4:]
219	}
220	return arr, nil
221}
222
223func readSectionU64(f IndexFile, sec simpleSection) ([]uint64, error) {
224	if sec.sz%8 != 0 {
225		return nil, fmt.Errorf("barf: section size %% 8 != 0: sz %d ", sec.sz)
226	}
227	blob, err := f.Read(sec.off, sec.sz)
228	if err != nil {
229		return nil, err
230	}
231	arr := make([]uint64, 0, len(blob)/8)
232	for len(blob) > 0 {
233		arr = append(arr, binary.BigEndian.Uint64(blob))
234		blob = blob[8:]
235	}
236	return arr, nil
237}
238
239func (r *reader) readJSON(data any, sec simpleSection) error {
240	blob, err := r.r.Read(sec.off, sec.sz)
241	if err != nil {
242		return err
243	}
244
245	return json.Unmarshal(blob, data)
246}
247
248// canReadVersion returns checks if zoekt can read in md. If it can't a
249// non-nil error is returned.
250func canReadVersion(md *zoekt.IndexMetadata) bool {
251	// Backwards compatible with v16
252	return md.IndexFormatVersion == IndexFormatVersion || md.IndexFormatVersion == NextIndexFormatVersion
253}
254
255func (r *reader) readIndexData(toc *indexTOC) (*indexData, error) {
256	d := indexData{
257		file:        r.r,
258		branchIDs:   []map[string]uint{},
259		branchNames: []map[uint]string{},
260	}
261
262	repos, md, err := r.parseMetadata(toc.metaData, toc.repoMetaData)
263	if md != nil && !canReadVersion(md) {
264		return nil, fmt.Errorf("file is v%d, want v%d", md.IndexFormatVersion, IndexFormatVersion)
265	} else if err != nil {
266		return nil, err
267	}
268
269	d.metaData = *md
270	d.repoMetaData = make([]zoekt.Repository, 0, len(repos))
271	for _, r := range repos {
272		d.repoMetaData = append(d.repoMetaData, *r)
273	}
274
275	if d.metaData.IndexFeatureVersion < ReadMinFeatureVersion {
276		return nil, fmt.Errorf("file is feature version %d, want feature version >= %d", d.metaData.IndexFeatureVersion, ReadMinFeatureVersion)
277	}
278
279	if d.metaData.IndexMinReaderVersion > FeatureVersion {
280		return nil, fmt.Errorf("file needs read feature version >= %d, have read feature version %d", d.metaData.IndexMinReaderVersion, FeatureVersion)
281	}
282
283	d.boundariesStart = toc.fileContents.data.off
284	d.boundaries = toc.fileContents.relativeIndex()
285	d.newlinesStart = toc.newlines.data.off
286	d.newlinesIndex = toc.newlines.relativeIndex()
287	d.docSectionsStart = toc.fileSections.data.off
288	d.docSectionsIndex = toc.fileSections.relativeIndex()
289
290	d.symbols.symKindIndex = toc.symbolKindMap.relativeIndex()
291	d.fileEndSymbol, err = readSectionU32(d.file, toc.fileEndSymbol)
292	if err != nil {
293		return nil, err
294	}
295
296	// Call readSectionBlob on each section key, and store the result in
297	// the blob value.
298	for sect, blob := range map[simpleSection]*[]byte{
299		toc.symbolMap.index:    &d.symbols.symIndex,
300		toc.symbolMap.data:     &d.symbols.symContent,
301		toc.symbolKindMap.data: &d.symbols.symKindContent,
302		toc.symbolMetaData:     &d.symbols.symMetaData,
303	} {
304		if *blob, err = d.readSectionBlob(sect); err != nil {
305			return nil, err
306		}
307	}
308
309	d.checksums, err = d.readSectionBlob(toc.contentChecksums)
310	if err != nil {
311		return nil, err
312	}
313
314	d.languages, err = d.readSectionBlob(toc.languages)
315	if err != nil {
316		return nil, err
317	}
318
319	d.categories, err = d.readSectionBlob(toc.categories)
320	if err != nil {
321		return nil, err
322	}
323
324	d.contentNgrams, err = d.newBtreeIndex(toc.ngramText, toc.postings)
325	if err != nil {
326		return nil, err
327	}
328
329	d.fileBranchMasks, err = readSectionU64(d.file, toc.branchMasks)
330	if err != nil {
331		return nil, err
332	}
333
334	d.fileNameContent, err = d.readSectionBlob(toc.fileNames.data)
335	if err != nil {
336		return nil, err
337	}
338
339	d.fileNameIndex = toc.fileNames.relativeIndex()
340
341	d.fileNameNgrams, err = d.newBtreeIndex(toc.nameNgramText, toc.namePostings)
342	if err != nil {
343		return nil, err
344	}
345
346	for _, md := range d.repoMetaData {
347		repoBranchIDs := make(map[string]uint, len(md.Branches))
348		repoBranchNames := make(map[uint]string, len(md.Branches))
349		for j, br := range md.Branches {
350			id := uint(1) << uint(j)
351			repoBranchIDs[br.Name] = id
352			repoBranchNames[id] = br.Name
353		}
354		d.branchIDs = append(d.branchIDs, repoBranchIDs)
355		d.branchNames = append(d.branchNames, repoBranchNames)
356		d.rawConfigMasks = append(d.rawConfigMasks, encodeRawConfig(md.RawConfig))
357	}
358
359	blob, err := d.readSectionBlob(toc.runeDocSections)
360	if err != nil {
361		return nil, err
362	}
363
364	d.runeDocSections = unmarshalDocSections(blob, nil)
365
366	var runeOffsets, fileNameRuneOffsets []uint32
367
368	for sect, dest := range map[simpleSection]*[]uint32{
369		toc.subRepos:        &d.subRepos,
370		toc.runeOffsets:     &runeOffsets,
371		toc.nameRuneOffsets: &fileNameRuneOffsets,
372		toc.nameEndRunes:    &d.fileNameEndRunes,
373		toc.fileEndRunes:    &d.fileEndRunes,
374	} {
375		if blob, err := d.readSectionBlob(sect); err != nil {
376			return nil, err
377		} else {
378			*dest = fromSizedDeltas(blob, nil)
379		}
380	}
381
382	d.runeOffsets = makeRuneOffsetMap(runeOffsets)
383	d.fileNameRuneOffsets = makeRuneOffsetMap(fileNameRuneOffsets)
384
385	d.subRepoPaths = make([][]string, 0, len(d.repoMetaData))
386	for i := range d.repoMetaData {
387		keys := make([]string, 0, len(d.repoMetaData[i].SubRepoMap)+1)
388		keys = append(keys, "")
389		for k := range d.repoMetaData[i].SubRepoMap {
390			if k != "" {
391				keys = append(keys, k)
392			}
393		}
394		sort.Strings(keys)
395		d.subRepoPaths = append(d.subRepoPaths, keys)
396	}
397
398	d.languageMap = map[uint16]string{}
399	for k, v := range d.metaData.LanguageMap {
400		d.languageMap[v] = k
401	}
402
403	if err := d.verify(); err != nil {
404		return nil, err
405	}
406
407	if d.metaData.IndexFormatVersion >= 17 {
408		blob, err := d.readSectionBlob(toc.repos)
409		if err != nil {
410			return nil, err
411		}
412		d.repos = fromSizedDeltas16(blob, nil)
413	} else {
414		// every document is for repo index 0 (default value of uint16)
415		d.repos = make([]uint16, len(d.fileBranchMasks))
416	}
417
418	if err := d.calculateStats(); err != nil {
419		return nil, err
420	}
421
422	return &d, nil
423}
424
425func (r *reader) parseMetadata(metaData simpleSection, repoMetaData simpleSection) ([]*zoekt.Repository, *zoekt.IndexMetadata, error) {
426	var md zoekt.IndexMetadata
427	if err := r.readJSON(&md, metaData); err != nil {
428		return nil, nil, err
429	}
430
431	// Sourcegraph specific: we support mutating metadata via an additional
432	// ".meta" file. This is to support tombstoning. An additional benefit is we
433	// can update metadata (such as Rank and Name) without re-indexing content.
434	blob, err := os.ReadFile(r.r.Name() + ".meta")
435	if err != nil && !os.IsNotExist(err) {
436		return nil, &md, fmt.Errorf("failed to read meta file: %w", err)
437	}
438
439	if len(blob) == 0 {
440		blob, err = r.r.Read(repoMetaData.off, repoMetaData.sz)
441		if err != nil {
442			return nil, &md, err
443		}
444	}
445
446	var repos []*zoekt.Repository
447	if md.IndexFormatVersion >= 17 {
448		if err := json.Unmarshal(blob, &repos); err != nil {
449			return nil, &md, err
450		}
451	} else {
452		repos = make([]*zoekt.Repository, 1)
453		if err := json.Unmarshal(blob, &repos[0]); err != nil {
454			return nil, &md, err
455		}
456	}
457
458	if md.ID == "" {
459		if len(repos) == 0 {
460			return nil, nil, fmt.Errorf("len(repos)=0. Cannot backfill ID")
461		}
462		md.ID = backfillID(repos[0].Name)
463	}
464
465	return repos, &md, nil
466}
467
468const ngramEncoding = 8
469
470func (d *indexData) newBtreeIndex(ngramSec simpleSection, postings compoundSection) (btreeIndex, error) {
471	bi := btreeIndex{file: d.file}
472
473	textContent, err := d.readSectionBlob(ngramSec)
474	if err != nil {
475		return btreeIndex{}, err
476	}
477
478	// For 500k trigams we can expect approx 1000 leaf nodes (500k divided by
479	// half the bucketSize) and 20 nodes on level 2 (all but the rightmost
480	// inner nodes will have exactly v=50 children) plus a root node.
481	bt := newBtree(btreeOpts{bucketSize: btreeBucketSize, v: 50})
482	for i := 0; i < len(textContent); i += ngramEncoding {
483		ng := ngram(binary.BigEndian.Uint64(textContent[i : i+ngramEncoding]))
484		bt.insert(ng)
485	}
486	bt.freeze()
487
488	bi.bt = bt
489
490	// hold on to simple sections (8 bytes each)
491	bi.ngramSec = ngramSec
492	bi.postingIndex = postings.index
493
494	return bi, nil
495}
496
497func (d *indexData) verify() error {
498	// This is not an exhaustive check: the postings can easily
499	// generate OOB acccesses, and are expensive to check, but this lets us rule out
500	// other sources of OOB access.
501	n := len(d.fileNameIndex)
502	if n == 0 {
503		return nil
504	}
505
506	n--
507	for what, got := range map[string]int{
508		"boundaries":        len(d.boundaries) - 1,
509		"branch masks":      len(d.fileBranchMasks),
510		"doc section index": len(d.docSectionsIndex) - 1,
511		"newlines index":    len(d.newlinesIndex) - 1,
512	} {
513		if got != n {
514			return fmt.Errorf("got %s %d, want %d", what, got, n)
515		}
516	}
517	return nil
518}
519
520func (d *indexData) readContents(i uint32) ([]byte, error) {
521	return d.readSectionBlob(simpleSection{
522		off: d.boundariesStart + d.boundaries[i],
523		sz:  d.boundaries[i+1] - d.boundaries[i],
524	})
525}
526
527func (d *indexData) readContentSlice(off uint32, sz uint32) ([]byte, error) {
528	// TODO(hanwen): cap result if it is at the end of the content
529	// section.
530	return d.readSectionBlob(simpleSection{
531		off: d.boundariesStart + off,
532		sz:  sz,
533	})
534}
535
536func (d *indexData) readNewlines(i uint32, buf []uint32) ([]uint32, uint32, error) {
537	sec := simpleSection{
538		off: d.newlinesStart + d.newlinesIndex[i],
539		sz:  d.newlinesIndex[i+1] - d.newlinesIndex[i],
540	}
541	blob, err := d.readSectionBlob(sec)
542	if err != nil {
543		return nil, 0, err
544	}
545
546	nl := fromSizedDeltas(blob, buf)
547
548	// can be nil if buf is nil and there are no doc sections. However, we rely
549	// on it being non-nil to cache the read.
550	if nl == nil {
551		nl = make([]uint32, 0)
552	}
553	return nl, sec.sz, nil
554}
555
556func (d *indexData) readDocSections(i uint32, buf []DocumentSection) ([]DocumentSection, uint32, error) {
557	sec := simpleSection{
558		off: d.docSectionsStart + d.docSectionsIndex[i],
559		sz:  d.docSectionsIndex[i+1] - d.docSectionsIndex[i],
560	}
561	blob, err := d.readSectionBlob(sec)
562	if err != nil {
563		return nil, 0, err
564	}
565
566	ds := unmarshalDocSections(blob, buf)
567
568	// can be nil if buf is nil and there are no doc sections. However, we rely
569	// on it being non-nil to cache the read.
570	if ds == nil {
571		ds = make([]DocumentSection, 0)
572	}
573
574	return ds, sec.sz, nil
575}
576
577// NewSearcher creates a Searcher for a single index file.  Search
578// results coming from this searcher are valid only for the lifetime
579// of the Searcher itself, ie. []byte members should be copied into
580// fresh buffers if the result is to survive closing the shard.
581func NewSearcher(r IndexFile) (zoekt.Searcher, error) {
582	rd := &reader{r: r}
583
584	var toc indexTOC
585	if err := rd.readTOC(&toc); err != nil {
586		return nil, err
587	}
588	indexData, err := rd.readIndexData(&toc)
589	if err != nil {
590		return nil, err
591	}
592	indexData.file = r
593	return indexData, nil
594}
595
596// ReadMetadata returns the metadata of index shard without reading
597// the index data. The IndexFile is not closed.
598func ReadMetadata(inf IndexFile) ([]*zoekt.Repository, *zoekt.IndexMetadata, error) {
599	rd := &reader{r: inf}
600	var toc indexTOC
601	err := rd.readTOCSections(&toc, []string{"metaData", "repoMetaData"})
602	if err != nil {
603		return nil, nil, err
604	}
605	return rd.parseMetadata(toc.metaData, toc.repoMetaData)
606}
607
608// ReadMetadataPathAlive is like ReadMetadataPath except that it only returns
609// alive repositories.
610func ReadMetadataPathAlive(p string) ([]*zoekt.Repository, *zoekt.IndexMetadata, error) {
611	repos, id, err := ReadMetadataPath(p)
612	if err != nil {
613		return nil, nil, err
614	}
615	alive := repos[:0]
616	for _, repo := range repos {
617		if !repo.Tombstone {
618			alive = append(alive, repo)
619		}
620	}
621	return alive, id, nil
622}
623
624// ReadMetadataPath returns the metadata of index shard at p without reading
625// the index data. ReadMetadataPath is a helper for ReadMetadata which opens
626// the IndexFile at p.
627func ReadMetadataPath(p string) ([]*zoekt.Repository, *zoekt.IndexMetadata, error) {
628	f, err := os.Open(p)
629	if err != nil {
630		return nil, nil, err
631	}
632	defer f.Close()
633
634	iFile, err := NewIndexFile(f)
635	if err != nil {
636		return nil, nil, err
637	}
638	defer iFile.Close()
639
640	return ReadMetadata(iFile)
641}
642
643// IndexFilePaths returns all paths for the IndexFile at filepath p that
644// exist. Note: if no files exist this will return an empty slice and nil
645// error.
646//
647// This is p and the ".meta" file for p.
648func IndexFilePaths(p string) ([]string, error) {
649	paths := []string{p, p + ".meta"}
650	exist := paths[:0]
651	for _, p := range paths {
652		if _, err := os.Stat(p); err == nil {
653			exist = append(exist, p)
654		} else if !os.IsNotExist(err) {
655			return nil, err
656		}
657	}
658	return exist, nil
659}
660
661// maybeContainsRepo is a performance optimization mainly intended to be used by
662// containsRepo to avoid unmarshalling large metadata files for compound shards.
663// It is best-effort, so if it encounters any error returns true (ie indicating
664// you need to do more checks).
665func maybeContainsRepo(inf IndexFile, repoID uint32) bool {
666	rd := &reader{r: inf}
667	var toc indexTOC
668	err := rd.readTOCSections(&toc, []string{"reposIDsBitmap"})
669	if err != nil {
670		return true
671	}
672
673	// shard does not yet contain reposIDsBitmap so we can't tell if it contains
674	// repo.
675	if toc.reposIDsBitmap.sz == 0 {
676		return true
677	}
678
679	blob, err := inf.Read(toc.reposIDsBitmap.off, toc.reposIDsBitmap.sz)
680	if err != nil {
681		return true
682	}
683
684	var rb roaring.Bitmap
685	_, err = rb.FromUnsafeBytes(blob)
686	if err != nil {
687		return true
688	}
689
690	return rb.Contains(repoID)
691}
692
693var metricCompoundShardLookups = promauto.NewCounterVec(prometheus.CounterOpts{
694	Name: "zoekt_compound_shard_lookups",
695	Help: "Number of compound shard lookups and how much work was done.",
696}, []string{"state"})
697
698// containsRepo returns true if the shard at path contains a repo with id. The
699// function returns false if the shard does not contain the repo or if it
700// encounters an error.
701func containsRepo(p string, id uint32) bool {
702	var err error
703	earlyReturn := false
704
705	defer func() {
706		if err != nil {
707			metricCompoundShardLookups.WithLabelValues("error").Inc()
708			return
709		}
710		if earlyReturn {
711			metricCompoundShardLookups.WithLabelValues("skipped").Inc()
712			return
713		}
714		metricCompoundShardLookups.WithLabelValues("full_lookup").Inc()
715	}()
716
717	f, err := os.Open(p)
718	if err != nil {
719		return false
720	}
721	defer f.Close()
722
723	inf, err := NewIndexFile(f)
724	if err != nil {
725		return false
726	}
727	defer inf.Close()
728
729	// PERF: Looping over repos can be relatively slow on instances with thousands
730	// of tiny repos in compound shards. This is a much faster check to see if we
731	// need to do more work.
732	//
733	// If we are still seeing performance issues, we should consider adding
734	// some sort of global oracle here to avoid filepath.Glob and checking
735	// each compound shard.
736	if !maybeContainsRepo(inf, id) {
737		earlyReturn = true
738		return false
739	}
740
741	repos, _, err := ReadMetadata(inf)
742	if err != nil {
743		return false
744	}
745	for _, repo := range repos {
746		if repo.Tombstone {
747			continue
748		}
749		if repo.ID == id {
750			return true
751		}
752	}
753
754	return false
755}
756
757func loadIndexData(r IndexFile) (*indexData, error) {
758	rd := &reader{r: r}
759
760	var toc indexTOC
761	if err := rd.readTOC(&toc); err != nil {
762		return nil, err
763	}
764	return rd.readIndexData(&toc)
765}
766
767// PrintNgramStats outputs a list of the form
768//
769//	n_1 trigram_1
770//	n_2 trigram_2
771//	...
772//
773// where n_i is the length of the postings list of trigram_i stored in r.
774func PrintNgramStats(r IndexFile) error {
775	id, err := loadIndexData(r)
776	if err != nil {
777		return err
778	}
779
780	var rNgram [3]rune
781	for ngram, ss := range id.contentNgrams.DumpMap() {
782		rNgram = ngramToRunes(ngram)
783		fmt.Printf("%d\t%q\n", ss.sz, string(rNgram[:]))
784	}
785	return nil
786}
787
788var crc64Table = crc64.MakeTable(crc64.ECMA)
789
790// backfillID returns a 20 char long sortable ID. The ID only depends on s. It
791// should only be used to set the ID of simple v16 shards on read.
792func backfillID(s string) string {
793	var id xid.ID
794
795	// Our timestamps are based on Unix time. Shards without IDs are assigned IDs
796	// based on the 0 epoch.
797	binary.BigEndian.PutUint32(id[:], 0)
798	binary.BigEndian.PutUint64(id[4:], crc64.Checksum([]byte(s), crc64Table))
799	return id.String()
800}
Configure Feed

Configure Feed