read.go at a8d7c8b2978fccce99acef24d893297007a01f8f · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / read.go
at a8d7c8b2978fccce99acef24d893297007a01f8f 18 kB View raw
Julie Tibshirani Skip other sections when reading metadata (#826) 2y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package zoekt
 16
 17import (
 18	"encoding/binary"
 19	"encoding/json"
 20	"fmt"
 21	"hash/crc64"
 22	"log"
 23	"os"
 24	"slices"
 25	"sort"
 26
 27	"github.com/rs/xid"
 28)
 29
 30// IndexFile is a file suitable for concurrent read access. For performance
 31// reasons, it allows a mmap'd implementation.
 32type IndexFile interface {
 33	Read(off uint32, sz uint32) ([]byte, error)
 34	Size() (uint32, error)
 35	Close()
 36	Name() string
 37}
 38
 39// reader is a stateful file
 40type reader struct {
 41	r   IndexFile
 42	off uint32
 43}
 44
 45func (r *reader) seek(off uint32) {
 46	r.off = off
 47}
 48
 49func (r *reader) U32() (uint32, error) {
 50	b, err := r.r.Read(r.off, 4)
 51	r.off += 4
 52	if err != nil {
 53		return 0, err
 54	}
 55	return binary.BigEndian.Uint32(b), nil
 56}
 57
 58func (r *reader) U64() (uint64, error) {
 59	b, err := r.r.Read(r.off, 8)
 60	r.off += 8
 61	if err != nil {
 62		return 0, err
 63	}
 64	return binary.BigEndian.Uint64(b), nil
 65}
 66
 67func (r *reader) ReadByte() (byte, error) {
 68	b, err := r.r.Read(r.off, 1)
 69	r.off += 1
 70	if err != nil {
 71		return 0, err
 72	}
 73	return b[0], nil
 74}
 75
 76func (r *reader) Varint() (uint64, error) {
 77	v, err := binary.ReadUvarint(r)
 78	if err != nil {
 79		return 0, err
 80	}
 81	return v, nil
 82}
 83
 84func (r *reader) Str() (string, error) {
 85	slen, err := r.Varint()
 86	if err != nil {
 87		return "", err
 88	}
 89	b, err := r.r.Read(r.off, uint32(slen))
 90	if err != nil {
 91		return "", err
 92	}
 93	r.off += uint32(slen)
 94	return string(b), nil
 95}
 96
 97func (r *reader) readTOC(toc *indexTOC) error {
 98	return r.readTOCSections(toc, []string{})
 99}
100
101// readTOCSections reads the table of contents of the index file.
102//
103// If the tags parameter is non-empty, it reads only those tagged sections for efficiency
104// and does not populate the other sections.
105func (r *reader) readTOCSections(toc *indexTOC, tags []string) error {
106	tocSection, sectionCount, err := r.readHeader()
107	if err != nil {
108		return err
109	}
110
111	if sectionCount == 0 {
112		// tagged sections are indicated by a 0 sectionCount,
113		// and then a list of string-tagged type-indicated sections.
114		secs := toc.sectionsTagged()
115		for r.off < tocSection.off+tocSection.sz {
116			tag, err := r.Str()
117			if err != nil {
118				return err
119			}
120			kind, err := r.Varint()
121			if err != nil {
122				return err
123			}
124
125			skipSection := len(tags) > 0 && !slices.Contains(tags, tag)
126			sec := secs[tag]
127			if sec == nil || sec.kind() != sectionKind(kind) {
128				// If we don't recognize the section, we may be reading a newer index than the current version. Use
129				// a "dummy section" struct to skip over it.
130				skipSection = true
131				log.Printf("encountered malformed index section (%s), skipping over it", tag)
132
133				switch sectionKind(kind) {
134				case sectionKindSimple:
135					sec = &simpleSection{}
136				case sectionKindCompound:
137					sec = &lazyCompoundSection{}
138				case sectionKindCompoundLazy:
139					sec = &lazyCompoundSection{}
140				default:
141					return fmt.Errorf("unknown section kind %d", kind)
142				}
143			}
144
145			if skipSection {
146				if err := sec.skip(r); err != nil {
147					return err
148				}
149			} else {
150				if err := sec.read(r); err != nil {
151					return err
152				}
153			}
154		}
155	} else {
156		// TODO: Remove this branch when ReaderMinFeatureVersion >= 10
157
158		secs := toc.sections()
159
160		if len(secs) != int(sectionCount) {
161			secs = toc.sectionsNext()
162		}
163
164		if len(secs) != int(sectionCount) {
165			return fmt.Errorf("section count mismatch: got %d want %d", sectionCount, len(secs))
166		}
167
168		for _, s := range secs {
169			if err := s.read(r); err != nil {
170				return err
171			}
172		}
173	}
174	return nil
175}
176
177func (r *reader) readHeader() (simpleSection, uint32, error) {
178	sz, err := r.r.Size()
179	if err != nil {
180		return simpleSection{}, 0, err
181	}
182	r.off = sz - 8
183
184	var tocSection simpleSection
185	if err := tocSection.read(r); err != nil {
186		return simpleSection{}, 0, err
187	}
188
189	r.seek(tocSection.off)
190
191	sectionCount, err := r.U32()
192	if err != nil {
193		return simpleSection{}, 0, err
194	}
195	return tocSection, sectionCount, nil
196}
197
198func (r *indexData) readSectionBlob(sec simpleSection) ([]byte, error) {
199	return r.file.Read(sec.off, sec.sz)
200}
201
202func readSectionU32(f IndexFile, sec simpleSection) ([]uint32, error) {
203	if sec.sz%4 != 0 {
204		return nil, fmt.Errorf("barf: section size %% 4 != 0: sz %d ", sec.sz)
205	}
206	blob, err := f.Read(sec.off, sec.sz)
207	if err != nil {
208		return nil, err
209	}
210	arr := make([]uint32, 0, len(blob)/4)
211	for len(blob) > 0 {
212		arr = append(arr, binary.BigEndian.Uint32(blob))
213		blob = blob[4:]
214	}
215	return arr, nil
216}
217
218func readSectionU64(f IndexFile, sec simpleSection) ([]uint64, error) {
219	if sec.sz%8 != 0 {
220		return nil, fmt.Errorf("barf: section size %% 8 != 0: sz %d ", sec.sz)
221	}
222	blob, err := f.Read(sec.off, sec.sz)
223	if err != nil {
224		return nil, err
225	}
226	arr := make([]uint64, 0, len(blob)/8)
227	for len(blob) > 0 {
228		arr = append(arr, binary.BigEndian.Uint64(blob))
229		blob = blob[8:]
230	}
231	return arr, nil
232}
233
234func (r *reader) readJSON(data interface{}, sec simpleSection) error {
235	blob, err := r.r.Read(sec.off, sec.sz)
236	if err != nil {
237		return err
238	}
239
240	return json.Unmarshal(blob, data)
241}
242
243// canReadVersion returns checks if zoekt can read in md. If it can't a
244// non-nil error is returned.
245func canReadVersion(md *IndexMetadata) bool {
246	// Backwards compatible with v16
247	return md.IndexFormatVersion == IndexFormatVersion || md.IndexFormatVersion == NextIndexFormatVersion
248}
249
250func (r *reader) readIndexData(toc *indexTOC) (*indexData, error) {
251	d := indexData{
252		file:        r.r,
253		branchIDs:   []map[string]uint{},
254		branchNames: []map[uint]string{},
255	}
256
257	repos, md, err := r.parseMetadata(toc.metaData, toc.repoMetaData)
258	if md != nil && !canReadVersion(md) {
259		return nil, fmt.Errorf("file is v%d, want v%d", md.IndexFormatVersion, IndexFormatVersion)
260	} else if err != nil {
261		return nil, err
262	}
263
264	d.metaData = *md
265	d.repoMetaData = make([]Repository, 0, len(repos))
266	for _, r := range repos {
267		d.repoMetaData = append(d.repoMetaData, *r)
268	}
269
270	if d.metaData.IndexFeatureVersion < ReadMinFeatureVersion {
271		return nil, fmt.Errorf("file is feature version %d, want feature version >= %d", d.metaData.IndexFeatureVersion, ReadMinFeatureVersion)
272	}
273
274	if d.metaData.IndexMinReaderVersion > FeatureVersion {
275		return nil, fmt.Errorf("file needs read feature version >= %d, have read feature version %d", d.metaData.IndexMinReaderVersion, FeatureVersion)
276	}
277
278	d.boundariesStart = toc.fileContents.data.off
279	d.boundaries = toc.fileContents.relativeIndex()
280	d.newlinesStart = toc.newlines.data.off
281	d.newlinesIndex = toc.newlines.relativeIndex()
282	d.docSectionsStart = toc.fileSections.data.off
283	d.docSectionsIndex = toc.fileSections.relativeIndex()
284
285	d.symbols.symKindIndex = toc.symbolKindMap.relativeIndex()
286	d.fileEndSymbol, err = readSectionU32(d.file, toc.fileEndSymbol)
287	if err != nil {
288		return nil, err
289	}
290
291	// Call readSectionBlob on each section key, and store the result in
292	// the blob value.
293	for sect, blob := range map[simpleSection]*[]byte{
294		toc.symbolMap.index:    &d.symbols.symIndex,
295		toc.symbolMap.data:     &d.symbols.symContent,
296		toc.symbolKindMap.data: &d.symbols.symKindContent,
297		toc.symbolMetaData:     &d.symbols.symMetaData,
298	} {
299		if *blob, err = d.readSectionBlob(sect); err != nil {
300			return nil, err
301		}
302	}
303
304	d.checksums, err = d.readSectionBlob(toc.contentChecksums)
305	if err != nil {
306		return nil, err
307	}
308
309	d.languages, err = d.readSectionBlob(toc.languages)
310	if err != nil {
311		return nil, err
312	}
313
314	d.contentNgrams, err = d.newBtreeIndex(toc.ngramText, toc.postings)
315	if err != nil {
316		return nil, err
317	}
318
319	d.fileBranchMasks, err = readSectionU64(d.file, toc.branchMasks)
320	if err != nil {
321		return nil, err
322	}
323
324	d.fileNameContent, err = d.readSectionBlob(toc.fileNames.data)
325	if err != nil {
326		return nil, err
327	}
328
329	d.fileNameIndex = toc.fileNames.relativeIndex()
330
331	d.fileNameNgrams, err = d.newBtreeIndex(toc.nameNgramText, toc.namePostings)
332	if err != nil {
333		return nil, err
334	}
335
336	for _, md := range d.repoMetaData {
337		repoBranchIDs := make(map[string]uint, len(md.Branches))
338		repoBranchNames := make(map[uint]string, len(md.Branches))
339		for j, br := range md.Branches {
340			id := uint(1) << uint(j)
341			repoBranchIDs[br.Name] = id
342			repoBranchNames[id] = br.Name
343		}
344		d.branchIDs = append(d.branchIDs, repoBranchIDs)
345		d.branchNames = append(d.branchNames, repoBranchNames)
346		d.rawConfigMasks = append(d.rawConfigMasks, encodeRawConfig(md.RawConfig))
347	}
348
349	blob, err := d.readSectionBlob(toc.runeDocSections)
350	if err != nil {
351		return nil, err
352	}
353
354	d.runeDocSections = unmarshalDocSections(blob, nil)
355
356	var runeOffsets, fileNameRuneOffsets []uint32
357
358	for sect, dest := range map[simpleSection]*[]uint32{
359		toc.subRepos:        &d.subRepos,
360		toc.runeOffsets:     &runeOffsets,
361		toc.nameRuneOffsets: &fileNameRuneOffsets,
362		toc.nameEndRunes:    &d.fileNameEndRunes,
363		toc.fileEndRunes:    &d.fileEndRunes,
364	} {
365		if blob, err := d.readSectionBlob(sect); err != nil {
366			return nil, err
367		} else {
368			*dest = fromSizedDeltas(blob, nil)
369		}
370	}
371
372	d.runeOffsets = makeRuneOffsetMap(runeOffsets)
373	d.fileNameRuneOffsets = makeRuneOffsetMap(fileNameRuneOffsets)
374
375	d.subRepoPaths = make([][]string, 0, len(d.repoMetaData))
376	for i := 0; i < len(d.repoMetaData); i++ {
377		keys := make([]string, 0, len(d.repoMetaData[i].SubRepoMap)+1)
378		keys = append(keys, "")
379		for k := range d.repoMetaData[i].SubRepoMap {
380			if k != "" {
381				keys = append(keys, k)
382			}
383		}
384		sort.Strings(keys)
385		d.subRepoPaths = append(d.subRepoPaths, keys)
386	}
387
388	d.languageMap = map[uint16]string{}
389	for k, v := range d.metaData.LanguageMap {
390		d.languageMap[v] = k
391	}
392
393	if err := d.verify(); err != nil {
394		return nil, err
395	}
396
397	// roc.ranks.sz = 0 indicates that we are reading a shard without ranks, in
398	// which case we skip reading the section and leave d.ranks = nil
399	if toc.ranks.sz > 0 {
400		err = d.readRanks(toc)
401		if err != nil {
402			return nil, err
403		}
404	}
405
406	if d.metaData.IndexFormatVersion >= 17 {
407		blob, err := d.readSectionBlob(toc.repos)
408		if err != nil {
409			return nil, err
410		}
411		d.repos = fromSizedDeltas16(blob, nil)
412	} else {
413		// every document is for repo index 0 (default value of uint16)
414		d.repos = make([]uint16, len(d.fileBranchMasks))
415	}
416
417	if err := d.calculateStats(); err != nil {
418		return nil, err
419	}
420
421	return &d, nil
422}
423
424func (r *reader) parseMetadata(metaData simpleSection, repoMetaData simpleSection) ([]*Repository, *IndexMetadata, error) {
425	var md IndexMetadata
426	if err := r.readJSON(&md, metaData); err != nil {
427		return nil, nil, err
428	}
429
430	// Sourcegraph specific: we support mutating metadata via an additional
431	// ".meta" file. This is to support tombstoning. An additional benefit is we
432	// can update metadata (such as Rank and Name) without re-indexing content.
433	blob, err := os.ReadFile(r.r.Name() + ".meta")
434	if err != nil && !os.IsNotExist(err) {
435		return nil, &md, fmt.Errorf("failed to read meta file: %w", err)
436	}
437
438	if len(blob) == 0 {
439		blob, err = r.r.Read(repoMetaData.off, repoMetaData.sz)
440		if err != nil {
441			return nil, &md, err
442		}
443	}
444
445	var repos []*Repository
446	if md.IndexFormatVersion >= 17 {
447		if err := json.Unmarshal(blob, &repos); err != nil {
448			return nil, &md, err
449		}
450	} else {
451		repos = make([]*Repository, 1)
452		if err := json.Unmarshal(blob, &repos[0]); err != nil {
453			return nil, &md, err
454		}
455	}
456
457	if md.ID == "" {
458		if len(repos) == 0 {
459			return nil, nil, fmt.Errorf("len(repos)=0. Cannot backfill ID")
460		}
461		md.ID = backfillID(repos[0].Name)
462	}
463
464	return repos, &md, nil
465}
466
467const ngramEncoding = 8
468
469func (d *indexData) newBtreeIndex(ngramSec simpleSection, postings compoundSection) (btreeIndex, error) {
470	bi := btreeIndex{file: d.file}
471
472	textContent, err := d.readSectionBlob(ngramSec)
473	if err != nil {
474		return btreeIndex{}, err
475	}
476
477	// For 500k trigams we can expect approx 1000 leaf nodes (500k divided by
478	// half the bucketSize) and 20 nodes on level 2 (all but the rightmost
479	// inner nodes will have exactly v=50 children) plus a root node.
480	bt := newBtree(btreeOpts{bucketSize: btreeBucketSize, v: 50})
481	for i := 0; i < len(textContent); i += ngramEncoding {
482		ng := ngram(binary.BigEndian.Uint64(textContent[i : i+ngramEncoding]))
483		bt.insert(ng)
484	}
485	bt.freeze()
486
487	bi.bt = bt
488
489	// hold on to simple sections (8 bytes each)
490	bi.ngramSec = ngramSec
491	bi.postingIndex = postings.index
492
493	return bi, nil
494}
495
496func (d *indexData) verify() error {
497	// This is not an exhaustive check: the postings can easily
498	// generate OOB acccesses, and are expensive to check, but this lets us rule out
499	// other sources of OOB access.
500	n := len(d.fileNameIndex)
501	if n == 0 {
502		return nil
503	}
504
505	n--
506	for what, got := range map[string]int{
507		"boundaries":        len(d.boundaries) - 1,
508		"branch masks":      len(d.fileBranchMasks),
509		"doc section index": len(d.docSectionsIndex) - 1,
510		"newlines index":    len(d.newlinesIndex) - 1,
511	} {
512		if got != n {
513			return fmt.Errorf("got %s %d, want %d", what, got, n)
514		}
515	}
516	return nil
517}
518
519func (d *indexData) readContents(i uint32) ([]byte, error) {
520	return d.readSectionBlob(simpleSection{
521		off: d.boundariesStart + d.boundaries[i],
522		sz:  d.boundaries[i+1] - d.boundaries[i],
523	})
524}
525
526func (d *indexData) readContentSlice(off uint32, sz uint32) ([]byte, error) {
527	// TODO(hanwen): cap result if it is at the end of the content
528	// section.
529	return d.readSectionBlob(simpleSection{
530		off: d.boundariesStart + off,
531		sz:  sz,
532	})
533}
534
535func (d *indexData) readNewlines(i uint32, buf []uint32) ([]uint32, uint32, error) {
536	sec := simpleSection{
537		off: d.newlinesStart + d.newlinesIndex[i],
538		sz:  d.newlinesIndex[i+1] - d.newlinesIndex[i],
539	}
540	blob, err := d.readSectionBlob(sec)
541	if err != nil {
542		return nil, 0, err
543	}
544
545	return fromSizedDeltas(blob, buf), sec.sz, nil
546}
547
548func (d *indexData) readDocSections(i uint32, buf []DocumentSection) ([]DocumentSection, uint32, error) {
549	sec := simpleSection{
550		off: d.docSectionsStart + d.docSectionsIndex[i],
551		sz:  d.docSectionsIndex[i+1] - d.docSectionsIndex[i],
552	}
553	blob, err := d.readSectionBlob(sec)
554	if err != nil {
555		return nil, 0, err
556	}
557
558	ds := unmarshalDocSections(blob, buf)
559
560	// can be nil if buf is nil and there are no doc sections. However, we rely
561	// on it being non-nil to cache the read.
562	if ds == nil {
563		ds = make([]DocumentSection, 0)
564	}
565
566	return ds, sec.sz, nil
567}
568
569func (d *indexData) readRanks(toc *indexTOC) error {
570	blob, err := d.readSectionBlob(toc.ranks)
571	if err != nil {
572		return err
573	}
574
575	return decodeRanks(blob, &d.ranks)
576}
577
578// NewSearcher creates a Searcher for a single index file.  Search
579// results coming from this searcher are valid only for the lifetime
580// of the Searcher itself, ie. []byte members should be copied into
581// fresh buffers if the result is to survive closing the shard.
582func NewSearcher(r IndexFile) (Searcher, error) {
583	rd := &reader{r: r}
584
585	var toc indexTOC
586	if err := rd.readTOC(&toc); err != nil {
587		return nil, err
588	}
589	indexData, err := rd.readIndexData(&toc)
590	if err != nil {
591		return nil, err
592	}
593	indexData.file = r
594	return indexData, nil
595}
596
597// ReadMetadata returns the metadata of index shard without reading
598// the index data. The IndexFile is not closed.
599func ReadMetadata(inf IndexFile) ([]*Repository, *IndexMetadata, error) {
600	rd := &reader{r: inf}
601	var toc indexTOC
602	err := rd.readTOCSections(&toc, []string{"metaData", "repoMetaData"})
603	if err != nil {
604		return nil, nil, err
605	}
606	return rd.parseMetadata(toc.metaData, toc.repoMetaData)
607}
608
609// ReadMetadataPathAlive is like ReadMetadataPath except that it only returns
610// alive repositories.
611func ReadMetadataPathAlive(p string) ([]*Repository, *IndexMetadata, error) {
612	repos, id, err := ReadMetadataPath(p)
613	if err != nil {
614		return nil, nil, err
615	}
616	alive := repos[:0]
617	for _, repo := range repos {
618		if !repo.Tombstone {
619			alive = append(alive, repo)
620		}
621	}
622	return alive, id, nil
623}
624
625// ReadMetadataPath returns the metadata of index shard at p without reading
626// the index data. ReadMetadataPath is a helper for ReadMetadata which opens
627// the IndexFile at p.
628func ReadMetadataPath(p string) ([]*Repository, *IndexMetadata, error) {
629	f, err := os.Open(p)
630	if err != nil {
631		return nil, nil, err
632	}
633	defer f.Close()
634
635	iFile, err := NewIndexFile(f)
636	if err != nil {
637		return nil, nil, err
638	}
639	defer iFile.Close()
640
641	return ReadMetadata(iFile)
642}
643
644// IndexFilePaths returns all paths for the IndexFile at filepath p that
645// exist. Note: if no files exist this will return an empty slice and nil
646// error.
647//
648// This is p and the ".meta" file for p.
649func IndexFilePaths(p string) ([]string, error) {
650	paths := []string{p, p + ".meta"}
651	exist := paths[:0]
652	for _, p := range paths {
653		if _, err := os.Stat(p); err == nil {
654			exist = append(exist, p)
655		} else if !os.IsNotExist(err) {
656			return nil, err
657		}
658	}
659	return exist, nil
660}
661
662func loadIndexData(r IndexFile) (*indexData, error) {
663	rd := &reader{r: r}
664
665	var toc indexTOC
666	if err := rd.readTOC(&toc); err != nil {
667		return nil, err
668	}
669	return rd.readIndexData(&toc)
670}
671
672// PrintNgramStats outputs a list of the form
673//
674//	n_1 trigram_1
675//	n_2 trigram_2
676//	...
677//
678// where n_i is the length of the postings list of trigram_i stored in r.
679func PrintNgramStats(r IndexFile) error {
680	id, err := loadIndexData(r)
681	if err != nil {
682		return err
683	}
684
685	var rNgram [3]rune
686	for ngram, ss := range id.contentNgrams.DumpMap() {
687		rNgram = ngramToRunes(ngram)
688		fmt.Printf("%d\t%q\n", ss.sz, string(rNgram[:]))
689	}
690	return nil
691}
692
693var crc64Table = crc64.MakeTable(crc64.ECMA)
694
695// backfillID returns a 20 char long sortable ID. The ID only depends on s. It
696// should only be used to set the ID of simple v16 shards on read.
697func backfillID(s string) string {
698	var id xid.ID
699
700	// Our timestamps are based on Unix time. Shards without IDs are assigned IDs
701	// based on the 0 epoch.
702	binary.BigEndian.PutUint32(id[:], 0)
703	binary.BigEndian.PutUint64(id[4:], crc64.Checksum([]byte(s), crc64Table))
704	return id.String()
705}
Configure Feed

Configure Feed