index/read.go at 388cba654f9ede9d576ba3591f9df10820215df2 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / index / read.go
at 388cba654f9ede9d576ba3591f9df10820215df2 18 kB View raw
Julie Tibshirani Move root-level index code to index package (#902) 1y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package index
 16
 17import (
 18	"encoding/binary"
 19	"encoding/json"
 20	"fmt"
 21	"hash/crc64"
 22	"log"
 23	"os"
 24	"slices"
 25	"sort"
 26
 27	"github.com/rs/xid"
 28	"github.com/sourcegraph/zoekt"
 29)
 30
 31// IndexFile is a file suitable for concurrent read access. For performance
 32// reasons, it allows a mmap'd implementation.
 33type IndexFile interface {
 34	Read(off uint32, sz uint32) ([]byte, error)
 35	Size() (uint32, error)
 36	Close()
 37	Name() string
 38}
 39
 40// reader is a stateful file
 41type reader struct {
 42	r   IndexFile
 43	off uint32
 44}
 45
 46func (r *reader) seek(off uint32) {
 47	r.off = off
 48}
 49
 50func (r *reader) U32() (uint32, error) {
 51	b, err := r.r.Read(r.off, 4)
 52	r.off += 4
 53	if err != nil {
 54		return 0, err
 55	}
 56	return binary.BigEndian.Uint32(b), nil
 57}
 58
 59func (r *reader) U64() (uint64, error) {
 60	b, err := r.r.Read(r.off, 8)
 61	r.off += 8
 62	if err != nil {
 63		return 0, err
 64	}
 65	return binary.BigEndian.Uint64(b), nil
 66}
 67
 68func (r *reader) ReadByte() (byte, error) {
 69	b, err := r.r.Read(r.off, 1)
 70	r.off += 1
 71	if err != nil {
 72		return 0, err
 73	}
 74	return b[0], nil
 75}
 76
 77func (r *reader) Varint() (uint64, error) {
 78	v, err := binary.ReadUvarint(r)
 79	if err != nil {
 80		return 0, err
 81	}
 82	return v, nil
 83}
 84
 85func (r *reader) Str() (string, error) {
 86	slen, err := r.Varint()
 87	if err != nil {
 88		return "", err
 89	}
 90	b, err := r.r.Read(r.off, uint32(slen))
 91	if err != nil {
 92		return "", err
 93	}
 94	r.off += uint32(slen)
 95	return string(b), nil
 96}
 97
 98func (r *reader) readTOC(toc *indexTOC) error {
 99	return r.readTOCSections(toc, []string{})
100}
101
102// readTOCSections reads the table of contents of the index file.
103//
104// If the tags parameter is non-empty, it reads only those tagged sections for efficiency
105// and does not populate the other sections.
106func (r *reader) readTOCSections(toc *indexTOC, tags []string) error {
107	tocSection, sectionCount, err := r.readHeader()
108	if err != nil {
109		return err
110	}
111
112	if sectionCount == 0 {
113		// tagged sections are indicated by a 0 sectionCount,
114		// and then a list of string-tagged type-indicated sections.
115		secs := toc.sectionsTagged()
116		for r.off < tocSection.off+tocSection.sz {
117			tag, err := r.Str()
118			if err != nil {
119				return err
120			}
121			kind, err := r.Varint()
122			if err != nil {
123				return err
124			}
125
126			skipSection := len(tags) > 0 && !slices.Contains(tags, tag)
127			sec := secs[tag]
128			if sec == nil || sec.kind() != sectionKind(kind) {
129				// If we don't recognize the section, we may be reading a newer index than the current version. Use
130				// a "dummy section" struct to skip over it.
131				skipSection = true
132				log.Printf("encountered unrecognized index section (%s), skipping over it", tag)
133
134				switch sectionKind(kind) {
135				case sectionKindSimple:
136					sec = &simpleSection{}
137				case sectionKindCompound:
138					sec = &compoundSection{}
139				case sectionKindCompoundLazy:
140					sec = &lazyCompoundSection{}
141				default:
142					return fmt.Errorf("unknown section kind %d", kind)
143				}
144			}
145
146			if skipSection {
147				if err := sec.skip(r); err != nil {
148					return err
149				}
150			} else {
151				if err := sec.read(r); err != nil {
152					return err
153				}
154			}
155		}
156	} else {
157		// TODO: Remove this branch when ReaderMinFeatureVersion >= 10
158
159		secs := toc.sections()
160
161		if len(secs) != int(sectionCount) {
162			secs = toc.sectionsNext()
163		}
164
165		if len(secs) != int(sectionCount) {
166			return fmt.Errorf("section count mismatch: got %d want %d", sectionCount, len(secs))
167		}
168
169		for _, s := range secs {
170			if err := s.read(r); err != nil {
171				return err
172			}
173		}
174	}
175	return nil
176}
177
178func (r *reader) readHeader() (simpleSection, uint32, error) {
179	sz, err := r.r.Size()
180	if err != nil {
181		return simpleSection{}, 0, err
182	}
183	r.off = sz - 8
184
185	var tocSection simpleSection
186	if err := tocSection.read(r); err != nil {
187		return simpleSection{}, 0, err
188	}
189
190	r.seek(tocSection.off)
191
192	sectionCount, err := r.U32()
193	if err != nil {
194		return simpleSection{}, 0, err
195	}
196	return tocSection, sectionCount, nil
197}
198
199func (r *indexData) readSectionBlob(sec simpleSection) ([]byte, error) {
200	return r.file.Read(sec.off, sec.sz)
201}
202
203func readSectionU32(f IndexFile, sec simpleSection) ([]uint32, error) {
204	if sec.sz%4 != 0 {
205		return nil, fmt.Errorf("barf: section size %% 4 != 0: sz %d ", sec.sz)
206	}
207	blob, err := f.Read(sec.off, sec.sz)
208	if err != nil {
209		return nil, err
210	}
211	arr := make([]uint32, 0, len(blob)/4)
212	for len(blob) > 0 {
213		arr = append(arr, binary.BigEndian.Uint32(blob))
214		blob = blob[4:]
215	}
216	return arr, nil
217}
218
219func readSectionU64(f IndexFile, sec simpleSection) ([]uint64, error) {
220	if sec.sz%8 != 0 {
221		return nil, fmt.Errorf("barf: section size %% 8 != 0: sz %d ", sec.sz)
222	}
223	blob, err := f.Read(sec.off, sec.sz)
224	if err != nil {
225		return nil, err
226	}
227	arr := make([]uint64, 0, len(blob)/8)
228	for len(blob) > 0 {
229		arr = append(arr, binary.BigEndian.Uint64(blob))
230		blob = blob[8:]
231	}
232	return arr, nil
233}
234
235func (r *reader) readJSON(data interface{}, sec simpleSection) error {
236	blob, err := r.r.Read(sec.off, sec.sz)
237	if err != nil {
238		return err
239	}
240
241	return json.Unmarshal(blob, data)
242}
243
244// canReadVersion returns checks if zoekt can read in md. If it can't a
245// non-nil error is returned.
246func canReadVersion(md *zoekt.IndexMetadata) bool {
247	// Backwards compatible with v16
248	return md.IndexFormatVersion == IndexFormatVersion || md.IndexFormatVersion == NextIndexFormatVersion
249}
250
251func (r *reader) readIndexData(toc *indexTOC) (*indexData, error) {
252	d := indexData{
253		file:        r.r,
254		branchIDs:   []map[string]uint{},
255		branchNames: []map[uint]string{},
256	}
257
258	repos, md, err := r.parseMetadata(toc.metaData, toc.repoMetaData)
259	if md != nil && !canReadVersion(md) {
260		return nil, fmt.Errorf("file is v%d, want v%d", md.IndexFormatVersion, IndexFormatVersion)
261	} else if err != nil {
262		return nil, err
263	}
264
265	d.metaData = *md
266	d.repoMetaData = make([]zoekt.Repository, 0, len(repos))
267	for _, r := range repos {
268		d.repoMetaData = append(d.repoMetaData, *r)
269	}
270
271	if d.metaData.IndexFeatureVersion < ReadMinFeatureVersion {
272		return nil, fmt.Errorf("file is feature version %d, want feature version >= %d", d.metaData.IndexFeatureVersion, ReadMinFeatureVersion)
273	}
274
275	if d.metaData.IndexMinReaderVersion > FeatureVersion {
276		return nil, fmt.Errorf("file needs read feature version >= %d, have read feature version %d", d.metaData.IndexMinReaderVersion, FeatureVersion)
277	}
278
279	d.boundariesStart = toc.fileContents.data.off
280	d.boundaries = toc.fileContents.relativeIndex()
281	d.newlinesStart = toc.newlines.data.off
282	d.newlinesIndex = toc.newlines.relativeIndex()
283	d.docSectionsStart = toc.fileSections.data.off
284	d.docSectionsIndex = toc.fileSections.relativeIndex()
285
286	d.symbols.symKindIndex = toc.symbolKindMap.relativeIndex()
287	d.fileEndSymbol, err = readSectionU32(d.file, toc.fileEndSymbol)
288	if err != nil {
289		return nil, err
290	}
291
292	// Call readSectionBlob on each section key, and store the result in
293	// the blob value.
294	for sect, blob := range map[simpleSection]*[]byte{
295		toc.symbolMap.index:    &d.symbols.symIndex,
296		toc.symbolMap.data:     &d.symbols.symContent,
297		toc.symbolKindMap.data: &d.symbols.symKindContent,
298		toc.symbolMetaData:     &d.symbols.symMetaData,
299	} {
300		if *blob, err = d.readSectionBlob(sect); err != nil {
301			return nil, err
302		}
303	}
304
305	d.checksums, err = d.readSectionBlob(toc.contentChecksums)
306	if err != nil {
307		return nil, err
308	}
309
310	d.languages, err = d.readSectionBlob(toc.languages)
311	if err != nil {
312		return nil, err
313	}
314
315	d.contentNgrams, err = d.newBtreeIndex(toc.ngramText, toc.postings)
316	if err != nil {
317		return nil, err
318	}
319
320	d.fileBranchMasks, err = readSectionU64(d.file, toc.branchMasks)
321	if err != nil {
322		return nil, err
323	}
324
325	d.fileNameContent, err = d.readSectionBlob(toc.fileNames.data)
326	if err != nil {
327		return nil, err
328	}
329
330	d.fileNameIndex = toc.fileNames.relativeIndex()
331
332	d.fileNameNgrams, err = d.newBtreeIndex(toc.nameNgramText, toc.namePostings)
333	if err != nil {
334		return nil, err
335	}
336
337	for _, md := range d.repoMetaData {
338		repoBranchIDs := make(map[string]uint, len(md.Branches))
339		repoBranchNames := make(map[uint]string, len(md.Branches))
340		for j, br := range md.Branches {
341			id := uint(1) << uint(j)
342			repoBranchIDs[br.Name] = id
343			repoBranchNames[id] = br.Name
344		}
345		d.branchIDs = append(d.branchIDs, repoBranchIDs)
346		d.branchNames = append(d.branchNames, repoBranchNames)
347		d.rawConfigMasks = append(d.rawConfigMasks, encodeRawConfig(md.RawConfig))
348	}
349
350	blob, err := d.readSectionBlob(toc.runeDocSections)
351	if err != nil {
352		return nil, err
353	}
354
355	d.runeDocSections = unmarshalDocSections(blob, nil)
356
357	var runeOffsets, fileNameRuneOffsets []uint32
358
359	for sect, dest := range map[simpleSection]*[]uint32{
360		toc.subRepos:        &d.subRepos,
361		toc.runeOffsets:     &runeOffsets,
362		toc.nameRuneOffsets: &fileNameRuneOffsets,
363		toc.nameEndRunes:    &d.fileNameEndRunes,
364		toc.fileEndRunes:    &d.fileEndRunes,
365	} {
366		if blob, err := d.readSectionBlob(sect); err != nil {
367			return nil, err
368		} else {
369			*dest = fromSizedDeltas(blob, nil)
370		}
371	}
372
373	d.runeOffsets = makeRuneOffsetMap(runeOffsets)
374	d.fileNameRuneOffsets = makeRuneOffsetMap(fileNameRuneOffsets)
375
376	d.subRepoPaths = make([][]string, 0, len(d.repoMetaData))
377	for i := 0; i < len(d.repoMetaData); i++ {
378		keys := make([]string, 0, len(d.repoMetaData[i].SubRepoMap)+1)
379		keys = append(keys, "")
380		for k := range d.repoMetaData[i].SubRepoMap {
381			if k != "" {
382				keys = append(keys, k)
383			}
384		}
385		sort.Strings(keys)
386		d.subRepoPaths = append(d.subRepoPaths, keys)
387	}
388
389	d.languageMap = map[uint16]string{}
390	for k, v := range d.metaData.LanguageMap {
391		d.languageMap[v] = k
392	}
393
394	if err := d.verify(); err != nil {
395		return nil, err
396	}
397
398	if d.metaData.IndexFormatVersion >= 17 {
399		blob, err := d.readSectionBlob(toc.repos)
400		if err != nil {
401			return nil, err
402		}
403		d.repos = fromSizedDeltas16(blob, nil)
404	} else {
405		// every document is for repo index 0 (default value of uint16)
406		d.repos = make([]uint16, len(d.fileBranchMasks))
407	}
408
409	if err := d.calculateStats(); err != nil {
410		return nil, err
411	}
412
413	return &d, nil
414}
415
416func (r *reader) parseMetadata(metaData simpleSection, repoMetaData simpleSection) ([]*zoekt.Repository, *zoekt.IndexMetadata, error) {
417	var md zoekt.IndexMetadata
418	if err := r.readJSON(&md, metaData); err != nil {
419		return nil, nil, err
420	}
421
422	// Sourcegraph specific: we support mutating metadata via an additional
423	// ".meta" file. This is to support tombstoning. An additional benefit is we
424	// can update metadata (such as Rank and Name) without re-indexing content.
425	blob, err := os.ReadFile(r.r.Name() + ".meta")
426	if err != nil && !os.IsNotExist(err) {
427		return nil, &md, fmt.Errorf("failed to read meta file: %w", err)
428	}
429
430	if len(blob) == 0 {
431		blob, err = r.r.Read(repoMetaData.off, repoMetaData.sz)
432		if err != nil {
433			return nil, &md, err
434		}
435	}
436
437	var repos []*zoekt.Repository
438	if md.IndexFormatVersion >= 17 {
439		if err := json.Unmarshal(blob, &repos); err != nil {
440			return nil, &md, err
441		}
442	} else {
443		repos = make([]*zoekt.Repository, 1)
444		if err := json.Unmarshal(blob, &repos[0]); err != nil {
445			return nil, &md, err
446		}
447	}
448
449	if md.ID == "" {
450		if len(repos) == 0 {
451			return nil, nil, fmt.Errorf("len(repos)=0. Cannot backfill ID")
452		}
453		md.ID = backfillID(repos[0].Name)
454	}
455
456	return repos, &md, nil
457}
458
459const ngramEncoding = 8
460
461func (d *indexData) newBtreeIndex(ngramSec simpleSection, postings compoundSection) (btreeIndex, error) {
462	bi := btreeIndex{file: d.file}
463
464	textContent, err := d.readSectionBlob(ngramSec)
465	if err != nil {
466		return btreeIndex{}, err
467	}
468
469	// For 500k trigams we can expect approx 1000 leaf nodes (500k divided by
470	// half the bucketSize) and 20 nodes on level 2 (all but the rightmost
471	// inner nodes will have exactly v=50 children) plus a root node.
472	bt := newBtree(btreeOpts{bucketSize: btreeBucketSize, v: 50})
473	for i := 0; i < len(textContent); i += ngramEncoding {
474		ng := ngram(binary.BigEndian.Uint64(textContent[i : i+ngramEncoding]))
475		bt.insert(ng)
476	}
477	bt.freeze()
478
479	bi.bt = bt
480
481	// hold on to simple sections (8 bytes each)
482	bi.ngramSec = ngramSec
483	bi.postingIndex = postings.index
484
485	return bi, nil
486}
487
488func (d *indexData) verify() error {
489	// This is not an exhaustive check: the postings can easily
490	// generate OOB acccesses, and are expensive to check, but this lets us rule out
491	// other sources of OOB access.
492	n := len(d.fileNameIndex)
493	if n == 0 {
494		return nil
495	}
496
497	n--
498	for what, got := range map[string]int{
499		"boundaries":        len(d.boundaries) - 1,
500		"branch masks":      len(d.fileBranchMasks),
501		"doc section index": len(d.docSectionsIndex) - 1,
502		"newlines index":    len(d.newlinesIndex) - 1,
503	} {
504		if got != n {
505			return fmt.Errorf("got %s %d, want %d", what, got, n)
506		}
507	}
508	return nil
509}
510
511func (d *indexData) readContents(i uint32) ([]byte, error) {
512	return d.readSectionBlob(simpleSection{
513		off: d.boundariesStart + d.boundaries[i],
514		sz:  d.boundaries[i+1] - d.boundaries[i],
515	})
516}
517
518func (d *indexData) readContentSlice(off uint32, sz uint32) ([]byte, error) {
519	// TODO(hanwen): cap result if it is at the end of the content
520	// section.
521	return d.readSectionBlob(simpleSection{
522		off: d.boundariesStart + off,
523		sz:  sz,
524	})
525}
526
527func (d *indexData) readNewlines(i uint32, buf []uint32) ([]uint32, uint32, error) {
528	sec := simpleSection{
529		off: d.newlinesStart + d.newlinesIndex[i],
530		sz:  d.newlinesIndex[i+1] - d.newlinesIndex[i],
531	}
532	blob, err := d.readSectionBlob(sec)
533	if err != nil {
534		return nil, 0, err
535	}
536
537	nl := fromSizedDeltas(blob, buf)
538
539	// can be nil if buf is nil and there are no doc sections. However, we rely
540	// on it being non-nil to cache the read.
541	if nl == nil {
542		nl = make([]uint32, 0)
543	}
544	return nl, sec.sz, nil
545}
546
547func (d *indexData) readDocSections(i uint32, buf []DocumentSection) ([]DocumentSection, uint32, error) {
548	sec := simpleSection{
549		off: d.docSectionsStart + d.docSectionsIndex[i],
550		sz:  d.docSectionsIndex[i+1] - d.docSectionsIndex[i],
551	}
552	blob, err := d.readSectionBlob(sec)
553	if err != nil {
554		return nil, 0, err
555	}
556
557	ds := unmarshalDocSections(blob, buf)
558
559	// can be nil if buf is nil and there are no doc sections. However, we rely
560	// on it being non-nil to cache the read.
561	if ds == nil {
562		ds = make([]DocumentSection, 0)
563	}
564
565	return ds, sec.sz, nil
566}
567
568// NewSearcher creates a Searcher for a single index file.  Search
569// results coming from this searcher are valid only for the lifetime
570// of the Searcher itself, ie. []byte members should be copied into
571// fresh buffers if the result is to survive closing the shard.
572func NewSearcher(r IndexFile) (zoekt.Searcher, error) {
573	rd := &reader{r: r}
574
575	var toc indexTOC
576	if err := rd.readTOC(&toc); err != nil {
577		return nil, err
578	}
579	indexData, err := rd.readIndexData(&toc)
580	if err != nil {
581		return nil, err
582	}
583	indexData.file = r
584	return indexData, nil
585}
586
587// ReadMetadata returns the metadata of index shard without reading
588// the index data. The IndexFile is not closed.
589func ReadMetadata(inf IndexFile) ([]*zoekt.Repository, *zoekt.IndexMetadata, error) {
590	rd := &reader{r: inf}
591	var toc indexTOC
592	err := rd.readTOCSections(&toc, []string{"metaData", "repoMetaData"})
593	if err != nil {
594		return nil, nil, err
595	}
596	return rd.parseMetadata(toc.metaData, toc.repoMetaData)
597}
598
599// ReadMetadataPathAlive is like ReadMetadataPath except that it only returns
600// alive repositories.
601func ReadMetadataPathAlive(p string) ([]*zoekt.Repository, *zoekt.IndexMetadata, error) {
602	repos, id, err := ReadMetadataPath(p)
603	if err != nil {
604		return nil, nil, err
605	}
606	alive := repos[:0]
607	for _, repo := range repos {
608		if !repo.Tombstone {
609			alive = append(alive, repo)
610		}
611	}
612	return alive, id, nil
613}
614
615// ReadMetadataPath returns the metadata of index shard at p without reading
616// the index data. ReadMetadataPath is a helper for ReadMetadata which opens
617// the IndexFile at p.
618func ReadMetadataPath(p string) ([]*zoekt.Repository, *zoekt.IndexMetadata, error) {
619	f, err := os.Open(p)
620	if err != nil {
621		return nil, nil, err
622	}
623	defer f.Close()
624
625	iFile, err := NewIndexFile(f)
626	if err != nil {
627		return nil, nil, err
628	}
629	defer iFile.Close()
630
631	return ReadMetadata(iFile)
632}
633
634// IndexFilePaths returns all paths for the IndexFile at filepath p that
635// exist. Note: if no files exist this will return an empty slice and nil
636// error.
637//
638// This is p and the ".meta" file for p.
639func IndexFilePaths(p string) ([]string, error) {
640	paths := []string{p, p + ".meta"}
641	exist := paths[:0]
642	for _, p := range paths {
643		if _, err := os.Stat(p); err == nil {
644			exist = append(exist, p)
645		} else if !os.IsNotExist(err) {
646			return nil, err
647		}
648	}
649	return exist, nil
650}
651
652func loadIndexData(r IndexFile) (*indexData, error) {
653	rd := &reader{r: r}
654
655	var toc indexTOC
656	if err := rd.readTOC(&toc); err != nil {
657		return nil, err
658	}
659	return rd.readIndexData(&toc)
660}
661
662// PrintNgramStats outputs a list of the form
663//
664//	n_1 trigram_1
665//	n_2 trigram_2
666//	...
667//
668// where n_i is the length of the postings list of trigram_i stored in r.
669func PrintNgramStats(r IndexFile) error {
670	id, err := loadIndexData(r)
671	if err != nil {
672		return err
673	}
674
675	var rNgram [3]rune
676	for ngram, ss := range id.contentNgrams.DumpMap() {
677		rNgram = ngramToRunes(ngram)
678		fmt.Printf("%d\t%q\n", ss.sz, string(rNgram[:]))
679	}
680	return nil
681}
682
683var crc64Table = crc64.MakeTable(crc64.ECMA)
684
685// backfillID returns a 20 char long sortable ID. The ID only depends on s. It
686// should only be used to set the ID of simple v16 shards on read.
687func backfillID(s string) string {
688	var id xid.ID
689
690	// Our timestamps are based on Unix time. Shards without IDs are assigned IDs
691	// based on the 0 epoch.
692	binary.BigEndian.PutUint32(id[:], 0)
693	binary.BigEndian.PutUint64(id[4:], crc64.Checksum([]byte(s), crc64Table))
694	return id.String()
695}
Configure Feed

Configure Feed