read.go at 20a30090ee38bb8d57dd98a47b9f6184fe7dba3f · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / read.go
at 20a30090ee38bb8d57dd98a47b9f6184fe7dba3f 18 kB View raw
Stefan Hengl indexdata: add interface for inverted index (#520) 3y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package zoekt
 16
 17import (
 18	"encoding/binary"
 19	"encoding/json"
 20	"fmt"
 21	"hash/crc64"
 22	"log"
 23	"os"
 24	"sort"
 25
 26	"github.com/rs/xid"
 27)
 28
 29// IndexFile is a file suitable for concurrent read access. For performance
 30// reasons, it allows a mmap'd implementation.
 31type IndexFile interface {
 32	Read(off uint32, sz uint32) ([]byte, error)
 33	Size() (uint32, error)
 34	Close()
 35	Name() string
 36}
 37
 38// reader is a stateful file
 39type reader struct {
 40	r   IndexFile
 41	off uint32
 42}
 43
 44func (r *reader) seek(off uint32) {
 45	r.off = off
 46}
 47
 48func (r *reader) U32() (uint32, error) {
 49	b, err := r.r.Read(r.off, 4)
 50	r.off += 4
 51	if err != nil {
 52		return 0, err
 53	}
 54	return binary.BigEndian.Uint32(b), nil
 55}
 56
 57func (r *reader) U64() (uint64, error) {
 58	b, err := r.r.Read(r.off, 8)
 59	r.off += 8
 60	if err != nil {
 61		return 0, err
 62	}
 63	return binary.BigEndian.Uint64(b), nil
 64}
 65
 66func (r *reader) ReadByte() (byte, error) {
 67	b, err := r.r.Read(r.off, 1)
 68	r.off += 1
 69	if err != nil {
 70		return 0, err
 71	}
 72	return b[0], nil
 73}
 74
 75func (r *reader) Varint() (uint64, error) {
 76	v, err := binary.ReadUvarint(r)
 77	if err != nil {
 78		return 0, err
 79	}
 80	return v, nil
 81}
 82
 83func (r *reader) Str() (string, error) {
 84	slen, err := r.Varint()
 85	if err != nil {
 86		return "", err
 87	}
 88	b, err := r.r.Read(r.off, uint32(slen))
 89	if err != nil {
 90		return "", err
 91	}
 92	r.off += uint32(slen)
 93	return string(b), nil
 94}
 95
 96func (r *reader) readTOC(toc *indexTOC) error {
 97	sz, err := r.r.Size()
 98	if err != nil {
 99		return err
100	}
101	r.off = sz - 8
102
103	var tocSection simpleSection
104	if err := tocSection.read(r); err != nil {
105		return err
106	}
107
108	r.seek(tocSection.off)
109
110	sectionCount, err := r.U32()
111	if err != nil {
112		return err
113	}
114
115	if sectionCount == 0 {
116		// tagged sections are indicated by a 0 sectionCount,
117		// and then a list of string-tagged type-indicated sections.
118		secs := toc.sectionsTagged()
119		for r.off < tocSection.off+tocSection.sz {
120			tag, err := r.Str()
121			if err != nil {
122				return err
123			}
124			kind, err := r.Varint()
125			if err != nil {
126				return err
127			}
128			sec := secs[tag]
129			if sec != nil && sec.kind() == sectionKind(kind) {
130				// happy path
131				if err := sec.read(r); err != nil {
132					return err
133				}
134				continue
135			}
136			// error case: skip over unknown section
137			if sec == nil {
138				log.Printf("file %s TOC has unknown section %q", r.r.Name(), tag)
139			} else {
140				return fmt.Errorf("file %s TOC section %q expects kind %d, got kind %d", r.r.Name(), tag,
141					kind, sec.kind())
142			}
143			if kind == 0 {
144				if err := (&simpleSection{}).read(r); err != nil {
145					return err
146				}
147			} else if kind == 1 {
148				if err := (&compoundSection{}).read(r); err != nil {
149					return err
150				}
151			}
152		}
153	} else {
154		// TODO: Remove this branch when ReaderMinFeatureVersion >= 10
155
156		secs := toc.sections()
157
158		if len(secs) != int(sectionCount) {
159			secs = toc.sectionsNext()
160		}
161
162		if len(secs) != int(sectionCount) {
163			return fmt.Errorf("section count mismatch: got %d want %d", sectionCount, len(secs))
164		}
165
166		for _, s := range secs {
167			if err := s.read(r); err != nil {
168				return err
169			}
170		}
171	}
172	return nil
173}
174
175func (r *indexData) readSectionBlob(sec simpleSection) ([]byte, error) {
176	return r.file.Read(sec.off, sec.sz)
177}
178
179func readSectionU32(f IndexFile, sec simpleSection) ([]uint32, error) {
180	if sec.sz%4 != 0 {
181		return nil, fmt.Errorf("barf: section size %% 4 != 0: sz %d ", sec.sz)
182	}
183	blob, err := f.Read(sec.off, sec.sz)
184	if err != nil {
185		return nil, err
186	}
187	arr := make([]uint32, 0, len(blob)/4)
188	for len(blob) > 0 {
189		arr = append(arr, binary.BigEndian.Uint32(blob))
190		blob = blob[4:]
191	}
192	return arr, nil
193}
194
195func readSectionU64(f IndexFile, sec simpleSection) ([]uint64, error) {
196	if sec.sz%8 != 0 {
197		return nil, fmt.Errorf("barf: section size %% 8 != 0: sz %d ", sec.sz)
198	}
199	blob, err := f.Read(sec.off, sec.sz)
200	if err != nil {
201		return nil, err
202	}
203	arr := make([]uint64, 0, len(blob)/8)
204	for len(blob) > 0 {
205		arr = append(arr, binary.BigEndian.Uint64(blob))
206		blob = blob[8:]
207	}
208	return arr, nil
209}
210
211func (r *reader) readJSON(data interface{}, sec *simpleSection) error {
212	blob, err := r.r.Read(sec.off, sec.sz)
213	if err != nil {
214		return err
215	}
216
217	return json.Unmarshal(blob, data)
218}
219
220// canReadVersion returns checks if zoekt can read in md. If it can't a
221// non-nil error is returned.
222func canReadVersion(md *IndexMetadata) bool {
223	// Backwards compatible with v16
224	return md.IndexFormatVersion == IndexFormatVersion || md.IndexFormatVersion == NextIndexFormatVersion
225}
226
227func (r *reader) readIndexData(toc *indexTOC) (*indexData, error) {
228	d := indexData{
229		file:           r.r,
230		fileNameNgrams: map[ngram][]byte{},
231		branchIDs:      []map[string]uint{},
232		branchNames:    []map[uint]string{},
233	}
234
235	repos, md, err := r.readMetadata(toc)
236	if md != nil && !canReadVersion(md) {
237		return nil, fmt.Errorf("file is v%d, want v%d", md.IndexFormatVersion, IndexFormatVersion)
238	} else if err != nil {
239		return nil, err
240	}
241
242	d.metaData = *md
243	d.repoMetaData = make([]Repository, 0, len(repos))
244	for _, r := range repos {
245		d.repoMetaData = append(d.repoMetaData, *r)
246	}
247
248	if d.metaData.IndexFeatureVersion < ReadMinFeatureVersion {
249		return nil, fmt.Errorf("file is feature version %d, want feature version >= %d", d.metaData.IndexFeatureVersion, ReadMinFeatureVersion)
250	}
251
252	if d.metaData.IndexMinReaderVersion > FeatureVersion {
253		return nil, fmt.Errorf("file needs read feature version >= %d, have read feature version %d", d.metaData.IndexMinReaderVersion, FeatureVersion)
254	}
255
256	d.boundariesStart = toc.fileContents.data.off
257	d.boundaries = toc.fileContents.relativeIndex()
258	d.newlinesStart = toc.newlines.data.off
259	d.newlinesIndex = toc.newlines.relativeIndex()
260	d.docSectionsStart = toc.fileSections.data.off
261	d.docSectionsIndex = toc.fileSections.relativeIndex()
262
263	d.symbols.symKindIndex = toc.symbolKindMap.relativeIndex()
264	d.fileEndSymbol, err = readSectionU32(d.file, toc.fileEndSymbol)
265	if err != nil {
266		return nil, err
267	}
268
269	// Call readSectionBlob on each section key, and store the result in
270	// the blob value.
271	for sect, blob := range map[simpleSection]*[]byte{
272		toc.symbolMap.index:    &d.symbols.symIndex,
273		toc.symbolMap.data:     &d.symbols.symContent,
274		toc.symbolKindMap.data: &d.symbols.symKindContent,
275		toc.symbolMetaData:     &d.symbols.symMetaData,
276	} {
277		if *blob, err = d.readSectionBlob(sect); err != nil {
278			return nil, err
279		}
280	}
281
282	d.checksums, err = d.readSectionBlob(toc.contentChecksums)
283	if err != nil {
284		return nil, err
285	}
286
287	d.languages, err = d.readSectionBlob(toc.languages)
288	if err != nil {
289		return nil, err
290	}
291
292	if os.Getenv("ZOEKT_ENABLE_NGRAM_BS") != "" {
293		bsMap, err := d.readBinarySearchNgrams(toc)
294		if err != nil {
295			return nil, err
296		}
297		d.ngrams = bsMap
298	} else {
299		offsetMap, err := d.readNgrams(toc)
300		if err != nil {
301			return nil, err
302		}
303		d.ngrams = offsetMap
304	}
305
306	d.fileBranchMasks, err = readSectionU64(d.file, toc.branchMasks)
307	if err != nil {
308		return nil, err
309	}
310
311	d.fileNameContent, err = d.readSectionBlob(toc.fileNames.data)
312	if err != nil {
313		return nil, err
314	}
315
316	d.fileNameIndex = toc.fileNames.relativeIndex()
317
318	d.fileNameNgrams, err = d.readFileNameNgrams(toc)
319	if err != nil {
320		return nil, err
321	}
322
323	for _, md := range d.repoMetaData {
324		repoBranchIDs := make(map[string]uint, len(md.Branches))
325		repoBranchNames := make(map[uint]string, len(md.Branches))
326		for j, br := range md.Branches {
327			id := uint(1) << uint(j)
328			repoBranchIDs[br.Name] = id
329			repoBranchNames[id] = br.Name
330		}
331		d.branchIDs = append(d.branchIDs, repoBranchIDs)
332		d.branchNames = append(d.branchNames, repoBranchNames)
333		d.rawConfigMasks = append(d.rawConfigMasks, encodeRawConfig(md.RawConfig))
334	}
335
336	blob, err := d.readSectionBlob(toc.runeDocSections)
337	if err != nil {
338		return nil, err
339	}
340
341	if os.Getenv("ZOEKT_ENABLE_LAZY_DOC_SECTIONS") != "" {
342		d.runeDocSectionsRaw = blob
343	} else {
344		d.runeDocSections = unmarshalDocSections(blob, nil)
345	}
346
347	var runeOffsets, fileNameRuneOffsets []uint32
348
349	for sect, dest := range map[simpleSection]*[]uint32{
350		toc.subRepos:        &d.subRepos,
351		toc.runeOffsets:     &runeOffsets,
352		toc.nameRuneOffsets: &fileNameRuneOffsets,
353		toc.nameEndRunes:    &d.fileNameEndRunes,
354		toc.fileEndRunes:    &d.fileEndRunes,
355	} {
356		if blob, err := d.readSectionBlob(sect); err != nil {
357			return nil, err
358		} else {
359			*dest = fromSizedDeltas(blob, nil)
360		}
361	}
362
363	d.runeOffsets = makeRuneOffsetMap(runeOffsets)
364	d.fileNameRuneOffsets = makeRuneOffsetMap(fileNameRuneOffsets)
365
366	d.subRepoPaths = make([][]string, 0, len(d.repoMetaData))
367	for i := 0; i < len(d.repoMetaData); i++ {
368		keys := make([]string, 0, len(d.repoMetaData[i].SubRepoMap)+1)
369		keys = append(keys, "")
370		for k := range d.repoMetaData[i].SubRepoMap {
371			if k != "" {
372				keys = append(keys, k)
373			}
374		}
375		sort.Strings(keys)
376		d.subRepoPaths = append(d.subRepoPaths, keys)
377	}
378
379	d.languageMap = map[uint16]string{}
380	for k, v := range d.metaData.LanguageMap {
381		d.languageMap[v] = k
382	}
383
384	if err := d.verify(); err != nil {
385		return nil, err
386	}
387
388	// roc.ranks.sz = 0 indicates that we are reading a shard without ranks, in
389	// which case we skip reading the section and leave d.ranks = nil
390	if toc.ranks.sz > 0 {
391		err = d.readRanks(toc)
392		if err != nil {
393			return nil, err
394		}
395	}
396
397	if d.metaData.IndexFormatVersion >= 17 {
398		blob, err := d.readSectionBlob(toc.repos)
399		if err != nil {
400			return nil, err
401		}
402		d.repos = fromSizedDeltas16(blob, nil)
403	} else {
404		// every document is for repo index 0 (default value of uint16)
405		d.repos = make([]uint16, len(d.fileBranchMasks))
406	}
407
408	if err := d.calculateStats(); err != nil {
409		return nil, err
410	}
411
412	return &d, nil
413}
414
415func (r *reader) readMetadata(toc *indexTOC) ([]*Repository, *IndexMetadata, error) {
416	var md IndexMetadata
417	if err := r.readJSON(&md, &toc.metaData); err != nil {
418		return nil, nil, err
419	}
420
421	// Sourcegraph specific: we support mutating metadata via an additional
422	// ".meta" file. This is to support tombstoning. An additional benefit is we
423	// can update metadata (such as Rank and Name) without re-indexing content.
424	blob, err := os.ReadFile(r.r.Name() + ".meta")
425	if err != nil && !os.IsNotExist(err) {
426		return nil, &md, fmt.Errorf("failed to read meta file: %w", err)
427	}
428
429	if len(blob) == 0 {
430		blob, err = r.r.Read(toc.repoMetaData.off, toc.repoMetaData.sz)
431		if err != nil {
432			return nil, &md, err
433		}
434	}
435
436	var repos []*Repository
437	if md.IndexFormatVersion >= 17 {
438		if err := json.Unmarshal(blob, &repos); err != nil {
439			return nil, &md, err
440		}
441	} else {
442		repos = make([]*Repository, 1)
443		if err := json.Unmarshal(blob, &repos[0]); err != nil {
444			return nil, &md, err
445		}
446	}
447
448	if md.ID == "" {
449		if len(repos) == 0 {
450			return nil, nil, fmt.Errorf("len(repos)=0. Cannot backfill ID")
451		}
452		md.ID = backfillID(repos[0].Name)
453	}
454
455	return repos, &md, nil
456}
457
458const ngramEncoding = 8
459
460func (d *indexData) readNgrams(toc *indexTOC) (combinedNgramOffset, error) {
461	textContent, err := d.readSectionBlob(toc.ngramText)
462	if err != nil {
463		return combinedNgramOffset{}, err
464	}
465	postingsIndex := toc.postings.relativeIndex()
466
467	for i := 0; i < len(postingsIndex); i++ {
468		postingsIndex[i] += toc.postings.data.off
469	}
470
471	ngrams := make([]ngram, 0, len(textContent)/ngramEncoding)
472	for i := 0; i < len(textContent); i += ngramEncoding {
473		ng := ngram(binary.BigEndian.Uint64(textContent[i : i+ngramEncoding]))
474		ngrams = append(ngrams, ng)
475	}
476
477	return makeCombinedNgramOffset(ngrams, postingsIndex), nil
478}
479
480func (d *indexData) readBinarySearchNgrams(toc *indexTOC) (binarySearchNgram, error) {
481	ngramText, err := d.readSectionBlob(toc.ngramText)
482	if err != nil {
483		return binarySearchNgram{}, err
484	}
485
486	return binarySearchNgram{
487		ngramText:                 ngramText,
488		postingOffsets:            toc.postings.offsets,
489		postingDataSentinelOffset: toc.postings.data.off + toc.postings.data.sz,
490	}, nil
491}
492
493func (d *indexData) readFileNameNgrams(toc *indexTOC) (map[ngram][]byte, error) {
494	nameNgramText, err := d.readSectionBlob(toc.nameNgramText)
495	if err != nil {
496		return nil, err
497	}
498
499	fileNamePostingsData, err := d.readSectionBlob(toc.namePostings.data)
500	if err != nil {
501		return nil, err
502	}
503
504	fileNamePostingsIndex := toc.namePostings.relativeIndex()
505
506	fileNameNgrams := make(map[ngram][]byte, len(nameNgramText)/ngramEncoding)
507	for i := 0; i < len(nameNgramText); i += ngramEncoding {
508		j := i / ngramEncoding
509		off := fileNamePostingsIndex[j]
510		end := fileNamePostingsIndex[j+1]
511		ng := ngram(binary.BigEndian.Uint64(nameNgramText[i : i+ngramEncoding]))
512		fileNameNgrams[ng] = fileNamePostingsData[off:end]
513	}
514
515	return fileNameNgrams, nil
516}
517
518func (d *indexData) verify() error {
519	// This is not an exhaustive check: the postings can easily
520	// generate OOB acccesses, and are expensive to check, but this lets us rule out
521	// other sources of OOB access.
522	n := len(d.fileNameIndex)
523	if n == 0 {
524		return nil
525	}
526
527	n--
528	for what, got := range map[string]int{
529		"boundaries":        len(d.boundaries) - 1,
530		"branch masks":      len(d.fileBranchMasks),
531		"doc section index": len(d.docSectionsIndex) - 1,
532		"newlines index":    len(d.newlinesIndex) - 1,
533	} {
534		if got != n {
535			return fmt.Errorf("got %s %d, want %d", what, got, n)
536		}
537	}
538	return nil
539}
540
541func (d *indexData) readContents(i uint32) ([]byte, error) {
542	return d.readSectionBlob(simpleSection{
543		off: d.boundariesStart + d.boundaries[i],
544		sz:  d.boundaries[i+1] - d.boundaries[i],
545	})
546}
547
548func (d *indexData) readContentSlice(off uint32, sz uint32) ([]byte, error) {
549	// TODO(hanwen): cap result if it is at the end of the content
550	// section.
551	return d.readSectionBlob(simpleSection{
552		off: d.boundariesStart + off,
553		sz:  sz,
554	})
555}
556
557func (d *indexData) readNewlines(i uint32, buf []uint32) ([]uint32, uint32, error) {
558	sec := simpleSection{
559		off: d.newlinesStart + d.newlinesIndex[i],
560		sz:  d.newlinesIndex[i+1] - d.newlinesIndex[i],
561	}
562	blob, err := d.readSectionBlob(sec)
563	if err != nil {
564		return nil, 0, err
565	}
566
567	return fromSizedDeltas(blob, buf), sec.sz, nil
568}
569
570func (d *indexData) readDocSections(i uint32, buf []DocumentSection) ([]DocumentSection, uint32, error) {
571	sec := simpleSection{
572		off: d.docSectionsStart + d.docSectionsIndex[i],
573		sz:  d.docSectionsIndex[i+1] - d.docSectionsIndex[i],
574	}
575	blob, err := d.readSectionBlob(sec)
576	if err != nil {
577		return nil, 0, err
578	}
579
580	return unmarshalDocSections(blob, buf), sec.sz, nil
581}
582
583func (d *indexData) readRanks(toc *indexTOC) error {
584	blob, err := d.readSectionBlob(toc.ranks)
585	if err != nil {
586		return err
587	}
588
589	return decodeRanks(blob, &d.ranks)
590}
591
592// NewSearcher creates a Searcher for a single index file.  Search
593// results coming from this searcher are valid only for the lifetime
594// of the Searcher itself, ie. []byte members should be copied into
595// fresh buffers if the result is to survive closing the shard.
596func NewSearcher(r IndexFile) (Searcher, error) {
597	rd := &reader{r: r}
598
599	var toc indexTOC
600	if err := rd.readTOC(&toc); err != nil {
601		return nil, err
602	}
603	indexData, err := rd.readIndexData(&toc)
604	if err != nil {
605		return nil, err
606	}
607	indexData.file = r
608	return indexData, nil
609}
610
611// ReadMetadata returns the metadata of index shard without reading
612// the index data. The IndexFile is not closed.
613func ReadMetadata(inf IndexFile) ([]*Repository, *IndexMetadata, error) {
614	rd := &reader{r: inf}
615	var toc indexTOC
616	if err := rd.readTOC(&toc); err != nil {
617		return nil, nil, err
618	}
619
620	return rd.readMetadata(&toc)
621}
622
623// ReadMetadataPathAlive is like ReadMetadataPath except that it only returns
624// alive repositories.
625func ReadMetadataPathAlive(p string) ([]*Repository, *IndexMetadata, error) {
626	repos, id, err := ReadMetadataPath(p)
627	if err != nil {
628		return nil, nil, err
629	}
630	alive := repos[:0]
631	for _, repo := range repos {
632		if !repo.Tombstone {
633			alive = append(alive, repo)
634		}
635	}
636	return alive, id, nil
637}
638
639// ReadMetadataPath returns the metadata of index shard at p without reading
640// the index data. ReadMetadataPath is a helper for ReadMetadata which opens
641// the IndexFile at p.
642func ReadMetadataPath(p string) ([]*Repository, *IndexMetadata, error) {
643	f, err := os.Open(p)
644	if err != nil {
645		return nil, nil, err
646	}
647	defer f.Close()
648
649	iFile, err := NewIndexFile(f)
650	if err != nil {
651		return nil, nil, err
652	}
653	defer iFile.Close()
654
655	return ReadMetadata(iFile)
656}
657
658// IndexFilePaths returns all paths for the IndexFile at filepath p that
659// exist. Note: if no files exist this will return an empty slice and nil
660// error.
661//
662// This is p and the ".meta" file for p.
663func IndexFilePaths(p string) ([]string, error) {
664	paths := []string{p, p + ".meta"}
665	exist := paths[:0]
666	for _, p := range paths {
667		if _, err := os.Stat(p); err == nil {
668			exist = append(exist, p)
669		} else if !os.IsNotExist(err) {
670			return nil, err
671		}
672	}
673	return exist, nil
674}
675
676func loadIndexData(r IndexFile) (*indexData, error) {
677	rd := &reader{r: r}
678
679	var toc indexTOC
680	if err := rd.readTOC(&toc); err != nil {
681		return nil, err
682	}
683	return rd.readIndexData(&toc)
684}
685
686// PrintNgramStats outputs a list of the form
687//
688//	n_1 trigram_1
689//	n_2 trigram_2
690//	...
691//
692// where n_i is the length of the postings list of trigram_i stored in r.
693func PrintNgramStats(r IndexFile) error {
694	id, err := loadIndexData(r)
695	if err != nil {
696		return err
697	}
698
699	if id.ngrams == nil {
700		return fmt.Errorf("PrintNgramStats: ngrams=nil")
701	}
702	var rNgram [3]rune
703	for ngram, ss := range id.ngrams.DumpMap() {
704		rNgram = ngramToRunes(ngram)
705		fmt.Printf("%d\t%q\n", ss.sz, string(rNgram[:]))
706	}
707	return nil
708}
709
710var crc64Table = crc64.MakeTable(crc64.ECMA)
711
712// backfillID returns a 20 char long sortable ID. The ID only depends on s. It
713// should only be used to set the ID of simple v16 shards on read.
714func backfillID(s string) string {
715	var id xid.ID
716
717	// Our timestamps are based on Unix time. Shards without IDs are assigned IDs
718	// based on the 0 epoch.
719	binary.BigEndian.PutUint32(id[:], 0)
720	binary.BigEndian.PutUint64(id[4:], crc64.Checksum([]byte(s), crc64Table))
721	return id.String()
722}
Configure Feed

Configure Feed