fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "encoding/binary"
19 "encoding/json"
20 "fmt"
21 "hash/crc64"
22 "log"
23 "os"
24 "sort"
25
26 "github.com/rs/xid"
27)
28
29// IndexFile is a file suitable for concurrent read access. For performance
30// reasons, it allows a mmap'd implementation.
31type IndexFile interface {
32 Read(off uint32, sz uint32) ([]byte, error)
33 Size() (uint32, error)
34 Close()
35 Name() string
36}
37
38// reader is a stateful file
39type reader struct {
40 r IndexFile
41 off uint32
42}
43
44func (r *reader) seek(off uint32) {
45 r.off = off
46}
47
48func (r *reader) U32() (uint32, error) {
49 b, err := r.r.Read(r.off, 4)
50 r.off += 4
51 if err != nil {
52 return 0, err
53 }
54 return binary.BigEndian.Uint32(b), nil
55}
56
57func (r *reader) U64() (uint64, error) {
58 b, err := r.r.Read(r.off, 8)
59 r.off += 8
60 if err != nil {
61 return 0, err
62 }
63 return binary.BigEndian.Uint64(b), nil
64}
65
66func (r *reader) ReadByte() (byte, error) {
67 b, err := r.r.Read(r.off, 1)
68 r.off += 1
69 if err != nil {
70 return 0, err
71 }
72 return b[0], nil
73}
74
75func (r *reader) Varint() (uint64, error) {
76 v, err := binary.ReadUvarint(r)
77 if err != nil {
78 return 0, err
79 }
80 return v, nil
81}
82
83func (r *reader) Str() (string, error) {
84 slen, err := r.Varint()
85 if err != nil {
86 return "", err
87 }
88 b, err := r.r.Read(r.off, uint32(slen))
89 if err != nil {
90 return "", err
91 }
92 r.off += uint32(slen)
93 return string(b), nil
94}
95
96func (r *reader) readTOC(toc *indexTOC) error {
97 sz, err := r.r.Size()
98 if err != nil {
99 return err
100 }
101 r.off = sz - 8
102
103 var tocSection simpleSection
104 if err := tocSection.read(r); err != nil {
105 return err
106 }
107
108 r.seek(tocSection.off)
109
110 sectionCount, err := r.U32()
111 if err != nil {
112 return err
113 }
114
115 if sectionCount == 0 {
116 // tagged sections are indicated by a 0 sectionCount,
117 // and then a list of string-tagged type-indicated sections.
118 secs := toc.sectionsTagged()
119 for r.off < tocSection.off+tocSection.sz {
120 tag, err := r.Str()
121 if err != nil {
122 return err
123 }
124 kind, err := r.Varint()
125 if err != nil {
126 return err
127 }
128 sec := secs[tag]
129 if sec != nil && sec.kind() == sectionKind(kind) {
130 // happy path
131 if err := sec.read(r); err != nil {
132 return err
133 }
134 continue
135 }
136 // error case: skip over unknown section
137 if sec == nil {
138 log.Printf("file %s TOC has unknown section %q", r.r.Name(), tag)
139 } else {
140 return fmt.Errorf("file %s TOC section %q expects kind %d, got kind %d", r.r.Name(), tag,
141 kind, sec.kind())
142 }
143 if kind == 0 {
144 if err := (&simpleSection{}).read(r); err != nil {
145 return err
146 }
147 } else if kind == 1 {
148 if err := (&compoundSection{}).read(r); err != nil {
149 return err
150 }
151 }
152 }
153 } else {
154 // TODO: Remove this branch when ReaderMinFeatureVersion >= 10
155
156 secs := toc.sections()
157
158 if len(secs) != int(sectionCount) {
159 secs = toc.sectionsNext()
160 }
161
162 if len(secs) != int(sectionCount) {
163 return fmt.Errorf("section count mismatch: got %d want %d", sectionCount, len(secs))
164 }
165
166 for _, s := range secs {
167 if err := s.read(r); err != nil {
168 return err
169 }
170 }
171 }
172 return nil
173}
174
175func (r *indexData) readSectionBlob(sec simpleSection) ([]byte, error) {
176 return r.file.Read(sec.off, sec.sz)
177}
178
179func readSectionU32(f IndexFile, sec simpleSection) ([]uint32, error) {
180 if sec.sz%4 != 0 {
181 return nil, fmt.Errorf("barf: section size %% 4 != 0: sz %d ", sec.sz)
182 }
183 blob, err := f.Read(sec.off, sec.sz)
184 if err != nil {
185 return nil, err
186 }
187 arr := make([]uint32, 0, len(blob)/4)
188 for len(blob) > 0 {
189 arr = append(arr, binary.BigEndian.Uint32(blob))
190 blob = blob[4:]
191 }
192 return arr, nil
193}
194
195func readSectionU64(f IndexFile, sec simpleSection) ([]uint64, error) {
196 if sec.sz%8 != 0 {
197 return nil, fmt.Errorf("barf: section size %% 8 != 0: sz %d ", sec.sz)
198 }
199 blob, err := f.Read(sec.off, sec.sz)
200 if err != nil {
201 return nil, err
202 }
203 arr := make([]uint64, 0, len(blob)/8)
204 for len(blob) > 0 {
205 arr = append(arr, binary.BigEndian.Uint64(blob))
206 blob = blob[8:]
207 }
208 return arr, nil
209}
210
211func (r *reader) readJSON(data interface{}, sec *simpleSection) error {
212 blob, err := r.r.Read(sec.off, sec.sz)
213 if err != nil {
214 return err
215 }
216
217 return json.Unmarshal(blob, data)
218}
219
220// canReadVersion returns checks if zoekt can read in md. If it can't a
221// non-nil error is returned.
222func canReadVersion(md *IndexMetadata) bool {
223 // Backwards compatible with v16
224 return md.IndexFormatVersion == IndexFormatVersion || md.IndexFormatVersion == NextIndexFormatVersion
225}
226
227func (r *reader) readIndexData(toc *indexTOC) (*indexData, error) {
228 d := indexData{
229 file: r.r,
230 fileNameNgrams: map[ngram][]byte{},
231 branchIDs: []map[string]uint{},
232 branchNames: []map[uint]string{},
233 }
234
235 repos, md, err := r.readMetadata(toc)
236 if md != nil && !canReadVersion(md) {
237 return nil, fmt.Errorf("file is v%d, want v%d", md.IndexFormatVersion, IndexFormatVersion)
238 } else if err != nil {
239 return nil, err
240 }
241
242 d.metaData = *md
243 d.repoMetaData = make([]Repository, 0, len(repos))
244 for _, r := range repos {
245 d.repoMetaData = append(d.repoMetaData, *r)
246 }
247
248 if d.metaData.IndexFeatureVersion < ReadMinFeatureVersion {
249 return nil, fmt.Errorf("file is feature version %d, want feature version >= %d", d.metaData.IndexFeatureVersion, ReadMinFeatureVersion)
250 }
251
252 if d.metaData.IndexMinReaderVersion > FeatureVersion {
253 return nil, fmt.Errorf("file needs read feature version >= %d, have read feature version %d", d.metaData.IndexMinReaderVersion, FeatureVersion)
254 }
255
256 d.boundariesStart = toc.fileContents.data.off
257 d.boundaries = toc.fileContents.relativeIndex()
258 d.newlinesStart = toc.newlines.data.off
259 d.newlinesIndex = toc.newlines.relativeIndex()
260 d.docSectionsStart = toc.fileSections.data.off
261 d.docSectionsIndex = toc.fileSections.relativeIndex()
262
263 d.symbols.symKindIndex = toc.symbolKindMap.relativeIndex()
264 d.fileEndSymbol, err = readSectionU32(d.file, toc.fileEndSymbol)
265 if err != nil {
266 return nil, err
267 }
268
269 // Call readSectionBlob on each section key, and store the result in
270 // the blob value.
271 for sect, blob := range map[simpleSection]*[]byte{
272 toc.symbolMap.index: &d.symbols.symIndex,
273 toc.symbolMap.data: &d.symbols.symContent,
274 toc.symbolKindMap.data: &d.symbols.symKindContent,
275 toc.symbolMetaData: &d.symbols.symMetaData,
276 } {
277 if *blob, err = d.readSectionBlob(sect); err != nil {
278 return nil, err
279 }
280 }
281
282 d.checksums, err = d.readSectionBlob(toc.contentChecksums)
283 if err != nil {
284 return nil, err
285 }
286
287 d.languages, err = d.readSectionBlob(toc.languages)
288 if err != nil {
289 return nil, err
290 }
291
292 if os.Getenv("ZOEKT_ENABLE_NGRAM_BS") != "" {
293 bsMap, err := d.readBinarySearchNgrams(toc)
294 if err != nil {
295 return nil, err
296 }
297 d.ngrams = bsMap
298 } else {
299 offsetMap, err := d.readNgrams(toc)
300 if err != nil {
301 return nil, err
302 }
303 d.ngrams = offsetMap
304 }
305
306 d.fileBranchMasks, err = readSectionU64(d.file, toc.branchMasks)
307 if err != nil {
308 return nil, err
309 }
310
311 d.fileNameContent, err = d.readSectionBlob(toc.fileNames.data)
312 if err != nil {
313 return nil, err
314 }
315
316 d.fileNameIndex = toc.fileNames.relativeIndex()
317
318 d.fileNameNgrams, err = d.readFileNameNgrams(toc)
319 if err != nil {
320 return nil, err
321 }
322
323 for _, md := range d.repoMetaData {
324 repoBranchIDs := make(map[string]uint, len(md.Branches))
325 repoBranchNames := make(map[uint]string, len(md.Branches))
326 for j, br := range md.Branches {
327 id := uint(1) << uint(j)
328 repoBranchIDs[br.Name] = id
329 repoBranchNames[id] = br.Name
330 }
331 d.branchIDs = append(d.branchIDs, repoBranchIDs)
332 d.branchNames = append(d.branchNames, repoBranchNames)
333 d.rawConfigMasks = append(d.rawConfigMasks, encodeRawConfig(md.RawConfig))
334 }
335
336 blob, err := d.readSectionBlob(toc.runeDocSections)
337 if err != nil {
338 return nil, err
339 }
340
341 if os.Getenv("ZOEKT_ENABLE_LAZY_DOC_SECTIONS") != "" {
342 d.runeDocSectionsRaw = blob
343 } else {
344 d.runeDocSections = unmarshalDocSections(blob, nil)
345 }
346
347 var runeOffsets, fileNameRuneOffsets []uint32
348
349 for sect, dest := range map[simpleSection]*[]uint32{
350 toc.subRepos: &d.subRepos,
351 toc.runeOffsets: &runeOffsets,
352 toc.nameRuneOffsets: &fileNameRuneOffsets,
353 toc.nameEndRunes: &d.fileNameEndRunes,
354 toc.fileEndRunes: &d.fileEndRunes,
355 } {
356 if blob, err := d.readSectionBlob(sect); err != nil {
357 return nil, err
358 } else {
359 *dest = fromSizedDeltas(blob, nil)
360 }
361 }
362
363 d.runeOffsets = makeRuneOffsetMap(runeOffsets)
364 d.fileNameRuneOffsets = makeRuneOffsetMap(fileNameRuneOffsets)
365
366 d.subRepoPaths = make([][]string, 0, len(d.repoMetaData))
367 for i := 0; i < len(d.repoMetaData); i++ {
368 keys := make([]string, 0, len(d.repoMetaData[i].SubRepoMap)+1)
369 keys = append(keys, "")
370 for k := range d.repoMetaData[i].SubRepoMap {
371 if k != "" {
372 keys = append(keys, k)
373 }
374 }
375 sort.Strings(keys)
376 d.subRepoPaths = append(d.subRepoPaths, keys)
377 }
378
379 d.languageMap = map[uint16]string{}
380 for k, v := range d.metaData.LanguageMap {
381 d.languageMap[v] = k
382 }
383
384 if err := d.verify(); err != nil {
385 return nil, err
386 }
387
388 // roc.ranks.sz = 0 indicates that we are reading a shard without ranks, in
389 // which case we skip reading the section and leave d.ranks = nil
390 if toc.ranks.sz > 0 {
391 err = d.readRanks(toc)
392 if err != nil {
393 return nil, err
394 }
395 }
396
397 if d.metaData.IndexFormatVersion >= 17 {
398 blob, err := d.readSectionBlob(toc.repos)
399 if err != nil {
400 return nil, err
401 }
402 d.repos = fromSizedDeltas16(blob, nil)
403 } else {
404 // every document is for repo index 0 (default value of uint16)
405 d.repos = make([]uint16, len(d.fileBranchMasks))
406 }
407
408 if err := d.calculateStats(); err != nil {
409 return nil, err
410 }
411
412 return &d, nil
413}
414
415func (r *reader) readMetadata(toc *indexTOC) ([]*Repository, *IndexMetadata, error) {
416 var md IndexMetadata
417 if err := r.readJSON(&md, &toc.metaData); err != nil {
418 return nil, nil, err
419 }
420
421 // Sourcegraph specific: we support mutating metadata via an additional
422 // ".meta" file. This is to support tombstoning. An additional benefit is we
423 // can update metadata (such as Rank and Name) without re-indexing content.
424 blob, err := os.ReadFile(r.r.Name() + ".meta")
425 if err != nil && !os.IsNotExist(err) {
426 return nil, &md, fmt.Errorf("failed to read meta file: %w", err)
427 }
428
429 if len(blob) == 0 {
430 blob, err = r.r.Read(toc.repoMetaData.off, toc.repoMetaData.sz)
431 if err != nil {
432 return nil, &md, err
433 }
434 }
435
436 var repos []*Repository
437 if md.IndexFormatVersion >= 17 {
438 if err := json.Unmarshal(blob, &repos); err != nil {
439 return nil, &md, err
440 }
441 } else {
442 repos = make([]*Repository, 1)
443 if err := json.Unmarshal(blob, &repos[0]); err != nil {
444 return nil, &md, err
445 }
446 }
447
448 if md.ID == "" {
449 if len(repos) == 0 {
450 return nil, nil, fmt.Errorf("len(repos)=0. Cannot backfill ID")
451 }
452 md.ID = backfillID(repos[0].Name)
453 }
454
455 return repos, &md, nil
456}
457
458const ngramEncoding = 8
459
460func (d *indexData) readNgrams(toc *indexTOC) (combinedNgramOffset, error) {
461 textContent, err := d.readSectionBlob(toc.ngramText)
462 if err != nil {
463 return combinedNgramOffset{}, err
464 }
465 postingsIndex := toc.postings.relativeIndex()
466
467 for i := 0; i < len(postingsIndex); i++ {
468 postingsIndex[i] += toc.postings.data.off
469 }
470
471 ngrams := make([]ngram, 0, len(textContent)/ngramEncoding)
472 for i := 0; i < len(textContent); i += ngramEncoding {
473 ng := ngram(binary.BigEndian.Uint64(textContent[i : i+ngramEncoding]))
474 ngrams = append(ngrams, ng)
475 }
476
477 return makeCombinedNgramOffset(ngrams, postingsIndex), nil
478}
479
480func (d *indexData) readBinarySearchNgrams(toc *indexTOC) (binarySearchNgram, error) {
481 ngramText, err := d.readSectionBlob(toc.ngramText)
482 if err != nil {
483 return binarySearchNgram{}, err
484 }
485
486 return binarySearchNgram{
487 ngramText: ngramText,
488 postingOffsets: toc.postings.offsets,
489 postingDataSentinelOffset: toc.postings.data.off + toc.postings.data.sz,
490 }, nil
491}
492
493func (d *indexData) readFileNameNgrams(toc *indexTOC) (map[ngram][]byte, error) {
494 nameNgramText, err := d.readSectionBlob(toc.nameNgramText)
495 if err != nil {
496 return nil, err
497 }
498
499 fileNamePostingsData, err := d.readSectionBlob(toc.namePostings.data)
500 if err != nil {
501 return nil, err
502 }
503
504 fileNamePostingsIndex := toc.namePostings.relativeIndex()
505
506 fileNameNgrams := make(map[ngram][]byte, len(nameNgramText)/ngramEncoding)
507 for i := 0; i < len(nameNgramText); i += ngramEncoding {
508 j := i / ngramEncoding
509 off := fileNamePostingsIndex[j]
510 end := fileNamePostingsIndex[j+1]
511 ng := ngram(binary.BigEndian.Uint64(nameNgramText[i : i+ngramEncoding]))
512 fileNameNgrams[ng] = fileNamePostingsData[off:end]
513 }
514
515 return fileNameNgrams, nil
516}
517
518func (d *indexData) verify() error {
519 // This is not an exhaustive check: the postings can easily
520 // generate OOB acccesses, and are expensive to check, but this lets us rule out
521 // other sources of OOB access.
522 n := len(d.fileNameIndex)
523 if n == 0 {
524 return nil
525 }
526
527 n--
528 for what, got := range map[string]int{
529 "boundaries": len(d.boundaries) - 1,
530 "branch masks": len(d.fileBranchMasks),
531 "doc section index": len(d.docSectionsIndex) - 1,
532 "newlines index": len(d.newlinesIndex) - 1,
533 } {
534 if got != n {
535 return fmt.Errorf("got %s %d, want %d", what, got, n)
536 }
537 }
538 return nil
539}
540
541func (d *indexData) readContents(i uint32) ([]byte, error) {
542 return d.readSectionBlob(simpleSection{
543 off: d.boundariesStart + d.boundaries[i],
544 sz: d.boundaries[i+1] - d.boundaries[i],
545 })
546}
547
548func (d *indexData) readContentSlice(off uint32, sz uint32) ([]byte, error) {
549 // TODO(hanwen): cap result if it is at the end of the content
550 // section.
551 return d.readSectionBlob(simpleSection{
552 off: d.boundariesStart + off,
553 sz: sz,
554 })
555}
556
557func (d *indexData) readNewlines(i uint32, buf []uint32) ([]uint32, uint32, error) {
558 sec := simpleSection{
559 off: d.newlinesStart + d.newlinesIndex[i],
560 sz: d.newlinesIndex[i+1] - d.newlinesIndex[i],
561 }
562 blob, err := d.readSectionBlob(sec)
563 if err != nil {
564 return nil, 0, err
565 }
566
567 return fromSizedDeltas(blob, buf), sec.sz, nil
568}
569
570func (d *indexData) readDocSections(i uint32, buf []DocumentSection) ([]DocumentSection, uint32, error) {
571 sec := simpleSection{
572 off: d.docSectionsStart + d.docSectionsIndex[i],
573 sz: d.docSectionsIndex[i+1] - d.docSectionsIndex[i],
574 }
575 blob, err := d.readSectionBlob(sec)
576 if err != nil {
577 return nil, 0, err
578 }
579
580 return unmarshalDocSections(blob, buf), sec.sz, nil
581}
582
583func (d *indexData) readRanks(toc *indexTOC) error {
584 blob, err := d.readSectionBlob(toc.ranks)
585 if err != nil {
586 return err
587 }
588
589 return decodeRanks(blob, &d.ranks)
590}
591
592// NewSearcher creates a Searcher for a single index file. Search
593// results coming from this searcher are valid only for the lifetime
594// of the Searcher itself, ie. []byte members should be copied into
595// fresh buffers if the result is to survive closing the shard.
596func NewSearcher(r IndexFile) (Searcher, error) {
597 rd := &reader{r: r}
598
599 var toc indexTOC
600 if err := rd.readTOC(&toc); err != nil {
601 return nil, err
602 }
603 indexData, err := rd.readIndexData(&toc)
604 if err != nil {
605 return nil, err
606 }
607 indexData.file = r
608 return indexData, nil
609}
610
611// ReadMetadata returns the metadata of index shard without reading
612// the index data. The IndexFile is not closed.
613func ReadMetadata(inf IndexFile) ([]*Repository, *IndexMetadata, error) {
614 rd := &reader{r: inf}
615 var toc indexTOC
616 if err := rd.readTOC(&toc); err != nil {
617 return nil, nil, err
618 }
619
620 return rd.readMetadata(&toc)
621}
622
623// ReadMetadataPathAlive is like ReadMetadataPath except that it only returns
624// alive repositories.
625func ReadMetadataPathAlive(p string) ([]*Repository, *IndexMetadata, error) {
626 repos, id, err := ReadMetadataPath(p)
627 if err != nil {
628 return nil, nil, err
629 }
630 alive := repos[:0]
631 for _, repo := range repos {
632 if !repo.Tombstone {
633 alive = append(alive, repo)
634 }
635 }
636 return alive, id, nil
637}
638
639// ReadMetadataPath returns the metadata of index shard at p without reading
640// the index data. ReadMetadataPath is a helper for ReadMetadata which opens
641// the IndexFile at p.
642func ReadMetadataPath(p string) ([]*Repository, *IndexMetadata, error) {
643 f, err := os.Open(p)
644 if err != nil {
645 return nil, nil, err
646 }
647 defer f.Close()
648
649 iFile, err := NewIndexFile(f)
650 if err != nil {
651 return nil, nil, err
652 }
653 defer iFile.Close()
654
655 return ReadMetadata(iFile)
656}
657
658// IndexFilePaths returns all paths for the IndexFile at filepath p that
659// exist. Note: if no files exist this will return an empty slice and nil
660// error.
661//
662// This is p and the ".meta" file for p.
663func IndexFilePaths(p string) ([]string, error) {
664 paths := []string{p, p + ".meta"}
665 exist := paths[:0]
666 for _, p := range paths {
667 if _, err := os.Stat(p); err == nil {
668 exist = append(exist, p)
669 } else if !os.IsNotExist(err) {
670 return nil, err
671 }
672 }
673 return exist, nil
674}
675
676func loadIndexData(r IndexFile) (*indexData, error) {
677 rd := &reader{r: r}
678
679 var toc indexTOC
680 if err := rd.readTOC(&toc); err != nil {
681 return nil, err
682 }
683 return rd.readIndexData(&toc)
684}
685
686// PrintNgramStats outputs a list of the form
687//
688// n_1 trigram_1
689// n_2 trigram_2
690// ...
691//
692// where n_i is the length of the postings list of trigram_i stored in r.
693func PrintNgramStats(r IndexFile) error {
694 id, err := loadIndexData(r)
695 if err != nil {
696 return err
697 }
698
699 if id.ngrams == nil {
700 return fmt.Errorf("PrintNgramStats: ngrams=nil")
701 }
702 var rNgram [3]rune
703 for ngram, ss := range id.ngrams.DumpMap() {
704 rNgram = ngramToRunes(ngram)
705 fmt.Printf("%d\t%q\n", ss.sz, string(rNgram[:]))
706 }
707 return nil
708}
709
710var crc64Table = crc64.MakeTable(crc64.ECMA)
711
712// backfillID returns a 20 char long sortable ID. The ID only depends on s. It
713// should only be used to set the ID of simple v16 shards on read.
714func backfillID(s string) string {
715 var id xid.ID
716
717 // Our timestamps are based on Unix time. Shards without IDs are assigned IDs
718 // based on the 0 epoch.
719 binary.BigEndian.PutUint32(id[:], 0)
720 binary.BigEndian.PutUint64(id[4:], crc64.Checksum([]byte(s), crc64Table))
721 return id.String()
722}