fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "encoding/binary"
19 "encoding/json"
20 "fmt"
21 "hash/crc64"
22 "log"
23 "os"
24 "slices"
25 "sort"
26
27 "github.com/rs/xid"
28)
29
30// IndexFile is a file suitable for concurrent read access. For performance
31// reasons, it allows a mmap'd implementation.
32type IndexFile interface {
33 Read(off uint32, sz uint32) ([]byte, error)
34 Size() (uint32, error)
35 Close()
36 Name() string
37}
38
39// reader is a stateful file
40type reader struct {
41 r IndexFile
42 off uint32
43}
44
45func (r *reader) seek(off uint32) {
46 r.off = off
47}
48
49func (r *reader) U32() (uint32, error) {
50 b, err := r.r.Read(r.off, 4)
51 r.off += 4
52 if err != nil {
53 return 0, err
54 }
55 return binary.BigEndian.Uint32(b), nil
56}
57
58func (r *reader) U64() (uint64, error) {
59 b, err := r.r.Read(r.off, 8)
60 r.off += 8
61 if err != nil {
62 return 0, err
63 }
64 return binary.BigEndian.Uint64(b), nil
65}
66
67func (r *reader) ReadByte() (byte, error) {
68 b, err := r.r.Read(r.off, 1)
69 r.off += 1
70 if err != nil {
71 return 0, err
72 }
73 return b[0], nil
74}
75
76func (r *reader) Varint() (uint64, error) {
77 v, err := binary.ReadUvarint(r)
78 if err != nil {
79 return 0, err
80 }
81 return v, nil
82}
83
84func (r *reader) Str() (string, error) {
85 slen, err := r.Varint()
86 if err != nil {
87 return "", err
88 }
89 b, err := r.r.Read(r.off, uint32(slen))
90 if err != nil {
91 return "", err
92 }
93 r.off += uint32(slen)
94 return string(b), nil
95}
96
97func (r *reader) readTOC(toc *indexTOC) error {
98 return r.readTOCSections(toc, []string{})
99}
100
101// readTOCSections reads the table of contents of the index file.
102//
103// If the tags parameter is non-empty, it reads only those tagged sections for efficiency
104// and does not populate the other sections.
105func (r *reader) readTOCSections(toc *indexTOC, tags []string) error {
106 tocSection, sectionCount, err := r.readHeader()
107 if err != nil {
108 return err
109 }
110
111 if sectionCount == 0 {
112 // tagged sections are indicated by a 0 sectionCount,
113 // and then a list of string-tagged type-indicated sections.
114 secs := toc.sectionsTagged()
115 for r.off < tocSection.off+tocSection.sz {
116 tag, err := r.Str()
117 if err != nil {
118 return err
119 }
120 kind, err := r.Varint()
121 if err != nil {
122 return err
123 }
124
125 skipSection := len(tags) > 0 && !slices.Contains(tags, tag)
126 sec := secs[tag]
127 if sec == nil || sec.kind() != sectionKind(kind) {
128 // If we don't recognize the section, we may be reading a newer index than the current version. Use
129 // a "dummy section" struct to skip over it.
130 skipSection = true
131 log.Printf("encountered malformed index section (%s), skipping over it", tag)
132
133 switch sectionKind(kind) {
134 case sectionKindSimple:
135 sec = &simpleSection{}
136 case sectionKindCompound:
137 sec = &lazyCompoundSection{}
138 case sectionKindCompoundLazy:
139 sec = &lazyCompoundSection{}
140 default:
141 return fmt.Errorf("unknown section kind %d", kind)
142 }
143 }
144
145 if skipSection {
146 if err := sec.skip(r); err != nil {
147 return err
148 }
149 } else {
150 if err := sec.read(r); err != nil {
151 return err
152 }
153 }
154 }
155 } else {
156 // TODO: Remove this branch when ReaderMinFeatureVersion >= 10
157
158 secs := toc.sections()
159
160 if len(secs) != int(sectionCount) {
161 secs = toc.sectionsNext()
162 }
163
164 if len(secs) != int(sectionCount) {
165 return fmt.Errorf("section count mismatch: got %d want %d", sectionCount, len(secs))
166 }
167
168 for _, s := range secs {
169 if err := s.read(r); err != nil {
170 return err
171 }
172 }
173 }
174 return nil
175}
176
177func (r *reader) readHeader() (simpleSection, uint32, error) {
178 sz, err := r.r.Size()
179 if err != nil {
180 return simpleSection{}, 0, err
181 }
182 r.off = sz - 8
183
184 var tocSection simpleSection
185 if err := tocSection.read(r); err != nil {
186 return simpleSection{}, 0, err
187 }
188
189 r.seek(tocSection.off)
190
191 sectionCount, err := r.U32()
192 if err != nil {
193 return simpleSection{}, 0, err
194 }
195 return tocSection, sectionCount, nil
196}
197
198func (r *indexData) readSectionBlob(sec simpleSection) ([]byte, error) {
199 return r.file.Read(sec.off, sec.sz)
200}
201
202func readSectionU32(f IndexFile, sec simpleSection) ([]uint32, error) {
203 if sec.sz%4 != 0 {
204 return nil, fmt.Errorf("barf: section size %% 4 != 0: sz %d ", sec.sz)
205 }
206 blob, err := f.Read(sec.off, sec.sz)
207 if err != nil {
208 return nil, err
209 }
210 arr := make([]uint32, 0, len(blob)/4)
211 for len(blob) > 0 {
212 arr = append(arr, binary.BigEndian.Uint32(blob))
213 blob = blob[4:]
214 }
215 return arr, nil
216}
217
218func readSectionU64(f IndexFile, sec simpleSection) ([]uint64, error) {
219 if sec.sz%8 != 0 {
220 return nil, fmt.Errorf("barf: section size %% 8 != 0: sz %d ", sec.sz)
221 }
222 blob, err := f.Read(sec.off, sec.sz)
223 if err != nil {
224 return nil, err
225 }
226 arr := make([]uint64, 0, len(blob)/8)
227 for len(blob) > 0 {
228 arr = append(arr, binary.BigEndian.Uint64(blob))
229 blob = blob[8:]
230 }
231 return arr, nil
232}
233
234func (r *reader) readJSON(data interface{}, sec simpleSection) error {
235 blob, err := r.r.Read(sec.off, sec.sz)
236 if err != nil {
237 return err
238 }
239
240 return json.Unmarshal(blob, data)
241}
242
243// canReadVersion returns checks if zoekt can read in md. If it can't a
244// non-nil error is returned.
245func canReadVersion(md *IndexMetadata) bool {
246 // Backwards compatible with v16
247 return md.IndexFormatVersion == IndexFormatVersion || md.IndexFormatVersion == NextIndexFormatVersion
248}
249
250func (r *reader) readIndexData(toc *indexTOC) (*indexData, error) {
251 d := indexData{
252 file: r.r,
253 branchIDs: []map[string]uint{},
254 branchNames: []map[uint]string{},
255 }
256
257 repos, md, err := r.parseMetadata(toc.metaData, toc.repoMetaData)
258 if md != nil && !canReadVersion(md) {
259 return nil, fmt.Errorf("file is v%d, want v%d", md.IndexFormatVersion, IndexFormatVersion)
260 } else if err != nil {
261 return nil, err
262 }
263
264 d.metaData = *md
265 d.repoMetaData = make([]Repository, 0, len(repos))
266 for _, r := range repos {
267 d.repoMetaData = append(d.repoMetaData, *r)
268 }
269
270 if d.metaData.IndexFeatureVersion < ReadMinFeatureVersion {
271 return nil, fmt.Errorf("file is feature version %d, want feature version >= %d", d.metaData.IndexFeatureVersion, ReadMinFeatureVersion)
272 }
273
274 if d.metaData.IndexMinReaderVersion > FeatureVersion {
275 return nil, fmt.Errorf("file needs read feature version >= %d, have read feature version %d", d.metaData.IndexMinReaderVersion, FeatureVersion)
276 }
277
278 d.boundariesStart = toc.fileContents.data.off
279 d.boundaries = toc.fileContents.relativeIndex()
280 d.newlinesStart = toc.newlines.data.off
281 d.newlinesIndex = toc.newlines.relativeIndex()
282 d.docSectionsStart = toc.fileSections.data.off
283 d.docSectionsIndex = toc.fileSections.relativeIndex()
284
285 d.symbols.symKindIndex = toc.symbolKindMap.relativeIndex()
286 d.fileEndSymbol, err = readSectionU32(d.file, toc.fileEndSymbol)
287 if err != nil {
288 return nil, err
289 }
290
291 // Call readSectionBlob on each section key, and store the result in
292 // the blob value.
293 for sect, blob := range map[simpleSection]*[]byte{
294 toc.symbolMap.index: &d.symbols.symIndex,
295 toc.symbolMap.data: &d.symbols.symContent,
296 toc.symbolKindMap.data: &d.symbols.symKindContent,
297 toc.symbolMetaData: &d.symbols.symMetaData,
298 } {
299 if *blob, err = d.readSectionBlob(sect); err != nil {
300 return nil, err
301 }
302 }
303
304 d.checksums, err = d.readSectionBlob(toc.contentChecksums)
305 if err != nil {
306 return nil, err
307 }
308
309 d.languages, err = d.readSectionBlob(toc.languages)
310 if err != nil {
311 return nil, err
312 }
313
314 d.contentNgrams, err = d.newBtreeIndex(toc.ngramText, toc.postings)
315 if err != nil {
316 return nil, err
317 }
318
319 d.fileBranchMasks, err = readSectionU64(d.file, toc.branchMasks)
320 if err != nil {
321 return nil, err
322 }
323
324 d.fileNameContent, err = d.readSectionBlob(toc.fileNames.data)
325 if err != nil {
326 return nil, err
327 }
328
329 d.fileNameIndex = toc.fileNames.relativeIndex()
330
331 d.fileNameNgrams, err = d.newBtreeIndex(toc.nameNgramText, toc.namePostings)
332 if err != nil {
333 return nil, err
334 }
335
336 for _, md := range d.repoMetaData {
337 repoBranchIDs := make(map[string]uint, len(md.Branches))
338 repoBranchNames := make(map[uint]string, len(md.Branches))
339 for j, br := range md.Branches {
340 id := uint(1) << uint(j)
341 repoBranchIDs[br.Name] = id
342 repoBranchNames[id] = br.Name
343 }
344 d.branchIDs = append(d.branchIDs, repoBranchIDs)
345 d.branchNames = append(d.branchNames, repoBranchNames)
346 d.rawConfigMasks = append(d.rawConfigMasks, encodeRawConfig(md.RawConfig))
347 }
348
349 blob, err := d.readSectionBlob(toc.runeDocSections)
350 if err != nil {
351 return nil, err
352 }
353
354 d.runeDocSections = unmarshalDocSections(blob, nil)
355
356 var runeOffsets, fileNameRuneOffsets []uint32
357
358 for sect, dest := range map[simpleSection]*[]uint32{
359 toc.subRepos: &d.subRepos,
360 toc.runeOffsets: &runeOffsets,
361 toc.nameRuneOffsets: &fileNameRuneOffsets,
362 toc.nameEndRunes: &d.fileNameEndRunes,
363 toc.fileEndRunes: &d.fileEndRunes,
364 } {
365 if blob, err := d.readSectionBlob(sect); err != nil {
366 return nil, err
367 } else {
368 *dest = fromSizedDeltas(blob, nil)
369 }
370 }
371
372 d.runeOffsets = makeRuneOffsetMap(runeOffsets)
373 d.fileNameRuneOffsets = makeRuneOffsetMap(fileNameRuneOffsets)
374
375 d.subRepoPaths = make([][]string, 0, len(d.repoMetaData))
376 for i := 0; i < len(d.repoMetaData); i++ {
377 keys := make([]string, 0, len(d.repoMetaData[i].SubRepoMap)+1)
378 keys = append(keys, "")
379 for k := range d.repoMetaData[i].SubRepoMap {
380 if k != "" {
381 keys = append(keys, k)
382 }
383 }
384 sort.Strings(keys)
385 d.subRepoPaths = append(d.subRepoPaths, keys)
386 }
387
388 d.languageMap = map[uint16]string{}
389 for k, v := range d.metaData.LanguageMap {
390 d.languageMap[v] = k
391 }
392
393 if err := d.verify(); err != nil {
394 return nil, err
395 }
396
397 // roc.ranks.sz = 0 indicates that we are reading a shard without ranks, in
398 // which case we skip reading the section and leave d.ranks = nil
399 if toc.ranks.sz > 0 {
400 err = d.readRanks(toc)
401 if err != nil {
402 return nil, err
403 }
404 }
405
406 if d.metaData.IndexFormatVersion >= 17 {
407 blob, err := d.readSectionBlob(toc.repos)
408 if err != nil {
409 return nil, err
410 }
411 d.repos = fromSizedDeltas16(blob, nil)
412 } else {
413 // every document is for repo index 0 (default value of uint16)
414 d.repos = make([]uint16, len(d.fileBranchMasks))
415 }
416
417 if err := d.calculateStats(); err != nil {
418 return nil, err
419 }
420
421 return &d, nil
422}
423
424func (r *reader) parseMetadata(metaData simpleSection, repoMetaData simpleSection) ([]*Repository, *IndexMetadata, error) {
425 var md IndexMetadata
426 if err := r.readJSON(&md, metaData); err != nil {
427 return nil, nil, err
428 }
429
430 // Sourcegraph specific: we support mutating metadata via an additional
431 // ".meta" file. This is to support tombstoning. An additional benefit is we
432 // can update metadata (such as Rank and Name) without re-indexing content.
433 blob, err := os.ReadFile(r.r.Name() + ".meta")
434 if err != nil && !os.IsNotExist(err) {
435 return nil, &md, fmt.Errorf("failed to read meta file: %w", err)
436 }
437
438 if len(blob) == 0 {
439 blob, err = r.r.Read(repoMetaData.off, repoMetaData.sz)
440 if err != nil {
441 return nil, &md, err
442 }
443 }
444
445 var repos []*Repository
446 if md.IndexFormatVersion >= 17 {
447 if err := json.Unmarshal(blob, &repos); err != nil {
448 return nil, &md, err
449 }
450 } else {
451 repos = make([]*Repository, 1)
452 if err := json.Unmarshal(blob, &repos[0]); err != nil {
453 return nil, &md, err
454 }
455 }
456
457 if md.ID == "" {
458 if len(repos) == 0 {
459 return nil, nil, fmt.Errorf("len(repos)=0. Cannot backfill ID")
460 }
461 md.ID = backfillID(repos[0].Name)
462 }
463
464 return repos, &md, nil
465}
466
467const ngramEncoding = 8
468
469func (d *indexData) newBtreeIndex(ngramSec simpleSection, postings compoundSection) (btreeIndex, error) {
470 bi := btreeIndex{file: d.file}
471
472 textContent, err := d.readSectionBlob(ngramSec)
473 if err != nil {
474 return btreeIndex{}, err
475 }
476
477 // For 500k trigams we can expect approx 1000 leaf nodes (500k divided by
478 // half the bucketSize) and 20 nodes on level 2 (all but the rightmost
479 // inner nodes will have exactly v=50 children) plus a root node.
480 bt := newBtree(btreeOpts{bucketSize: btreeBucketSize, v: 50})
481 for i := 0; i < len(textContent); i += ngramEncoding {
482 ng := ngram(binary.BigEndian.Uint64(textContent[i : i+ngramEncoding]))
483 bt.insert(ng)
484 }
485 bt.freeze()
486
487 bi.bt = bt
488
489 // hold on to simple sections (8 bytes each)
490 bi.ngramSec = ngramSec
491 bi.postingIndex = postings.index
492
493 return bi, nil
494}
495
496func (d *indexData) verify() error {
497 // This is not an exhaustive check: the postings can easily
498 // generate OOB acccesses, and are expensive to check, but this lets us rule out
499 // other sources of OOB access.
500 n := len(d.fileNameIndex)
501 if n == 0 {
502 return nil
503 }
504
505 n--
506 for what, got := range map[string]int{
507 "boundaries": len(d.boundaries) - 1,
508 "branch masks": len(d.fileBranchMasks),
509 "doc section index": len(d.docSectionsIndex) - 1,
510 "newlines index": len(d.newlinesIndex) - 1,
511 } {
512 if got != n {
513 return fmt.Errorf("got %s %d, want %d", what, got, n)
514 }
515 }
516 return nil
517}
518
519func (d *indexData) readContents(i uint32) ([]byte, error) {
520 return d.readSectionBlob(simpleSection{
521 off: d.boundariesStart + d.boundaries[i],
522 sz: d.boundaries[i+1] - d.boundaries[i],
523 })
524}
525
526func (d *indexData) readContentSlice(off uint32, sz uint32) ([]byte, error) {
527 // TODO(hanwen): cap result if it is at the end of the content
528 // section.
529 return d.readSectionBlob(simpleSection{
530 off: d.boundariesStart + off,
531 sz: sz,
532 })
533}
534
535func (d *indexData) readNewlines(i uint32, buf []uint32) ([]uint32, uint32, error) {
536 sec := simpleSection{
537 off: d.newlinesStart + d.newlinesIndex[i],
538 sz: d.newlinesIndex[i+1] - d.newlinesIndex[i],
539 }
540 blob, err := d.readSectionBlob(sec)
541 if err != nil {
542 return nil, 0, err
543 }
544
545 return fromSizedDeltas(blob, buf), sec.sz, nil
546}
547
548func (d *indexData) readDocSections(i uint32, buf []DocumentSection) ([]DocumentSection, uint32, error) {
549 sec := simpleSection{
550 off: d.docSectionsStart + d.docSectionsIndex[i],
551 sz: d.docSectionsIndex[i+1] - d.docSectionsIndex[i],
552 }
553 blob, err := d.readSectionBlob(sec)
554 if err != nil {
555 return nil, 0, err
556 }
557
558 ds := unmarshalDocSections(blob, buf)
559
560 // can be nil if buf is nil and there are no doc sections. However, we rely
561 // on it being non-nil to cache the read.
562 if ds == nil {
563 ds = make([]DocumentSection, 0)
564 }
565
566 return ds, sec.sz, nil
567}
568
569func (d *indexData) readRanks(toc *indexTOC) error {
570 blob, err := d.readSectionBlob(toc.ranks)
571 if err != nil {
572 return err
573 }
574
575 return decodeRanks(blob, &d.ranks)
576}
577
578// NewSearcher creates a Searcher for a single index file. Search
579// results coming from this searcher are valid only for the lifetime
580// of the Searcher itself, ie. []byte members should be copied into
581// fresh buffers if the result is to survive closing the shard.
582func NewSearcher(r IndexFile) (Searcher, error) {
583 rd := &reader{r: r}
584
585 var toc indexTOC
586 if err := rd.readTOC(&toc); err != nil {
587 return nil, err
588 }
589 indexData, err := rd.readIndexData(&toc)
590 if err != nil {
591 return nil, err
592 }
593 indexData.file = r
594 return indexData, nil
595}
596
597// ReadMetadata returns the metadata of index shard without reading
598// the index data. The IndexFile is not closed.
599func ReadMetadata(inf IndexFile) ([]*Repository, *IndexMetadata, error) {
600 rd := &reader{r: inf}
601 var toc indexTOC
602 err := rd.readTOCSections(&toc, []string{"metaData", "repoMetaData"})
603 if err != nil {
604 return nil, nil, err
605 }
606 return rd.parseMetadata(toc.metaData, toc.repoMetaData)
607}
608
609// ReadMetadataPathAlive is like ReadMetadataPath except that it only returns
610// alive repositories.
611func ReadMetadataPathAlive(p string) ([]*Repository, *IndexMetadata, error) {
612 repos, id, err := ReadMetadataPath(p)
613 if err != nil {
614 return nil, nil, err
615 }
616 alive := repos[:0]
617 for _, repo := range repos {
618 if !repo.Tombstone {
619 alive = append(alive, repo)
620 }
621 }
622 return alive, id, nil
623}
624
625// ReadMetadataPath returns the metadata of index shard at p without reading
626// the index data. ReadMetadataPath is a helper for ReadMetadata which opens
627// the IndexFile at p.
628func ReadMetadataPath(p string) ([]*Repository, *IndexMetadata, error) {
629 f, err := os.Open(p)
630 if err != nil {
631 return nil, nil, err
632 }
633 defer f.Close()
634
635 iFile, err := NewIndexFile(f)
636 if err != nil {
637 return nil, nil, err
638 }
639 defer iFile.Close()
640
641 return ReadMetadata(iFile)
642}
643
644// IndexFilePaths returns all paths for the IndexFile at filepath p that
645// exist. Note: if no files exist this will return an empty slice and nil
646// error.
647//
648// This is p and the ".meta" file for p.
649func IndexFilePaths(p string) ([]string, error) {
650 paths := []string{p, p + ".meta"}
651 exist := paths[:0]
652 for _, p := range paths {
653 if _, err := os.Stat(p); err == nil {
654 exist = append(exist, p)
655 } else if !os.IsNotExist(err) {
656 return nil, err
657 }
658 }
659 return exist, nil
660}
661
662func loadIndexData(r IndexFile) (*indexData, error) {
663 rd := &reader{r: r}
664
665 var toc indexTOC
666 if err := rd.readTOC(&toc); err != nil {
667 return nil, err
668 }
669 return rd.readIndexData(&toc)
670}
671
672// PrintNgramStats outputs a list of the form
673//
674// n_1 trigram_1
675// n_2 trigram_2
676// ...
677//
678// where n_i is the length of the postings list of trigram_i stored in r.
679func PrintNgramStats(r IndexFile) error {
680 id, err := loadIndexData(r)
681 if err != nil {
682 return err
683 }
684
685 var rNgram [3]rune
686 for ngram, ss := range id.contentNgrams.DumpMap() {
687 rNgram = ngramToRunes(ngram)
688 fmt.Printf("%d\t%q\n", ss.sz, string(rNgram[:]))
689 }
690 return nil
691}
692
693var crc64Table = crc64.MakeTable(crc64.ECMA)
694
695// backfillID returns a 20 char long sortable ID. The ID only depends on s. It
696// should only be used to set the ID of simple v16 shards on read.
697func backfillID(s string) string {
698 var id xid.ID
699
700 // Our timestamps are based on Unix time. Shards without IDs are assigned IDs
701 // based on the 0 epoch.
702 binary.BigEndian.PutUint32(id[:], 0)
703 binary.BigEndian.PutUint64(id[4:], crc64.Checksum([]byte(s), crc64Table))
704 return id.String()
705}