fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package index
16
17import (
18 "encoding/binary"
19 "encoding/json"
20 "fmt"
21 "hash/crc64"
22 "log"
23 "os"
24 "slices"
25 "sort"
26
27 "github.com/rs/xid"
28 "github.com/sourcegraph/zoekt"
29)
30
31// IndexFile is a file suitable for concurrent read access. For performance
32// reasons, it allows a mmap'd implementation.
33type IndexFile interface {
34 Read(off uint32, sz uint32) ([]byte, error)
35 Size() (uint32, error)
36 Close()
37 Name() string
38}
39
40// reader is a stateful file
41type reader struct {
42 r IndexFile
43 off uint32
44}
45
46func (r *reader) seek(off uint32) {
47 r.off = off
48}
49
50func (r *reader) U32() (uint32, error) {
51 b, err := r.r.Read(r.off, 4)
52 r.off += 4
53 if err != nil {
54 return 0, err
55 }
56 return binary.BigEndian.Uint32(b), nil
57}
58
59func (r *reader) U64() (uint64, error) {
60 b, err := r.r.Read(r.off, 8)
61 r.off += 8
62 if err != nil {
63 return 0, err
64 }
65 return binary.BigEndian.Uint64(b), nil
66}
67
68func (r *reader) ReadByte() (byte, error) {
69 b, err := r.r.Read(r.off, 1)
70 r.off += 1
71 if err != nil {
72 return 0, err
73 }
74 return b[0], nil
75}
76
77func (r *reader) Varint() (uint64, error) {
78 v, err := binary.ReadUvarint(r)
79 if err != nil {
80 return 0, err
81 }
82 return v, nil
83}
84
85func (r *reader) Str() (string, error) {
86 slen, err := r.Varint()
87 if err != nil {
88 return "", err
89 }
90 b, err := r.r.Read(r.off, uint32(slen))
91 if err != nil {
92 return "", err
93 }
94 r.off += uint32(slen)
95 return string(b), nil
96}
97
98func (r *reader) readTOC(toc *indexTOC) error {
99 return r.readTOCSections(toc, []string{})
100}
101
102// readTOCSections reads the table of contents of the index file.
103//
104// If the tags parameter is non-empty, it reads only those tagged sections for efficiency
105// and does not populate the other sections.
106func (r *reader) readTOCSections(toc *indexTOC, tags []string) error {
107 tocSection, sectionCount, err := r.readHeader()
108 if err != nil {
109 return err
110 }
111
112 if sectionCount == 0 {
113 // tagged sections are indicated by a 0 sectionCount,
114 // and then a list of string-tagged type-indicated sections.
115 secs := toc.sectionsTagged()
116 for r.off < tocSection.off+tocSection.sz {
117 tag, err := r.Str()
118 if err != nil {
119 return err
120 }
121 kind, err := r.Varint()
122 if err != nil {
123 return err
124 }
125
126 skipSection := len(tags) > 0 && !slices.Contains(tags, tag)
127 sec := secs[tag]
128 if sec == nil || sec.kind() != sectionKind(kind) {
129 // If we don't recognize the section, we may be reading a newer index than the current version. Use
130 // a "dummy section" struct to skip over it.
131 skipSection = true
132 log.Printf("encountered unrecognized index section (%s), skipping over it", tag)
133
134 switch sectionKind(kind) {
135 case sectionKindSimple:
136 sec = &simpleSection{}
137 case sectionKindCompound:
138 sec = &compoundSection{}
139 case sectionKindCompoundLazy:
140 sec = &lazyCompoundSection{}
141 default:
142 return fmt.Errorf("unknown section kind %d", kind)
143 }
144 }
145
146 if skipSection {
147 if err := sec.skip(r); err != nil {
148 return err
149 }
150 } else {
151 if err := sec.read(r); err != nil {
152 return err
153 }
154 }
155 }
156 } else {
157 // TODO: Remove this branch when ReaderMinFeatureVersion >= 10
158
159 secs := toc.sections()
160
161 if len(secs) != int(sectionCount) {
162 secs = toc.sectionsNext()
163 }
164
165 if len(secs) != int(sectionCount) {
166 return fmt.Errorf("section count mismatch: got %d want %d", sectionCount, len(secs))
167 }
168
169 for _, s := range secs {
170 if err := s.read(r); err != nil {
171 return err
172 }
173 }
174 }
175 return nil
176}
177
178func (r *reader) readHeader() (simpleSection, uint32, error) {
179 sz, err := r.r.Size()
180 if err != nil {
181 return simpleSection{}, 0, err
182 }
183 r.off = sz - 8
184
185 var tocSection simpleSection
186 if err := tocSection.read(r); err != nil {
187 return simpleSection{}, 0, err
188 }
189
190 r.seek(tocSection.off)
191
192 sectionCount, err := r.U32()
193 if err != nil {
194 return simpleSection{}, 0, err
195 }
196 return tocSection, sectionCount, nil
197}
198
199func (r *indexData) readSectionBlob(sec simpleSection) ([]byte, error) {
200 return r.file.Read(sec.off, sec.sz)
201}
202
203func readSectionU32(f IndexFile, sec simpleSection) ([]uint32, error) {
204 if sec.sz%4 != 0 {
205 return nil, fmt.Errorf("barf: section size %% 4 != 0: sz %d ", sec.sz)
206 }
207 blob, err := f.Read(sec.off, sec.sz)
208 if err != nil {
209 return nil, err
210 }
211 arr := make([]uint32, 0, len(blob)/4)
212 for len(blob) > 0 {
213 arr = append(arr, binary.BigEndian.Uint32(blob))
214 blob = blob[4:]
215 }
216 return arr, nil
217}
218
219func readSectionU64(f IndexFile, sec simpleSection) ([]uint64, error) {
220 if sec.sz%8 != 0 {
221 return nil, fmt.Errorf("barf: section size %% 8 != 0: sz %d ", sec.sz)
222 }
223 blob, err := f.Read(sec.off, sec.sz)
224 if err != nil {
225 return nil, err
226 }
227 arr := make([]uint64, 0, len(blob)/8)
228 for len(blob) > 0 {
229 arr = append(arr, binary.BigEndian.Uint64(blob))
230 blob = blob[8:]
231 }
232 return arr, nil
233}
234
235func (r *reader) readJSON(data interface{}, sec simpleSection) error {
236 blob, err := r.r.Read(sec.off, sec.sz)
237 if err != nil {
238 return err
239 }
240
241 return json.Unmarshal(blob, data)
242}
243
244// canReadVersion returns checks if zoekt can read in md. If it can't a
245// non-nil error is returned.
246func canReadVersion(md *zoekt.IndexMetadata) bool {
247 // Backwards compatible with v16
248 return md.IndexFormatVersion == IndexFormatVersion || md.IndexFormatVersion == NextIndexFormatVersion
249}
250
251func (r *reader) readIndexData(toc *indexTOC) (*indexData, error) {
252 d := indexData{
253 file: r.r,
254 branchIDs: []map[string]uint{},
255 branchNames: []map[uint]string{},
256 }
257
258 repos, md, err := r.parseMetadata(toc.metaData, toc.repoMetaData)
259 if md != nil && !canReadVersion(md) {
260 return nil, fmt.Errorf("file is v%d, want v%d", md.IndexFormatVersion, IndexFormatVersion)
261 } else if err != nil {
262 return nil, err
263 }
264
265 d.metaData = *md
266 d.repoMetaData = make([]zoekt.Repository, 0, len(repos))
267 for _, r := range repos {
268 d.repoMetaData = append(d.repoMetaData, *r)
269 }
270
271 if d.metaData.IndexFeatureVersion < ReadMinFeatureVersion {
272 return nil, fmt.Errorf("file is feature version %d, want feature version >= %d", d.metaData.IndexFeatureVersion, ReadMinFeatureVersion)
273 }
274
275 if d.metaData.IndexMinReaderVersion > FeatureVersion {
276 return nil, fmt.Errorf("file needs read feature version >= %d, have read feature version %d", d.metaData.IndexMinReaderVersion, FeatureVersion)
277 }
278
279 d.boundariesStart = toc.fileContents.data.off
280 d.boundaries = toc.fileContents.relativeIndex()
281 d.newlinesStart = toc.newlines.data.off
282 d.newlinesIndex = toc.newlines.relativeIndex()
283 d.docSectionsStart = toc.fileSections.data.off
284 d.docSectionsIndex = toc.fileSections.relativeIndex()
285
286 d.symbols.symKindIndex = toc.symbolKindMap.relativeIndex()
287 d.fileEndSymbol, err = readSectionU32(d.file, toc.fileEndSymbol)
288 if err != nil {
289 return nil, err
290 }
291
292 // Call readSectionBlob on each section key, and store the result in
293 // the blob value.
294 for sect, blob := range map[simpleSection]*[]byte{
295 toc.symbolMap.index: &d.symbols.symIndex,
296 toc.symbolMap.data: &d.symbols.symContent,
297 toc.symbolKindMap.data: &d.symbols.symKindContent,
298 toc.symbolMetaData: &d.symbols.symMetaData,
299 } {
300 if *blob, err = d.readSectionBlob(sect); err != nil {
301 return nil, err
302 }
303 }
304
305 d.checksums, err = d.readSectionBlob(toc.contentChecksums)
306 if err != nil {
307 return nil, err
308 }
309
310 d.languages, err = d.readSectionBlob(toc.languages)
311 if err != nil {
312 return nil, err
313 }
314
315 d.contentNgrams, err = d.newBtreeIndex(toc.ngramText, toc.postings)
316 if err != nil {
317 return nil, err
318 }
319
320 d.fileBranchMasks, err = readSectionU64(d.file, toc.branchMasks)
321 if err != nil {
322 return nil, err
323 }
324
325 d.fileNameContent, err = d.readSectionBlob(toc.fileNames.data)
326 if err != nil {
327 return nil, err
328 }
329
330 d.fileNameIndex = toc.fileNames.relativeIndex()
331
332 d.fileNameNgrams, err = d.newBtreeIndex(toc.nameNgramText, toc.namePostings)
333 if err != nil {
334 return nil, err
335 }
336
337 for _, md := range d.repoMetaData {
338 repoBranchIDs := make(map[string]uint, len(md.Branches))
339 repoBranchNames := make(map[uint]string, len(md.Branches))
340 for j, br := range md.Branches {
341 id := uint(1) << uint(j)
342 repoBranchIDs[br.Name] = id
343 repoBranchNames[id] = br.Name
344 }
345 d.branchIDs = append(d.branchIDs, repoBranchIDs)
346 d.branchNames = append(d.branchNames, repoBranchNames)
347 d.rawConfigMasks = append(d.rawConfigMasks, encodeRawConfig(md.RawConfig))
348 }
349
350 blob, err := d.readSectionBlob(toc.runeDocSections)
351 if err != nil {
352 return nil, err
353 }
354
355 d.runeDocSections = unmarshalDocSections(blob, nil)
356
357 var runeOffsets, fileNameRuneOffsets []uint32
358
359 for sect, dest := range map[simpleSection]*[]uint32{
360 toc.subRepos: &d.subRepos,
361 toc.runeOffsets: &runeOffsets,
362 toc.nameRuneOffsets: &fileNameRuneOffsets,
363 toc.nameEndRunes: &d.fileNameEndRunes,
364 toc.fileEndRunes: &d.fileEndRunes,
365 } {
366 if blob, err := d.readSectionBlob(sect); err != nil {
367 return nil, err
368 } else {
369 *dest = fromSizedDeltas(blob, nil)
370 }
371 }
372
373 d.runeOffsets = makeRuneOffsetMap(runeOffsets)
374 d.fileNameRuneOffsets = makeRuneOffsetMap(fileNameRuneOffsets)
375
376 d.subRepoPaths = make([][]string, 0, len(d.repoMetaData))
377 for i := 0; i < len(d.repoMetaData); i++ {
378 keys := make([]string, 0, len(d.repoMetaData[i].SubRepoMap)+1)
379 keys = append(keys, "")
380 for k := range d.repoMetaData[i].SubRepoMap {
381 if k != "" {
382 keys = append(keys, k)
383 }
384 }
385 sort.Strings(keys)
386 d.subRepoPaths = append(d.subRepoPaths, keys)
387 }
388
389 d.languageMap = map[uint16]string{}
390 for k, v := range d.metaData.LanguageMap {
391 d.languageMap[v] = k
392 }
393
394 if err := d.verify(); err != nil {
395 return nil, err
396 }
397
398 if d.metaData.IndexFormatVersion >= 17 {
399 blob, err := d.readSectionBlob(toc.repos)
400 if err != nil {
401 return nil, err
402 }
403 d.repos = fromSizedDeltas16(blob, nil)
404 } else {
405 // every document is for repo index 0 (default value of uint16)
406 d.repos = make([]uint16, len(d.fileBranchMasks))
407 }
408
409 if err := d.calculateStats(); err != nil {
410 return nil, err
411 }
412
413 return &d, nil
414}
415
416func (r *reader) parseMetadata(metaData simpleSection, repoMetaData simpleSection) ([]*zoekt.Repository, *zoekt.IndexMetadata, error) {
417 var md zoekt.IndexMetadata
418 if err := r.readJSON(&md, metaData); err != nil {
419 return nil, nil, err
420 }
421
422 // Sourcegraph specific: we support mutating metadata via an additional
423 // ".meta" file. This is to support tombstoning. An additional benefit is we
424 // can update metadata (such as Rank and Name) without re-indexing content.
425 blob, err := os.ReadFile(r.r.Name() + ".meta")
426 if err != nil && !os.IsNotExist(err) {
427 return nil, &md, fmt.Errorf("failed to read meta file: %w", err)
428 }
429
430 if len(blob) == 0 {
431 blob, err = r.r.Read(repoMetaData.off, repoMetaData.sz)
432 if err != nil {
433 return nil, &md, err
434 }
435 }
436
437 var repos []*zoekt.Repository
438 if md.IndexFormatVersion >= 17 {
439 if err := json.Unmarshal(blob, &repos); err != nil {
440 return nil, &md, err
441 }
442 } else {
443 repos = make([]*zoekt.Repository, 1)
444 if err := json.Unmarshal(blob, &repos[0]); err != nil {
445 return nil, &md, err
446 }
447 }
448
449 if md.ID == "" {
450 if len(repos) == 0 {
451 return nil, nil, fmt.Errorf("len(repos)=0. Cannot backfill ID")
452 }
453 md.ID = backfillID(repos[0].Name)
454 }
455
456 return repos, &md, nil
457}
458
459const ngramEncoding = 8
460
461func (d *indexData) newBtreeIndex(ngramSec simpleSection, postings compoundSection) (btreeIndex, error) {
462 bi := btreeIndex{file: d.file}
463
464 textContent, err := d.readSectionBlob(ngramSec)
465 if err != nil {
466 return btreeIndex{}, err
467 }
468
469 // For 500k trigams we can expect approx 1000 leaf nodes (500k divided by
470 // half the bucketSize) and 20 nodes on level 2 (all but the rightmost
471 // inner nodes will have exactly v=50 children) plus a root node.
472 bt := newBtree(btreeOpts{bucketSize: btreeBucketSize, v: 50})
473 for i := 0; i < len(textContent); i += ngramEncoding {
474 ng := ngram(binary.BigEndian.Uint64(textContent[i : i+ngramEncoding]))
475 bt.insert(ng)
476 }
477 bt.freeze()
478
479 bi.bt = bt
480
481 // hold on to simple sections (8 bytes each)
482 bi.ngramSec = ngramSec
483 bi.postingIndex = postings.index
484
485 return bi, nil
486}
487
488func (d *indexData) verify() error {
489 // This is not an exhaustive check: the postings can easily
490 // generate OOB acccesses, and are expensive to check, but this lets us rule out
491 // other sources of OOB access.
492 n := len(d.fileNameIndex)
493 if n == 0 {
494 return nil
495 }
496
497 n--
498 for what, got := range map[string]int{
499 "boundaries": len(d.boundaries) - 1,
500 "branch masks": len(d.fileBranchMasks),
501 "doc section index": len(d.docSectionsIndex) - 1,
502 "newlines index": len(d.newlinesIndex) - 1,
503 } {
504 if got != n {
505 return fmt.Errorf("got %s %d, want %d", what, got, n)
506 }
507 }
508 return nil
509}
510
511func (d *indexData) readContents(i uint32) ([]byte, error) {
512 return d.readSectionBlob(simpleSection{
513 off: d.boundariesStart + d.boundaries[i],
514 sz: d.boundaries[i+1] - d.boundaries[i],
515 })
516}
517
518func (d *indexData) readContentSlice(off uint32, sz uint32) ([]byte, error) {
519 // TODO(hanwen): cap result if it is at the end of the content
520 // section.
521 return d.readSectionBlob(simpleSection{
522 off: d.boundariesStart + off,
523 sz: sz,
524 })
525}
526
527func (d *indexData) readNewlines(i uint32, buf []uint32) ([]uint32, uint32, error) {
528 sec := simpleSection{
529 off: d.newlinesStart + d.newlinesIndex[i],
530 sz: d.newlinesIndex[i+1] - d.newlinesIndex[i],
531 }
532 blob, err := d.readSectionBlob(sec)
533 if err != nil {
534 return nil, 0, err
535 }
536
537 nl := fromSizedDeltas(blob, buf)
538
539 // can be nil if buf is nil and there are no doc sections. However, we rely
540 // on it being non-nil to cache the read.
541 if nl == nil {
542 nl = make([]uint32, 0)
543 }
544 return nl, sec.sz, nil
545}
546
547func (d *indexData) readDocSections(i uint32, buf []DocumentSection) ([]DocumentSection, uint32, error) {
548 sec := simpleSection{
549 off: d.docSectionsStart + d.docSectionsIndex[i],
550 sz: d.docSectionsIndex[i+1] - d.docSectionsIndex[i],
551 }
552 blob, err := d.readSectionBlob(sec)
553 if err != nil {
554 return nil, 0, err
555 }
556
557 ds := unmarshalDocSections(blob, buf)
558
559 // can be nil if buf is nil and there are no doc sections. However, we rely
560 // on it being non-nil to cache the read.
561 if ds == nil {
562 ds = make([]DocumentSection, 0)
563 }
564
565 return ds, sec.sz, nil
566}
567
568// NewSearcher creates a Searcher for a single index file. Search
569// results coming from this searcher are valid only for the lifetime
570// of the Searcher itself, ie. []byte members should be copied into
571// fresh buffers if the result is to survive closing the shard.
572func NewSearcher(r IndexFile) (zoekt.Searcher, error) {
573 rd := &reader{r: r}
574
575 var toc indexTOC
576 if err := rd.readTOC(&toc); err != nil {
577 return nil, err
578 }
579 indexData, err := rd.readIndexData(&toc)
580 if err != nil {
581 return nil, err
582 }
583 indexData.file = r
584 return indexData, nil
585}
586
587// ReadMetadata returns the metadata of index shard without reading
588// the index data. The IndexFile is not closed.
589func ReadMetadata(inf IndexFile) ([]*zoekt.Repository, *zoekt.IndexMetadata, error) {
590 rd := &reader{r: inf}
591 var toc indexTOC
592 err := rd.readTOCSections(&toc, []string{"metaData", "repoMetaData"})
593 if err != nil {
594 return nil, nil, err
595 }
596 return rd.parseMetadata(toc.metaData, toc.repoMetaData)
597}
598
599// ReadMetadataPathAlive is like ReadMetadataPath except that it only returns
600// alive repositories.
601func ReadMetadataPathAlive(p string) ([]*zoekt.Repository, *zoekt.IndexMetadata, error) {
602 repos, id, err := ReadMetadataPath(p)
603 if err != nil {
604 return nil, nil, err
605 }
606 alive := repos[:0]
607 for _, repo := range repos {
608 if !repo.Tombstone {
609 alive = append(alive, repo)
610 }
611 }
612 return alive, id, nil
613}
614
615// ReadMetadataPath returns the metadata of index shard at p without reading
616// the index data. ReadMetadataPath is a helper for ReadMetadata which opens
617// the IndexFile at p.
618func ReadMetadataPath(p string) ([]*zoekt.Repository, *zoekt.IndexMetadata, error) {
619 f, err := os.Open(p)
620 if err != nil {
621 return nil, nil, err
622 }
623 defer f.Close()
624
625 iFile, err := NewIndexFile(f)
626 if err != nil {
627 return nil, nil, err
628 }
629 defer iFile.Close()
630
631 return ReadMetadata(iFile)
632}
633
634// IndexFilePaths returns all paths for the IndexFile at filepath p that
635// exist. Note: if no files exist this will return an empty slice and nil
636// error.
637//
638// This is p and the ".meta" file for p.
639func IndexFilePaths(p string) ([]string, error) {
640 paths := []string{p, p + ".meta"}
641 exist := paths[:0]
642 for _, p := range paths {
643 if _, err := os.Stat(p); err == nil {
644 exist = append(exist, p)
645 } else if !os.IsNotExist(err) {
646 return nil, err
647 }
648 }
649 return exist, nil
650}
651
652func loadIndexData(r IndexFile) (*indexData, error) {
653 rd := &reader{r: r}
654
655 var toc indexTOC
656 if err := rd.readTOC(&toc); err != nil {
657 return nil, err
658 }
659 return rd.readIndexData(&toc)
660}
661
662// PrintNgramStats outputs a list of the form
663//
664// n_1 trigram_1
665// n_2 trigram_2
666// ...
667//
668// where n_i is the length of the postings list of trigram_i stored in r.
669func PrintNgramStats(r IndexFile) error {
670 id, err := loadIndexData(r)
671 if err != nil {
672 return err
673 }
674
675 var rNgram [3]rune
676 for ngram, ss := range id.contentNgrams.DumpMap() {
677 rNgram = ngramToRunes(ngram)
678 fmt.Printf("%d\t%q\n", ss.sz, string(rNgram[:]))
679 }
680 return nil
681}
682
683var crc64Table = crc64.MakeTable(crc64.ECMA)
684
685// backfillID returns a 20 char long sortable ID. The ID only depends on s. It
686// should only be used to set the ID of simple v16 shards on read.
687func backfillID(s string) string {
688 var id xid.ID
689
690 // Our timestamps are based on Unix time. Shards without IDs are assigned IDs
691 // based on the 0 epoch.
692 binary.BigEndian.PutUint32(id[:], 0)
693 binary.BigEndian.PutUint64(id[4:], crc64.Checksum([]byte(s), crc64Table))
694 return id.String()
695}