fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "encoding/binary"
19 "encoding/json"
20 "fmt"
21 "hash/crc64"
22 "log"
23 "os"
24 "sort"
25
26 "github.com/rs/xid"
27)
28
29// IndexFile is a file suitable for concurrent read access. For performance
30// reasons, it allows a mmap'd implementation.
31type IndexFile interface {
32 Read(off uint32, sz uint32) ([]byte, error)
33 Size() (uint32, error)
34 Close()
35 Name() string
36}
37
38// reader is a stateful file
39type reader struct {
40 r IndexFile
41 off uint32
42}
43
44func (r *reader) seek(off uint32) {
45 r.off = off
46}
47
48func (r *reader) U32() (uint32, error) {
49 b, err := r.r.Read(r.off, 4)
50 r.off += 4
51 if err != nil {
52 return 0, err
53 }
54 return binary.BigEndian.Uint32(b), nil
55}
56
57func (r *reader) U64() (uint64, error) {
58 b, err := r.r.Read(r.off, 8)
59 r.off += 8
60 if err != nil {
61 return 0, err
62 }
63 return binary.BigEndian.Uint64(b), nil
64}
65
66func (r *reader) ReadByte() (byte, error) {
67 b, err := r.r.Read(r.off, 1)
68 r.off += 1
69 if err != nil {
70 return 0, err
71 }
72 return b[0], nil
73}
74
75func (r *reader) Varint() (uint64, error) {
76 v, err := binary.ReadUvarint(r)
77 if err != nil {
78 return 0, err
79 }
80 return v, nil
81}
82
83func (r *reader) Str() (string, error) {
84 slen, err := r.Varint()
85 if err != nil {
86 return "", err
87 }
88 b, err := r.r.Read(r.off, uint32(slen))
89 if err != nil {
90 return "", err
91 }
92 r.off += uint32(slen)
93 return string(b), nil
94}
95
96func (r *reader) readTOC(toc *indexTOC) error {
97 sz, err := r.r.Size()
98 if err != nil {
99 return err
100 }
101 r.off = sz - 8
102
103 var tocSection simpleSection
104 if err := tocSection.read(r); err != nil {
105 return err
106 }
107
108 r.seek(tocSection.off)
109
110 sectionCount, err := r.U32()
111 if err != nil {
112 return err
113 }
114
115 if sectionCount == 0 {
116 // tagged sections are indicated by a 0 sectionCount,
117 // and then a list of string-tagged type-indicated sections.
118 secs := toc.sectionsTagged()
119 for r.off < tocSection.off+tocSection.sz {
120 tag, err := r.Str()
121 if err != nil {
122 return err
123 }
124 kind, err := r.Varint()
125 if err != nil {
126 return err
127 }
128 sec := secs[tag]
129 if sec != nil && sec.kind() == sectionKind(kind) {
130 // happy path
131 if err := sec.read(r); err != nil {
132 return err
133 }
134 continue
135 }
136 // error case: skip over unknown section
137 if sec == nil {
138 log.Printf("file %s TOC has unknown section %q", r.r.Name(), tag)
139 } else {
140 return fmt.Errorf("file %s TOC section %q expects kind %d, got kind %d", r.r.Name(), tag,
141 kind, sec.kind())
142 }
143 if kind == 0 {
144 if err := (&simpleSection{}).read(r); err != nil {
145 return err
146 }
147 } else if kind == 1 {
148 if err := (&compoundSection{}).read(r); err != nil {
149 return err
150 }
151 }
152 }
153 } else {
154 // TODO: Remove this branch when ReaderMinFeatureVersion >= 10
155
156 secs := toc.sections()
157
158 if len(secs) != int(sectionCount) {
159 secs = toc.sectionsNext()
160 }
161
162 if len(secs) != int(sectionCount) {
163 return fmt.Errorf("section count mismatch: got %d want %d", sectionCount, len(secs))
164 }
165
166 for _, s := range secs {
167 if err := s.read(r); err != nil {
168 return err
169 }
170 }
171 }
172 return nil
173}
174
175func (r *indexData) readSectionBlob(sec simpleSection) ([]byte, error) {
176 return r.file.Read(sec.off, sec.sz)
177}
178
179func readSectionU32(f IndexFile, sec simpleSection) ([]uint32, error) {
180 if sec.sz%4 != 0 {
181 return nil, fmt.Errorf("barf: section size %% 4 != 0: sz %d ", sec.sz)
182 }
183 blob, err := f.Read(sec.off, sec.sz)
184 if err != nil {
185 return nil, err
186 }
187 arr := make([]uint32, 0, len(blob)/4)
188 for len(blob) > 0 {
189 arr = append(arr, binary.BigEndian.Uint32(blob))
190 blob = blob[4:]
191 }
192 return arr, nil
193}
194
195func readSectionU64(f IndexFile, sec simpleSection) ([]uint64, error) {
196 if sec.sz%8 != 0 {
197 return nil, fmt.Errorf("barf: section size %% 8 != 0: sz %d ", sec.sz)
198 }
199 blob, err := f.Read(sec.off, sec.sz)
200 if err != nil {
201 return nil, err
202 }
203 arr := make([]uint64, 0, len(blob)/8)
204 for len(blob) > 0 {
205 arr = append(arr, binary.BigEndian.Uint64(blob))
206 blob = blob[8:]
207 }
208 return arr, nil
209}
210
211func (r *reader) readJSON(data interface{}, sec *simpleSection) error {
212 blob, err := r.r.Read(sec.off, sec.sz)
213 if err != nil {
214 return err
215 }
216
217 return json.Unmarshal(blob, data)
218}
219
220// canReadVersion returns checks if zoekt can read in md. If it can't a
221// non-nil error is returned.
222func canReadVersion(md *IndexMetadata) bool {
223 // Backwards compatible with v16
224 return md.IndexFormatVersion == IndexFormatVersion || md.IndexFormatVersion == NextIndexFormatVersion
225}
226
227func (r *reader) readIndexData(toc *indexTOC) (*indexData, error) {
228 d := indexData{
229 file: r.r,
230 branchIDs: []map[string]uint{},
231 branchNames: []map[uint]string{},
232 }
233
234 repos, md, err := r.readMetadata(toc)
235 if md != nil && !canReadVersion(md) {
236 return nil, fmt.Errorf("file is v%d, want v%d", md.IndexFormatVersion, IndexFormatVersion)
237 } else if err != nil {
238 return nil, err
239 }
240
241 d.metaData = *md
242 d.repoMetaData = make([]Repository, 0, len(repos))
243 for _, r := range repos {
244 d.repoMetaData = append(d.repoMetaData, *r)
245 }
246
247 if d.metaData.IndexFeatureVersion < ReadMinFeatureVersion {
248 return nil, fmt.Errorf("file is feature version %d, want feature version >= %d", d.metaData.IndexFeatureVersion, ReadMinFeatureVersion)
249 }
250
251 if d.metaData.IndexMinReaderVersion > FeatureVersion {
252 return nil, fmt.Errorf("file needs read feature version >= %d, have read feature version %d", d.metaData.IndexMinReaderVersion, FeatureVersion)
253 }
254
255 d.boundariesStart = toc.fileContents.data.off
256 d.boundaries = toc.fileContents.relativeIndex()
257 d.newlinesStart = toc.newlines.data.off
258 d.newlinesIndex = toc.newlines.relativeIndex()
259 d.docSectionsStart = toc.fileSections.data.off
260 d.docSectionsIndex = toc.fileSections.relativeIndex()
261
262 d.symbols.symKindIndex = toc.symbolKindMap.relativeIndex()
263 d.fileEndSymbol, err = readSectionU32(d.file, toc.fileEndSymbol)
264 if err != nil {
265 return nil, err
266 }
267
268 // Call readSectionBlob on each section key, and store the result in
269 // the blob value.
270 for sect, blob := range map[simpleSection]*[]byte{
271 toc.symbolMap.index: &d.symbols.symIndex,
272 toc.symbolMap.data: &d.symbols.symContent,
273 toc.symbolKindMap.data: &d.symbols.symKindContent,
274 toc.symbolMetaData: &d.symbols.symMetaData,
275 } {
276 if *blob, err = d.readSectionBlob(sect); err != nil {
277 return nil, err
278 }
279 }
280
281 d.checksums, err = d.readSectionBlob(toc.contentChecksums)
282 if err != nil {
283 return nil, err
284 }
285
286 d.languages, err = d.readSectionBlob(toc.languages)
287 if err != nil {
288 return nil, err
289 }
290
291 if os.Getenv("ZOEKT_DISABLE_BTREE") != "" {
292 offsetMap, err := d.readNgrams(toc)
293 if err != nil {
294 return nil, err
295 }
296 d.ngrams = offsetMap
297 } else {
298 bt, err := d.newBtreeIndex(toc.ngramText, toc.postings)
299 if err != nil {
300 return nil, err
301 }
302 d.ngrams = bt
303 }
304
305 d.fileBranchMasks, err = readSectionU64(d.file, toc.branchMasks)
306 if err != nil {
307 return nil, err
308 }
309
310 d.fileNameContent, err = d.readSectionBlob(toc.fileNames.data)
311 if err != nil {
312 return nil, err
313 }
314
315 d.fileNameIndex = toc.fileNames.relativeIndex()
316
317 if os.Getenv("ZOEKT_DISABLE_BTREE") != "" {
318 d.fileNameNgrams.m, err = d.readFileNameNgrams(toc)
319 } else {
320 d.fileNameNgrams.bt, err = d.newBtreeIndex(toc.nameNgramText, toc.namePostings)
321 }
322 if err != nil {
323 return nil, err
324 }
325
326 for _, md := range d.repoMetaData {
327 repoBranchIDs := make(map[string]uint, len(md.Branches))
328 repoBranchNames := make(map[uint]string, len(md.Branches))
329 for j, br := range md.Branches {
330 id := uint(1) << uint(j)
331 repoBranchIDs[br.Name] = id
332 repoBranchNames[id] = br.Name
333 }
334 d.branchIDs = append(d.branchIDs, repoBranchIDs)
335 d.branchNames = append(d.branchNames, repoBranchNames)
336 d.rawConfigMasks = append(d.rawConfigMasks, encodeRawConfig(md.RawConfig))
337 }
338
339 blob, err := d.readSectionBlob(toc.runeDocSections)
340 if err != nil {
341 return nil, err
342 }
343
344 if os.Getenv("ZOEKT_ENABLE_LAZY_DOC_SECTIONS") != "" {
345 d.runeDocSectionsRaw = blob
346 } else {
347 d.runeDocSections = unmarshalDocSections(blob, nil)
348 }
349
350 var runeOffsets, fileNameRuneOffsets []uint32
351
352 for sect, dest := range map[simpleSection]*[]uint32{
353 toc.subRepos: &d.subRepos,
354 toc.runeOffsets: &runeOffsets,
355 toc.nameRuneOffsets: &fileNameRuneOffsets,
356 toc.nameEndRunes: &d.fileNameEndRunes,
357 toc.fileEndRunes: &d.fileEndRunes,
358 } {
359 if blob, err := d.readSectionBlob(sect); err != nil {
360 return nil, err
361 } else {
362 *dest = fromSizedDeltas(blob, nil)
363 }
364 }
365
366 d.runeOffsets = makeRuneOffsetMap(runeOffsets)
367 d.fileNameRuneOffsets = makeRuneOffsetMap(fileNameRuneOffsets)
368
369 d.subRepoPaths = make([][]string, 0, len(d.repoMetaData))
370 for i := 0; i < len(d.repoMetaData); i++ {
371 keys := make([]string, 0, len(d.repoMetaData[i].SubRepoMap)+1)
372 keys = append(keys, "")
373 for k := range d.repoMetaData[i].SubRepoMap {
374 if k != "" {
375 keys = append(keys, k)
376 }
377 }
378 sort.Strings(keys)
379 d.subRepoPaths = append(d.subRepoPaths, keys)
380 }
381
382 d.languageMap = map[uint16]string{}
383 for k, v := range d.metaData.LanguageMap {
384 d.languageMap[v] = k
385 }
386
387 if err := d.verify(); err != nil {
388 return nil, err
389 }
390
391 // roc.ranks.sz = 0 indicates that we are reading a shard without ranks, in
392 // which case we skip reading the section and leave d.ranks = nil
393 if toc.ranks.sz > 0 {
394 err = d.readRanks(toc)
395 if err != nil {
396 return nil, err
397 }
398 }
399
400 if d.metaData.IndexFormatVersion >= 17 {
401 blob, err := d.readSectionBlob(toc.repos)
402 if err != nil {
403 return nil, err
404 }
405 d.repos = fromSizedDeltas16(blob, nil)
406 } else {
407 // every document is for repo index 0 (default value of uint16)
408 d.repos = make([]uint16, len(d.fileBranchMasks))
409 }
410
411 if err := d.calculateStats(); err != nil {
412 return nil, err
413 }
414
415 return &d, nil
416}
417
418func (r *reader) readMetadata(toc *indexTOC) ([]*Repository, *IndexMetadata, error) {
419 var md IndexMetadata
420 if err := r.readJSON(&md, &toc.metaData); err != nil {
421 return nil, nil, err
422 }
423
424 // Sourcegraph specific: we support mutating metadata via an additional
425 // ".meta" file. This is to support tombstoning. An additional benefit is we
426 // can update metadata (such as Rank and Name) without re-indexing content.
427 blob, err := os.ReadFile(r.r.Name() + ".meta")
428 if err != nil && !os.IsNotExist(err) {
429 return nil, &md, fmt.Errorf("failed to read meta file: %w", err)
430 }
431
432 if len(blob) == 0 {
433 blob, err = r.r.Read(toc.repoMetaData.off, toc.repoMetaData.sz)
434 if err != nil {
435 return nil, &md, err
436 }
437 }
438
439 var repos []*Repository
440 if md.IndexFormatVersion >= 17 {
441 if err := json.Unmarshal(blob, &repos); err != nil {
442 return nil, &md, err
443 }
444 } else {
445 repos = make([]*Repository, 1)
446 if err := json.Unmarshal(blob, &repos[0]); err != nil {
447 return nil, &md, err
448 }
449 }
450
451 if md.ID == "" {
452 if len(repos) == 0 {
453 return nil, nil, fmt.Errorf("len(repos)=0. Cannot backfill ID")
454 }
455 md.ID = backfillID(repos[0].Name)
456 }
457
458 return repos, &md, nil
459}
460
461const ngramEncoding = 8
462
463func (d *indexData) readNgrams(toc *indexTOC) (combinedNgramOffset, error) {
464 textContent, err := d.readSectionBlob(toc.ngramText)
465 if err != nil {
466 return combinedNgramOffset{}, err
467 }
468 postingsIndex := toc.postings.relativeIndex()
469
470 for i := 0; i < len(postingsIndex); i++ {
471 postingsIndex[i] += toc.postings.data.off
472 }
473
474 ngrams := make([]ngram, 0, len(textContent)/ngramEncoding)
475 for i := 0; i < len(textContent); i += ngramEncoding {
476 ng := ngram(binary.BigEndian.Uint64(textContent[i : i+ngramEncoding]))
477 ngrams = append(ngrams, ng)
478 }
479
480 return makeCombinedNgramOffset(ngrams, postingsIndex), nil
481}
482
483func (d *indexData) newBtreeIndex(ngramSec simpleSection, postings compoundSection) (btreeIndex, error) {
484 bi := btreeIndex{file: d.file}
485
486 textContent, err := d.readSectionBlob(ngramSec)
487 if err != nil {
488 return btreeIndex{}, err
489 }
490
491 // For 500k trigams we can expect approx 1000 leaf nodes (500k divided by
492 // half the bucketSize) and 20 nodes on level 2 (all but the rightmost
493 // inner nodes will have exactly v=50 children) plus a root node.
494 bt := newBtree(btreeOpts{bucketSize: btreeBucketSize, v: 50})
495 for i := 0; i < len(textContent); i += ngramEncoding {
496 ng := ngram(binary.BigEndian.Uint64(textContent[i : i+ngramEncoding]))
497 bt.insert(ng)
498 }
499 bt.freeze()
500
501 bi.bt = bt
502
503 // hold on to simple sections (8 bytes each)
504 bi.ngramSec = ngramSec
505 bi.postingIndex = postings.index
506
507 return bi, nil
508}
509
510func (d *indexData) readFileNameNgrams(toc *indexTOC) (map[ngram][]byte, error) {
511 nameNgramText, err := d.readSectionBlob(toc.nameNgramText)
512 if err != nil {
513 return nil, err
514 }
515
516 fileNamePostingsData, err := d.readSectionBlob(toc.namePostings.data)
517 if err != nil {
518 return nil, err
519 }
520
521 fileNamePostingsIndex := toc.namePostings.relativeIndex()
522
523 fileNameNgrams := make(map[ngram][]byte, len(nameNgramText)/ngramEncoding)
524 for i := 0; i < len(nameNgramText); i += ngramEncoding {
525 j := i / ngramEncoding
526 off := fileNamePostingsIndex[j]
527 end := fileNamePostingsIndex[j+1]
528 ng := ngram(binary.BigEndian.Uint64(nameNgramText[i : i+ngramEncoding]))
529 fileNameNgrams[ng] = fileNamePostingsData[off:end]
530 }
531
532 return fileNameNgrams, nil
533}
534
535func (d *indexData) verify() error {
536 // This is not an exhaustive check: the postings can easily
537 // generate OOB acccesses, and are expensive to check, but this lets us rule out
538 // other sources of OOB access.
539 n := len(d.fileNameIndex)
540 if n == 0 {
541 return nil
542 }
543
544 n--
545 for what, got := range map[string]int{
546 "boundaries": len(d.boundaries) - 1,
547 "branch masks": len(d.fileBranchMasks),
548 "doc section index": len(d.docSectionsIndex) - 1,
549 "newlines index": len(d.newlinesIndex) - 1,
550 } {
551 if got != n {
552 return fmt.Errorf("got %s %d, want %d", what, got, n)
553 }
554 }
555 return nil
556}
557
558func (d *indexData) readContents(i uint32) ([]byte, error) {
559 return d.readSectionBlob(simpleSection{
560 off: d.boundariesStart + d.boundaries[i],
561 sz: d.boundaries[i+1] - d.boundaries[i],
562 })
563}
564
565func (d *indexData) readContentSlice(off uint32, sz uint32) ([]byte, error) {
566 // TODO(hanwen): cap result if it is at the end of the content
567 // section.
568 return d.readSectionBlob(simpleSection{
569 off: d.boundariesStart + off,
570 sz: sz,
571 })
572}
573
574func (d *indexData) readNewlines(i uint32, buf []uint32) ([]uint32, uint32, error) {
575 sec := simpleSection{
576 off: d.newlinesStart + d.newlinesIndex[i],
577 sz: d.newlinesIndex[i+1] - d.newlinesIndex[i],
578 }
579 blob, err := d.readSectionBlob(sec)
580 if err != nil {
581 return nil, 0, err
582 }
583
584 return fromSizedDeltas(blob, buf), sec.sz, nil
585}
586
587func (d *indexData) readDocSections(i uint32, buf []DocumentSection) ([]DocumentSection, uint32, error) {
588 sec := simpleSection{
589 off: d.docSectionsStart + d.docSectionsIndex[i],
590 sz: d.docSectionsIndex[i+1] - d.docSectionsIndex[i],
591 }
592 blob, err := d.readSectionBlob(sec)
593 if err != nil {
594 return nil, 0, err
595 }
596
597 return unmarshalDocSections(blob, buf), sec.sz, nil
598}
599
600func (d *indexData) readRanks(toc *indexTOC) error {
601 blob, err := d.readSectionBlob(toc.ranks)
602 if err != nil {
603 return err
604 }
605
606 return decodeRanks(blob, &d.ranks)
607}
608
609// NewSearcher creates a Searcher for a single index file. Search
610// results coming from this searcher are valid only for the lifetime
611// of the Searcher itself, ie. []byte members should be copied into
612// fresh buffers if the result is to survive closing the shard.
613func NewSearcher(r IndexFile) (Searcher, error) {
614 rd := &reader{r: r}
615
616 var toc indexTOC
617 if err := rd.readTOC(&toc); err != nil {
618 return nil, err
619 }
620 indexData, err := rd.readIndexData(&toc)
621 if err != nil {
622 return nil, err
623 }
624 indexData.file = r
625 return indexData, nil
626}
627
628// ReadMetadata returns the metadata of index shard without reading
629// the index data. The IndexFile is not closed.
630func ReadMetadata(inf IndexFile) ([]*Repository, *IndexMetadata, error) {
631 rd := &reader{r: inf}
632 var toc indexTOC
633 if err := rd.readTOC(&toc); err != nil {
634 return nil, nil, err
635 }
636
637 return rd.readMetadata(&toc)
638}
639
640// ReadMetadataPathAlive is like ReadMetadataPath except that it only returns
641// alive repositories.
642func ReadMetadataPathAlive(p string) ([]*Repository, *IndexMetadata, error) {
643 repos, id, err := ReadMetadataPath(p)
644 if err != nil {
645 return nil, nil, err
646 }
647 alive := repos[:0]
648 for _, repo := range repos {
649 if !repo.Tombstone {
650 alive = append(alive, repo)
651 }
652 }
653 return alive, id, nil
654}
655
656// ReadMetadataPath returns the metadata of index shard at p without reading
657// the index data. ReadMetadataPath is a helper for ReadMetadata which opens
658// the IndexFile at p.
659func ReadMetadataPath(p string) ([]*Repository, *IndexMetadata, error) {
660 f, err := os.Open(p)
661 if err != nil {
662 return nil, nil, err
663 }
664 defer f.Close()
665
666 iFile, err := NewIndexFile(f)
667 if err != nil {
668 return nil, nil, err
669 }
670 defer iFile.Close()
671
672 return ReadMetadata(iFile)
673}
674
675// IndexFilePaths returns all paths for the IndexFile at filepath p that
676// exist. Note: if no files exist this will return an empty slice and nil
677// error.
678//
679// This is p and the ".meta" file for p.
680func IndexFilePaths(p string) ([]string, error) {
681 paths := []string{p, p + ".meta"}
682 exist := paths[:0]
683 for _, p := range paths {
684 if _, err := os.Stat(p); err == nil {
685 exist = append(exist, p)
686 } else if !os.IsNotExist(err) {
687 return nil, err
688 }
689 }
690 return exist, nil
691}
692
693func loadIndexData(r IndexFile) (*indexData, error) {
694 rd := &reader{r: r}
695
696 var toc indexTOC
697 if err := rd.readTOC(&toc); err != nil {
698 return nil, err
699 }
700 return rd.readIndexData(&toc)
701}
702
703// PrintNgramStats outputs a list of the form
704//
705// n_1 trigram_1
706// n_2 trigram_2
707// ...
708//
709// where n_i is the length of the postings list of trigram_i stored in r.
710func PrintNgramStats(r IndexFile) error {
711 id, err := loadIndexData(r)
712 if err != nil {
713 return err
714 }
715
716 if id.ngrams == nil {
717 return fmt.Errorf("PrintNgramStats: ngrams=nil")
718 }
719 var rNgram [3]rune
720 for ngram, ss := range id.ngrams.DumpMap() {
721 rNgram = ngramToRunes(ngram)
722 fmt.Printf("%d\t%q\n", ss.sz, string(rNgram[:]))
723 }
724 return nil
725}
726
727var crc64Table = crc64.MakeTable(crc64.ECMA)
728
729// backfillID returns a 20 char long sortable ID. The ID only depends on s. It
730// should only be used to set the ID of simple v16 shards on read.
731func backfillID(s string) string {
732 var id xid.ID
733
734 // Our timestamps are based on Unix time. Shards without IDs are assigned IDs
735 // based on the 0 epoch.
736 binary.BigEndian.PutUint32(id[:], 0)
737 binary.BigEndian.PutUint64(id[4:], crc64.Checksum([]byte(s), crc64Table))
738 return id.String()
739}