fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "encoding/binary" 19 "encoding/json" 20 "fmt" 21 "hash/crc64" 22 "log" 23 "os" 24 "sort" 25 26 "github.com/rs/xid" 27) 28 29// IndexFile is a file suitable for concurrent read access. For performance 30// reasons, it allows a mmap'd implementation. 31type IndexFile interface { 32 Read(off uint32, sz uint32) ([]byte, error) 33 Size() (uint32, error) 34 Close() 35 Name() string 36} 37 38// reader is a stateful file 39type reader struct { 40 r IndexFile 41 off uint32 42} 43 44func (r *reader) seek(off uint32) { 45 r.off = off 46} 47 48func (r *reader) U32() (uint32, error) { 49 b, err := r.r.Read(r.off, 4) 50 r.off += 4 51 if err != nil { 52 return 0, err 53 } 54 return binary.BigEndian.Uint32(b), nil 55} 56 57func (r *reader) U64() (uint64, error) { 58 b, err := r.r.Read(r.off, 8) 59 r.off += 8 60 if err != nil { 61 return 0, err 62 } 63 return binary.BigEndian.Uint64(b), nil 64} 65 66func (r *reader) ReadByte() (byte, error) { 67 b, err := r.r.Read(r.off, 1) 68 r.off += 1 69 if err != nil { 70 return 0, err 71 } 72 return b[0], nil 73} 74 75func (r *reader) Varint() (uint64, error) { 76 v, err := binary.ReadUvarint(r) 77 if err != nil { 78 return 0, err 79 } 80 return v, nil 81} 82 83func (r *reader) Str() (string, error) { 84 slen, err := r.Varint() 85 if err != nil { 86 return "", err 87 } 88 b, err := r.r.Read(r.off, uint32(slen)) 89 if err != nil { 90 return "", err 91 } 92 r.off += uint32(slen) 93 return string(b), nil 94} 95 96func (r *reader) readTOC(toc *indexTOC) error { 97 sz, err := r.r.Size() 98 if err != nil { 99 return err 100 } 101 r.off = sz - 8 102 103 var tocSection simpleSection 104 if err := tocSection.read(r); err != nil { 105 return err 106 } 107 108 r.seek(tocSection.off) 109 110 sectionCount, err := r.U32() 111 if err != nil { 112 return err 113 } 114 115 if sectionCount == 0 { 116 // tagged sections are indicated by a 0 sectionCount, 117 // and then a list of string-tagged type-indicated sections. 118 secs := toc.sectionsTagged() 119 for r.off < tocSection.off+tocSection.sz { 120 tag, err := r.Str() 121 if err != nil { 122 return err 123 } 124 kind, err := r.Varint() 125 if err != nil { 126 return err 127 } 128 sec := secs[tag] 129 if sec != nil && sec.kind() == sectionKind(kind) { 130 // happy path 131 if err := sec.read(r); err != nil { 132 return err 133 } 134 continue 135 } 136 // error case: skip over unknown section 137 if sec == nil { 138 log.Printf("file %s TOC has unknown section %q", r.r.Name(), tag) 139 } else { 140 return fmt.Errorf("file %s TOC section %q expects kind %d, got kind %d", r.r.Name(), tag, 141 kind, sec.kind()) 142 } 143 if kind == 0 { 144 if err := (&simpleSection{}).read(r); err != nil { 145 return err 146 } 147 } else if kind == 1 { 148 if err := (&compoundSection{}).read(r); err != nil { 149 return err 150 } 151 } 152 } 153 } else { 154 // TODO: Remove this branch when ReaderMinFeatureVersion >= 10 155 156 secs := toc.sections() 157 158 if len(secs) != int(sectionCount) { 159 secs = toc.sectionsNext() 160 } 161 162 if len(secs) != int(sectionCount) { 163 return fmt.Errorf("section count mismatch: got %d want %d", sectionCount, len(secs)) 164 } 165 166 for _, s := range secs { 167 if err := s.read(r); err != nil { 168 return err 169 } 170 } 171 } 172 return nil 173} 174 175func (r *indexData) readSectionBlob(sec simpleSection) ([]byte, error) { 176 return r.file.Read(sec.off, sec.sz) 177} 178 179func readSectionU32(f IndexFile, sec simpleSection) ([]uint32, error) { 180 if sec.sz%4 != 0 { 181 return nil, fmt.Errorf("barf: section size %% 4 != 0: sz %d ", sec.sz) 182 } 183 blob, err := f.Read(sec.off, sec.sz) 184 if err != nil { 185 return nil, err 186 } 187 arr := make([]uint32, 0, len(blob)/4) 188 for len(blob) > 0 { 189 arr = append(arr, binary.BigEndian.Uint32(blob)) 190 blob = blob[4:] 191 } 192 return arr, nil 193} 194 195func readSectionU64(f IndexFile, sec simpleSection) ([]uint64, error) { 196 if sec.sz%8 != 0 { 197 return nil, fmt.Errorf("barf: section size %% 8 != 0: sz %d ", sec.sz) 198 } 199 blob, err := f.Read(sec.off, sec.sz) 200 if err != nil { 201 return nil, err 202 } 203 arr := make([]uint64, 0, len(blob)/8) 204 for len(blob) > 0 { 205 arr = append(arr, binary.BigEndian.Uint64(blob)) 206 blob = blob[8:] 207 } 208 return arr, nil 209} 210 211func (r *reader) readJSON(data interface{}, sec *simpleSection) error { 212 blob, err := r.r.Read(sec.off, sec.sz) 213 if err != nil { 214 return err 215 } 216 217 return json.Unmarshal(blob, data) 218} 219 220// canReadVersion returns checks if zoekt can read in md. If it can't a 221// non-nil error is returned. 222func canReadVersion(md *IndexMetadata) bool { 223 // Backwards compatible with v16 224 return md.IndexFormatVersion == IndexFormatVersion || md.IndexFormatVersion == NextIndexFormatVersion 225} 226 227func (r *reader) readIndexData(toc *indexTOC) (*indexData, error) { 228 d := indexData{ 229 file: r.r, 230 branchIDs: []map[string]uint{}, 231 branchNames: []map[uint]string{}, 232 } 233 234 repos, md, err := r.readMetadata(toc) 235 if md != nil && !canReadVersion(md) { 236 return nil, fmt.Errorf("file is v%d, want v%d", md.IndexFormatVersion, IndexFormatVersion) 237 } else if err != nil { 238 return nil, err 239 } 240 241 d.metaData = *md 242 d.repoMetaData = make([]Repository, 0, len(repos)) 243 for _, r := range repos { 244 d.repoMetaData = append(d.repoMetaData, *r) 245 } 246 247 if d.metaData.IndexFeatureVersion < ReadMinFeatureVersion { 248 return nil, fmt.Errorf("file is feature version %d, want feature version >= %d", d.metaData.IndexFeatureVersion, ReadMinFeatureVersion) 249 } 250 251 if d.metaData.IndexMinReaderVersion > FeatureVersion { 252 return nil, fmt.Errorf("file needs read feature version >= %d, have read feature version %d", d.metaData.IndexMinReaderVersion, FeatureVersion) 253 } 254 255 d.boundariesStart = toc.fileContents.data.off 256 d.boundaries = toc.fileContents.relativeIndex() 257 d.newlinesStart = toc.newlines.data.off 258 d.newlinesIndex = toc.newlines.relativeIndex() 259 d.docSectionsStart = toc.fileSections.data.off 260 d.docSectionsIndex = toc.fileSections.relativeIndex() 261 262 d.symbols.symKindIndex = toc.symbolKindMap.relativeIndex() 263 d.fileEndSymbol, err = readSectionU32(d.file, toc.fileEndSymbol) 264 if err != nil { 265 return nil, err 266 } 267 268 // Call readSectionBlob on each section key, and store the result in 269 // the blob value. 270 for sect, blob := range map[simpleSection]*[]byte{ 271 toc.symbolMap.index: &d.symbols.symIndex, 272 toc.symbolMap.data: &d.symbols.symContent, 273 toc.symbolKindMap.data: &d.symbols.symKindContent, 274 toc.symbolMetaData: &d.symbols.symMetaData, 275 } { 276 if *blob, err = d.readSectionBlob(sect); err != nil { 277 return nil, err 278 } 279 } 280 281 d.checksums, err = d.readSectionBlob(toc.contentChecksums) 282 if err != nil { 283 return nil, err 284 } 285 286 d.languages, err = d.readSectionBlob(toc.languages) 287 if err != nil { 288 return nil, err 289 } 290 291 if os.Getenv("ZOEKT_DISABLE_BTREE") != "" { 292 offsetMap, err := d.readNgrams(toc) 293 if err != nil { 294 return nil, err 295 } 296 d.ngrams = offsetMap 297 } else { 298 bt, err := d.newBtreeIndex(toc.ngramText, toc.postings) 299 if err != nil { 300 return nil, err 301 } 302 d.ngrams = bt 303 } 304 305 d.fileBranchMasks, err = readSectionU64(d.file, toc.branchMasks) 306 if err != nil { 307 return nil, err 308 } 309 310 d.fileNameContent, err = d.readSectionBlob(toc.fileNames.data) 311 if err != nil { 312 return nil, err 313 } 314 315 d.fileNameIndex = toc.fileNames.relativeIndex() 316 317 if os.Getenv("ZOEKT_DISABLE_BTREE") != "" { 318 d.fileNameNgrams.m, err = d.readFileNameNgrams(toc) 319 } else { 320 d.fileNameNgrams.bt, err = d.newBtreeIndex(toc.nameNgramText, toc.namePostings) 321 } 322 if err != nil { 323 return nil, err 324 } 325 326 for _, md := range d.repoMetaData { 327 repoBranchIDs := make(map[string]uint, len(md.Branches)) 328 repoBranchNames := make(map[uint]string, len(md.Branches)) 329 for j, br := range md.Branches { 330 id := uint(1) << uint(j) 331 repoBranchIDs[br.Name] = id 332 repoBranchNames[id] = br.Name 333 } 334 d.branchIDs = append(d.branchIDs, repoBranchIDs) 335 d.branchNames = append(d.branchNames, repoBranchNames) 336 d.rawConfigMasks = append(d.rawConfigMasks, encodeRawConfig(md.RawConfig)) 337 } 338 339 blob, err := d.readSectionBlob(toc.runeDocSections) 340 if err != nil { 341 return nil, err 342 } 343 344 if os.Getenv("ZOEKT_ENABLE_LAZY_DOC_SECTIONS") != "" { 345 d.runeDocSectionsRaw = blob 346 } else { 347 d.runeDocSections = unmarshalDocSections(blob, nil) 348 } 349 350 var runeOffsets, fileNameRuneOffsets []uint32 351 352 for sect, dest := range map[simpleSection]*[]uint32{ 353 toc.subRepos: &d.subRepos, 354 toc.runeOffsets: &runeOffsets, 355 toc.nameRuneOffsets: &fileNameRuneOffsets, 356 toc.nameEndRunes: &d.fileNameEndRunes, 357 toc.fileEndRunes: &d.fileEndRunes, 358 } { 359 if blob, err := d.readSectionBlob(sect); err != nil { 360 return nil, err 361 } else { 362 *dest = fromSizedDeltas(blob, nil) 363 } 364 } 365 366 d.runeOffsets = makeRuneOffsetMap(runeOffsets) 367 d.fileNameRuneOffsets = makeRuneOffsetMap(fileNameRuneOffsets) 368 369 d.subRepoPaths = make([][]string, 0, len(d.repoMetaData)) 370 for i := 0; i < len(d.repoMetaData); i++ { 371 keys := make([]string, 0, len(d.repoMetaData[i].SubRepoMap)+1) 372 keys = append(keys, "") 373 for k := range d.repoMetaData[i].SubRepoMap { 374 if k != "" { 375 keys = append(keys, k) 376 } 377 } 378 sort.Strings(keys) 379 d.subRepoPaths = append(d.subRepoPaths, keys) 380 } 381 382 d.languageMap = map[uint16]string{} 383 for k, v := range d.metaData.LanguageMap { 384 d.languageMap[v] = k 385 } 386 387 if err := d.verify(); err != nil { 388 return nil, err 389 } 390 391 // roc.ranks.sz = 0 indicates that we are reading a shard without ranks, in 392 // which case we skip reading the section and leave d.ranks = nil 393 if toc.ranks.sz > 0 { 394 err = d.readRanks(toc) 395 if err != nil { 396 return nil, err 397 } 398 } 399 400 if d.metaData.IndexFormatVersion >= 17 { 401 blob, err := d.readSectionBlob(toc.repos) 402 if err != nil { 403 return nil, err 404 } 405 d.repos = fromSizedDeltas16(blob, nil) 406 } else { 407 // every document is for repo index 0 (default value of uint16) 408 d.repos = make([]uint16, len(d.fileBranchMasks)) 409 } 410 411 if err := d.calculateStats(); err != nil { 412 return nil, err 413 } 414 415 return &d, nil 416} 417 418func (r *reader) readMetadata(toc *indexTOC) ([]*Repository, *IndexMetadata, error) { 419 var md IndexMetadata 420 if err := r.readJSON(&md, &toc.metaData); err != nil { 421 return nil, nil, err 422 } 423 424 // Sourcegraph specific: we support mutating metadata via an additional 425 // ".meta" file. This is to support tombstoning. An additional benefit is we 426 // can update metadata (such as Rank and Name) without re-indexing content. 427 blob, err := os.ReadFile(r.r.Name() + ".meta") 428 if err != nil && !os.IsNotExist(err) { 429 return nil, &md, fmt.Errorf("failed to read meta file: %w", err) 430 } 431 432 if len(blob) == 0 { 433 blob, err = r.r.Read(toc.repoMetaData.off, toc.repoMetaData.sz) 434 if err != nil { 435 return nil, &md, err 436 } 437 } 438 439 var repos []*Repository 440 if md.IndexFormatVersion >= 17 { 441 if err := json.Unmarshal(blob, &repos); err != nil { 442 return nil, &md, err 443 } 444 } else { 445 repos = make([]*Repository, 1) 446 if err := json.Unmarshal(blob, &repos[0]); err != nil { 447 return nil, &md, err 448 } 449 } 450 451 if md.ID == "" { 452 if len(repos) == 0 { 453 return nil, nil, fmt.Errorf("len(repos)=0. Cannot backfill ID") 454 } 455 md.ID = backfillID(repos[0].Name) 456 } 457 458 return repos, &md, nil 459} 460 461const ngramEncoding = 8 462 463func (d *indexData) readNgrams(toc *indexTOC) (combinedNgramOffset, error) { 464 textContent, err := d.readSectionBlob(toc.ngramText) 465 if err != nil { 466 return combinedNgramOffset{}, err 467 } 468 postingsIndex := toc.postings.relativeIndex() 469 470 for i := 0; i < len(postingsIndex); i++ { 471 postingsIndex[i] += toc.postings.data.off 472 } 473 474 ngrams := make([]ngram, 0, len(textContent)/ngramEncoding) 475 for i := 0; i < len(textContent); i += ngramEncoding { 476 ng := ngram(binary.BigEndian.Uint64(textContent[i : i+ngramEncoding])) 477 ngrams = append(ngrams, ng) 478 } 479 480 return makeCombinedNgramOffset(ngrams, postingsIndex), nil 481} 482 483func (d *indexData) newBtreeIndex(ngramSec simpleSection, postings compoundSection) (btreeIndex, error) { 484 bi := btreeIndex{file: d.file} 485 486 textContent, err := d.readSectionBlob(ngramSec) 487 if err != nil { 488 return btreeIndex{}, err 489 } 490 491 // For 500k trigams we can expect approx 1000 leaf nodes (500k divided by 492 // half the bucketSize) and 20 nodes on level 2 (all but the rightmost 493 // inner nodes will have exactly v=50 children) plus a root node. 494 bt := newBtree(btreeOpts{bucketSize: btreeBucketSize, v: 50}) 495 for i := 0; i < len(textContent); i += ngramEncoding { 496 ng := ngram(binary.BigEndian.Uint64(textContent[i : i+ngramEncoding])) 497 bt.insert(ng) 498 } 499 bt.freeze() 500 501 bi.bt = bt 502 503 // hold on to simple sections (8 bytes each) 504 bi.ngramSec = ngramSec 505 bi.postingIndex = postings.index 506 507 return bi, nil 508} 509 510func (d *indexData) readFileNameNgrams(toc *indexTOC) (map[ngram][]byte, error) { 511 nameNgramText, err := d.readSectionBlob(toc.nameNgramText) 512 if err != nil { 513 return nil, err 514 } 515 516 fileNamePostingsData, err := d.readSectionBlob(toc.namePostings.data) 517 if err != nil { 518 return nil, err 519 } 520 521 fileNamePostingsIndex := toc.namePostings.relativeIndex() 522 523 fileNameNgrams := make(map[ngram][]byte, len(nameNgramText)/ngramEncoding) 524 for i := 0; i < len(nameNgramText); i += ngramEncoding { 525 j := i / ngramEncoding 526 off := fileNamePostingsIndex[j] 527 end := fileNamePostingsIndex[j+1] 528 ng := ngram(binary.BigEndian.Uint64(nameNgramText[i : i+ngramEncoding])) 529 fileNameNgrams[ng] = fileNamePostingsData[off:end] 530 } 531 532 return fileNameNgrams, nil 533} 534 535func (d *indexData) verify() error { 536 // This is not an exhaustive check: the postings can easily 537 // generate OOB acccesses, and are expensive to check, but this lets us rule out 538 // other sources of OOB access. 539 n := len(d.fileNameIndex) 540 if n == 0 { 541 return nil 542 } 543 544 n-- 545 for what, got := range map[string]int{ 546 "boundaries": len(d.boundaries) - 1, 547 "branch masks": len(d.fileBranchMasks), 548 "doc section index": len(d.docSectionsIndex) - 1, 549 "newlines index": len(d.newlinesIndex) - 1, 550 } { 551 if got != n { 552 return fmt.Errorf("got %s %d, want %d", what, got, n) 553 } 554 } 555 return nil 556} 557 558func (d *indexData) readContents(i uint32) ([]byte, error) { 559 return d.readSectionBlob(simpleSection{ 560 off: d.boundariesStart + d.boundaries[i], 561 sz: d.boundaries[i+1] - d.boundaries[i], 562 }) 563} 564 565func (d *indexData) readContentSlice(off uint32, sz uint32) ([]byte, error) { 566 // TODO(hanwen): cap result if it is at the end of the content 567 // section. 568 return d.readSectionBlob(simpleSection{ 569 off: d.boundariesStart + off, 570 sz: sz, 571 }) 572} 573 574func (d *indexData) readNewlines(i uint32, buf []uint32) ([]uint32, uint32, error) { 575 sec := simpleSection{ 576 off: d.newlinesStart + d.newlinesIndex[i], 577 sz: d.newlinesIndex[i+1] - d.newlinesIndex[i], 578 } 579 blob, err := d.readSectionBlob(sec) 580 if err != nil { 581 return nil, 0, err 582 } 583 584 return fromSizedDeltas(blob, buf), sec.sz, nil 585} 586 587func (d *indexData) readDocSections(i uint32, buf []DocumentSection) ([]DocumentSection, uint32, error) { 588 sec := simpleSection{ 589 off: d.docSectionsStart + d.docSectionsIndex[i], 590 sz: d.docSectionsIndex[i+1] - d.docSectionsIndex[i], 591 } 592 blob, err := d.readSectionBlob(sec) 593 if err != nil { 594 return nil, 0, err 595 } 596 597 return unmarshalDocSections(blob, buf), sec.sz, nil 598} 599 600func (d *indexData) readRanks(toc *indexTOC) error { 601 blob, err := d.readSectionBlob(toc.ranks) 602 if err != nil { 603 return err 604 } 605 606 return decodeRanks(blob, &d.ranks) 607} 608 609// NewSearcher creates a Searcher for a single index file. Search 610// results coming from this searcher are valid only for the lifetime 611// of the Searcher itself, ie. []byte members should be copied into 612// fresh buffers if the result is to survive closing the shard. 613func NewSearcher(r IndexFile) (Searcher, error) { 614 rd := &reader{r: r} 615 616 var toc indexTOC 617 if err := rd.readTOC(&toc); err != nil { 618 return nil, err 619 } 620 indexData, err := rd.readIndexData(&toc) 621 if err != nil { 622 return nil, err 623 } 624 indexData.file = r 625 return indexData, nil 626} 627 628// ReadMetadata returns the metadata of index shard without reading 629// the index data. The IndexFile is not closed. 630func ReadMetadata(inf IndexFile) ([]*Repository, *IndexMetadata, error) { 631 rd := &reader{r: inf} 632 var toc indexTOC 633 if err := rd.readTOC(&toc); err != nil { 634 return nil, nil, err 635 } 636 637 return rd.readMetadata(&toc) 638} 639 640// ReadMetadataPathAlive is like ReadMetadataPath except that it only returns 641// alive repositories. 642func ReadMetadataPathAlive(p string) ([]*Repository, *IndexMetadata, error) { 643 repos, id, err := ReadMetadataPath(p) 644 if err != nil { 645 return nil, nil, err 646 } 647 alive := repos[:0] 648 for _, repo := range repos { 649 if !repo.Tombstone { 650 alive = append(alive, repo) 651 } 652 } 653 return alive, id, nil 654} 655 656// ReadMetadataPath returns the metadata of index shard at p without reading 657// the index data. ReadMetadataPath is a helper for ReadMetadata which opens 658// the IndexFile at p. 659func ReadMetadataPath(p string) ([]*Repository, *IndexMetadata, error) { 660 f, err := os.Open(p) 661 if err != nil { 662 return nil, nil, err 663 } 664 defer f.Close() 665 666 iFile, err := NewIndexFile(f) 667 if err != nil { 668 return nil, nil, err 669 } 670 defer iFile.Close() 671 672 return ReadMetadata(iFile) 673} 674 675// IndexFilePaths returns all paths for the IndexFile at filepath p that 676// exist. Note: if no files exist this will return an empty slice and nil 677// error. 678// 679// This is p and the ".meta" file for p. 680func IndexFilePaths(p string) ([]string, error) { 681 paths := []string{p, p + ".meta"} 682 exist := paths[:0] 683 for _, p := range paths { 684 if _, err := os.Stat(p); err == nil { 685 exist = append(exist, p) 686 } else if !os.IsNotExist(err) { 687 return nil, err 688 } 689 } 690 return exist, nil 691} 692 693func loadIndexData(r IndexFile) (*indexData, error) { 694 rd := &reader{r: r} 695 696 var toc indexTOC 697 if err := rd.readTOC(&toc); err != nil { 698 return nil, err 699 } 700 return rd.readIndexData(&toc) 701} 702 703// PrintNgramStats outputs a list of the form 704// 705// n_1 trigram_1 706// n_2 trigram_2 707// ... 708// 709// where n_i is the length of the postings list of trigram_i stored in r. 710func PrintNgramStats(r IndexFile) error { 711 id, err := loadIndexData(r) 712 if err != nil { 713 return err 714 } 715 716 if id.ngrams == nil { 717 return fmt.Errorf("PrintNgramStats: ngrams=nil") 718 } 719 var rNgram [3]rune 720 for ngram, ss := range id.ngrams.DumpMap() { 721 rNgram = ngramToRunes(ngram) 722 fmt.Printf("%d\t%q\n", ss.sz, string(rNgram[:])) 723 } 724 return nil 725} 726 727var crc64Table = crc64.MakeTable(crc64.ECMA) 728 729// backfillID returns a 20 char long sortable ID. The ID only depends on s. It 730// should only be used to set the ID of simple v16 shards on read. 731func backfillID(s string) string { 732 var id xid.ID 733 734 // Our timestamps are based on Unix time. Shards without IDs are assigned IDs 735 // based on the 0 epoch. 736 binary.BigEndian.PutUint32(id[:], 0) 737 binary.BigEndian.PutUint64(id[4:], crc64.Checksum([]byte(s), crc64Table)) 738 return id.String() 739}