fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package index 16 17import ( 18 "encoding/binary" 19 "encoding/json" 20 "fmt" 21 "hash/crc64" 22 "log" 23 "os" 24 "slices" 25 "sort" 26 27 "github.com/rs/xid" 28 "github.com/sourcegraph/zoekt" 29) 30 31// IndexFile is a file suitable for concurrent read access. For performance 32// reasons, it allows a mmap'd implementation. 33type IndexFile interface { 34 Read(off uint32, sz uint32) ([]byte, error) 35 Size() (uint32, error) 36 Close() 37 Name() string 38} 39 40// reader is a stateful file 41type reader struct { 42 r IndexFile 43 off uint32 44} 45 46func (r *reader) seek(off uint32) { 47 r.off = off 48} 49 50func (r *reader) U32() (uint32, error) { 51 b, err := r.r.Read(r.off, 4) 52 r.off += 4 53 if err != nil { 54 return 0, err 55 } 56 return binary.BigEndian.Uint32(b), nil 57} 58 59func (r *reader) U64() (uint64, error) { 60 b, err := r.r.Read(r.off, 8) 61 r.off += 8 62 if err != nil { 63 return 0, err 64 } 65 return binary.BigEndian.Uint64(b), nil 66} 67 68func (r *reader) ReadByte() (byte, error) { 69 b, err := r.r.Read(r.off, 1) 70 r.off += 1 71 if err != nil { 72 return 0, err 73 } 74 return b[0], nil 75} 76 77func (r *reader) Varint() (uint64, error) { 78 v, err := binary.ReadUvarint(r) 79 if err != nil { 80 return 0, err 81 } 82 return v, nil 83} 84 85func (r *reader) Str() (string, error) { 86 slen, err := r.Varint() 87 if err != nil { 88 return "", err 89 } 90 b, err := r.r.Read(r.off, uint32(slen)) 91 if err != nil { 92 return "", err 93 } 94 r.off += uint32(slen) 95 return string(b), nil 96} 97 98func (r *reader) readTOC(toc *indexTOC) error { 99 return r.readTOCSections(toc, []string{}) 100} 101 102// readTOCSections reads the table of contents of the index file. 103// 104// If the tags parameter is non-empty, it reads only those tagged sections for efficiency 105// and does not populate the other sections. 106func (r *reader) readTOCSections(toc *indexTOC, tags []string) error { 107 tocSection, sectionCount, err := r.readHeader() 108 if err != nil { 109 return err 110 } 111 112 if sectionCount == 0 { 113 // tagged sections are indicated by a 0 sectionCount, 114 // and then a list of string-tagged type-indicated sections. 115 secs := toc.sectionsTagged() 116 for r.off < tocSection.off+tocSection.sz { 117 tag, err := r.Str() 118 if err != nil { 119 return err 120 } 121 kind, err := r.Varint() 122 if err != nil { 123 return err 124 } 125 126 skipSection := len(tags) > 0 && !slices.Contains(tags, tag) 127 sec := secs[tag] 128 if sec == nil || sec.kind() != sectionKind(kind) { 129 // If we don't recognize the section, we may be reading a newer index than the current version. Use 130 // a "dummy section" struct to skip over it. 131 skipSection = true 132 log.Printf("encountered unrecognized index section (%s), skipping over it", tag) 133 134 switch sectionKind(kind) { 135 case sectionKindSimple: 136 sec = &simpleSection{} 137 case sectionKindCompound: 138 sec = &compoundSection{} 139 case sectionKindCompoundLazy: 140 sec = &lazyCompoundSection{} 141 default: 142 return fmt.Errorf("unknown section kind %d", kind) 143 } 144 } 145 146 if skipSection { 147 if err := sec.skip(r); err != nil { 148 return err 149 } 150 } else { 151 if err := sec.read(r); err != nil { 152 return err 153 } 154 } 155 } 156 } else { 157 // TODO: Remove this branch when ReaderMinFeatureVersion >= 10 158 159 secs := toc.sections() 160 161 if len(secs) != int(sectionCount) { 162 secs = toc.sectionsNext() 163 } 164 165 if len(secs) != int(sectionCount) { 166 return fmt.Errorf("section count mismatch: got %d want %d", sectionCount, len(secs)) 167 } 168 169 for _, s := range secs { 170 if err := s.read(r); err != nil { 171 return err 172 } 173 } 174 } 175 return nil 176} 177 178func (r *reader) readHeader() (simpleSection, uint32, error) { 179 sz, err := r.r.Size() 180 if err != nil { 181 return simpleSection{}, 0, err 182 } 183 r.off = sz - 8 184 185 var tocSection simpleSection 186 if err := tocSection.read(r); err != nil { 187 return simpleSection{}, 0, err 188 } 189 190 r.seek(tocSection.off) 191 192 sectionCount, err := r.U32() 193 if err != nil { 194 return simpleSection{}, 0, err 195 } 196 return tocSection, sectionCount, nil 197} 198 199func (r *indexData) readSectionBlob(sec simpleSection) ([]byte, error) { 200 return r.file.Read(sec.off, sec.sz) 201} 202 203func readSectionU32(f IndexFile, sec simpleSection) ([]uint32, error) { 204 if sec.sz%4 != 0 { 205 return nil, fmt.Errorf("barf: section size %% 4 != 0: sz %d ", sec.sz) 206 } 207 blob, err := f.Read(sec.off, sec.sz) 208 if err != nil { 209 return nil, err 210 } 211 arr := make([]uint32, 0, len(blob)/4) 212 for len(blob) > 0 { 213 arr = append(arr, binary.BigEndian.Uint32(blob)) 214 blob = blob[4:] 215 } 216 return arr, nil 217} 218 219func readSectionU64(f IndexFile, sec simpleSection) ([]uint64, error) { 220 if sec.sz%8 != 0 { 221 return nil, fmt.Errorf("barf: section size %% 8 != 0: sz %d ", sec.sz) 222 } 223 blob, err := f.Read(sec.off, sec.sz) 224 if err != nil { 225 return nil, err 226 } 227 arr := make([]uint64, 0, len(blob)/8) 228 for len(blob) > 0 { 229 arr = append(arr, binary.BigEndian.Uint64(blob)) 230 blob = blob[8:] 231 } 232 return arr, nil 233} 234 235func (r *reader) readJSON(data interface{}, sec simpleSection) error { 236 blob, err := r.r.Read(sec.off, sec.sz) 237 if err != nil { 238 return err 239 } 240 241 return json.Unmarshal(blob, data) 242} 243 244// canReadVersion returns checks if zoekt can read in md. If it can't a 245// non-nil error is returned. 246func canReadVersion(md *zoekt.IndexMetadata) bool { 247 // Backwards compatible with v16 248 return md.IndexFormatVersion == IndexFormatVersion || md.IndexFormatVersion == NextIndexFormatVersion 249} 250 251func (r *reader) readIndexData(toc *indexTOC) (*indexData, error) { 252 d := indexData{ 253 file: r.r, 254 branchIDs: []map[string]uint{}, 255 branchNames: []map[uint]string{}, 256 } 257 258 repos, md, err := r.parseMetadata(toc.metaData, toc.repoMetaData) 259 if md != nil && !canReadVersion(md) { 260 return nil, fmt.Errorf("file is v%d, want v%d", md.IndexFormatVersion, IndexFormatVersion) 261 } else if err != nil { 262 return nil, err 263 } 264 265 d.metaData = *md 266 d.repoMetaData = make([]zoekt.Repository, 0, len(repos)) 267 for _, r := range repos { 268 d.repoMetaData = append(d.repoMetaData, *r) 269 } 270 271 if d.metaData.IndexFeatureVersion < ReadMinFeatureVersion { 272 return nil, fmt.Errorf("file is feature version %d, want feature version >= %d", d.metaData.IndexFeatureVersion, ReadMinFeatureVersion) 273 } 274 275 if d.metaData.IndexMinReaderVersion > FeatureVersion { 276 return nil, fmt.Errorf("file needs read feature version >= %d, have read feature version %d", d.metaData.IndexMinReaderVersion, FeatureVersion) 277 } 278 279 d.boundariesStart = toc.fileContents.data.off 280 d.boundaries = toc.fileContents.relativeIndex() 281 d.newlinesStart = toc.newlines.data.off 282 d.newlinesIndex = toc.newlines.relativeIndex() 283 d.docSectionsStart = toc.fileSections.data.off 284 d.docSectionsIndex = toc.fileSections.relativeIndex() 285 286 d.symbols.symKindIndex = toc.symbolKindMap.relativeIndex() 287 d.fileEndSymbol, err = readSectionU32(d.file, toc.fileEndSymbol) 288 if err != nil { 289 return nil, err 290 } 291 292 // Call readSectionBlob on each section key, and store the result in 293 // the blob value. 294 for sect, blob := range map[simpleSection]*[]byte{ 295 toc.symbolMap.index: &d.symbols.symIndex, 296 toc.symbolMap.data: &d.symbols.symContent, 297 toc.symbolKindMap.data: &d.symbols.symKindContent, 298 toc.symbolMetaData: &d.symbols.symMetaData, 299 } { 300 if *blob, err = d.readSectionBlob(sect); err != nil { 301 return nil, err 302 } 303 } 304 305 d.checksums, err = d.readSectionBlob(toc.contentChecksums) 306 if err != nil { 307 return nil, err 308 } 309 310 d.languages, err = d.readSectionBlob(toc.languages) 311 if err != nil { 312 return nil, err 313 } 314 315 d.contentNgrams, err = d.newBtreeIndex(toc.ngramText, toc.postings) 316 if err != nil { 317 return nil, err 318 } 319 320 d.fileBranchMasks, err = readSectionU64(d.file, toc.branchMasks) 321 if err != nil { 322 return nil, err 323 } 324 325 d.fileNameContent, err = d.readSectionBlob(toc.fileNames.data) 326 if err != nil { 327 return nil, err 328 } 329 330 d.fileNameIndex = toc.fileNames.relativeIndex() 331 332 d.fileNameNgrams, err = d.newBtreeIndex(toc.nameNgramText, toc.namePostings) 333 if err != nil { 334 return nil, err 335 } 336 337 for _, md := range d.repoMetaData { 338 repoBranchIDs := make(map[string]uint, len(md.Branches)) 339 repoBranchNames := make(map[uint]string, len(md.Branches)) 340 for j, br := range md.Branches { 341 id := uint(1) << uint(j) 342 repoBranchIDs[br.Name] = id 343 repoBranchNames[id] = br.Name 344 } 345 d.branchIDs = append(d.branchIDs, repoBranchIDs) 346 d.branchNames = append(d.branchNames, repoBranchNames) 347 d.rawConfigMasks = append(d.rawConfigMasks, encodeRawConfig(md.RawConfig)) 348 } 349 350 blob, err := d.readSectionBlob(toc.runeDocSections) 351 if err != nil { 352 return nil, err 353 } 354 355 d.runeDocSections = unmarshalDocSections(blob, nil) 356 357 var runeOffsets, fileNameRuneOffsets []uint32 358 359 for sect, dest := range map[simpleSection]*[]uint32{ 360 toc.subRepos: &d.subRepos, 361 toc.runeOffsets: &runeOffsets, 362 toc.nameRuneOffsets: &fileNameRuneOffsets, 363 toc.nameEndRunes: &d.fileNameEndRunes, 364 toc.fileEndRunes: &d.fileEndRunes, 365 } { 366 if blob, err := d.readSectionBlob(sect); err != nil { 367 return nil, err 368 } else { 369 *dest = fromSizedDeltas(blob, nil) 370 } 371 } 372 373 d.runeOffsets = makeRuneOffsetMap(runeOffsets) 374 d.fileNameRuneOffsets = makeRuneOffsetMap(fileNameRuneOffsets) 375 376 d.subRepoPaths = make([][]string, 0, len(d.repoMetaData)) 377 for i := 0; i < len(d.repoMetaData); i++ { 378 keys := make([]string, 0, len(d.repoMetaData[i].SubRepoMap)+1) 379 keys = append(keys, "") 380 for k := range d.repoMetaData[i].SubRepoMap { 381 if k != "" { 382 keys = append(keys, k) 383 } 384 } 385 sort.Strings(keys) 386 d.subRepoPaths = append(d.subRepoPaths, keys) 387 } 388 389 d.languageMap = map[uint16]string{} 390 for k, v := range d.metaData.LanguageMap { 391 d.languageMap[v] = k 392 } 393 394 if err := d.verify(); err != nil { 395 return nil, err 396 } 397 398 if d.metaData.IndexFormatVersion >= 17 { 399 blob, err := d.readSectionBlob(toc.repos) 400 if err != nil { 401 return nil, err 402 } 403 d.repos = fromSizedDeltas16(blob, nil) 404 } else { 405 // every document is for repo index 0 (default value of uint16) 406 d.repos = make([]uint16, len(d.fileBranchMasks)) 407 } 408 409 if err := d.calculateStats(); err != nil { 410 return nil, err 411 } 412 413 return &d, nil 414} 415 416func (r *reader) parseMetadata(metaData simpleSection, repoMetaData simpleSection) ([]*zoekt.Repository, *zoekt.IndexMetadata, error) { 417 var md zoekt.IndexMetadata 418 if err := r.readJSON(&md, metaData); err != nil { 419 return nil, nil, err 420 } 421 422 // Sourcegraph specific: we support mutating metadata via an additional 423 // ".meta" file. This is to support tombstoning. An additional benefit is we 424 // can update metadata (such as Rank and Name) without re-indexing content. 425 blob, err := os.ReadFile(r.r.Name() + ".meta") 426 if err != nil && !os.IsNotExist(err) { 427 return nil, &md, fmt.Errorf("failed to read meta file: %w", err) 428 } 429 430 if len(blob) == 0 { 431 blob, err = r.r.Read(repoMetaData.off, repoMetaData.sz) 432 if err != nil { 433 return nil, &md, err 434 } 435 } 436 437 var repos []*zoekt.Repository 438 if md.IndexFormatVersion >= 17 { 439 if err := json.Unmarshal(blob, &repos); err != nil { 440 return nil, &md, err 441 } 442 } else { 443 repos = make([]*zoekt.Repository, 1) 444 if err := json.Unmarshal(blob, &repos[0]); err != nil { 445 return nil, &md, err 446 } 447 } 448 449 if md.ID == "" { 450 if len(repos) == 0 { 451 return nil, nil, fmt.Errorf("len(repos)=0. Cannot backfill ID") 452 } 453 md.ID = backfillID(repos[0].Name) 454 } 455 456 return repos, &md, nil 457} 458 459const ngramEncoding = 8 460 461func (d *indexData) newBtreeIndex(ngramSec simpleSection, postings compoundSection) (btreeIndex, error) { 462 bi := btreeIndex{file: d.file} 463 464 textContent, err := d.readSectionBlob(ngramSec) 465 if err != nil { 466 return btreeIndex{}, err 467 } 468 469 // For 500k trigams we can expect approx 1000 leaf nodes (500k divided by 470 // half the bucketSize) and 20 nodes on level 2 (all but the rightmost 471 // inner nodes will have exactly v=50 children) plus a root node. 472 bt := newBtree(btreeOpts{bucketSize: btreeBucketSize, v: 50}) 473 for i := 0; i < len(textContent); i += ngramEncoding { 474 ng := ngram(binary.BigEndian.Uint64(textContent[i : i+ngramEncoding])) 475 bt.insert(ng) 476 } 477 bt.freeze() 478 479 bi.bt = bt 480 481 // hold on to simple sections (8 bytes each) 482 bi.ngramSec = ngramSec 483 bi.postingIndex = postings.index 484 485 return bi, nil 486} 487 488func (d *indexData) verify() error { 489 // This is not an exhaustive check: the postings can easily 490 // generate OOB acccesses, and are expensive to check, but this lets us rule out 491 // other sources of OOB access. 492 n := len(d.fileNameIndex) 493 if n == 0 { 494 return nil 495 } 496 497 n-- 498 for what, got := range map[string]int{ 499 "boundaries": len(d.boundaries) - 1, 500 "branch masks": len(d.fileBranchMasks), 501 "doc section index": len(d.docSectionsIndex) - 1, 502 "newlines index": len(d.newlinesIndex) - 1, 503 } { 504 if got != n { 505 return fmt.Errorf("got %s %d, want %d", what, got, n) 506 } 507 } 508 return nil 509} 510 511func (d *indexData) readContents(i uint32) ([]byte, error) { 512 return d.readSectionBlob(simpleSection{ 513 off: d.boundariesStart + d.boundaries[i], 514 sz: d.boundaries[i+1] - d.boundaries[i], 515 }) 516} 517 518func (d *indexData) readContentSlice(off uint32, sz uint32) ([]byte, error) { 519 // TODO(hanwen): cap result if it is at the end of the content 520 // section. 521 return d.readSectionBlob(simpleSection{ 522 off: d.boundariesStart + off, 523 sz: sz, 524 }) 525} 526 527func (d *indexData) readNewlines(i uint32, buf []uint32) ([]uint32, uint32, error) { 528 sec := simpleSection{ 529 off: d.newlinesStart + d.newlinesIndex[i], 530 sz: d.newlinesIndex[i+1] - d.newlinesIndex[i], 531 } 532 blob, err := d.readSectionBlob(sec) 533 if err != nil { 534 return nil, 0, err 535 } 536 537 nl := fromSizedDeltas(blob, buf) 538 539 // can be nil if buf is nil and there are no doc sections. However, we rely 540 // on it being non-nil to cache the read. 541 if nl == nil { 542 nl = make([]uint32, 0) 543 } 544 return nl, sec.sz, nil 545} 546 547func (d *indexData) readDocSections(i uint32, buf []DocumentSection) ([]DocumentSection, uint32, error) { 548 sec := simpleSection{ 549 off: d.docSectionsStart + d.docSectionsIndex[i], 550 sz: d.docSectionsIndex[i+1] - d.docSectionsIndex[i], 551 } 552 blob, err := d.readSectionBlob(sec) 553 if err != nil { 554 return nil, 0, err 555 } 556 557 ds := unmarshalDocSections(blob, buf) 558 559 // can be nil if buf is nil and there are no doc sections. However, we rely 560 // on it being non-nil to cache the read. 561 if ds == nil { 562 ds = make([]DocumentSection, 0) 563 } 564 565 return ds, sec.sz, nil 566} 567 568// NewSearcher creates a Searcher for a single index file. Search 569// results coming from this searcher are valid only for the lifetime 570// of the Searcher itself, ie. []byte members should be copied into 571// fresh buffers if the result is to survive closing the shard. 572func NewSearcher(r IndexFile) (zoekt.Searcher, error) { 573 rd := &reader{r: r} 574 575 var toc indexTOC 576 if err := rd.readTOC(&toc); err != nil { 577 return nil, err 578 } 579 indexData, err := rd.readIndexData(&toc) 580 if err != nil { 581 return nil, err 582 } 583 indexData.file = r 584 return indexData, nil 585} 586 587// ReadMetadata returns the metadata of index shard without reading 588// the index data. The IndexFile is not closed. 589func ReadMetadata(inf IndexFile) ([]*zoekt.Repository, *zoekt.IndexMetadata, error) { 590 rd := &reader{r: inf} 591 var toc indexTOC 592 err := rd.readTOCSections(&toc, []string{"metaData", "repoMetaData"}) 593 if err != nil { 594 return nil, nil, err 595 } 596 return rd.parseMetadata(toc.metaData, toc.repoMetaData) 597} 598 599// ReadMetadataPathAlive is like ReadMetadataPath except that it only returns 600// alive repositories. 601func ReadMetadataPathAlive(p string) ([]*zoekt.Repository, *zoekt.IndexMetadata, error) { 602 repos, id, err := ReadMetadataPath(p) 603 if err != nil { 604 return nil, nil, err 605 } 606 alive := repos[:0] 607 for _, repo := range repos { 608 if !repo.Tombstone { 609 alive = append(alive, repo) 610 } 611 } 612 return alive, id, nil 613} 614 615// ReadMetadataPath returns the metadata of index shard at p without reading 616// the index data. ReadMetadataPath is a helper for ReadMetadata which opens 617// the IndexFile at p. 618func ReadMetadataPath(p string) ([]*zoekt.Repository, *zoekt.IndexMetadata, error) { 619 f, err := os.Open(p) 620 if err != nil { 621 return nil, nil, err 622 } 623 defer f.Close() 624 625 iFile, err := NewIndexFile(f) 626 if err != nil { 627 return nil, nil, err 628 } 629 defer iFile.Close() 630 631 return ReadMetadata(iFile) 632} 633 634// IndexFilePaths returns all paths for the IndexFile at filepath p that 635// exist. Note: if no files exist this will return an empty slice and nil 636// error. 637// 638// This is p and the ".meta" file for p. 639func IndexFilePaths(p string) ([]string, error) { 640 paths := []string{p, p + ".meta"} 641 exist := paths[:0] 642 for _, p := range paths { 643 if _, err := os.Stat(p); err == nil { 644 exist = append(exist, p) 645 } else if !os.IsNotExist(err) { 646 return nil, err 647 } 648 } 649 return exist, nil 650} 651 652func loadIndexData(r IndexFile) (*indexData, error) { 653 rd := &reader{r: r} 654 655 var toc indexTOC 656 if err := rd.readTOC(&toc); err != nil { 657 return nil, err 658 } 659 return rd.readIndexData(&toc) 660} 661 662// PrintNgramStats outputs a list of the form 663// 664// n_1 trigram_1 665// n_2 trigram_2 666// ... 667// 668// where n_i is the length of the postings list of trigram_i stored in r. 669func PrintNgramStats(r IndexFile) error { 670 id, err := loadIndexData(r) 671 if err != nil { 672 return err 673 } 674 675 var rNgram [3]rune 676 for ngram, ss := range id.contentNgrams.DumpMap() { 677 rNgram = ngramToRunes(ngram) 678 fmt.Printf("%d\t%q\n", ss.sz, string(rNgram[:])) 679 } 680 return nil 681} 682 683var crc64Table = crc64.MakeTable(crc64.ECMA) 684 685// backfillID returns a 20 char long sortable ID. The ID only depends on s. It 686// should only be used to set the ID of simple v16 shards on read. 687func backfillID(s string) string { 688 var id xid.ID 689 690 // Our timestamps are based on Unix time. Shards without IDs are assigned IDs 691 // based on the 0 epoch. 692 binary.BigEndian.PutUint32(id[:], 0) 693 binary.BigEndian.PutUint64(id[4:], crc64.Checksum([]byte(s), crc64Table)) 694 return id.String() 695}