fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "encoding/binary" 19 "encoding/json" 20 "fmt" 21 "hash/crc64" 22 "log" 23 "os" 24 "slices" 25 "sort" 26 27 "github.com/rs/xid" 28) 29 30// IndexFile is a file suitable for concurrent read access. For performance 31// reasons, it allows a mmap'd implementation. 32type IndexFile interface { 33 Read(off uint32, sz uint32) ([]byte, error) 34 Size() (uint32, error) 35 Close() 36 Name() string 37} 38 39// reader is a stateful file 40type reader struct { 41 r IndexFile 42 off uint32 43} 44 45func (r *reader) seek(off uint32) { 46 r.off = off 47} 48 49func (r *reader) U32() (uint32, error) { 50 b, err := r.r.Read(r.off, 4) 51 r.off += 4 52 if err != nil { 53 return 0, err 54 } 55 return binary.BigEndian.Uint32(b), nil 56} 57 58func (r *reader) U64() (uint64, error) { 59 b, err := r.r.Read(r.off, 8) 60 r.off += 8 61 if err != nil { 62 return 0, err 63 } 64 return binary.BigEndian.Uint64(b), nil 65} 66 67func (r *reader) ReadByte() (byte, error) { 68 b, err := r.r.Read(r.off, 1) 69 r.off += 1 70 if err != nil { 71 return 0, err 72 } 73 return b[0], nil 74} 75 76func (r *reader) Varint() (uint64, error) { 77 v, err := binary.ReadUvarint(r) 78 if err != nil { 79 return 0, err 80 } 81 return v, nil 82} 83 84func (r *reader) Str() (string, error) { 85 slen, err := r.Varint() 86 if err != nil { 87 return "", err 88 } 89 b, err := r.r.Read(r.off, uint32(slen)) 90 if err != nil { 91 return "", err 92 } 93 r.off += uint32(slen) 94 return string(b), nil 95} 96 97func (r *reader) readTOC(toc *indexTOC) error { 98 return r.readTOCSections(toc, []string{}) 99} 100 101// readTOCSections reads the table of contents of the index file. 102// 103// If the tags parameter is non-empty, it reads only those tagged sections for efficiency 104// and does not populate the other sections. 105func (r *reader) readTOCSections(toc *indexTOC, tags []string) error { 106 tocSection, sectionCount, err := r.readHeader() 107 if err != nil { 108 return err 109 } 110 111 if sectionCount == 0 { 112 // tagged sections are indicated by a 0 sectionCount, 113 // and then a list of string-tagged type-indicated sections. 114 secs := toc.sectionsTagged() 115 for r.off < tocSection.off+tocSection.sz { 116 tag, err := r.Str() 117 if err != nil { 118 return err 119 } 120 kind, err := r.Varint() 121 if err != nil { 122 return err 123 } 124 125 skipSection := len(tags) > 0 && !slices.Contains(tags, tag) 126 sec := secs[tag] 127 if sec == nil || sec.kind() != sectionKind(kind) { 128 // If we don't recognize the section, we may be reading a newer index than the current version. Use 129 // a "dummy section" struct to skip over it. 130 skipSection = true 131 log.Printf("encountered malformed index section (%s), skipping over it", tag) 132 133 switch sectionKind(kind) { 134 case sectionKindSimple: 135 sec = &simpleSection{} 136 case sectionKindCompound: 137 sec = &lazyCompoundSection{} 138 case sectionKindCompoundLazy: 139 sec = &lazyCompoundSection{} 140 default: 141 return fmt.Errorf("unknown section kind %d", kind) 142 } 143 } 144 145 if skipSection { 146 if err := sec.skip(r); err != nil { 147 return err 148 } 149 } else { 150 if err := sec.read(r); err != nil { 151 return err 152 } 153 } 154 } 155 } else { 156 // TODO: Remove this branch when ReaderMinFeatureVersion >= 10 157 158 secs := toc.sections() 159 160 if len(secs) != int(sectionCount) { 161 secs = toc.sectionsNext() 162 } 163 164 if len(secs) != int(sectionCount) { 165 return fmt.Errorf("section count mismatch: got %d want %d", sectionCount, len(secs)) 166 } 167 168 for _, s := range secs { 169 if err := s.read(r); err != nil { 170 return err 171 } 172 } 173 } 174 return nil 175} 176 177func (r *reader) readHeader() (simpleSection, uint32, error) { 178 sz, err := r.r.Size() 179 if err != nil { 180 return simpleSection{}, 0, err 181 } 182 r.off = sz - 8 183 184 var tocSection simpleSection 185 if err := tocSection.read(r); err != nil { 186 return simpleSection{}, 0, err 187 } 188 189 r.seek(tocSection.off) 190 191 sectionCount, err := r.U32() 192 if err != nil { 193 return simpleSection{}, 0, err 194 } 195 return tocSection, sectionCount, nil 196} 197 198func (r *indexData) readSectionBlob(sec simpleSection) ([]byte, error) { 199 return r.file.Read(sec.off, sec.sz) 200} 201 202func readSectionU32(f IndexFile, sec simpleSection) ([]uint32, error) { 203 if sec.sz%4 != 0 { 204 return nil, fmt.Errorf("barf: section size %% 4 != 0: sz %d ", sec.sz) 205 } 206 blob, err := f.Read(sec.off, sec.sz) 207 if err != nil { 208 return nil, err 209 } 210 arr := make([]uint32, 0, len(blob)/4) 211 for len(blob) > 0 { 212 arr = append(arr, binary.BigEndian.Uint32(blob)) 213 blob = blob[4:] 214 } 215 return arr, nil 216} 217 218func readSectionU64(f IndexFile, sec simpleSection) ([]uint64, error) { 219 if sec.sz%8 != 0 { 220 return nil, fmt.Errorf("barf: section size %% 8 != 0: sz %d ", sec.sz) 221 } 222 blob, err := f.Read(sec.off, sec.sz) 223 if err != nil { 224 return nil, err 225 } 226 arr := make([]uint64, 0, len(blob)/8) 227 for len(blob) > 0 { 228 arr = append(arr, binary.BigEndian.Uint64(blob)) 229 blob = blob[8:] 230 } 231 return arr, nil 232} 233 234func (r *reader) readJSON(data interface{}, sec simpleSection) error { 235 blob, err := r.r.Read(sec.off, sec.sz) 236 if err != nil { 237 return err 238 } 239 240 return json.Unmarshal(blob, data) 241} 242 243// canReadVersion returns checks if zoekt can read in md. If it can't a 244// non-nil error is returned. 245func canReadVersion(md *IndexMetadata) bool { 246 // Backwards compatible with v16 247 return md.IndexFormatVersion == IndexFormatVersion || md.IndexFormatVersion == NextIndexFormatVersion 248} 249 250func (r *reader) readIndexData(toc *indexTOC) (*indexData, error) { 251 d := indexData{ 252 file: r.r, 253 branchIDs: []map[string]uint{}, 254 branchNames: []map[uint]string{}, 255 } 256 257 repos, md, err := r.parseMetadata(toc.metaData, toc.repoMetaData) 258 if md != nil && !canReadVersion(md) { 259 return nil, fmt.Errorf("file is v%d, want v%d", md.IndexFormatVersion, IndexFormatVersion) 260 } else if err != nil { 261 return nil, err 262 } 263 264 d.metaData = *md 265 d.repoMetaData = make([]Repository, 0, len(repos)) 266 for _, r := range repos { 267 d.repoMetaData = append(d.repoMetaData, *r) 268 } 269 270 if d.metaData.IndexFeatureVersion < ReadMinFeatureVersion { 271 return nil, fmt.Errorf("file is feature version %d, want feature version >= %d", d.metaData.IndexFeatureVersion, ReadMinFeatureVersion) 272 } 273 274 if d.metaData.IndexMinReaderVersion > FeatureVersion { 275 return nil, fmt.Errorf("file needs read feature version >= %d, have read feature version %d", d.metaData.IndexMinReaderVersion, FeatureVersion) 276 } 277 278 d.boundariesStart = toc.fileContents.data.off 279 d.boundaries = toc.fileContents.relativeIndex() 280 d.newlinesStart = toc.newlines.data.off 281 d.newlinesIndex = toc.newlines.relativeIndex() 282 d.docSectionsStart = toc.fileSections.data.off 283 d.docSectionsIndex = toc.fileSections.relativeIndex() 284 285 d.symbols.symKindIndex = toc.symbolKindMap.relativeIndex() 286 d.fileEndSymbol, err = readSectionU32(d.file, toc.fileEndSymbol) 287 if err != nil { 288 return nil, err 289 } 290 291 // Call readSectionBlob on each section key, and store the result in 292 // the blob value. 293 for sect, blob := range map[simpleSection]*[]byte{ 294 toc.symbolMap.index: &d.symbols.symIndex, 295 toc.symbolMap.data: &d.symbols.symContent, 296 toc.symbolKindMap.data: &d.symbols.symKindContent, 297 toc.symbolMetaData: &d.symbols.symMetaData, 298 } { 299 if *blob, err = d.readSectionBlob(sect); err != nil { 300 return nil, err 301 } 302 } 303 304 d.checksums, err = d.readSectionBlob(toc.contentChecksums) 305 if err != nil { 306 return nil, err 307 } 308 309 d.languages, err = d.readSectionBlob(toc.languages) 310 if err != nil { 311 return nil, err 312 } 313 314 d.contentNgrams, err = d.newBtreeIndex(toc.ngramText, toc.postings) 315 if err != nil { 316 return nil, err 317 } 318 319 d.fileBranchMasks, err = readSectionU64(d.file, toc.branchMasks) 320 if err != nil { 321 return nil, err 322 } 323 324 d.fileNameContent, err = d.readSectionBlob(toc.fileNames.data) 325 if err != nil { 326 return nil, err 327 } 328 329 d.fileNameIndex = toc.fileNames.relativeIndex() 330 331 d.fileNameNgrams, err = d.newBtreeIndex(toc.nameNgramText, toc.namePostings) 332 if err != nil { 333 return nil, err 334 } 335 336 for _, md := range d.repoMetaData { 337 repoBranchIDs := make(map[string]uint, len(md.Branches)) 338 repoBranchNames := make(map[uint]string, len(md.Branches)) 339 for j, br := range md.Branches { 340 id := uint(1) << uint(j) 341 repoBranchIDs[br.Name] = id 342 repoBranchNames[id] = br.Name 343 } 344 d.branchIDs = append(d.branchIDs, repoBranchIDs) 345 d.branchNames = append(d.branchNames, repoBranchNames) 346 d.rawConfigMasks = append(d.rawConfigMasks, encodeRawConfig(md.RawConfig)) 347 } 348 349 blob, err := d.readSectionBlob(toc.runeDocSections) 350 if err != nil { 351 return nil, err 352 } 353 354 d.runeDocSections = unmarshalDocSections(blob, nil) 355 356 var runeOffsets, fileNameRuneOffsets []uint32 357 358 for sect, dest := range map[simpleSection]*[]uint32{ 359 toc.subRepos: &d.subRepos, 360 toc.runeOffsets: &runeOffsets, 361 toc.nameRuneOffsets: &fileNameRuneOffsets, 362 toc.nameEndRunes: &d.fileNameEndRunes, 363 toc.fileEndRunes: &d.fileEndRunes, 364 } { 365 if blob, err := d.readSectionBlob(sect); err != nil { 366 return nil, err 367 } else { 368 *dest = fromSizedDeltas(blob, nil) 369 } 370 } 371 372 d.runeOffsets = makeRuneOffsetMap(runeOffsets) 373 d.fileNameRuneOffsets = makeRuneOffsetMap(fileNameRuneOffsets) 374 375 d.subRepoPaths = make([][]string, 0, len(d.repoMetaData)) 376 for i := 0; i < len(d.repoMetaData); i++ { 377 keys := make([]string, 0, len(d.repoMetaData[i].SubRepoMap)+1) 378 keys = append(keys, "") 379 for k := range d.repoMetaData[i].SubRepoMap { 380 if k != "" { 381 keys = append(keys, k) 382 } 383 } 384 sort.Strings(keys) 385 d.subRepoPaths = append(d.subRepoPaths, keys) 386 } 387 388 d.languageMap = map[uint16]string{} 389 for k, v := range d.metaData.LanguageMap { 390 d.languageMap[v] = k 391 } 392 393 if err := d.verify(); err != nil { 394 return nil, err 395 } 396 397 // roc.ranks.sz = 0 indicates that we are reading a shard without ranks, in 398 // which case we skip reading the section and leave d.ranks = nil 399 if toc.ranks.sz > 0 { 400 err = d.readRanks(toc) 401 if err != nil { 402 return nil, err 403 } 404 } 405 406 if d.metaData.IndexFormatVersion >= 17 { 407 blob, err := d.readSectionBlob(toc.repos) 408 if err != nil { 409 return nil, err 410 } 411 d.repos = fromSizedDeltas16(blob, nil) 412 } else { 413 // every document is for repo index 0 (default value of uint16) 414 d.repos = make([]uint16, len(d.fileBranchMasks)) 415 } 416 417 if err := d.calculateStats(); err != nil { 418 return nil, err 419 } 420 421 return &d, nil 422} 423 424func (r *reader) parseMetadata(metaData simpleSection, repoMetaData simpleSection) ([]*Repository, *IndexMetadata, error) { 425 var md IndexMetadata 426 if err := r.readJSON(&md, metaData); err != nil { 427 return nil, nil, err 428 } 429 430 // Sourcegraph specific: we support mutating metadata via an additional 431 // ".meta" file. This is to support tombstoning. An additional benefit is we 432 // can update metadata (such as Rank and Name) without re-indexing content. 433 blob, err := os.ReadFile(r.r.Name() + ".meta") 434 if err != nil && !os.IsNotExist(err) { 435 return nil, &md, fmt.Errorf("failed to read meta file: %w", err) 436 } 437 438 if len(blob) == 0 { 439 blob, err = r.r.Read(repoMetaData.off, repoMetaData.sz) 440 if err != nil { 441 return nil, &md, err 442 } 443 } 444 445 var repos []*Repository 446 if md.IndexFormatVersion >= 17 { 447 if err := json.Unmarshal(blob, &repos); err != nil { 448 return nil, &md, err 449 } 450 } else { 451 repos = make([]*Repository, 1) 452 if err := json.Unmarshal(blob, &repos[0]); err != nil { 453 return nil, &md, err 454 } 455 } 456 457 if md.ID == "" { 458 if len(repos) == 0 { 459 return nil, nil, fmt.Errorf("len(repos)=0. Cannot backfill ID") 460 } 461 md.ID = backfillID(repos[0].Name) 462 } 463 464 return repos, &md, nil 465} 466 467const ngramEncoding = 8 468 469func (d *indexData) newBtreeIndex(ngramSec simpleSection, postings compoundSection) (btreeIndex, error) { 470 bi := btreeIndex{file: d.file} 471 472 textContent, err := d.readSectionBlob(ngramSec) 473 if err != nil { 474 return btreeIndex{}, err 475 } 476 477 // For 500k trigams we can expect approx 1000 leaf nodes (500k divided by 478 // half the bucketSize) and 20 nodes on level 2 (all but the rightmost 479 // inner nodes will have exactly v=50 children) plus a root node. 480 bt := newBtree(btreeOpts{bucketSize: btreeBucketSize, v: 50}) 481 for i := 0; i < len(textContent); i += ngramEncoding { 482 ng := ngram(binary.BigEndian.Uint64(textContent[i : i+ngramEncoding])) 483 bt.insert(ng) 484 } 485 bt.freeze() 486 487 bi.bt = bt 488 489 // hold on to simple sections (8 bytes each) 490 bi.ngramSec = ngramSec 491 bi.postingIndex = postings.index 492 493 return bi, nil 494} 495 496func (d *indexData) verify() error { 497 // This is not an exhaustive check: the postings can easily 498 // generate OOB acccesses, and are expensive to check, but this lets us rule out 499 // other sources of OOB access. 500 n := len(d.fileNameIndex) 501 if n == 0 { 502 return nil 503 } 504 505 n-- 506 for what, got := range map[string]int{ 507 "boundaries": len(d.boundaries) - 1, 508 "branch masks": len(d.fileBranchMasks), 509 "doc section index": len(d.docSectionsIndex) - 1, 510 "newlines index": len(d.newlinesIndex) - 1, 511 } { 512 if got != n { 513 return fmt.Errorf("got %s %d, want %d", what, got, n) 514 } 515 } 516 return nil 517} 518 519func (d *indexData) readContents(i uint32) ([]byte, error) { 520 return d.readSectionBlob(simpleSection{ 521 off: d.boundariesStart + d.boundaries[i], 522 sz: d.boundaries[i+1] - d.boundaries[i], 523 }) 524} 525 526func (d *indexData) readContentSlice(off uint32, sz uint32) ([]byte, error) { 527 // TODO(hanwen): cap result if it is at the end of the content 528 // section. 529 return d.readSectionBlob(simpleSection{ 530 off: d.boundariesStart + off, 531 sz: sz, 532 }) 533} 534 535func (d *indexData) readNewlines(i uint32, buf []uint32) ([]uint32, uint32, error) { 536 sec := simpleSection{ 537 off: d.newlinesStart + d.newlinesIndex[i], 538 sz: d.newlinesIndex[i+1] - d.newlinesIndex[i], 539 } 540 blob, err := d.readSectionBlob(sec) 541 if err != nil { 542 return nil, 0, err 543 } 544 545 return fromSizedDeltas(blob, buf), sec.sz, nil 546} 547 548func (d *indexData) readDocSections(i uint32, buf []DocumentSection) ([]DocumentSection, uint32, error) { 549 sec := simpleSection{ 550 off: d.docSectionsStart + d.docSectionsIndex[i], 551 sz: d.docSectionsIndex[i+1] - d.docSectionsIndex[i], 552 } 553 blob, err := d.readSectionBlob(sec) 554 if err != nil { 555 return nil, 0, err 556 } 557 558 ds := unmarshalDocSections(blob, buf) 559 560 // can be nil if buf is nil and there are no doc sections. However, we rely 561 // on it being non-nil to cache the read. 562 if ds == nil { 563 ds = make([]DocumentSection, 0) 564 } 565 566 return ds, sec.sz, nil 567} 568 569func (d *indexData) readRanks(toc *indexTOC) error { 570 blob, err := d.readSectionBlob(toc.ranks) 571 if err != nil { 572 return err 573 } 574 575 return decodeRanks(blob, &d.ranks) 576} 577 578// NewSearcher creates a Searcher for a single index file. Search 579// results coming from this searcher are valid only for the lifetime 580// of the Searcher itself, ie. []byte members should be copied into 581// fresh buffers if the result is to survive closing the shard. 582func NewSearcher(r IndexFile) (Searcher, error) { 583 rd := &reader{r: r} 584 585 var toc indexTOC 586 if err := rd.readTOC(&toc); err != nil { 587 return nil, err 588 } 589 indexData, err := rd.readIndexData(&toc) 590 if err != nil { 591 return nil, err 592 } 593 indexData.file = r 594 return indexData, nil 595} 596 597// ReadMetadata returns the metadata of index shard without reading 598// the index data. The IndexFile is not closed. 599func ReadMetadata(inf IndexFile) ([]*Repository, *IndexMetadata, error) { 600 rd := &reader{r: inf} 601 var toc indexTOC 602 err := rd.readTOCSections(&toc, []string{"metaData", "repoMetaData"}) 603 if err != nil { 604 return nil, nil, err 605 } 606 return rd.parseMetadata(toc.metaData, toc.repoMetaData) 607} 608 609// ReadMetadataPathAlive is like ReadMetadataPath except that it only returns 610// alive repositories. 611func ReadMetadataPathAlive(p string) ([]*Repository, *IndexMetadata, error) { 612 repos, id, err := ReadMetadataPath(p) 613 if err != nil { 614 return nil, nil, err 615 } 616 alive := repos[:0] 617 for _, repo := range repos { 618 if !repo.Tombstone { 619 alive = append(alive, repo) 620 } 621 } 622 return alive, id, nil 623} 624 625// ReadMetadataPath returns the metadata of index shard at p without reading 626// the index data. ReadMetadataPath is a helper for ReadMetadata which opens 627// the IndexFile at p. 628func ReadMetadataPath(p string) ([]*Repository, *IndexMetadata, error) { 629 f, err := os.Open(p) 630 if err != nil { 631 return nil, nil, err 632 } 633 defer f.Close() 634 635 iFile, err := NewIndexFile(f) 636 if err != nil { 637 return nil, nil, err 638 } 639 defer iFile.Close() 640 641 return ReadMetadata(iFile) 642} 643 644// IndexFilePaths returns all paths for the IndexFile at filepath p that 645// exist. Note: if no files exist this will return an empty slice and nil 646// error. 647// 648// This is p and the ".meta" file for p. 649func IndexFilePaths(p string) ([]string, error) { 650 paths := []string{p, p + ".meta"} 651 exist := paths[:0] 652 for _, p := range paths { 653 if _, err := os.Stat(p); err == nil { 654 exist = append(exist, p) 655 } else if !os.IsNotExist(err) { 656 return nil, err 657 } 658 } 659 return exist, nil 660} 661 662func loadIndexData(r IndexFile) (*indexData, error) { 663 rd := &reader{r: r} 664 665 var toc indexTOC 666 if err := rd.readTOC(&toc); err != nil { 667 return nil, err 668 } 669 return rd.readIndexData(&toc) 670} 671 672// PrintNgramStats outputs a list of the form 673// 674// n_1 trigram_1 675// n_2 trigram_2 676// ... 677// 678// where n_i is the length of the postings list of trigram_i stored in r. 679func PrintNgramStats(r IndexFile) error { 680 id, err := loadIndexData(r) 681 if err != nil { 682 return err 683 } 684 685 var rNgram [3]rune 686 for ngram, ss := range id.contentNgrams.DumpMap() { 687 rNgram = ngramToRunes(ngram) 688 fmt.Printf("%d\t%q\n", ss.sz, string(rNgram[:])) 689 } 690 return nil 691} 692 693var crc64Table = crc64.MakeTable(crc64.ECMA) 694 695// backfillID returns a 20 char long sortable ID. The ID only depends on s. It 696// should only be used to set the ID of simple v16 shards on read. 697func backfillID(s string) string { 698 var id xid.ID 699 700 // Our timestamps are based on Unix time. Shards without IDs are assigned IDs 701 // based on the 0 epoch. 702 binary.BigEndian.PutUint32(id[:], 0) 703 binary.BigEndian.PutUint64(id[4:], crc64.Checksum([]byte(s), crc64Table)) 704 return id.String() 705}