fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

at main 21 kB View raw
1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package index 16 17import ( 18 "encoding/binary" 19 "encoding/json" 20 "fmt" 21 "hash/crc64" 22 "log" 23 "os" 24 "slices" 25 "sort" 26 27 "github.com/RoaringBitmap/roaring" 28 "github.com/prometheus/client_golang/prometheus" 29 "github.com/prometheus/client_golang/prometheus/promauto" 30 "github.com/rs/xid" 31 32 "github.com/sourcegraph/zoekt" 33) 34 35// IndexFile is a file suitable for concurrent read access. For performance 36// reasons, it allows a mmap'd implementation. 37type IndexFile interface { 38 Read(off uint32, sz uint32) ([]byte, error) 39 Size() (uint32, error) 40 Close() 41 Name() string 42} 43 44// reader is a stateful file 45type reader struct { 46 r IndexFile 47 off uint32 48} 49 50func (r *reader) seek(off uint32) { 51 r.off = off 52} 53 54func (r *reader) U32() (uint32, error) { 55 b, err := r.r.Read(r.off, 4) 56 r.off += 4 57 if err != nil { 58 return 0, err 59 } 60 return binary.BigEndian.Uint32(b), nil 61} 62 63func (r *reader) U64() (uint64, error) { 64 b, err := r.r.Read(r.off, 8) 65 r.off += 8 66 if err != nil { 67 return 0, err 68 } 69 return binary.BigEndian.Uint64(b), nil 70} 71 72func (r *reader) ReadByte() (byte, error) { 73 b, err := r.r.Read(r.off, 1) 74 r.off += 1 75 if err != nil { 76 return 0, err 77 } 78 return b[0], nil 79} 80 81func (r *reader) Varint() (uint64, error) { 82 v, err := binary.ReadUvarint(r) 83 if err != nil { 84 return 0, err 85 } 86 return v, nil 87} 88 89func (r *reader) Str() (string, error) { 90 slen, err := r.Varint() 91 if err != nil { 92 return "", err 93 } 94 b, err := r.r.Read(r.off, uint32(slen)) 95 if err != nil { 96 return "", err 97 } 98 r.off += uint32(slen) 99 return string(b), nil 100} 101 102func (r *reader) readTOC(toc *indexTOC) error { 103 return r.readTOCSections(toc, []string{}) 104} 105 106// readTOCSections reads the table of contents of the index file. 107// 108// If the tags parameter is non-empty, it reads only those tagged sections for efficiency 109// and does not populate the other sections. 110func (r *reader) readTOCSections(toc *indexTOC, tags []string) error { 111 tocSection, sectionCount, err := r.readHeader() 112 if err != nil { 113 return err 114 } 115 116 if sectionCount == 0 { 117 // tagged sections are indicated by a 0 sectionCount, 118 // and then a list of string-tagged type-indicated sections. 119 secs := toc.sectionsTagged() 120 for r.off < tocSection.off+tocSection.sz { 121 tag, err := r.Str() 122 if err != nil { 123 return err 124 } 125 kind, err := r.Varint() 126 if err != nil { 127 return err 128 } 129 130 skipSection := len(tags) > 0 && !slices.Contains(tags, tag) 131 sec := secs[tag] 132 if sec == nil || sec.kind() != sectionKind(kind) { 133 // If we don't recognize the section, we may be reading a newer index than the current version. Use 134 // a "dummy section" struct to skip over it. 135 skipSection = true 136 log.Printf("encountered unrecognized index section (%s), skipping over it", tag) 137 138 switch sectionKind(kind) { 139 case sectionKindSimple: 140 sec = &simpleSection{} 141 case sectionKindCompound: 142 sec = &compoundSection{} 143 case sectionKindCompoundLazy: 144 sec = &lazyCompoundSection{} 145 default: 146 return fmt.Errorf("unknown section kind %d", kind) 147 } 148 } 149 150 if skipSection { 151 if err := sec.skip(r); err != nil { 152 return err 153 } 154 } else { 155 if err := sec.read(r); err != nil { 156 return err 157 } 158 } 159 } 160 } else { 161 // TODO: Remove this branch when ReaderMinFeatureVersion >= 10 162 163 secs := toc.sections() 164 165 if len(secs) != int(sectionCount) { 166 secs = toc.sectionsNext() 167 } 168 169 if len(secs) != int(sectionCount) { 170 return fmt.Errorf("section count mismatch: got %d want %d", sectionCount, len(secs)) 171 } 172 173 for _, s := range secs { 174 if err := s.read(r); err != nil { 175 return err 176 } 177 } 178 } 179 return nil 180} 181 182func (r *reader) readHeader() (simpleSection, uint32, error) { 183 sz, err := r.r.Size() 184 if err != nil { 185 return simpleSection{}, 0, err 186 } 187 r.off = sz - 8 188 189 var tocSection simpleSection 190 if err := tocSection.read(r); err != nil { 191 return simpleSection{}, 0, err 192 } 193 194 r.seek(tocSection.off) 195 196 sectionCount, err := r.U32() 197 if err != nil { 198 return simpleSection{}, 0, err 199 } 200 return tocSection, sectionCount, nil 201} 202 203func (r *indexData) readSectionBlob(sec simpleSection) ([]byte, error) { 204 return r.file.Read(sec.off, sec.sz) 205} 206 207func readSectionU32(f IndexFile, sec simpleSection) ([]uint32, error) { 208 if sec.sz%4 != 0 { 209 return nil, fmt.Errorf("barf: section size %% 4 != 0: sz %d ", sec.sz) 210 } 211 blob, err := f.Read(sec.off, sec.sz) 212 if err != nil { 213 return nil, err 214 } 215 arr := make([]uint32, 0, len(blob)/4) 216 for len(blob) > 0 { 217 arr = append(arr, binary.BigEndian.Uint32(blob)) 218 blob = blob[4:] 219 } 220 return arr, nil 221} 222 223func readSectionU64(f IndexFile, sec simpleSection) ([]uint64, error) { 224 if sec.sz%8 != 0 { 225 return nil, fmt.Errorf("barf: section size %% 8 != 0: sz %d ", sec.sz) 226 } 227 blob, err := f.Read(sec.off, sec.sz) 228 if err != nil { 229 return nil, err 230 } 231 arr := make([]uint64, 0, len(blob)/8) 232 for len(blob) > 0 { 233 arr = append(arr, binary.BigEndian.Uint64(blob)) 234 blob = blob[8:] 235 } 236 return arr, nil 237} 238 239func (r *reader) readJSON(data any, sec simpleSection) error { 240 blob, err := r.r.Read(sec.off, sec.sz) 241 if err != nil { 242 return err 243 } 244 245 return json.Unmarshal(blob, data) 246} 247 248// canReadVersion returns checks if zoekt can read in md. If it can't a 249// non-nil error is returned. 250func canReadVersion(md *zoekt.IndexMetadata) bool { 251 // Backwards compatible with v16 252 return md.IndexFormatVersion == IndexFormatVersion || md.IndexFormatVersion == NextIndexFormatVersion 253} 254 255func (r *reader) readIndexData(toc *indexTOC) (*indexData, error) { 256 d := indexData{ 257 file: r.r, 258 branchIDs: []map[string]uint{}, 259 branchNames: []map[uint]string{}, 260 261 // docMatchTreeCache is disabled by default. 262 // The number of max entries can be set with environment variable ZOEKT_DOCMATCHTREE_CACHE 263 docMatchTreeCache: newDocMatchTreeCache(0), 264 } 265 266 repos, md, err := r.parseMetadata(toc.metaData, toc.repoMetaData) 267 if md != nil && !canReadVersion(md) { 268 return nil, fmt.Errorf("file is v%d, want v%d", md.IndexFormatVersion, IndexFormatVersion) 269 } else if err != nil { 270 return nil, err 271 } 272 273 d.metaData = *md 274 d.repoMetaData = make([]zoekt.Repository, 0, len(repos)) 275 for _, r := range repos { 276 d.repoMetaData = append(d.repoMetaData, *r) 277 } 278 279 if d.metaData.IndexFeatureVersion < ReadMinFeatureVersion { 280 return nil, fmt.Errorf("file is feature version %d, want feature version >= %d", d.metaData.IndexFeatureVersion, ReadMinFeatureVersion) 281 } 282 283 if d.metaData.IndexMinReaderVersion > FeatureVersion { 284 return nil, fmt.Errorf("file needs read feature version >= %d, have read feature version %d", d.metaData.IndexMinReaderVersion, FeatureVersion) 285 } 286 287 d.boundariesStart = toc.fileContents.data.off 288 d.boundaries = toc.fileContents.relativeIndex() 289 d.newlinesStart = toc.newlines.data.off 290 d.newlinesIndex = toc.newlines.relativeIndex() 291 d.docSectionsStart = toc.fileSections.data.off 292 d.docSectionsIndex = toc.fileSections.relativeIndex() 293 294 d.symbols.symKindIndex = toc.symbolKindMap.relativeIndex() 295 d.fileEndSymbol, err = readSectionU32(d.file, toc.fileEndSymbol) 296 if err != nil { 297 return nil, err 298 } 299 300 // Call readSectionBlob on each section key, and store the result in 301 // the blob value. 302 for sect, blob := range map[simpleSection]*[]byte{ 303 toc.symbolMap.index: &d.symbols.symIndex, 304 toc.symbolMap.data: &d.symbols.symContent, 305 toc.symbolKindMap.data: &d.symbols.symKindContent, 306 toc.symbolMetaData: &d.symbols.symMetaData, 307 } { 308 if *blob, err = d.readSectionBlob(sect); err != nil { 309 return nil, err 310 } 311 } 312 313 d.checksums, err = d.readSectionBlob(toc.contentChecksums) 314 if err != nil { 315 return nil, err 316 } 317 318 d.languages, err = d.readSectionBlob(toc.languages) 319 if err != nil { 320 return nil, err 321 } 322 323 d.categories, err = d.readSectionBlob(toc.categories) 324 if err != nil { 325 return nil, err 326 } 327 328 d.contentNgrams, err = d.newBtreeIndex(toc.ngramText, toc.postings) 329 if err != nil { 330 return nil, err 331 } 332 333 d.fileBranchMasks, err = readSectionU64(d.file, toc.branchMasks) 334 if err != nil { 335 return nil, err 336 } 337 338 d.fileNameContent, err = d.readSectionBlob(toc.fileNames.data) 339 if err != nil { 340 return nil, err 341 } 342 343 d.fileNameIndex = toc.fileNames.relativeIndex() 344 345 d.fileNameNgrams, err = d.newBtreeIndex(toc.nameNgramText, toc.namePostings) 346 if err != nil { 347 return nil, err 348 } 349 350 for _, md := range d.repoMetaData { 351 repoBranchIDs := make(map[string]uint, len(md.Branches)) 352 repoBranchNames := make(map[uint]string, len(md.Branches)) 353 for j, br := range md.Branches { 354 id := uint(1) << uint(j) 355 repoBranchIDs[br.Name] = id 356 repoBranchNames[id] = br.Name 357 } 358 d.branchIDs = append(d.branchIDs, repoBranchIDs) 359 d.branchNames = append(d.branchNames, repoBranchNames) 360 d.rawConfigMasks = append(d.rawConfigMasks, encodeRawConfig(md.RawConfig)) 361 } 362 363 blob, err := d.readSectionBlob(toc.runeDocSections) 364 if err != nil { 365 return nil, err 366 } 367 368 d.runeDocSections = unmarshalDocSections(blob, nil) 369 370 var runeOffsets, fileNameRuneOffsets []uint32 371 372 for sect, dest := range map[simpleSection]*[]uint32{ 373 toc.subRepos: &d.subRepos, 374 toc.runeOffsets: &runeOffsets, 375 toc.nameRuneOffsets: &fileNameRuneOffsets, 376 toc.nameEndRunes: &d.fileNameEndRunes, 377 toc.fileEndRunes: &d.fileEndRunes, 378 } { 379 if blob, err := d.readSectionBlob(sect); err != nil { 380 return nil, err 381 } else { 382 *dest = fromSizedDeltas(blob, nil) 383 } 384 } 385 386 d.runeOffsets = makeRuneOffsetMap(runeOffsets) 387 d.fileNameRuneOffsets = makeRuneOffsetMap(fileNameRuneOffsets) 388 389 d.subRepoPaths = make([][]string, 0, len(d.repoMetaData)) 390 for i := range d.repoMetaData { 391 keys := make([]string, 0, len(d.repoMetaData[i].SubRepoMap)+1) 392 keys = append(keys, "") 393 for k := range d.repoMetaData[i].SubRepoMap { 394 if k != "" { 395 keys = append(keys, k) 396 } 397 } 398 sort.Strings(keys) 399 d.subRepoPaths = append(d.subRepoPaths, keys) 400 } 401 402 d.languageMap = map[uint16]string{} 403 for k, v := range d.metaData.LanguageMap { 404 d.languageMap[v] = k 405 } 406 407 if err := d.verify(); err != nil { 408 return nil, err 409 } 410 411 if d.metaData.IndexFormatVersion >= 17 { 412 blob, err := d.readSectionBlob(toc.repos) 413 if err != nil { 414 return nil, err 415 } 416 d.repos = fromSizedDeltas16(blob, nil) 417 } else { 418 // every document is for repo index 0 (default value of uint16) 419 d.repos = make([]uint16, len(d.fileBranchMasks)) 420 } 421 422 if err := d.calculateStats(); err != nil { 423 return nil, err 424 } 425 426 return &d, nil 427} 428 429func (r *reader) parseMetadata(metaData simpleSection, repoMetaData simpleSection) ([]*zoekt.Repository, *zoekt.IndexMetadata, error) { 430 var md zoekt.IndexMetadata 431 if err := r.readJSON(&md, metaData); err != nil { 432 return nil, nil, err 433 } 434 435 // Sourcegraph specific: we support mutating metadata via an additional 436 // ".meta" file. This is to support tombstoning. An additional benefit is we 437 // can update metadata (such as Rank and Name) without re-indexing content. 438 blob, err := os.ReadFile(r.r.Name() + ".meta") 439 if err != nil && !os.IsNotExist(err) { 440 return nil, &md, fmt.Errorf("failed to read meta file: %w", err) 441 } 442 443 if len(blob) == 0 { 444 blob, err = r.r.Read(repoMetaData.off, repoMetaData.sz) 445 if err != nil { 446 return nil, &md, err 447 } 448 } 449 450 var repos []*zoekt.Repository 451 if md.IndexFormatVersion >= 17 { 452 if err := json.Unmarshal(blob, &repos); err != nil { 453 return nil, &md, err 454 } 455 } else { 456 repos = make([]*zoekt.Repository, 1) 457 if err := json.Unmarshal(blob, &repos[0]); err != nil { 458 return nil, &md, err 459 } 460 } 461 462 if md.ID == "" { 463 if len(repos) == 0 { 464 return nil, nil, fmt.Errorf("len(repos)=0. Cannot backfill ID") 465 } 466 md.ID = backfillID(repos[0].Name) 467 } 468 469 return repos, &md, nil 470} 471 472const ngramEncoding = 8 473 474func (d *indexData) newBtreeIndex(ngramSec simpleSection, postings compoundSection) (btreeIndex, error) { 475 bi := btreeIndex{file: d.file} 476 477 textContent, err := d.readSectionBlob(ngramSec) 478 if err != nil { 479 return btreeIndex{}, err 480 } 481 482 // For 500k trigams we can expect approx 1000 leaf nodes (500k divided by 483 // half the bucketSize) and 20 nodes on level 2 (all but the rightmost 484 // inner nodes will have exactly v=50 children) plus a root node. 485 bt := newBtree(btreeOpts{bucketSize: btreeBucketSize, v: 50}) 486 for i := 0; i < len(textContent); i += ngramEncoding { 487 ng := ngram(binary.BigEndian.Uint64(textContent[i : i+ngramEncoding])) 488 bt.insert(ng) 489 } 490 bt.freeze() 491 492 bi.bt = bt 493 494 // hold on to simple sections (8 bytes each) 495 bi.ngramSec = ngramSec 496 bi.postingIndex = postings.index 497 498 return bi, nil 499} 500 501func (d *indexData) verify() error { 502 // This is not an exhaustive check: the postings can easily 503 // generate OOB acccesses, and are expensive to check, but this lets us rule out 504 // other sources of OOB access. 505 n := len(d.fileNameIndex) 506 if n == 0 { 507 return nil 508 } 509 510 n-- 511 for what, got := range map[string]int{ 512 "boundaries": len(d.boundaries) - 1, 513 "branch masks": len(d.fileBranchMasks), 514 "doc section index": len(d.docSectionsIndex) - 1, 515 "newlines index": len(d.newlinesIndex) - 1, 516 } { 517 if got != n { 518 return fmt.Errorf("got %s %d, want %d", what, got, n) 519 } 520 } 521 return nil 522} 523 524func (d *indexData) readContents(i uint32) ([]byte, error) { 525 return d.readSectionBlob(simpleSection{ 526 off: d.boundariesStart + d.boundaries[i], 527 sz: d.boundaries[i+1] - d.boundaries[i], 528 }) 529} 530 531func (d *indexData) readContentSlice(off uint32, sz uint32) ([]byte, error) { 532 // TODO(hanwen): cap result if it is at the end of the content 533 // section. 534 return d.readSectionBlob(simpleSection{ 535 off: d.boundariesStart + off, 536 sz: sz, 537 }) 538} 539 540func (d *indexData) readNewlines(i uint32, buf []uint32) ([]uint32, uint32, error) { 541 sec := simpleSection{ 542 off: d.newlinesStart + d.newlinesIndex[i], 543 sz: d.newlinesIndex[i+1] - d.newlinesIndex[i], 544 } 545 blob, err := d.readSectionBlob(sec) 546 if err != nil { 547 return nil, 0, err 548 } 549 550 nl := fromSizedDeltas(blob, buf) 551 552 // can be nil if buf is nil and there are no doc sections. However, we rely 553 // on it being non-nil to cache the read. 554 if nl == nil { 555 nl = make([]uint32, 0) 556 } 557 return nl, sec.sz, nil 558} 559 560func (d *indexData) readDocSections(i uint32, buf []DocumentSection) ([]DocumentSection, uint32, error) { 561 sec := simpleSection{ 562 off: d.docSectionsStart + d.docSectionsIndex[i], 563 sz: d.docSectionsIndex[i+1] - d.docSectionsIndex[i], 564 } 565 blob, err := d.readSectionBlob(sec) 566 if err != nil { 567 return nil, 0, err 568 } 569 570 ds := unmarshalDocSections(blob, buf) 571 572 // can be nil if buf is nil and there are no doc sections. However, we rely 573 // on it being non-nil to cache the read. 574 if ds == nil { 575 ds = make([]DocumentSection, 0) 576 } 577 578 return ds, sec.sz, nil 579} 580 581// NewSearcher creates a Searcher for a single index file. Search 582// results coming from this searcher are valid only for the lifetime 583// of the Searcher itself, ie. []byte members should be copied into 584// fresh buffers if the result is to survive closing the shard. 585func NewSearcher(r IndexFile) (zoekt.Searcher, error) { 586 rd := &reader{r: r} 587 588 var toc indexTOC 589 if err := rd.readTOC(&toc); err != nil { 590 return nil, err 591 } 592 indexData, err := rd.readIndexData(&toc) 593 if err != nil { 594 return nil, err 595 } 596 indexData.file = r 597 return indexData, nil 598} 599 600// ReadMetadata returns the metadata of index shard without reading 601// the index data. The IndexFile is not closed. 602func ReadMetadata(inf IndexFile) ([]*zoekt.Repository, *zoekt.IndexMetadata, error) { 603 rd := &reader{r: inf} 604 var toc indexTOC 605 err := rd.readTOCSections(&toc, []string{"metaData", "repoMetaData"}) 606 if err != nil { 607 return nil, nil, err 608 } 609 return rd.parseMetadata(toc.metaData, toc.repoMetaData) 610} 611 612// ReadMetadataPathAlive is like ReadMetadataPath except that it only returns 613// alive repositories. 614func ReadMetadataPathAlive(p string) ([]*zoekt.Repository, *zoekt.IndexMetadata, error) { 615 repos, id, err := ReadMetadataPath(p) 616 if err != nil { 617 return nil, nil, err 618 } 619 alive := repos[:0] 620 for _, repo := range repos { 621 if !repo.Tombstone { 622 alive = append(alive, repo) 623 } 624 } 625 return alive, id, nil 626} 627 628// ReadMetadataPath returns the metadata of index shard at p without reading 629// the index data. ReadMetadataPath is a helper for ReadMetadata which opens 630// the IndexFile at p. 631func ReadMetadataPath(p string) ([]*zoekt.Repository, *zoekt.IndexMetadata, error) { 632 f, err := os.Open(p) 633 if err != nil { 634 return nil, nil, err 635 } 636 defer f.Close() 637 638 iFile, err := NewIndexFile(f) 639 if err != nil { 640 return nil, nil, err 641 } 642 defer iFile.Close() 643 644 return ReadMetadata(iFile) 645} 646 647// IndexFilePaths returns all paths for the IndexFile at filepath p that 648// exist. Note: if no files exist this will return an empty slice and nil 649// error. 650// 651// This is p and the ".meta" file for p. 652func IndexFilePaths(p string) ([]string, error) { 653 paths := []string{p, p + ".meta"} 654 exist := paths[:0] 655 for _, p := range paths { 656 if _, err := os.Stat(p); err == nil { 657 exist = append(exist, p) 658 } else if !os.IsNotExist(err) { 659 return nil, err 660 } 661 } 662 return exist, nil 663} 664 665// maybeContainsRepo is a performance optimization mainly intended to be used by 666// containsRepo to avoid unmarshalling large metadata files for compound shards. 667// It is best-effort, so if it encounters any error returns true (ie indicating 668// you need to do more checks). 669func maybeContainsRepo(inf IndexFile, repoID uint32) bool { 670 rd := &reader{r: inf} 671 var toc indexTOC 672 err := rd.readTOCSections(&toc, []string{"reposIDsBitmap"}) 673 if err != nil { 674 return true 675 } 676 677 // shard does not yet contain reposIDsBitmap so we can't tell if it contains 678 // repo. 679 if toc.reposIDsBitmap.sz == 0 { 680 return true 681 } 682 683 blob, err := inf.Read(toc.reposIDsBitmap.off, toc.reposIDsBitmap.sz) 684 if err != nil { 685 return true 686 } 687 688 var rb roaring.Bitmap 689 _, err = rb.FromUnsafeBytes(blob) 690 if err != nil { 691 return true 692 } 693 694 return rb.Contains(repoID) 695} 696 697var metricCompoundShardLookups = promauto.NewCounterVec(prometheus.CounterOpts{ 698 Name: "zoekt_compound_shard_lookups", 699 Help: "Number of compound shard lookups and how much work was done.", 700}, []string{"state"}) 701 702// containsRepo returns true if the shard at path contains a repo with id. The 703// function returns false if the shard does not contain the repo or if it 704// encounters an error. 705func containsRepo(p string, id uint32) bool { 706 var err error 707 earlyReturn := false 708 709 defer func() { 710 if err != nil { 711 metricCompoundShardLookups.WithLabelValues("error").Inc() 712 return 713 } 714 if earlyReturn { 715 metricCompoundShardLookups.WithLabelValues("skipped").Inc() 716 return 717 } 718 metricCompoundShardLookups.WithLabelValues("full_lookup").Inc() 719 }() 720 721 f, err := os.Open(p) 722 if err != nil { 723 return false 724 } 725 defer f.Close() 726 727 inf, err := NewIndexFile(f) 728 if err != nil { 729 return false 730 } 731 defer inf.Close() 732 733 // PERF: Looping over repos can be relatively slow on instances with thousands 734 // of tiny repos in compound shards. This is a much faster check to see if we 735 // need to do more work. 736 // 737 // If we are still seeing performance issues, we should consider adding 738 // some sort of global oracle here to avoid filepath.Glob and checking 739 // each compound shard. 740 if !maybeContainsRepo(inf, id) { 741 earlyReturn = true 742 return false 743 } 744 745 repos, _, err := ReadMetadata(inf) 746 if err != nil { 747 return false 748 } 749 for _, repo := range repos { 750 if repo.Tombstone { 751 continue 752 } 753 if repo.ID == id { 754 return true 755 } 756 } 757 758 return false 759} 760 761func loadIndexData(r IndexFile) (*indexData, error) { 762 rd := &reader{r: r} 763 764 var toc indexTOC 765 if err := rd.readTOC(&toc); err != nil { 766 return nil, err 767 } 768 return rd.readIndexData(&toc) 769} 770 771// PrintNgramStats outputs a list of the form 772// 773// n_1 trigram_1 774// n_2 trigram_2 775// ... 776// 777// where n_i is the length of the postings list of trigram_i stored in r. 778func PrintNgramStats(r IndexFile) error { 779 id, err := loadIndexData(r) 780 if err != nil { 781 return err 782 } 783 784 var rNgram [3]rune 785 for ngram, ss := range id.contentNgrams.DumpMap() { 786 rNgram = ngramToRunes(ngram) 787 fmt.Printf("%d\t%q\n", ss.sz, string(rNgram[:])) 788 } 789 return nil 790} 791 792var crc64Table = crc64.MakeTable(crc64.ECMA) 793 794// backfillID returns a 20 char long sortable ID. The ID only depends on s. It 795// should only be used to set the ID of simple v16 shards on read. 796func backfillID(s string) string { 797 var id xid.ID 798 799 // Our timestamps are based on Unix time. Shards without IDs are assigned IDs 800 // based on the 0 epoch. 801 binary.BigEndian.PutUint32(id[:], 0) 802 binary.BigEndian.PutUint64(id[4:], crc64.Checksum([]byte(s), crc64Table)) 803 return id.String() 804}