fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15// Package index contains logic for building Zoekt indexes. NOTE: this package is not considered 16// part of the public API, and it is not recommended to rely on it in external code. 17package index 18 19import ( 20 "crypto/sha1" 21 "flag" 22 "fmt" 23 "log" 24 "net/url" 25 "os" 26 "os/exec" 27 "path" 28 "path/filepath" 29 "reflect" 30 "runtime" 31 "runtime/pprof" 32 "sort" 33 "strconv" 34 "strings" 35 "sync" 36 "time" 37 38 "github.com/bmatcuk/doublestar" 39 "github.com/dustin/go-humanize" 40 "github.com/go-enry/go-enry/v2" 41 "github.com/rs/xid" 42 "golang.org/x/sys/unix" 43 44 "maps" 45 46 "github.com/sourcegraph/zoekt" 47 "github.com/sourcegraph/zoekt/internal/ctags" 48 "github.com/sourcegraph/zoekt/internal/tenant" 49) 50 51var DefaultDir = filepath.Join(os.Getenv("HOME"), ".zoekt") 52 53// Branch describes a single branch version. 54type Branch struct { 55 Name string 56 Version string 57} 58 59// Options sets options for the index building. 60type Options struct { 61 // IndexDir is a directory that holds *.zoekt index files. 62 IndexDir string 63 64 // SizeMax is the maximum file size 65 SizeMax int 66 67 // Parallelism is the maximum number of shards to index in parallel 68 Parallelism int 69 70 // ShardMax sets the maximum corpus size for a single shard 71 ShardMax int 72 73 // TrigramMax sets the maximum number of distinct trigrams per document. 74 TrigramMax int 75 76 // RepositoryDescription holds names and URLs for the repository. 77 RepositoryDescription zoekt.Repository 78 79 // SubRepositories is a path => sub repository map. 80 SubRepositories map[string]*zoekt.Repository 81 82 // DisableCTags disables the generation of ctags metadata. 83 DisableCTags bool 84 85 // CtagsPath is the path to the ctags binary to run, or empty 86 // if a valid binary couldn't be found. 87 CTagsPath string 88 89 // Same as CTagsPath but for scip-ctags 90 ScipCTagsPath string 91 92 // If set, ctags must succeed. 93 CTagsMustSucceed bool 94 95 // LargeFiles is a slice of glob patterns, including ** for any number 96 // of directories, where matching file paths should be indexed 97 // regardless of their size. The full pattern syntax is here: 98 // https://github.com/bmatcuk/doublestar/tree/v1#patterns. 99 LargeFiles []string 100 101 // IsDelta is true if this run contains only the changed documents since the 102 // last run. 103 IsDelta bool 104 105 // changedOrRemovedFiles is a list of file paths that have been changed or removed 106 // since the last indexing job for this repository. These files will be tombstoned 107 // in the older shards for this repository. 108 changedOrRemovedFiles []string 109 110 LanguageMap ctags.LanguageMap 111 112 // ShardMerging is true if builder should respect compound shards. This is a 113 // Sourcegraph specific option. 114 ShardMerging bool 115 116 // HeapProfileTriggerBytes is the heap allocation in bytes that will trigger a memory profile. If 0, no memory profile 117 // will be triggered. Note this trigger looks at total heap allocation (which includes both inuse and garbage objects). 118 // 119 // Profiles will be written to files named `index-memory.prof.n` in the index directory. No more than 10 files are written. 120 // 121 // Note: heap checking is "best effort", and it's possible for the process to OOM without triggering the heap profile. 122 HeapProfileTriggerBytes uint64 123 124 // TenantID is the ID of the tenant this shard belongs to. 125 TenantID int 126 127 // RepoID is the ID of the repository this shard belongs to. 128 RepoID uint32 129} 130 131// HashOptions contains only the options in Options that upon modification leads to IndexState of IndexStateMismatch during the next index building. 132type HashOptions struct { 133 sizeMax int 134 disableCTags bool 135 ctagsPath string 136 cTagsMustSucceed bool 137 largeFiles []string 138} 139 140func (o *Options) HashOptions() HashOptions { 141 return HashOptions{ 142 sizeMax: o.SizeMax, 143 disableCTags: o.DisableCTags, 144 ctagsPath: o.CTagsPath, 145 cTagsMustSucceed: o.CTagsMustSucceed, 146 largeFiles: o.LargeFiles, 147 } 148} 149 150func (o *Options) GetHash() string { 151 h := o.HashOptions() 152 hasher := sha1.New() 153 154 hasher.Write([]byte(h.ctagsPath)) 155 hasher.Write(fmt.Appendf(nil, "%t", h.cTagsMustSucceed)) 156 hasher.Write(fmt.Appendf(nil, "%d", h.sizeMax)) 157 hasher.Write(fmt.Appendf(nil, "%q", h.largeFiles)) 158 hasher.Write(fmt.Appendf(nil, "%t", h.disableCTags)) 159 160 return fmt.Sprintf("%x", hasher.Sum(nil)) 161} 162 163type largeFilesFlag struct{ *Options } 164 165func (f largeFilesFlag) String() string { 166 // From flag.Value documentation: 167 // 168 // The flag package may call the String method with a zero-valued receiver, 169 // such as a nil pointer. 170 if f.Options == nil { 171 return "" 172 } 173 s := append([]string{""}, f.LargeFiles...) 174 return strings.Join(s, "-large_file ") 175} 176 177func (f largeFilesFlag) Set(value string) error { 178 f.LargeFiles = append(f.LargeFiles, value) 179 return nil 180} 181 182// Flags adds flags for build options to fs. It is the "inverse" of Args. 183func (o *Options) Flags(fs *flag.FlagSet) { 184 x := *o 185 x.SetDefaults() 186 fs.IntVar(&o.SizeMax, "file_limit", x.SizeMax, "maximum file size") 187 fs.IntVar(&o.TrigramMax, "max_trigram_count", x.TrigramMax, "maximum number of trigrams per document") 188 fs.IntVar(&o.ShardMax, "shard_limit", x.ShardMax, "maximum corpus size for a shard") 189 fs.IntVar(&o.Parallelism, "parallelism", x.Parallelism, "maximum number of parallel indexing processes.") 190 fs.StringVar(&o.IndexDir, "index", x.IndexDir, "directory for search indices") 191 fs.BoolVar(&o.CTagsMustSucceed, "require_ctags", x.CTagsMustSucceed, "If set, ctags calls must succeed.") 192 fs.Var(largeFilesFlag{o}, "large_file", "A glob pattern where matching files are to be index regardless of their size. You can add multiple patterns by setting this more than once.") 193 194 // Sourcegraph specific 195 fs.BoolVar(&o.DisableCTags, "disable_ctags", x.DisableCTags, "If set, ctags will not be called.") 196 fs.BoolVar(&o.ShardMerging, "shard_merging", x.ShardMerging, "If set, builder will respect compound shards.") 197} 198 199// Args generates command line arguments for o. It is the "inverse" of Flags. 200func (o *Options) Args() []string { 201 var args []string 202 203 if o.SizeMax != 0 { 204 args = append(args, "-file_limit", strconv.Itoa(o.SizeMax)) 205 } 206 207 if o.TrigramMax != 0 { 208 args = append(args, "-max_trigram_count", strconv.Itoa(o.TrigramMax)) 209 } 210 211 if o.ShardMax != 0 { 212 args = append(args, "-shard_limit", strconv.Itoa(o.ShardMax)) 213 } 214 215 if o.Parallelism != 0 { 216 args = append(args, "-parallelism", strconv.Itoa(o.Parallelism)) 217 } 218 219 if o.IndexDir != "" { 220 args = append(args, "-index", o.IndexDir) 221 } 222 223 if o.CTagsMustSucceed { 224 args = append(args, "-require_ctags") 225 } 226 227 for _, a := range o.LargeFiles { 228 args = append(args, "-large_file", a) 229 } 230 231 // Sourcegraph specific 232 if o.DisableCTags { 233 args = append(args, "-disable_ctags") 234 } 235 236 if o.ShardMerging { 237 args = append(args, "-shard_merging") 238 } 239 240 return args 241} 242 243// Builder manages (parallel) creation of uniformly sized shards. The 244// builder buffers up documents until it collects enough documents and 245// then builds a shard and writes. 246type Builder struct { 247 opts Options 248 throttle chan int 249 250 nextShardNum int 251 todo []*Document 252 docChecker DocChecker 253 size int 254 255 parserBins ctags.ParserBinMap 256 building sync.WaitGroup 257 258 errMu sync.Mutex 259 buildError error 260 261 // temp name => final name for finished shards. We only rename 262 // them once all shards succeed to avoid Frankstein corpuses. 263 finishedShards map[string]string 264 265 // indexTime is set by tests for doing reproducible builds. 266 indexTime time.Time 267 268 // heapProfileMu is used to ensure that only one memory profile is written at a time 269 heapProfileMu sync.Mutex 270 heapProfileNum int 271 272 // a sortable 20 chars long id. 273 id string 274 275 finishCalled bool 276} 277 278type finishedShard struct { 279 temp, final string 280} 281 282func checkCTags() string { 283 if ctags := os.Getenv("CTAGS_COMMAND"); ctags != "" { 284 return ctags 285 } 286 287 if ctags, err := exec.LookPath("universal-ctags"); err == nil { 288 return ctags 289 } 290 291 return "" 292} 293 294func checkScipCTags() string { 295 if ctags := os.Getenv("SCIP_CTAGS_COMMAND"); ctags != "" { 296 return ctags 297 } 298 299 if ctags, err := exec.LookPath("scip-ctags"); err == nil { 300 return ctags 301 } 302 303 return "" 304} 305 306// SetDefaults sets reasonable default options. 307func (o *Options) SetDefaults() { 308 if o.CTagsPath == "" && !o.DisableCTags { 309 o.CTagsPath = checkCTags() 310 } 311 312 if o.ScipCTagsPath == "" && !o.DisableCTags { 313 o.ScipCTagsPath = checkScipCTags() 314 } 315 316 if o.Parallelism == 0 { 317 o.Parallelism = 4 318 } 319 if o.SizeMax == 0 { 320 o.SizeMax = 2 << 20 321 } 322 if o.ShardMax == 0 { 323 o.ShardMax = 100 << 20 324 } 325 if o.TrigramMax == 0 { 326 o.TrigramMax = 20000 327 } 328 329 if o.RepositoryDescription.Name == "" && o.RepositoryDescription.URL != "" { 330 parsed, _ := url.Parse(o.RepositoryDescription.URL) 331 if parsed != nil { 332 o.RepositoryDescription.Name = filepath.Join(parsed.Host, parsed.Path) 333 } 334 } 335} 336 337// ShardName returns the name the given index shard. 338func (o *Options) shardName(n int) string { 339 return o.shardNameVersion(IndexFormatVersion, n) 340} 341 342func (o *Options) shardNameVersion(version, n int) string { 343 var prefix string 344 345 // If tenant enforcement is enabled and we have tenant/repo IDs, use those to generate the prefix 346 if o.TenantID != 0 && o.RepoID != 0 && tenant.EnforceTenant() { 347 prefix = tenant.SrcPrefix(o.TenantID, o.RepoID) 348 } else { 349 prefix = o.RepositoryDescription.Name 350 } 351 352 return ShardName(o.IndexDir, prefix, version, n) 353} 354 355type IndexState string 356 357const ( 358 IndexStateMissing IndexState = "missing" 359 IndexStateCorrupt IndexState = "corrupt" 360 IndexStateVersion IndexState = "version-mismatch" 361 IndexStateOption IndexState = "option-mismatch" 362 IndexStateMeta IndexState = "meta-mismatch" 363 IndexStateContent IndexState = "content-mismatch" 364 IndexStateEqual IndexState = "equal" 365) 366 367var readVersions = []struct { 368 IndexFormatVersion int 369 FeatureVersion int 370}{{ 371 IndexFormatVersion: IndexFormatVersion, 372 FeatureVersion: FeatureVersion, 373}, { 374 IndexFormatVersion: NextIndexFormatVersion, 375 FeatureVersion: FeatureVersion, 376}} 377 378// IncrementalSkipIndexing returns true if the index present on disk matches 379// the build options. 380func (o *Options) IncrementalSkipIndexing() bool { 381 state, _ := o.IndexState() 382 return state == IndexStateEqual 383} 384 385// IndexState checks how the index present on disk compares to the build 386// options and returns the IndexState and the name of the first shard. 387func (o *Options) IndexState() (IndexState, string) { 388 // Open the latest version we support that is on disk. 389 fn := o.findShard() 390 if fn == "" { 391 return IndexStateMissing, fn 392 } 393 394 repos, index, err := ReadMetadataPathAlive(fn) 395 if os.IsNotExist(err) { 396 return IndexStateMissing, fn 397 } else if err != nil { 398 return IndexStateCorrupt, fn 399 } 400 401 for _, v := range readVersions { 402 if v.IndexFormatVersion == index.IndexFormatVersion && v.FeatureVersion != index.IndexFeatureVersion { 403 return IndexStateVersion, fn 404 } 405 } 406 407 var repo *zoekt.Repository 408 for _, cand := range repos { 409 if cand.Name == o.RepositoryDescription.Name { 410 repo = cand 411 break 412 } 413 } 414 415 if repo == nil { 416 return IndexStateCorrupt, fn 417 } 418 419 if repo.IndexOptions != o.GetHash() { 420 return IndexStateOption, fn 421 } 422 423 if !reflect.DeepEqual(repo.Branches, o.RepositoryDescription.Branches) { 424 return IndexStateContent, fn 425 } 426 427 // We can mutate repo since it lives in the scope of this function call. 428 if updated, err := repo.MergeMutable(&o.RepositoryDescription); err != nil { 429 // non-nil err means we are trying to update an immutable field => 430 // reindex content. 431 log.Printf("warn: immutable field changed, requires re-index: %s", err) 432 return IndexStateContent, fn 433 } else if updated { 434 return IndexStateMeta, fn 435 } 436 437 return IndexStateEqual, fn 438} 439 440// FindRepositoryMetadata returns the index metadata for the repository 441// specified in the options. 'ok' is false if the repository's metadata 442// couldn't be found or if an error occurred. 443func (o *Options) FindRepositoryMetadata() (repository *zoekt.Repository, metadata *zoekt.IndexMetadata, ok bool, err error) { 444 shard := o.findShard() 445 if shard == "" { 446 return nil, nil, false, nil 447 } 448 449 repositories, metadata, err := ReadMetadataPathAlive(shard) 450 if err != nil { 451 return nil, nil, false, fmt.Errorf("reading metadata for shard %q: %w", shard, err) 452 } 453 454 ID := o.RepositoryDescription.ID 455 for _, r := range repositories { 456 // compound shards contain multiple repositories, so we 457 // have to pick only the one we're looking for 458 if r.ID == ID { 459 return r, metadata, true, nil 460 } 461 } 462 463 // If we're here, then we're somehow in a state where we found a matching 464 // shard that's missing the repository metadata we're looking for. This 465 // should never happen. 466 name := o.RepositoryDescription.Name 467 return nil, nil, false, fmt.Errorf("matching shard %q doesn't contain metadata for repo id %d (%q)", shard, ID, name) 468} 469 470func (o *Options) findShard() string { 471 for _, v := range readVersions { 472 fn := o.shardNameVersion(v.IndexFormatVersion, 0) 473 if _, err := os.Stat(fn); err == nil { 474 return fn 475 } 476 } 477 478 // Brute force finding the shard in compound shards. We should only hit this 479 // code path for repositories that don't exist yet or are in compound shards. 480 return o.findCompoundShard() 481} 482 483func (o *Options) findCompoundShard() string { 484 compoundShards, err := filepath.Glob(path.Join(o.IndexDir, "compound-*.zoekt")) 485 if err != nil { 486 return "" 487 } 488 for _, fn := range compoundShards { 489 if containsRepo(fn, o.RepositoryDescription.ID) { 490 return fn 491 } 492 } 493 494 return "" 495} 496 497func (o *Options) FindAllShards() []string { 498 for _, v := range readVersions { 499 fn := o.shardNameVersion(v.IndexFormatVersion, 0) 500 if _, err := os.Stat(fn); err == nil { 501 shards := []string{fn} 502 for i := 1; ; i++ { 503 fn := o.shardNameVersion(v.IndexFormatVersion, i) 504 if _, err := os.Stat(fn); err != nil { 505 return shards 506 } 507 shards = append(shards, fn) 508 } 509 } 510 } 511 512 // lazily fallback to findShard which will look for a compound shard. 513 if fn := o.findShard(); fn != "" { 514 return []string{fn} 515 } 516 517 return nil 518} 519 520// IgnoreSizeMax determines whether the max size should be ignored. 521func (o *Options) IgnoreSizeMax(name string) bool { 522 // A pattern match will override preceding pattern matches. 523 for i := len(o.LargeFiles) - 1; i >= 0; i-- { 524 pattern := strings.TrimSpace(o.LargeFiles[i]) 525 negated, validatedPattern := checkIsNegatePattern(pattern) 526 527 if m, _ := doublestar.PathMatch(validatedPattern, name); m { 528 if negated { 529 return false 530 } else { 531 return true 532 } 533 } 534 } 535 536 return false 537} 538 539func checkIsNegatePattern(pattern string) (bool, string) { 540 negate := "!" 541 542 // if negated then strip prefix meta character which identifies negated filter pattern 543 if strings.HasPrefix(pattern, negate) { 544 return true, pattern[len(negate):] 545 } 546 547 return false, pattern 548} 549 550// NewBuilder creates a new Builder instance. 551func NewBuilder(opts Options) (*Builder, error) { 552 opts.SetDefaults() 553 if opts.RepositoryDescription.Name == "" { 554 return nil, fmt.Errorf("builder: must set Name") 555 } 556 557 b := &Builder{ 558 opts: opts, 559 throttle: make(chan int, opts.Parallelism), 560 finishedShards: map[string]string{}, 561 } 562 563 parserBins, err := ctags.NewParserBinMap( 564 b.opts.CTagsPath, 565 b.opts.ScipCTagsPath, 566 opts.LanguageMap, 567 b.opts.CTagsMustSucceed, 568 ) 569 if err != nil { 570 return nil, err 571 } 572 573 b.parserBins = parserBins 574 575 if opts.IsDelta { 576 // Delta shards build on top of previously existing shards. 577 // As a consequence, the shardNum for delta shards starts from 578 // the number following the most recently generated shard - not 0. 579 // 580 // Using this numbering scheme allows all the shards to be 581 // discovered as a set. 582 shards := b.opts.FindAllShards() 583 b.nextShardNum = len(shards) // shards are zero indexed, so len() provides the next number after the last one 584 } 585 586 if _, err := b.newShardBuilder(); err != nil { 587 return nil, err 588 } 589 590 now := time.Now() 591 b.indexTime = now 592 b.id = xid.NewWithTime(now).String() 593 594 return b, nil 595} 596 597// AddFile is a convenience wrapper for the Add method 598func (b *Builder) AddFile(name string, content []byte) error { 599 return b.Add(Document{Name: name, Content: content}) 600} 601 602func (b *Builder) Add(doc Document) error { 603 if b.finishCalled { 604 return nil 605 } 606 607 allowLargeFile := b.opts.IgnoreSizeMax(doc.Name) 608 if len(doc.Content) > b.opts.SizeMax && !allowLargeFile { 609 // We could pass the document on to the shardbuilder, but if 610 // we pass through a part of the source tree with binary/large 611 // files, the corresponding shard would be mostly empty, so 612 // insert a reason here too. 613 doc.SkipReason = SkipReasonTooLarge 614 } else if skip := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile); skip != SkipReasonNone { 615 doc.SkipReason = skip 616 } 617 618 b.todo = append(b.todo, &doc) 619 620 if doc.SkipReason == SkipReasonNone { 621 b.size += len(doc.Name) + len(doc.Content) 622 } else { 623 b.size += len(doc.Name) 624 // Drop the content if we are skipping the document. Skipped content is not counted towards the 625 // shard size limit, so otherwise we might buffer too much data in memory before flushing. 626 doc.Content = nil 627 } 628 629 if b.size > b.opts.ShardMax { 630 return b.flush() 631 } 632 633 return nil 634} 635 636// MarkFileAsChangedOrRemoved indicates that the file specified by the given path 637// has been changed or removed since the last indexing job for this repository. 638// 639// If this build is a delta build, these files will be tombstoned in the older shards for this repository. 640func (b *Builder) MarkFileAsChangedOrRemoved(path string) { 641 b.opts.changedOrRemovedFiles = append(b.opts.changedOrRemovedFiles, path) 642} 643 644// Finish creates a last shard from the buffered documents, and clears 645// stale shards from previous runs. This should always be called, also 646// in failure cases, to ensure cleanup. 647// 648// It is safe to call Finish() multiple times. 649func (b *Builder) Finish() error { 650 if b.finishCalled { 651 return b.buildError 652 } 653 654 b.finishCalled = true 655 656 b.flush() 657 b.building.Wait() 658 659 if b.buildError != nil { 660 for tmp := range b.finishedShards { 661 log.Printf("Builder.Finish %s", tmp) 662 os.Remove(tmp) 663 } 664 b.finishedShards = map[string]string{} 665 return b.buildError 666 } 667 668 // map of temporary -> final names for all updated shards + shard metadata files 669 artifactPaths := make(map[string]string) 670 maps.Copy(artifactPaths, b.finishedShards) 671 672 oldShards := b.opts.FindAllShards() 673 674 if b.opts.IsDelta { 675 // Delta shard builds need to update FileTombstone and branch commit information for all 676 // existing shards 677 for _, shard := range oldShards { 678 repositories, _, err := ReadMetadataPathAlive(shard) 679 if err != nil { 680 return fmt.Errorf("reading metadata from shard %q: %w", shard, err) 681 } 682 683 if len(repositories) > 1 { 684 return fmt.Errorf("delta shard builds don't support repositories contained in compound shards (shard %q)", shard) 685 } 686 687 if len(repositories) == 0 { 688 return fmt.Errorf("failed to update repository metadata for shard %q - shard contains no repositories", shard) 689 } 690 691 repository := repositories[0] 692 if repository.ID != b.opts.RepositoryDescription.ID { 693 return fmt.Errorf("shard %q doesn't contain repository ID %d (%q)", shard, b.opts.RepositoryDescription.ID, b.opts.RepositoryDescription.Name) 694 } 695 696 if len(b.opts.changedOrRemovedFiles) > 0 && repository.FileTombstones == nil { 697 repository.FileTombstones = make(map[string]struct{}, len(b.opts.changedOrRemovedFiles)) 698 } 699 700 for _, f := range b.opts.changedOrRemovedFiles { 701 repository.FileTombstones[f] = struct{}{} 702 } 703 704 if !BranchNamesEqual(repository.Branches, b.opts.RepositoryDescription.Branches) { 705 return deltaBranchSetError{ 706 shardName: shard, 707 old: repository.Branches, 708 new: b.opts.RepositoryDescription.Branches, 709 } 710 } 711 712 if b.opts.GetHash() != repository.IndexOptions { 713 return &deltaIndexOptionsMismatchError{ 714 shardName: shard, 715 newOptions: b.opts.HashOptions(), 716 } 717 } 718 719 repository.Branches = b.opts.RepositoryDescription.Branches 720 721 repository.LatestCommitDate = b.opts.RepositoryDescription.LatestCommitDate 722 723 tempPath, finalPath, err := JsonMarshalRepoMetaTemp(shard, repository) 724 if err != nil { 725 return fmt.Errorf("writing repository metadta for shard %q: %w", shard, err) 726 } 727 728 artifactPaths[tempPath] = finalPath 729 } 730 } 731 732 // We mark finished shards as empty when we successfully finish. Return now 733 // to allow call sites to call Finish idempotently. 734 if len(artifactPaths) == 0 { 735 return b.buildError 736 } 737 738 // Collect a map of the old shards on disk. For each new shard we replace we 739 // delete it from toDelete. Anything remaining in toDelete will be removed 740 // after we have renamed everything into place. 741 742 var toDelete map[string]struct{} 743 if !b.opts.IsDelta { 744 // Non-delta shard builds delete all existing shards before they write out 745 // new ones. 746 // By contrast, delta shard builds work by stacking changes on top of existing shards. 747 // So, we skip populating the toDelete map if we're building delta shards. 748 749 toDelete = make(map[string]struct{}) 750 for _, name := range oldShards { 751 paths, err := IndexFilePaths(name) 752 if err != nil { 753 b.buildError = fmt.Errorf("failed to find old paths for %s: %w", name, err) 754 } 755 for _, p := range paths { 756 toDelete[p] = struct{}{} 757 } 758 } 759 } 760 761 for tmp, final := range artifactPaths { 762 if err := os.Rename(tmp, final); err != nil { 763 b.buildError = err 764 continue 765 } 766 767 delete(toDelete, final) 768 } 769 770 b.finishedShards = map[string]string{} 771 772 for p := range toDelete { 773 // Don't delete compound shards, set tombstones instead. 774 if b.opts.ShardMerging && strings.HasPrefix(filepath.Base(p), "compound-") { 775 if !strings.HasSuffix(p, ".zoekt") { 776 continue 777 } 778 err := SetTombstone(p, b.opts.RepositoryDescription.ID) 779 b.buildError = err 780 continue 781 } 782 log.Printf("removing old shard file: %s", p) 783 if err := os.Remove(p); err != nil { 784 b.buildError = err 785 } 786 } 787 788 return b.buildError 789} 790 791// BranchNamesEqual compares the given zoekt.RepositoryBranch slices, and returns true 792// iff both slices specify the same set of branch names in the same order. 793func BranchNamesEqual(a, b []zoekt.RepositoryBranch) bool { 794 if len(a) != len(b) { 795 return false 796 } 797 798 for i := range a { 799 x, y := a[i], b[i] 800 if x.Name != y.Name { 801 return false 802 } 803 } 804 805 return true 806} 807 808func (b *Builder) flush() error { 809 todo := b.todo 810 b.todo = nil 811 b.size = 0 812 b.errMu.Lock() 813 defer b.errMu.Unlock() 814 if b.buildError != nil { 815 return b.buildError 816 } 817 818 hasShard := b.nextShardNum > 0 819 if len(todo) == 0 && hasShard { 820 return nil 821 } 822 823 shard := b.nextShardNum 824 b.nextShardNum++ 825 826 if b.opts.Parallelism > 1 { 827 b.building.Add(1) 828 b.throttle <- 1 829 go func() { 830 done, err := b.buildShard(todo, shard) 831 <-b.throttle 832 833 b.errMu.Lock() 834 defer b.errMu.Unlock() 835 if err != nil && b.buildError == nil { 836 b.buildError = err 837 } 838 if err == nil { 839 b.finishedShards[done.temp] = done.final 840 } 841 b.building.Done() 842 }() 843 } else { 844 // No goroutines when we're not parallel. This 845 // simplifies memory profiling. 846 done, err := b.buildShard(todo, shard) 847 b.buildError = err 848 if err == nil { 849 b.finishedShards[done.temp] = done.final 850 } 851 852 return b.buildError 853 } 854 855 return nil 856} 857 858// map [0,inf) to [0,1) monotonically 859func squashRange(j int) float64 { 860 x := float64(j) 861 return x / (1 + x) 862} 863 864type rankedDoc struct { 865 *Document 866 rank []float64 867} 868 869// rank returns a vector of scores which is used at index-time to sort documents 870// before writing them to disk. The order of documents in the shard is important 871// at query time, because earlier documents receive a boost at query time and 872// have a higher chance of being searched before limits kick in. 873func rank(d *Document, origIdx int) []float64 { 874 skipped := 0.0 875 if d.SkipReason != SkipReasonNone { 876 skipped = 1.0 877 } 878 879 generated := 0.0 880 if enry.IsGenerated(d.Name, d.Content) { 881 generated = 1.0 882 } 883 884 vendor := 0.0 885 if enry.IsVendor(d.Name) { 886 vendor = 1.0 887 } 888 889 test := 0.0 890 if enry.IsTest(d.Name) { 891 test = 1.0 892 } 893 894 // Smaller is earlier (=better). 895 return []float64{ 896 // Always place skipped docs last 897 skipped, 898 899 // Prefer docs that are not generated 900 generated, 901 902 // Prefer docs that are not vendored 903 vendor, 904 905 // Prefer docs that are not tests 906 test, 907 908 // With short names 909 squashRange(len(d.Name)), 910 911 // With many symbols 912 1.0 - squashRange(len(d.Symbols)), 913 914 // With short content 915 squashRange(len(d.Content)), 916 917 // That is present is as many branches as possible 918 1.0 - squashRange(len(d.Branches)), 919 920 // Preserve original ordering. 921 squashRange(origIdx), 922 } 923} 924 925func sortDocuments(todo []*Document) { 926 rs := make([]rankedDoc, 0, len(todo)) 927 for i, t := range todo { 928 rd := rankedDoc{t, rank(t, i)} 929 rs = append(rs, rd) 930 } 931 sort.Slice(rs, func(i, j int) bool { 932 r1 := rs[i].rank 933 r2 := rs[j].rank 934 for i := range r1 { 935 if r1[i] < r2[i] { 936 return true 937 } 938 if r1[i] > r2[i] { 939 return false 940 } 941 } 942 943 return false 944 }) 945 for i := range todo { 946 todo[i] = rs[i].Document 947 } 948} 949 950func (b *Builder) buildShard(todo []*Document, nextShardNum int) (*finishedShard, error) { 951 if !b.opts.DisableCTags && (b.opts.CTagsPath != "" || b.opts.ScipCTagsPath != "") { 952 err := parseSymbols(todo, b.opts.LanguageMap, b.parserBins) 953 if b.opts.CTagsMustSucceed && err != nil { 954 return nil, err 955 } 956 if err != nil { 957 log.Printf("ignoring universal:%s or scip:%s error: %v", b.opts.CTagsPath, b.opts.ScipCTagsPath, err) 958 } 959 } 960 961 name := b.opts.shardName(nextShardNum) 962 963 shardBuilder, err := b.newShardBuilder() 964 if err != nil { 965 return nil, err 966 } 967 968 sortDocuments(todo) 969 970 for idx, t := range todo { 971 if err := shardBuilder.Add(*t); err != nil { 972 return nil, err 973 } 974 975 if idx%10_000 == 0 { 976 b.CheckMemoryUsage() 977 } 978 } 979 980 return b.writeShard(name, shardBuilder) 981} 982 983// CheckMemoryUsage checks the memory usage of the process and writes a memory profile if the heap usage exceeds the 984// configured threshold. NOTE: this method is expensive and should only be used for debugging. 985func (b *Builder) CheckMemoryUsage() { 986 // Don't check memory if heap profiling is disabled, or we've already written 10 profiles 987 if b.opts.HeapProfileTriggerBytes <= 0 || b.heapProfileNum >= 10 { 988 return 989 } 990 991 var m runtime.MemStats 992 runtime.ReadMemStats(&m) 993 994 if m.HeapAlloc > b.opts.HeapProfileTriggerBytes && b.heapProfileMu.TryLock() { 995 defer b.heapProfileMu.Unlock() 996 997 log.Printf("writing memory profile, allocated heap: %s", humanize.Bytes(m.HeapAlloc)) 998 name := filepath.Join(b.opts.IndexDir, fmt.Sprintf("indexmemory.prof.%d", b.heapProfileNum)) 999 f, err := os.Create(name) 1000 if err != nil { 1001 log.Printf("failed to create memory profile file: %v", err) 1002 return 1003 } 1004 1005 err = pprof.WriteHeapProfile(f) 1006 if err != nil { 1007 log.Printf("failed to write memory profile: %v", err) 1008 } 1009 1010 b.heapProfileNum++ 1011 } 1012} 1013 1014func (b *Builder) newShardBuilder() (*ShardBuilder, error) { 1015 desc := b.opts.RepositoryDescription 1016 desc.HasSymbols = !b.opts.DisableCTags && b.opts.CTagsPath != "" 1017 desc.SubRepoMap = b.opts.SubRepositories 1018 desc.IndexOptions = b.opts.GetHash() 1019 1020 shardBuilder, err := NewShardBuilder(&desc) 1021 if err != nil { 1022 return nil, err 1023 } 1024 shardBuilder.IndexTime = b.indexTime 1025 shardBuilder.ID = b.id 1026 return shardBuilder, nil 1027} 1028 1029func (b *Builder) writeShard(fn string, ib *ShardBuilder) (*finishedShard, error) { 1030 dir := filepath.Dir(fn) 1031 if err := os.MkdirAll(dir, 0o700); err != nil { 1032 return nil, err 1033 } 1034 1035 f, err := os.CreateTemp(dir, filepath.Base(fn)+".*.tmp") 1036 if err != nil { 1037 return nil, err 1038 } 1039 if runtime.GOOS != "windows" { 1040 if err := f.Chmod(0o666 &^ umask); err != nil { 1041 return nil, err 1042 } 1043 } 1044 1045 defer f.Close() 1046 if err := ib.Write(f); err != nil { 1047 return nil, err 1048 } 1049 fi, err := f.Stat() 1050 if err != nil { 1051 return nil, err 1052 } 1053 if err := f.Close(); err != nil { 1054 return nil, err 1055 } 1056 1057 log.Printf("finished shard %s: %d index bytes (overhead %3.1f), %d files processed \n", 1058 fn, 1059 fi.Size(), 1060 float64(fi.Size())/float64(ib.ContentSize()+1), 1061 ib.NumFiles()) 1062 1063 return &finishedShard{f.Name(), fn}, nil 1064} 1065 1066type deltaBranchSetError struct { 1067 shardName string 1068 old, new []zoekt.RepositoryBranch 1069} 1070 1071func (e deltaBranchSetError) Error() string { 1072 return fmt.Sprintf("repository metadata in shard %q contains a different set of branch names than what was requested, which is unsupported in a delta shard build. old: %+v, new: %+v", e.shardName, e.old, e.new) 1073} 1074 1075type deltaIndexOptionsMismatchError struct { 1076 shardName string 1077 newOptions HashOptions 1078} 1079 1080func (e *deltaIndexOptionsMismatchError) Error() string { 1081 return fmt.Sprintf("one or more index options for shard %q do not match Builder's index options. These index option updates are incompatible with delta build. New index options: %+v", e.shardName, e.newOptions) 1082} 1083 1084// umask holds the Umask of the current process 1085var umask os.FileMode 1086 1087func init() { 1088 umask = os.FileMode(unix.Umask(0)) 1089 unix.Umask(int(umask)) 1090}