fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15// Package index contains logic for building Zoekt indexes. NOTE: this package is not considered 16// part of the public API, and it is not recommended to rely on it in external code. 17package index 18 19import ( 20 "crypto/sha1" 21 "flag" 22 "fmt" 23 "log" 24 "net/url" 25 "os" 26 "os/exec" 27 "path" 28 "path/filepath" 29 "reflect" 30 "runtime" 31 "runtime/pprof" 32 "sort" 33 "strconv" 34 "strings" 35 "sync" 36 "time" 37 38 "github.com/bmatcuk/doublestar" 39 "github.com/dustin/go-humanize" 40 "github.com/go-enry/go-enry/v2" 41 "github.com/rs/xid" 42 "golang.org/x/sys/unix" 43 44 "maps" 45 46 "github.com/sourcegraph/zoekt" 47 "github.com/sourcegraph/zoekt/internal/ctags" 48 "github.com/sourcegraph/zoekt/internal/tenant" 49) 50 51var DefaultDir = filepath.Join(os.Getenv("HOME"), ".zoekt") 52 53// Branch describes a single branch version. 54type Branch struct { 55 Name string 56 Version string 57} 58 59// Options sets options for the index building. 60type Options struct { 61 // IndexDir is a directory that holds *.zoekt index files. 62 IndexDir string 63 64 // ShardPrefixOverride sets the prefix for shards name 65 ShardPrefixOverride string 66 67 // SizeMax is the maximum file size 68 SizeMax int 69 70 // Parallelism is the maximum number of shards to index in parallel 71 Parallelism int 72 73 // ShardMax sets the maximum corpus size for a single shard 74 ShardMax int 75 76 // TrigramMax sets the maximum number of distinct trigrams per document. 77 TrigramMax int 78 79 // RepositoryDescription holds names and URLs for the repository. 80 RepositoryDescription zoekt.Repository 81 82 // SubRepositories is a path => sub repository map. 83 SubRepositories map[string]*zoekt.Repository 84 85 // DisableCTags disables the generation of ctags metadata. 86 DisableCTags bool 87 88 // CtagsPath is the path to the ctags binary to run, or empty 89 // if a valid binary couldn't be found. 90 CTagsPath string 91 92 // Same as CTagsPath but for scip-ctags 93 ScipCTagsPath string 94 95 // If set, ctags must succeed. 96 CTagsMustSucceed bool 97 98 // LargeFiles is a slice of glob patterns, including ** for any number 99 // of directories, where matching file paths should be indexed 100 // regardless of their size. The full pattern syntax is here: 101 // https://github.com/bmatcuk/doublestar/tree/v1#patterns. 102 LargeFiles []string 103 104 // IsDelta is true if this run contains only the changed documents since the 105 // last run. 106 IsDelta bool 107 108 // changedOrRemovedFiles is a list of file paths that have been changed or removed 109 // since the last indexing job for this repository. These files will be tombstoned 110 // in the older shards for this repository. 111 changedOrRemovedFiles []string 112 113 LanguageMap ctags.LanguageMap 114 115 // ShardMerging is true if builder should respect compound shards. This is a 116 // Sourcegraph specific option. 117 ShardMerging bool 118 119 // HeapProfileTriggerBytes is the heap allocation in bytes that will trigger a memory profile. If 0, no memory profile 120 // will be triggered. Note this trigger looks at total heap allocation (which includes both inuse and garbage objects). 121 // 122 // Profiles will be written to files named `index-memory.prof.n` in the index directory. No more than 10 files are written. 123 // 124 // Note: heap checking is "best effort", and it's possible for the process to OOM without triggering the heap profile. 125 HeapProfileTriggerBytes uint64 126} 127 128// HashOptions contains only the options in Options that upon modification leads to IndexState of IndexStateMismatch during the next index building. 129type HashOptions struct { 130 sizeMax int 131 disableCTags bool 132 ctagsPath string 133 cTagsMustSucceed bool 134 largeFiles []string 135} 136 137func (o *Options) HashOptions() HashOptions { 138 return HashOptions{ 139 sizeMax: o.SizeMax, 140 disableCTags: o.DisableCTags, 141 ctagsPath: o.CTagsPath, 142 cTagsMustSucceed: o.CTagsMustSucceed, 143 largeFiles: o.LargeFiles, 144 } 145} 146 147func (o *Options) GetHash() string { 148 h := o.HashOptions() 149 hasher := sha1.New() 150 151 hasher.Write([]byte(h.ctagsPath)) 152 hasher.Write(fmt.Appendf(nil, "%t", h.cTagsMustSucceed)) 153 hasher.Write(fmt.Appendf(nil, "%d", h.sizeMax)) 154 hasher.Write(fmt.Appendf(nil, "%q", h.largeFiles)) 155 hasher.Write(fmt.Appendf(nil, "%t", h.disableCTags)) 156 157 return fmt.Sprintf("%x", hasher.Sum(nil)) 158} 159 160type largeFilesFlag struct{ *Options } 161 162func (f largeFilesFlag) String() string { 163 // From flag.Value documentation: 164 // 165 // The flag package may call the String method with a zero-valued receiver, 166 // such as a nil pointer. 167 if f.Options == nil { 168 return "" 169 } 170 s := append([]string{""}, f.LargeFiles...) 171 return strings.Join(s, "-large_file ") 172} 173 174func (f largeFilesFlag) Set(value string) error { 175 f.LargeFiles = append(f.LargeFiles, value) 176 return nil 177} 178 179// Flags adds flags for build options to fs. It is the "inverse" of Args. 180func (o *Options) Flags(fs *flag.FlagSet) { 181 x := *o 182 x.SetDefaults() 183 fs.IntVar(&o.SizeMax, "file_limit", x.SizeMax, "maximum file size") 184 fs.IntVar(&o.TrigramMax, "max_trigram_count", x.TrigramMax, "maximum number of trigrams per document") 185 fs.IntVar(&o.ShardMax, "shard_limit", x.ShardMax, "maximum corpus size for a shard") 186 fs.IntVar(&o.Parallelism, "parallelism", x.Parallelism, "maximum number of parallel indexing processes.") 187 fs.StringVar(&o.IndexDir, "index", x.IndexDir, "directory for search indices") 188 fs.StringVar(&o.ShardPrefixOverride, "shard_prefix_override", x.ShardPrefixOverride, "prefix for shard name") 189 fs.BoolVar(&o.CTagsMustSucceed, "require_ctags", x.CTagsMustSucceed, "If set, ctags calls must succeed.") 190 fs.Var(largeFilesFlag{o}, "large_file", "A glob pattern where matching files are to be index regardless of their size. You can add multiple patterns by setting this more than once.") 191 192 // Sourcegraph specific 193 fs.BoolVar(&o.DisableCTags, "disable_ctags", x.DisableCTags, "If set, ctags will not be called.") 194 fs.BoolVar(&o.ShardMerging, "shard_merging", x.ShardMerging, "If set, builder will respect compound shards.") 195} 196 197// Args generates command line arguments for o. It is the "inverse" of Flags. 198func (o *Options) Args() []string { 199 var args []string 200 201 if o.SizeMax != 0 { 202 args = append(args, "-file_limit", strconv.Itoa(o.SizeMax)) 203 } 204 205 if o.TrigramMax != 0 { 206 args = append(args, "-max_trigram_count", strconv.Itoa(o.TrigramMax)) 207 } 208 209 if o.ShardMax != 0 { 210 args = append(args, "-shard_limit", strconv.Itoa(o.ShardMax)) 211 } 212 213 if o.Parallelism != 0 { 214 args = append(args, "-parallelism", strconv.Itoa(o.Parallelism)) 215 } 216 217 if o.IndexDir != "" { 218 args = append(args, "-index", o.IndexDir) 219 } 220 221 if o.ShardPrefixOverride != "" { 222 args = append(args, "-shard_prefix_override", o.ShardPrefixOverride) 223 } 224 225 if o.CTagsMustSucceed { 226 args = append(args, "-require_ctags") 227 } 228 229 for _, a := range o.LargeFiles { 230 args = append(args, "-large_file", a) 231 } 232 233 // Sourcegraph specific 234 if o.DisableCTags { 235 args = append(args, "-disable_ctags") 236 } 237 238 if o.ShardMerging { 239 args = append(args, "-shard_merging") 240 } 241 242 return args 243} 244 245// Builder manages (parallel) creation of uniformly sized shards. The 246// builder buffers up documents until it collects enough documents and 247// then builds a shard and writes. 248type Builder struct { 249 opts Options 250 throttle chan int 251 252 nextShardNum int 253 todo []*Document 254 docChecker DocChecker 255 size int 256 257 parserBins ctags.ParserBinMap 258 building sync.WaitGroup 259 260 errMu sync.Mutex 261 buildError error 262 263 // temp name => final name for finished shards. We only rename 264 // them once all shards succeed to avoid Frankstein corpuses. 265 finishedShards map[string]string 266 267 // indexTime is set by tests for doing reproducible builds. 268 indexTime time.Time 269 270 // heapProfileMu is used to ensure that only one memory profile is written at a time 271 heapProfileMu sync.Mutex 272 heapProfileNum int 273 274 // a sortable 20 chars long id. 275 id string 276 277 finishCalled bool 278} 279 280type finishedShard struct { 281 temp, final string 282} 283 284func checkCTags() string { 285 if ctags := os.Getenv("CTAGS_COMMAND"); ctags != "" { 286 return ctags 287 } 288 289 if ctags, err := exec.LookPath("universal-ctags"); err == nil { 290 return ctags 291 } 292 293 return "" 294} 295 296func checkScipCTags() string { 297 if ctags := os.Getenv("SCIP_CTAGS_COMMAND"); ctags != "" { 298 return ctags 299 } 300 301 if ctags, err := exec.LookPath("scip-ctags"); err == nil { 302 return ctags 303 } 304 305 return "" 306} 307 308// SetDefaults sets reasonable default options. 309func (o *Options) SetDefaults() { 310 if o.CTagsPath == "" && !o.DisableCTags { 311 o.CTagsPath = checkCTags() 312 } 313 314 if o.ScipCTagsPath == "" && !o.DisableCTags { 315 o.ScipCTagsPath = checkScipCTags() 316 } 317 318 if o.Parallelism == 0 { 319 o.Parallelism = 4 320 } 321 if o.SizeMax == 0 { 322 o.SizeMax = 2 << 20 323 } 324 if o.ShardMax == 0 { 325 o.ShardMax = 100 << 20 326 } 327 if o.TrigramMax == 0 { 328 o.TrigramMax = 20000 329 } 330 331 if o.RepositoryDescription.Name == "" && o.RepositoryDescription.URL != "" { 332 parsed, _ := url.Parse(o.RepositoryDescription.URL) 333 if parsed != nil { 334 o.RepositoryDescription.Name = filepath.Join(parsed.Host, parsed.Path) 335 } 336 } 337} 338 339// ShardName returns the name the given index shard. 340func (o *Options) shardName(n int) string { 341 return o.shardNameVersion(IndexFormatVersion, n) 342} 343 344func (o *Options) shardNameVersion(version, n int) string { 345 prefix := o.ShardPrefixOverride // ShardPrefixOverride takes precedence to support custom shard naming strategies 346 347 if prefix == "" { 348 // Sourcegraph specific: We use IDs in shard names on multi-tenant 349 // instances to prevent conflicts. 350 if tenant.UseIDBasedShardNames() { 351 prefix = fmt.Sprintf("%09d_%09d", o.RepositoryDescription.TenantID, o.RepositoryDescription.ID) 352 } else { 353 prefix = o.RepositoryDescription.Name 354 } 355 } 356 357 return shardName(o.IndexDir, prefix, version, n) 358} 359 360type IndexState string 361 362const ( 363 IndexStateMissing IndexState = "missing" 364 IndexStateCorrupt IndexState = "corrupt" 365 IndexStateVersion IndexState = "version-mismatch" 366 IndexStateOption IndexState = "option-mismatch" 367 IndexStateMeta IndexState = "meta-mismatch" 368 IndexStateContent IndexState = "content-mismatch" 369 IndexStateEqual IndexState = "equal" 370) 371 372var readVersions = []struct { 373 IndexFormatVersion int 374 FeatureVersion int 375}{{ 376 IndexFormatVersion: IndexFormatVersion, 377 FeatureVersion: FeatureVersion, 378}, { 379 IndexFormatVersion: NextIndexFormatVersion, 380 FeatureVersion: FeatureVersion, 381}} 382 383// IncrementalSkipIndexing returns true if the index present on disk matches 384// the build options. 385func (o *Options) IncrementalSkipIndexing() bool { 386 state, _ := o.IndexState() 387 return state == IndexStateEqual 388} 389 390// IndexState checks how the index present on disk compares to the build 391// options and returns the IndexState and the name of the first shard. 392func (o *Options) IndexState() (IndexState, string) { 393 // Open the latest version we support that is on disk. 394 fn := o.findShard() 395 if fn == "" { 396 return IndexStateMissing, fn 397 } 398 399 repos, index, err := ReadMetadataPathAlive(fn) 400 if os.IsNotExist(err) { 401 return IndexStateMissing, fn 402 } else if err != nil { 403 return IndexStateCorrupt, fn 404 } 405 406 for _, v := range readVersions { 407 if v.IndexFormatVersion == index.IndexFormatVersion && v.FeatureVersion != index.IndexFeatureVersion { 408 return IndexStateVersion, fn 409 } 410 } 411 412 var repo *zoekt.Repository 413 for _, cand := range repos { 414 if cand.Name == o.RepositoryDescription.Name { 415 repo = cand 416 break 417 } 418 } 419 420 if repo == nil { 421 return IndexStateCorrupt, fn 422 } 423 424 if repo.IndexOptions != o.GetHash() { 425 return IndexStateOption, fn 426 } 427 428 if !reflect.DeepEqual(repo.Branches, o.RepositoryDescription.Branches) { 429 return IndexStateContent, fn 430 } 431 432 // We can mutate repo since it lives in the scope of this function call. 433 if updated, err := repo.MergeMutable(&o.RepositoryDescription); err != nil { 434 // non-nil err means we are trying to update an immutable field => 435 // reindex content. 436 log.Printf("warn: immutable field changed, requires re-index: %s", err) 437 return IndexStateContent, fn 438 } else if updated { 439 return IndexStateMeta, fn 440 } 441 442 return IndexStateEqual, fn 443} 444 445// FindRepositoryMetadata returns the index metadata for the repository 446// specified in the options. 'ok' is false if the repository's metadata 447// couldn't be found or if an error occurred. 448func (o *Options) FindRepositoryMetadata() (repository *zoekt.Repository, metadata *zoekt.IndexMetadata, ok bool, err error) { 449 shard := o.findShard() 450 if shard == "" { 451 return nil, nil, false, nil 452 } 453 454 repositories, metadata, err := ReadMetadataPathAlive(shard) 455 if err != nil { 456 return nil, nil, false, fmt.Errorf("reading metadata for shard %q: %w", shard, err) 457 } 458 459 ID := o.RepositoryDescription.ID 460 for _, r := range repositories { 461 // compound shards contain multiple repositories, so we 462 // have to pick only the one we're looking for 463 if r.ID == ID { 464 return r, metadata, true, nil 465 } 466 } 467 468 // If we're here, then we're somehow in a state where we found a matching 469 // shard that's missing the repository metadata we're looking for. This 470 // should never happen. 471 name := o.RepositoryDescription.Name 472 return nil, nil, false, fmt.Errorf("matching shard %q doesn't contain metadata for repo id %d (%q)", shard, ID, name) 473} 474 475func (o *Options) findShard() string { 476 for _, v := range readVersions { 477 fn := o.shardNameVersion(v.IndexFormatVersion, 0) 478 if _, err := os.Stat(fn); err == nil { 479 return fn 480 } 481 } 482 483 // Brute force finding the shard in compound shards. We should only hit this 484 // code path for repositories that don't exist yet or are in compound shards. 485 return o.findCompoundShard() 486} 487 488func (o *Options) findCompoundShard() string { 489 compoundShards, err := filepath.Glob(path.Join(o.IndexDir, "compound-*.zoekt")) 490 if err != nil { 491 return "" 492 } 493 for _, fn := range compoundShards { 494 if containsRepo(fn, o.RepositoryDescription.ID) { 495 return fn 496 } 497 } 498 499 return "" 500} 501 502func (o *Options) FindAllShards() []string { 503 for _, v := range readVersions { 504 fn := o.shardNameVersion(v.IndexFormatVersion, 0) 505 if _, err := os.Stat(fn); err == nil { 506 shards := []string{fn} 507 for i := 1; ; i++ { 508 fn := o.shardNameVersion(v.IndexFormatVersion, i) 509 if _, err := os.Stat(fn); err != nil { 510 return shards 511 } 512 shards = append(shards, fn) 513 } 514 } 515 } 516 517 // lazily fallback to findShard which will look for a compound shard. 518 if fn := o.findShard(); fn != "" { 519 return []string{fn} 520 } 521 522 return nil 523} 524 525// IgnoreSizeMax determines whether the max size should be ignored. 526func (o *Options) IgnoreSizeMax(name string) bool { 527 // A pattern match will override preceding pattern matches. 528 for i := len(o.LargeFiles) - 1; i >= 0; i-- { 529 pattern := strings.TrimSpace(o.LargeFiles[i]) 530 negated, validatedPattern := checkIsNegatePattern(pattern) 531 532 if m, _ := doublestar.PathMatch(validatedPattern, name); m { 533 if negated { 534 return false 535 } else { 536 return true 537 } 538 } 539 } 540 541 return false 542} 543 544func checkIsNegatePattern(pattern string) (bool, string) { 545 negate := "!" 546 547 // if negated then strip prefix meta character which identifies negated filter pattern 548 if strings.HasPrefix(pattern, negate) { 549 return true, pattern[len(negate):] 550 } 551 552 return false, pattern 553} 554 555// NewBuilder creates a new Builder instance. 556func NewBuilder(opts Options) (*Builder, error) { 557 opts.SetDefaults() 558 if opts.RepositoryDescription.Name == "" { 559 return nil, fmt.Errorf("builder: must set Name") 560 } 561 562 b := &Builder{ 563 opts: opts, 564 throttle: make(chan int, opts.Parallelism), 565 finishedShards: map[string]string{}, 566 } 567 568 parserBins, err := ctags.NewParserBinMap( 569 b.opts.CTagsPath, 570 b.opts.ScipCTagsPath, 571 opts.LanguageMap, 572 b.opts.CTagsMustSucceed, 573 ) 574 if err != nil { 575 return nil, err 576 } 577 578 b.parserBins = parserBins 579 580 if opts.IsDelta { 581 // Delta shards build on top of previously existing shards. 582 // As a consequence, the shardNum for delta shards starts from 583 // the number following the most recently generated shard - not 0. 584 // 585 // Using this numbering scheme allows all the shards to be 586 // discovered as a set. 587 shards := b.opts.FindAllShards() 588 b.nextShardNum = len(shards) // shards are zero indexed, so len() provides the next number after the last one 589 } 590 591 if _, err := b.newShardBuilder(); err != nil { 592 return nil, err 593 } 594 595 now := time.Now() 596 b.indexTime = now 597 b.id = xid.NewWithTime(now).String() 598 599 return b, nil 600} 601 602// AddFile is a convenience wrapper for the Add method 603func (b *Builder) AddFile(name string, content []byte) error { 604 return b.Add(Document{Name: name, Content: content}) 605} 606 607func (b *Builder) Add(doc Document) error { 608 if b.finishCalled { 609 return nil 610 } 611 612 allowLargeFile := b.opts.IgnoreSizeMax(doc.Name) 613 if len(doc.Content) > b.opts.SizeMax && !allowLargeFile { 614 // We could pass the document on to the shardbuilder, but if 615 // we pass through a part of the source tree with binary/large 616 // files, the corresponding shard would be mostly empty, so 617 // insert a reason here too. 618 doc.SkipReason = SkipReasonTooLarge 619 } else if skip := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile); skip != SkipReasonNone { 620 doc.SkipReason = skip 621 } 622 623 b.todo = append(b.todo, &doc) 624 625 if doc.SkipReason == SkipReasonNone { 626 b.size += len(doc.Name) + len(doc.Content) 627 } else { 628 b.size += len(doc.Name) 629 // Drop the content if we are skipping the document. Skipped content is not counted towards the 630 // shard size limit, so otherwise we might buffer too much data in memory before flushing. 631 doc.Content = nil 632 } 633 634 if b.size > b.opts.ShardMax { 635 return b.flush() 636 } 637 638 return nil 639} 640 641// MarkFileAsChangedOrRemoved indicates that the file specified by the given path 642// has been changed or removed since the last indexing job for this repository. 643// 644// If this build is a delta build, these files will be tombstoned in the older shards for this repository. 645func (b *Builder) MarkFileAsChangedOrRemoved(path string) { 646 b.opts.changedOrRemovedFiles = append(b.opts.changedOrRemovedFiles, path) 647} 648 649// Finish creates a last shard from the buffered documents, and clears 650// stale shards from previous runs. This should always be called, also 651// in failure cases, to ensure cleanup. 652// 653// It is safe to call Finish() multiple times. 654func (b *Builder) Finish() error { 655 if b.finishCalled { 656 return b.buildError 657 } 658 659 b.finishCalled = true 660 661 b.flush() 662 b.building.Wait() 663 664 if b.buildError != nil { 665 for tmp := range b.finishedShards { 666 log.Printf("Builder.Finish %s", tmp) 667 os.Remove(tmp) 668 } 669 b.finishedShards = map[string]string{} 670 return b.buildError 671 } 672 673 // map of temporary -> final names for all updated shards + shard metadata files 674 artifactPaths := make(map[string]string) 675 maps.Copy(artifactPaths, b.finishedShards) 676 677 oldShards := b.opts.FindAllShards() 678 679 if b.opts.IsDelta { 680 // Delta shard builds need to update FileTombstone and branch commit information for all 681 // existing shards 682 for _, shard := range oldShards { 683 repositories, _, err := ReadMetadataPathAlive(shard) 684 if err != nil { 685 return fmt.Errorf("reading metadata from shard %q: %w", shard, err) 686 } 687 688 if len(repositories) > 1 { 689 return fmt.Errorf("delta shard builds don't support repositories contained in compound shards (shard %q)", shard) 690 } 691 692 if len(repositories) == 0 { 693 return fmt.Errorf("failed to update repository metadata for shard %q - shard contains no repositories", shard) 694 } 695 696 repository := repositories[0] 697 if repository.ID != b.opts.RepositoryDescription.ID { 698 return fmt.Errorf("shard %q doesn't contain repository ID %d (%q)", shard, b.opts.RepositoryDescription.ID, b.opts.RepositoryDescription.Name) 699 } 700 701 if len(b.opts.changedOrRemovedFiles) > 0 && repository.FileTombstones == nil { 702 repository.FileTombstones = make(map[string]struct{}, len(b.opts.changedOrRemovedFiles)) 703 } 704 705 for _, f := range b.opts.changedOrRemovedFiles { 706 repository.FileTombstones[f] = struct{}{} 707 } 708 709 if !BranchNamesEqual(repository.Branches, b.opts.RepositoryDescription.Branches) { 710 return deltaBranchSetError{ 711 shardName: shard, 712 old: repository.Branches, 713 new: b.opts.RepositoryDescription.Branches, 714 } 715 } 716 717 if b.opts.GetHash() != repository.IndexOptions { 718 return &deltaIndexOptionsMismatchError{ 719 shardName: shard, 720 newOptions: b.opts.HashOptions(), 721 } 722 } 723 724 repository.Branches = b.opts.RepositoryDescription.Branches 725 726 repository.LatestCommitDate = b.opts.RepositoryDescription.LatestCommitDate 727 728 repository.Metadata = b.opts.RepositoryDescription.Metadata 729 730 tempPath, finalPath, err := JsonMarshalRepoMetaTemp(shard, repository) 731 if err != nil { 732 return fmt.Errorf("writing repository metadta for shard %q: %w", shard, err) 733 } 734 735 artifactPaths[tempPath] = finalPath 736 } 737 } 738 739 // We mark finished shards as empty when we successfully finish. Return now 740 // to allow call sites to call Finish idempotently. 741 if len(artifactPaths) == 0 { 742 return b.buildError 743 } 744 745 // Collect a map of the old shards on disk. For each new shard we replace we 746 // delete it from toDelete. Anything remaining in toDelete will be removed 747 // after we have renamed everything into place. 748 749 var toDelete map[string]struct{} 750 if !b.opts.IsDelta { 751 // Non-delta shard builds delete all existing shards before they write out 752 // new ones. 753 // By contrast, delta shard builds work by stacking changes on top of existing shards. 754 // So, we skip populating the toDelete map if we're building delta shards. 755 756 toDelete = make(map[string]struct{}) 757 for _, name := range oldShards { 758 paths, err := IndexFilePaths(name) 759 if err != nil { 760 b.buildError = fmt.Errorf("failed to find old paths for %s: %w", name, err) 761 } 762 for _, p := range paths { 763 toDelete[p] = struct{}{} 764 } 765 } 766 } 767 768 for tmp, final := range artifactPaths { 769 if err := os.Rename(tmp, final); err != nil { 770 b.buildError = err 771 continue 772 } 773 774 delete(toDelete, final) 775 } 776 777 b.finishedShards = map[string]string{} 778 779 for p := range toDelete { 780 // Don't delete compound shards, set tombstones instead. 781 if b.opts.ShardMerging && strings.HasPrefix(filepath.Base(p), "compound-") { 782 if !strings.HasSuffix(p, ".zoekt") { 783 continue 784 } 785 err := SetTombstone(p, b.opts.RepositoryDescription.ID) 786 b.buildError = err 787 continue 788 } 789 log.Printf("removing old shard file: %s", p) 790 if err := os.Remove(p); err != nil { 791 b.buildError = err 792 } 793 } 794 795 return b.buildError 796} 797 798// BranchNamesEqual compares the given zoekt.RepositoryBranch slices, and returns true 799// iff both slices specify the same set of branch names in the same order. 800func BranchNamesEqual(a, b []zoekt.RepositoryBranch) bool { 801 if len(a) != len(b) { 802 return false 803 } 804 805 for i := range a { 806 x, y := a[i], b[i] 807 if x.Name != y.Name { 808 return false 809 } 810 } 811 812 return true 813} 814 815func (b *Builder) flush() error { 816 todo := b.todo 817 b.todo = nil 818 b.size = 0 819 b.errMu.Lock() 820 defer b.errMu.Unlock() 821 if b.buildError != nil { 822 return b.buildError 823 } 824 825 hasShard := b.nextShardNum > 0 826 if len(todo) == 0 && hasShard { 827 return nil 828 } 829 830 shard := b.nextShardNum 831 b.nextShardNum++ 832 833 if b.opts.Parallelism > 1 { 834 b.building.Add(1) 835 b.throttle <- 1 836 go func() { 837 done, err := b.buildShard(todo, shard) 838 <-b.throttle 839 840 b.errMu.Lock() 841 defer b.errMu.Unlock() 842 if err != nil && b.buildError == nil { 843 b.buildError = err 844 } 845 if err == nil { 846 b.finishedShards[done.temp] = done.final 847 } 848 b.building.Done() 849 }() 850 } else { 851 // No goroutines when we're not parallel. This 852 // simplifies memory profiling. 853 done, err := b.buildShard(todo, shard) 854 b.buildError = err 855 if err == nil { 856 b.finishedShards[done.temp] = done.final 857 } 858 859 return b.buildError 860 } 861 862 return nil 863} 864 865// map [0,inf) to [0,1) monotonically 866func squashRange(j int) float64 { 867 x := float64(j) 868 return x / (1 + x) 869} 870 871type rankedDoc struct { 872 *Document 873 rank []float64 874} 875 876// rank returns a vector of scores which is used at index-time to sort documents 877// before writing them to disk. The order of documents in the shard is important 878// at query time, because earlier documents receive a boost at query time and 879// have a higher chance of being searched before limits kick in. 880func rank(d *Document, origIdx int) []float64 { 881 skipped := 0.0 882 if d.SkipReason != SkipReasonNone { 883 skipped = 1.0 884 } 885 886 generated := 0.0 887 if enry.IsGenerated(d.Name, d.Content) { 888 generated = 1.0 889 } 890 891 vendor := 0.0 892 if enry.IsVendor(d.Name) { 893 vendor = 1.0 894 } 895 896 test := 0.0 897 if enry.IsTest(d.Name) { 898 test = 1.0 899 } 900 901 // Smaller is earlier (=better). 902 return []float64{ 903 // Always place skipped docs last 904 skipped, 905 906 // Prefer docs that are not generated 907 generated, 908 909 // Prefer docs that are not vendored 910 vendor, 911 912 // Prefer docs that are not tests 913 test, 914 915 // With short names 916 squashRange(len(d.Name)), 917 918 // With many symbols 919 1.0 - squashRange(len(d.Symbols)), 920 921 // With short content 922 squashRange(len(d.Content)), 923 924 // That is present is as many branches as possible 925 1.0 - squashRange(len(d.Branches)), 926 927 // Preserve original ordering. 928 squashRange(origIdx), 929 } 930} 931 932func sortDocuments(todo []*Document) { 933 rs := make([]rankedDoc, 0, len(todo)) 934 for i, t := range todo { 935 rd := rankedDoc{t, rank(t, i)} 936 rs = append(rs, rd) 937 } 938 sort.Slice(rs, func(i, j int) bool { 939 r1 := rs[i].rank 940 r2 := rs[j].rank 941 for i := range r1 { 942 if r1[i] < r2[i] { 943 return true 944 } 945 if r1[i] > r2[i] { 946 return false 947 } 948 } 949 950 return false 951 }) 952 for i := range todo { 953 todo[i] = rs[i].Document 954 } 955} 956 957func (b *Builder) buildShard(todo []*Document, nextShardNum int) (*finishedShard, error) { 958 if !b.opts.DisableCTags && (b.opts.CTagsPath != "" || b.opts.ScipCTagsPath != "") { 959 err := parseSymbols(todo, b.opts.LanguageMap, b.parserBins) 960 if b.opts.CTagsMustSucceed && err != nil { 961 return nil, err 962 } 963 if err != nil { 964 log.Printf("ignoring universal:%s or scip:%s error: %v", b.opts.CTagsPath, b.opts.ScipCTagsPath, err) 965 } 966 } 967 968 name := b.opts.shardName(nextShardNum) 969 970 shardBuilder, err := b.newShardBuilder() 971 if err != nil { 972 return nil, err 973 } 974 975 sortDocuments(todo) 976 977 for idx, t := range todo { 978 if err := shardBuilder.Add(*t); err != nil { 979 return nil, err 980 } 981 982 if idx%10_000 == 0 { 983 b.CheckMemoryUsage() 984 } 985 } 986 987 return b.writeShard(name, shardBuilder) 988} 989 990// CheckMemoryUsage checks the memory usage of the process and writes a memory profile if the heap usage exceeds the 991// configured threshold. NOTE: this method is expensive and should only be used for debugging. 992func (b *Builder) CheckMemoryUsage() { 993 // Don't check memory if heap profiling is disabled, or we've already written 10 profiles 994 if b.opts.HeapProfileTriggerBytes <= 0 || b.heapProfileNum >= 10 { 995 return 996 } 997 998 var m runtime.MemStats 999 runtime.ReadMemStats(&m) 1000 1001 if m.HeapAlloc > b.opts.HeapProfileTriggerBytes && b.heapProfileMu.TryLock() { 1002 defer b.heapProfileMu.Unlock() 1003 1004 log.Printf("writing memory profile, allocated heap: %s", humanize.Bytes(m.HeapAlloc)) 1005 name := filepath.Join(b.opts.IndexDir, fmt.Sprintf("indexmemory.prof.%d", b.heapProfileNum)) 1006 f, err := os.Create(name) 1007 if err != nil { 1008 log.Printf("failed to create memory profile file: %v", err) 1009 return 1010 } 1011 1012 err = pprof.WriteHeapProfile(f) 1013 if err != nil { 1014 log.Printf("failed to write memory profile: %v", err) 1015 } 1016 1017 b.heapProfileNum++ 1018 } 1019} 1020 1021func (b *Builder) newShardBuilder() (*ShardBuilder, error) { 1022 desc := b.opts.RepositoryDescription 1023 desc.HasSymbols = !b.opts.DisableCTags && b.opts.CTagsPath != "" 1024 desc.SubRepoMap = b.opts.SubRepositories 1025 desc.IndexOptions = b.opts.GetHash() 1026 1027 shardBuilder, err := NewShardBuilder(&desc) 1028 if err != nil { 1029 return nil, err 1030 } 1031 shardBuilder.IndexTime = b.indexTime 1032 shardBuilder.ID = b.id 1033 return shardBuilder, nil 1034} 1035 1036func (b *Builder) writeShard(fn string, ib *ShardBuilder) (*finishedShard, error) { 1037 dir := filepath.Dir(fn) 1038 if err := os.MkdirAll(dir, 0o700); err != nil { 1039 return nil, err 1040 } 1041 1042 f, err := os.CreateTemp(dir, filepath.Base(fn)+".*.tmp") 1043 if err != nil { 1044 return nil, err 1045 } 1046 if runtime.GOOS != "windows" { 1047 if err := f.Chmod(0o666 &^ umask); err != nil { 1048 return nil, err 1049 } 1050 } 1051 1052 defer f.Close() 1053 if err := ib.Write(f); err != nil { 1054 return nil, err 1055 } 1056 fi, err := f.Stat() 1057 if err != nil { 1058 return nil, err 1059 } 1060 if err := f.Close(); err != nil { 1061 return nil, err 1062 } 1063 1064 log.Printf("finished shard %s: %d index bytes (overhead %3.1f), %d files processed \n", 1065 fn, 1066 fi.Size(), 1067 float64(fi.Size())/float64(ib.ContentSize()+1), 1068 ib.NumFiles()) 1069 1070 return &finishedShard{f.Name(), fn}, nil 1071} 1072 1073type deltaBranchSetError struct { 1074 shardName string 1075 old, new []zoekt.RepositoryBranch 1076} 1077 1078func (e deltaBranchSetError) Error() string { 1079 return fmt.Sprintf("repository metadata in shard %q contains a different set of branch names than what was requested, which is unsupported in a delta shard build. old: %+v, new: %+v", e.shardName, e.old, e.new) 1080} 1081 1082type deltaIndexOptionsMismatchError struct { 1083 shardName string 1084 newOptions HashOptions 1085} 1086 1087func (e *deltaIndexOptionsMismatchError) Error() string { 1088 return fmt.Sprintf("one or more index options for shard %q do not match Builder's index options. These index option updates are incompatible with delta build. New index options: %+v", e.shardName, e.newOptions) 1089} 1090 1091// umask holds the Umask of the current process 1092var umask os.FileMode 1093 1094func init() { 1095 umask = os.FileMode(unix.Umask(0)) 1096 unix.Umask(int(umask)) 1097}