fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15// Package index contains logic for building Zoekt indexes. NOTE: this package is not considered 16// part of the public API, and it is not recommended to rely on it in external code. 17package index 18 19import ( 20 "cmp" 21 "crypto/sha1" 22 "flag" 23 "fmt" 24 "log" 25 "net/url" 26 "os" 27 "os/exec" 28 "path" 29 "path/filepath" 30 "reflect" 31 "runtime" 32 "runtime/pprof" 33 "sort" 34 "strconv" 35 "strings" 36 "sync" 37 "time" 38 39 "github.com/bmatcuk/doublestar" 40 "github.com/dustin/go-humanize" 41 "github.com/go-enry/go-enry/v2" 42 "github.com/rs/xid" 43 "golang.org/x/sys/unix" 44 45 "maps" 46 47 "github.com/sourcegraph/zoekt" 48 "github.com/sourcegraph/zoekt/internal/ctags" 49) 50 51var DefaultDir = filepath.Join(os.Getenv("HOME"), ".zoekt") 52 53// Branch describes a single branch version. 54type Branch struct { 55 Name string 56 Version string 57} 58 59// Options sets options for the index building. 60type Options struct { 61 // IndexDir is a directory that holds *.zoekt index files. 62 IndexDir string 63 64 // SizeMax is the maximum file size 65 SizeMax int 66 67 // Parallelism is the maximum number of shards to index in parallel 68 Parallelism int 69 70 // ShardMax sets the maximum corpus size for a single shard 71 ShardMax int 72 73 // TrigramMax sets the maximum number of distinct trigrams per document. 74 TrigramMax int 75 76 // RepositoryDescription holds names and URLs for the repository. 77 RepositoryDescription zoekt.Repository 78 79 // SubRepositories is a path => sub repository map. 80 SubRepositories map[string]*zoekt.Repository 81 82 // DisableCTags disables the generation of ctags metadata. 83 DisableCTags bool 84 85 // CtagsPath is the path to the ctags binary to run, or empty 86 // if a valid binary couldn't be found. 87 CTagsPath string 88 89 // Same as CTagsPath but for scip-ctags 90 ScipCTagsPath string 91 92 // If set, ctags must succeed. 93 CTagsMustSucceed bool 94 95 // LargeFiles is a slice of glob patterns, including ** for any number 96 // of directories, where matching file paths should be indexed 97 // regardless of their size. The full pattern syntax is here: 98 // https://github.com/bmatcuk/doublestar/tree/v1#patterns. 99 LargeFiles []string 100 101 // IsDelta is true if this run contains only the changed documents since the 102 // last run. 103 IsDelta bool 104 105 // changedOrRemovedFiles is a list of file paths that have been changed or removed 106 // since the last indexing job for this repository. These files will be tombstoned 107 // in the older shards for this repository. 108 changedOrRemovedFiles []string 109 110 LanguageMap ctags.LanguageMap 111 112 // ShardMerging is true if builder should respect compound shards. This is a 113 // Sourcegraph specific option. 114 ShardMerging bool 115 116 // HeapProfileTriggerBytes is the heap allocation in bytes that will trigger a memory profile. If 0, no memory profile 117 // will be triggered. Note this trigger looks at total heap allocation (which includes both inuse and garbage objects). 118 // 119 // Profiles will be written to files named `index-memory.prof.n` in the index directory. No more than 10 files are written. 120 // 121 // Note: heap checking is "best effort", and it's possible for the process to OOM without triggering the heap profile. 122 HeapProfileTriggerBytes uint64 123 124 // ShardPrefix is the prefix of the shard. It defaults to the repository name. 125 ShardPrefix string 126} 127 128// HashOptions contains only the options in Options that upon modification leads to IndexState of IndexStateMismatch during the next index building. 129type HashOptions struct { 130 sizeMax int 131 disableCTags bool 132 ctagsPath string 133 cTagsMustSucceed bool 134 largeFiles []string 135} 136 137func (o *Options) HashOptions() HashOptions { 138 return HashOptions{ 139 sizeMax: o.SizeMax, 140 disableCTags: o.DisableCTags, 141 ctagsPath: o.CTagsPath, 142 cTagsMustSucceed: o.CTagsMustSucceed, 143 largeFiles: o.LargeFiles, 144 } 145} 146 147func (o *Options) GetHash() string { 148 h := o.HashOptions() 149 hasher := sha1.New() 150 151 hasher.Write([]byte(h.ctagsPath)) 152 hasher.Write(fmt.Appendf(nil, "%t", h.cTagsMustSucceed)) 153 hasher.Write(fmt.Appendf(nil, "%d", h.sizeMax)) 154 hasher.Write(fmt.Appendf(nil, "%q", h.largeFiles)) 155 hasher.Write(fmt.Appendf(nil, "%t", h.disableCTags)) 156 157 return fmt.Sprintf("%x", hasher.Sum(nil)) 158} 159 160type largeFilesFlag struct{ *Options } 161 162func (f largeFilesFlag) String() string { 163 // From flag.Value documentation: 164 // 165 // The flag package may call the String method with a zero-valued receiver, 166 // such as a nil pointer. 167 if f.Options == nil { 168 return "" 169 } 170 s := append([]string{""}, f.LargeFiles...) 171 return strings.Join(s, "-large_file ") 172} 173 174func (f largeFilesFlag) Set(value string) error { 175 f.LargeFiles = append(f.LargeFiles, value) 176 return nil 177} 178 179// Flags adds flags for build options to fs. It is the "inverse" of Args. 180func (o *Options) Flags(fs *flag.FlagSet) { 181 x := *o 182 x.SetDefaults() 183 fs.IntVar(&o.SizeMax, "file_limit", x.SizeMax, "maximum file size") 184 fs.IntVar(&o.TrigramMax, "max_trigram_count", x.TrigramMax, "maximum number of trigrams per document") 185 fs.IntVar(&o.ShardMax, "shard_limit", x.ShardMax, "maximum corpus size for a shard") 186 fs.IntVar(&o.Parallelism, "parallelism", x.Parallelism, "maximum number of parallel indexing processes.") 187 fs.StringVar(&o.IndexDir, "index", x.IndexDir, "directory for search indices") 188 fs.BoolVar(&o.CTagsMustSucceed, "require_ctags", x.CTagsMustSucceed, "If set, ctags calls must succeed.") 189 fs.Var(largeFilesFlag{o}, "large_file", "A glob pattern where matching files are to be index regardless of their size. You can add multiple patterns by setting this more than once.") 190 fs.StringVar(&o.ShardPrefix, "shard_prefix", x.ShardPrefix, "the prefix of the shard. Defaults to repository name") 191 192 // Sourcegraph specific 193 fs.BoolVar(&o.DisableCTags, "disable_ctags", x.DisableCTags, "If set, ctags will not be called.") 194 fs.BoolVar(&o.ShardMerging, "shard_merging", x.ShardMerging, "If set, builder will respect compound shards.") 195} 196 197// Args generates command line arguments for o. It is the "inverse" of Flags. 198func (o *Options) Args() []string { 199 var args []string 200 201 if o.SizeMax != 0 { 202 args = append(args, "-file_limit", strconv.Itoa(o.SizeMax)) 203 } 204 205 if o.TrigramMax != 0 { 206 args = append(args, "-max_trigram_count", strconv.Itoa(o.TrigramMax)) 207 } 208 209 if o.ShardMax != 0 { 210 args = append(args, "-shard_limit", strconv.Itoa(o.ShardMax)) 211 } 212 213 if o.Parallelism != 0 { 214 args = append(args, "-parallelism", strconv.Itoa(o.Parallelism)) 215 } 216 217 if o.IndexDir != "" { 218 args = append(args, "-index", o.IndexDir) 219 } 220 221 if o.CTagsMustSucceed { 222 args = append(args, "-require_ctags") 223 } 224 225 for _, a := range o.LargeFiles { 226 args = append(args, "-large_file", a) 227 } 228 229 // Sourcegraph specific 230 if o.DisableCTags { 231 args = append(args, "-disable_ctags") 232 } 233 234 if o.ShardMerging { 235 args = append(args, "-shard_merging") 236 } 237 238 if o.ShardPrefix != "" { 239 args = append(args, "-shard_prefix", o.ShardPrefix) 240 } 241 242 return args 243} 244 245// Builder manages (parallel) creation of uniformly sized shards. The 246// builder buffers up documents until it collects enough documents and 247// then builds a shard and writes. 248type Builder struct { 249 opts Options 250 throttle chan int 251 252 nextShardNum int 253 todo []*Document 254 docChecker DocChecker 255 size int 256 257 parserBins ctags.ParserBinMap 258 building sync.WaitGroup 259 260 errMu sync.Mutex 261 buildError error 262 263 // temp name => final name for finished shards. We only rename 264 // them once all shards succeed to avoid Frankstein corpuses. 265 finishedShards map[string]string 266 267 // indexTime is set by tests for doing reproducible builds. 268 indexTime time.Time 269 270 // heapProfileMu is used to ensure that only one memory profile is written at a time 271 heapProfileMu sync.Mutex 272 heapProfileNum int 273 274 // a sortable 20 chars long id. 275 id string 276 277 finishCalled bool 278} 279 280type finishedShard struct { 281 temp, final string 282} 283 284func checkCTags() string { 285 if ctags := os.Getenv("CTAGS_COMMAND"); ctags != "" { 286 return ctags 287 } 288 289 if ctags, err := exec.LookPath("universal-ctags"); err == nil { 290 return ctags 291 } 292 293 return "" 294} 295 296func checkScipCTags() string { 297 if ctags := os.Getenv("SCIP_CTAGS_COMMAND"); ctags != "" { 298 return ctags 299 } 300 301 if ctags, err := exec.LookPath("scip-ctags"); err == nil { 302 return ctags 303 } 304 305 return "" 306} 307 308// SetDefaults sets reasonable default options. 309func (o *Options) SetDefaults() { 310 if o.CTagsPath == "" && !o.DisableCTags { 311 o.CTagsPath = checkCTags() 312 } 313 314 if o.ScipCTagsPath == "" && !o.DisableCTags { 315 o.ScipCTagsPath = checkScipCTags() 316 } 317 318 if o.Parallelism == 0 { 319 o.Parallelism = 4 320 } 321 if o.SizeMax == 0 { 322 o.SizeMax = 2 << 20 323 } 324 if o.ShardMax == 0 { 325 o.ShardMax = 100 << 20 326 } 327 if o.TrigramMax == 0 { 328 o.TrigramMax = 20000 329 } 330 331 if o.RepositoryDescription.Name == "" && o.RepositoryDescription.URL != "" { 332 parsed, _ := url.Parse(o.RepositoryDescription.URL) 333 if parsed != nil { 334 o.RepositoryDescription.Name = filepath.Join(parsed.Host, parsed.Path) 335 } 336 } 337} 338 339// ShardName returns the name the given index shard. 340func (o *Options) shardName(n int) string { 341 return o.shardNameVersion(IndexFormatVersion, n) 342} 343 344func (o *Options) shardNameVersion(version, n int) string { 345 return ShardName(o.IndexDir, cmp.Or(o.ShardPrefix, o.RepositoryDescription.Name), version, n) 346} 347 348type IndexState string 349 350const ( 351 IndexStateMissing IndexState = "missing" 352 IndexStateCorrupt IndexState = "corrupt" 353 IndexStateVersion IndexState = "version-mismatch" 354 IndexStateOption IndexState = "option-mismatch" 355 IndexStateMeta IndexState = "meta-mismatch" 356 IndexStateContent IndexState = "content-mismatch" 357 IndexStateEqual IndexState = "equal" 358) 359 360var readVersions = []struct { 361 IndexFormatVersion int 362 FeatureVersion int 363}{{ 364 IndexFormatVersion: IndexFormatVersion, 365 FeatureVersion: FeatureVersion, 366}, { 367 IndexFormatVersion: NextIndexFormatVersion, 368 FeatureVersion: FeatureVersion, 369}} 370 371// IncrementalSkipIndexing returns true if the index present on disk matches 372// the build options. 373func (o *Options) IncrementalSkipIndexing() bool { 374 state, _ := o.IndexState() 375 return state == IndexStateEqual 376} 377 378// IndexState checks how the index present on disk compares to the build 379// options and returns the IndexState and the name of the first shard. 380func (o *Options) IndexState() (IndexState, string) { 381 // Open the latest version we support that is on disk. 382 fn := o.findShard() 383 if fn == "" { 384 return IndexStateMissing, fn 385 } 386 387 repos, index, err := ReadMetadataPathAlive(fn) 388 if os.IsNotExist(err) { 389 return IndexStateMissing, fn 390 } else if err != nil { 391 return IndexStateCorrupt, fn 392 } 393 394 for _, v := range readVersions { 395 if v.IndexFormatVersion == index.IndexFormatVersion && v.FeatureVersion != index.IndexFeatureVersion { 396 return IndexStateVersion, fn 397 } 398 } 399 400 var repo *zoekt.Repository 401 for _, cand := range repos { 402 if cand.Name == o.RepositoryDescription.Name { 403 repo = cand 404 break 405 } 406 } 407 408 if repo == nil { 409 return IndexStateCorrupt, fn 410 } 411 412 if repo.IndexOptions != o.GetHash() { 413 return IndexStateOption, fn 414 } 415 416 if !reflect.DeepEqual(repo.Branches, o.RepositoryDescription.Branches) { 417 return IndexStateContent, fn 418 } 419 420 // We can mutate repo since it lives in the scope of this function call. 421 if updated, err := repo.MergeMutable(&o.RepositoryDescription); err != nil { 422 // non-nil err means we are trying to update an immutable field => 423 // reindex content. 424 log.Printf("warn: immutable field changed, requires re-index: %s", err) 425 return IndexStateContent, fn 426 } else if updated { 427 return IndexStateMeta, fn 428 } 429 430 return IndexStateEqual, fn 431} 432 433// FindRepositoryMetadata returns the index metadata for the repository 434// specified in the options. 'ok' is false if the repository's metadata 435// couldn't be found or if an error occurred. 436func (o *Options) FindRepositoryMetadata() (repository *zoekt.Repository, metadata *zoekt.IndexMetadata, ok bool, err error) { 437 shard := o.findShard() 438 if shard == "" { 439 return nil, nil, false, nil 440 } 441 442 repositories, metadata, err := ReadMetadataPathAlive(shard) 443 if err != nil { 444 return nil, nil, false, fmt.Errorf("reading metadata for shard %q: %w", shard, err) 445 } 446 447 ID := o.RepositoryDescription.ID 448 for _, r := range repositories { 449 // compound shards contain multiple repositories, so we 450 // have to pick only the one we're looking for 451 if r.ID == ID { 452 return r, metadata, true, nil 453 } 454 } 455 456 // If we're here, then we're somehow in a state where we found a matching 457 // shard that's missing the repository metadata we're looking for. This 458 // should never happen. 459 name := o.RepositoryDescription.Name 460 return nil, nil, false, fmt.Errorf("matching shard %q doesn't contain metadata for repo id %d (%q)", shard, ID, name) 461} 462 463func (o *Options) findShard() string { 464 for _, v := range readVersions { 465 fn := o.shardNameVersion(v.IndexFormatVersion, 0) 466 if _, err := os.Stat(fn); err == nil { 467 return fn 468 } 469 } 470 471 // Brute force finding the shard in compound shards. We should only hit this 472 // code path for repositories that don't exist yet or are in compound shards. 473 return o.findCompoundShard() 474} 475 476func (o *Options) findCompoundShard() string { 477 compoundShards, err := filepath.Glob(path.Join(o.IndexDir, "compound-*.zoekt")) 478 if err != nil { 479 return "" 480 } 481 for _, fn := range compoundShards { 482 if containsRepo(fn, o.RepositoryDescription.ID) { 483 return fn 484 } 485 } 486 487 return "" 488} 489 490func (o *Options) FindAllShards() []string { 491 for _, v := range readVersions { 492 fn := o.shardNameVersion(v.IndexFormatVersion, 0) 493 if _, err := os.Stat(fn); err == nil { 494 shards := []string{fn} 495 for i := 1; ; i++ { 496 fn := o.shardNameVersion(v.IndexFormatVersion, i) 497 if _, err := os.Stat(fn); err != nil { 498 return shards 499 } 500 shards = append(shards, fn) 501 } 502 } 503 } 504 505 // lazily fallback to findShard which will look for a compound shard. 506 if fn := o.findShard(); fn != "" { 507 return []string{fn} 508 } 509 510 return nil 511} 512 513// IgnoreSizeMax determines whether the max size should be ignored. 514func (o *Options) IgnoreSizeMax(name string) bool { 515 // A pattern match will override preceding pattern matches. 516 for i := len(o.LargeFiles) - 1; i >= 0; i-- { 517 pattern := strings.TrimSpace(o.LargeFiles[i]) 518 negated, validatedPattern := checkIsNegatePattern(pattern) 519 520 if m, _ := doublestar.PathMatch(validatedPattern, name); m { 521 if negated { 522 return false 523 } else { 524 return true 525 } 526 } 527 } 528 529 return false 530} 531 532func checkIsNegatePattern(pattern string) (bool, string) { 533 negate := "!" 534 535 // if negated then strip prefix meta character which identifies negated filter pattern 536 if strings.HasPrefix(pattern, negate) { 537 return true, pattern[len(negate):] 538 } 539 540 return false, pattern 541} 542 543// NewBuilder creates a new Builder instance. 544func NewBuilder(opts Options) (*Builder, error) { 545 opts.SetDefaults() 546 if opts.RepositoryDescription.Name == "" { 547 return nil, fmt.Errorf("builder: must set Name") 548 } 549 550 b := &Builder{ 551 opts: opts, 552 throttle: make(chan int, opts.Parallelism), 553 finishedShards: map[string]string{}, 554 } 555 556 parserBins, err := ctags.NewParserBinMap( 557 b.opts.CTagsPath, 558 b.opts.ScipCTagsPath, 559 opts.LanguageMap, 560 b.opts.CTagsMustSucceed, 561 ) 562 if err != nil { 563 return nil, err 564 } 565 566 b.parserBins = parserBins 567 568 if opts.IsDelta { 569 // Delta shards build on top of previously existing shards. 570 // As a consequence, the shardNum for delta shards starts from 571 // the number following the most recently generated shard - not 0. 572 // 573 // Using this numbering scheme allows all the shards to be 574 // discovered as a set. 575 shards := b.opts.FindAllShards() 576 b.nextShardNum = len(shards) // shards are zero indexed, so len() provides the next number after the last one 577 } 578 579 if _, err := b.newShardBuilder(); err != nil { 580 return nil, err 581 } 582 583 now := time.Now() 584 b.indexTime = now 585 b.id = xid.NewWithTime(now).String() 586 587 return b, nil 588} 589 590// AddFile is a convenience wrapper for the Add method 591func (b *Builder) AddFile(name string, content []byte) error { 592 return b.Add(Document{Name: name, Content: content}) 593} 594 595func (b *Builder) Add(doc Document) error { 596 if b.finishCalled { 597 return nil 598 } 599 600 allowLargeFile := b.opts.IgnoreSizeMax(doc.Name) 601 if len(doc.Content) > b.opts.SizeMax && !allowLargeFile { 602 // We could pass the document on to the shardbuilder, but if 603 // we pass through a part of the source tree with binary/large 604 // files, the corresponding shard would be mostly empty, so 605 // insert a reason here too. 606 doc.SkipReason = SkipReasonTooLarge 607 } else if skip := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile); skip != SkipReasonNone { 608 doc.SkipReason = skip 609 } 610 611 b.todo = append(b.todo, &doc) 612 613 if doc.SkipReason == SkipReasonNone { 614 b.size += len(doc.Name) + len(doc.Content) 615 } else { 616 b.size += len(doc.Name) 617 // Drop the content if we are skipping the document. Skipped content is not counted towards the 618 // shard size limit, so otherwise we might buffer too much data in memory before flushing. 619 doc.Content = nil 620 } 621 622 if b.size > b.opts.ShardMax { 623 return b.flush() 624 } 625 626 return nil 627} 628 629// MarkFileAsChangedOrRemoved indicates that the file specified by the given path 630// has been changed or removed since the last indexing job for this repository. 631// 632// If this build is a delta build, these files will be tombstoned in the older shards for this repository. 633func (b *Builder) MarkFileAsChangedOrRemoved(path string) { 634 b.opts.changedOrRemovedFiles = append(b.opts.changedOrRemovedFiles, path) 635} 636 637// Finish creates a last shard from the buffered documents, and clears 638// stale shards from previous runs. This should always be called, also 639// in failure cases, to ensure cleanup. 640// 641// It is safe to call Finish() multiple times. 642func (b *Builder) Finish() error { 643 if b.finishCalled { 644 return b.buildError 645 } 646 647 b.finishCalled = true 648 649 b.flush() 650 b.building.Wait() 651 652 if b.buildError != nil { 653 for tmp := range b.finishedShards { 654 log.Printf("Builder.Finish %s", tmp) 655 os.Remove(tmp) 656 } 657 b.finishedShards = map[string]string{} 658 return b.buildError 659 } 660 661 // map of temporary -> final names for all updated shards + shard metadata files 662 artifactPaths := make(map[string]string) 663 maps.Copy(artifactPaths, b.finishedShards) 664 665 oldShards := b.opts.FindAllShards() 666 667 if b.opts.IsDelta { 668 // Delta shard builds need to update FileTombstone and branch commit information for all 669 // existing shards 670 for _, shard := range oldShards { 671 repositories, _, err := ReadMetadataPathAlive(shard) 672 if err != nil { 673 return fmt.Errorf("reading metadata from shard %q: %w", shard, err) 674 } 675 676 if len(repositories) > 1 { 677 return fmt.Errorf("delta shard builds don't support repositories contained in compound shards (shard %q)", shard) 678 } 679 680 if len(repositories) == 0 { 681 return fmt.Errorf("failed to update repository metadata for shard %q - shard contains no repositories", shard) 682 } 683 684 repository := repositories[0] 685 if repository.ID != b.opts.RepositoryDescription.ID { 686 return fmt.Errorf("shard %q doesn't contain repository ID %d (%q)", shard, b.opts.RepositoryDescription.ID, b.opts.RepositoryDescription.Name) 687 } 688 689 if len(b.opts.changedOrRemovedFiles) > 0 && repository.FileTombstones == nil { 690 repository.FileTombstones = make(map[string]struct{}, len(b.opts.changedOrRemovedFiles)) 691 } 692 693 for _, f := range b.opts.changedOrRemovedFiles { 694 repository.FileTombstones[f] = struct{}{} 695 } 696 697 if !BranchNamesEqual(repository.Branches, b.opts.RepositoryDescription.Branches) { 698 return deltaBranchSetError{ 699 shardName: shard, 700 old: repository.Branches, 701 new: b.opts.RepositoryDescription.Branches, 702 } 703 } 704 705 if b.opts.GetHash() != repository.IndexOptions { 706 return &deltaIndexOptionsMismatchError{ 707 shardName: shard, 708 newOptions: b.opts.HashOptions(), 709 } 710 } 711 712 repository.Branches = b.opts.RepositoryDescription.Branches 713 714 repository.LatestCommitDate = b.opts.RepositoryDescription.LatestCommitDate 715 716 tempPath, finalPath, err := JsonMarshalRepoMetaTemp(shard, repository) 717 if err != nil { 718 return fmt.Errorf("writing repository metadta for shard %q: %w", shard, err) 719 } 720 721 artifactPaths[tempPath] = finalPath 722 } 723 } 724 725 // We mark finished shards as empty when we successfully finish. Return now 726 // to allow call sites to call Finish idempotently. 727 if len(artifactPaths) == 0 { 728 return b.buildError 729 } 730 731 // Collect a map of the old shards on disk. For each new shard we replace we 732 // delete it from toDelete. Anything remaining in toDelete will be removed 733 // after we have renamed everything into place. 734 735 var toDelete map[string]struct{} 736 if !b.opts.IsDelta { 737 // Non-delta shard builds delete all existing shards before they write out 738 // new ones. 739 // By contrast, delta shard builds work by stacking changes on top of existing shards. 740 // So, we skip populating the toDelete map if we're building delta shards. 741 742 toDelete = make(map[string]struct{}) 743 for _, name := range oldShards { 744 paths, err := IndexFilePaths(name) 745 if err != nil { 746 b.buildError = fmt.Errorf("failed to find old paths for %s: %w", name, err) 747 } 748 for _, p := range paths { 749 toDelete[p] = struct{}{} 750 } 751 } 752 } 753 754 for tmp, final := range artifactPaths { 755 if err := os.Rename(tmp, final); err != nil { 756 b.buildError = err 757 continue 758 } 759 760 delete(toDelete, final) 761 } 762 763 b.finishedShards = map[string]string{} 764 765 for p := range toDelete { 766 // Don't delete compound shards, set tombstones instead. 767 if b.opts.ShardMerging && strings.HasPrefix(filepath.Base(p), "compound-") { 768 if !strings.HasSuffix(p, ".zoekt") { 769 continue 770 } 771 err := SetTombstone(p, b.opts.RepositoryDescription.ID) 772 b.buildError = err 773 continue 774 } 775 log.Printf("removing old shard file: %s", p) 776 if err := os.Remove(p); err != nil { 777 b.buildError = err 778 } 779 } 780 781 return b.buildError 782} 783 784// BranchNamesEqual compares the given zoekt.RepositoryBranch slices, and returns true 785// iff both slices specify the same set of branch names in the same order. 786func BranchNamesEqual(a, b []zoekt.RepositoryBranch) bool { 787 if len(a) != len(b) { 788 return false 789 } 790 791 for i := range a { 792 x, y := a[i], b[i] 793 if x.Name != y.Name { 794 return false 795 } 796 } 797 798 return true 799} 800 801func (b *Builder) flush() error { 802 todo := b.todo 803 b.todo = nil 804 b.size = 0 805 b.errMu.Lock() 806 defer b.errMu.Unlock() 807 if b.buildError != nil { 808 return b.buildError 809 } 810 811 hasShard := b.nextShardNum > 0 812 if len(todo) == 0 && hasShard { 813 return nil 814 } 815 816 shard := b.nextShardNum 817 b.nextShardNum++ 818 819 if b.opts.Parallelism > 1 { 820 b.building.Add(1) 821 b.throttle <- 1 822 go func() { 823 done, err := b.buildShard(todo, shard) 824 <-b.throttle 825 826 b.errMu.Lock() 827 defer b.errMu.Unlock() 828 if err != nil && b.buildError == nil { 829 b.buildError = err 830 } 831 if err == nil { 832 b.finishedShards[done.temp] = done.final 833 } 834 b.building.Done() 835 }() 836 } else { 837 // No goroutines when we're not parallel. This 838 // simplifies memory profiling. 839 done, err := b.buildShard(todo, shard) 840 b.buildError = err 841 if err == nil { 842 b.finishedShards[done.temp] = done.final 843 } 844 845 return b.buildError 846 } 847 848 return nil 849} 850 851// map [0,inf) to [0,1) monotonically 852func squashRange(j int) float64 { 853 x := float64(j) 854 return x / (1 + x) 855} 856 857type rankedDoc struct { 858 *Document 859 rank []float64 860} 861 862// rank returns a vector of scores which is used at index-time to sort documents 863// before writing them to disk. The order of documents in the shard is important 864// at query time, because earlier documents receive a boost at query time and 865// have a higher chance of being searched before limits kick in. 866func rank(d *Document, origIdx int) []float64 { 867 skipped := 0.0 868 if d.SkipReason != SkipReasonNone { 869 skipped = 1.0 870 } 871 872 generated := 0.0 873 if enry.IsGenerated(d.Name, d.Content) { 874 generated = 1.0 875 } 876 877 vendor := 0.0 878 if enry.IsVendor(d.Name) { 879 vendor = 1.0 880 } 881 882 test := 0.0 883 if enry.IsTest(d.Name) { 884 test = 1.0 885 } 886 887 // Smaller is earlier (=better). 888 return []float64{ 889 // Always place skipped docs last 890 skipped, 891 892 // Prefer docs that are not generated 893 generated, 894 895 // Prefer docs that are not vendored 896 vendor, 897 898 // Prefer docs that are not tests 899 test, 900 901 // With short names 902 squashRange(len(d.Name)), 903 904 // With many symbols 905 1.0 - squashRange(len(d.Symbols)), 906 907 // With short content 908 squashRange(len(d.Content)), 909 910 // That is present is as many branches as possible 911 1.0 - squashRange(len(d.Branches)), 912 913 // Preserve original ordering. 914 squashRange(origIdx), 915 } 916} 917 918func sortDocuments(todo []*Document) { 919 rs := make([]rankedDoc, 0, len(todo)) 920 for i, t := range todo { 921 rd := rankedDoc{t, rank(t, i)} 922 rs = append(rs, rd) 923 } 924 sort.Slice(rs, func(i, j int) bool { 925 r1 := rs[i].rank 926 r2 := rs[j].rank 927 for i := range r1 { 928 if r1[i] < r2[i] { 929 return true 930 } 931 if r1[i] > r2[i] { 932 return false 933 } 934 } 935 936 return false 937 }) 938 for i := range todo { 939 todo[i] = rs[i].Document 940 } 941} 942 943func (b *Builder) buildShard(todo []*Document, nextShardNum int) (*finishedShard, error) { 944 if !b.opts.DisableCTags && (b.opts.CTagsPath != "" || b.opts.ScipCTagsPath != "") { 945 err := parseSymbols(todo, b.opts.LanguageMap, b.parserBins) 946 if b.opts.CTagsMustSucceed && err != nil { 947 return nil, err 948 } 949 if err != nil { 950 log.Printf("ignoring universal:%s or scip:%s error: %v", b.opts.CTagsPath, b.opts.ScipCTagsPath, err) 951 } 952 } 953 954 name := b.opts.shardName(nextShardNum) 955 956 shardBuilder, err := b.newShardBuilder() 957 if err != nil { 958 return nil, err 959 } 960 961 sortDocuments(todo) 962 963 for idx, t := range todo { 964 if err := shardBuilder.Add(*t); err != nil { 965 return nil, err 966 } 967 968 if idx%10_000 == 0 { 969 b.CheckMemoryUsage() 970 } 971 } 972 973 return b.writeShard(name, shardBuilder) 974} 975 976// CheckMemoryUsage checks the memory usage of the process and writes a memory profile if the heap usage exceeds the 977// configured threshold. NOTE: this method is expensive and should only be used for debugging. 978func (b *Builder) CheckMemoryUsage() { 979 // Don't check memory if heap profiling is disabled, or we've already written 10 profiles 980 if b.opts.HeapProfileTriggerBytes <= 0 || b.heapProfileNum >= 10 { 981 return 982 } 983 984 var m runtime.MemStats 985 runtime.ReadMemStats(&m) 986 987 if m.HeapAlloc > b.opts.HeapProfileTriggerBytes && b.heapProfileMu.TryLock() { 988 defer b.heapProfileMu.Unlock() 989 990 log.Printf("writing memory profile, allocated heap: %s", humanize.Bytes(m.HeapAlloc)) 991 name := filepath.Join(b.opts.IndexDir, fmt.Sprintf("indexmemory.prof.%d", b.heapProfileNum)) 992 f, err := os.Create(name) 993 if err != nil { 994 log.Printf("failed to create memory profile file: %v", err) 995 return 996 } 997 998 err = pprof.WriteHeapProfile(f) 999 if err != nil { 1000 log.Printf("failed to write memory profile: %v", err) 1001 } 1002 1003 b.heapProfileNum++ 1004 } 1005} 1006 1007func (b *Builder) newShardBuilder() (*ShardBuilder, error) { 1008 desc := b.opts.RepositoryDescription 1009 desc.HasSymbols = !b.opts.DisableCTags && b.opts.CTagsPath != "" 1010 desc.SubRepoMap = b.opts.SubRepositories 1011 desc.IndexOptions = b.opts.GetHash() 1012 1013 shardBuilder, err := NewShardBuilder(&desc) 1014 if err != nil { 1015 return nil, err 1016 } 1017 shardBuilder.IndexTime = b.indexTime 1018 shardBuilder.ID = b.id 1019 return shardBuilder, nil 1020} 1021 1022func (b *Builder) writeShard(fn string, ib *ShardBuilder) (*finishedShard, error) { 1023 dir := filepath.Dir(fn) 1024 if err := os.MkdirAll(dir, 0o700); err != nil { 1025 return nil, err 1026 } 1027 1028 f, err := os.CreateTemp(dir, filepath.Base(fn)+".*.tmp") 1029 if err != nil { 1030 return nil, err 1031 } 1032 if runtime.GOOS != "windows" { 1033 if err := f.Chmod(0o666 &^ umask); err != nil { 1034 return nil, err 1035 } 1036 } 1037 1038 defer f.Close() 1039 if err := ib.Write(f); err != nil { 1040 return nil, err 1041 } 1042 fi, err := f.Stat() 1043 if err != nil { 1044 return nil, err 1045 } 1046 if err := f.Close(); err != nil { 1047 return nil, err 1048 } 1049 1050 log.Printf("finished shard %s: %d index bytes (overhead %3.1f), %d files processed \n", 1051 fn, 1052 fi.Size(), 1053 float64(fi.Size())/float64(ib.ContentSize()+1), 1054 ib.NumFiles()) 1055 1056 return &finishedShard{f.Name(), fn}, nil 1057} 1058 1059type deltaBranchSetError struct { 1060 shardName string 1061 old, new []zoekt.RepositoryBranch 1062} 1063 1064func (e deltaBranchSetError) Error() string { 1065 return fmt.Sprintf("repository metadata in shard %q contains a different set of branch names than what was requested, which is unsupported in a delta shard build. old: %+v, new: %+v", e.shardName, e.old, e.new) 1066} 1067 1068type deltaIndexOptionsMismatchError struct { 1069 shardName string 1070 newOptions HashOptions 1071} 1072 1073func (e *deltaIndexOptionsMismatchError) Error() string { 1074 return fmt.Sprintf("one or more index options for shard %q do not match Builder's index options. These index option updates are incompatible with delta build. New index options: %+v", e.shardName, e.newOptions) 1075} 1076 1077// umask holds the Umask of the current process 1078var umask os.FileMode 1079 1080func init() { 1081 umask = os.FileMode(unix.Umask(0)) 1082 unix.Umask(int(umask)) 1083}