fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15// Package index contains logic for building Zoekt indexes. NOTE: this package is not considered 16// part of the public API, and it is not recommended to rely on it in external code. 17package index 18 19import ( 20 "cmp" 21 "crypto/sha1" 22 "flag" 23 "fmt" 24 "log" 25 "net/url" 26 "os" 27 "os/exec" 28 "path" 29 "path/filepath" 30 "reflect" 31 "runtime" 32 "runtime/pprof" 33 "sort" 34 "strconv" 35 "strings" 36 "sync" 37 "time" 38 39 "github.com/bmatcuk/doublestar" 40 "github.com/dustin/go-humanize" 41 "github.com/go-enry/go-enry/v2" 42 "github.com/rs/xid" 43 "golang.org/x/sys/unix" 44 45 "github.com/sourcegraph/zoekt" 46 "github.com/sourcegraph/zoekt/internal/ctags" 47) 48 49var DefaultDir = filepath.Join(os.Getenv("HOME"), ".zoekt") 50 51// Branch describes a single branch version. 52type Branch struct { 53 Name string 54 Version string 55} 56 57// Options sets options for the index building. 58type Options struct { 59 // IndexDir is a directory that holds *.zoekt index files. 60 IndexDir string 61 62 // SizeMax is the maximum file size 63 SizeMax int 64 65 // Parallelism is the maximum number of shards to index in parallel 66 Parallelism int 67 68 // ShardMax sets the maximum corpus size for a single shard 69 ShardMax int 70 71 // TrigramMax sets the maximum number of distinct trigrams per document. 72 TrigramMax int 73 74 // RepositoryDescription holds names and URLs for the repository. 75 RepositoryDescription zoekt.Repository 76 77 // SubRepositories is a path => sub repository map. 78 SubRepositories map[string]*zoekt.Repository 79 80 // DisableCTags disables the generation of ctags metadata. 81 DisableCTags bool 82 83 // CtagsPath is the path to the ctags binary to run, or empty 84 // if a valid binary couldn't be found. 85 CTagsPath string 86 87 // Same as CTagsPath but for scip-ctags 88 ScipCTagsPath string 89 90 // If set, ctags must succeed. 91 CTagsMustSucceed bool 92 93 // LargeFiles is a slice of glob patterns, including ** for any number 94 // of directories, where matching file paths should be indexed 95 // regardless of their size. The full pattern syntax is here: 96 // https://github.com/bmatcuk/doublestar/tree/v1#patterns. 97 LargeFiles []string 98 99 // IsDelta is true if this run contains only the changed documents since the 100 // last run. 101 IsDelta bool 102 103 // changedOrRemovedFiles is a list of file paths that have been changed or removed 104 // since the last indexing job for this repository. These files will be tombstoned 105 // in the older shards for this repository. 106 changedOrRemovedFiles []string 107 108 LanguageMap ctags.LanguageMap 109 110 // ShardMerging is true if builder should respect compound shards. This is a 111 // Sourcegraph specific option. 112 ShardMerging bool 113 114 // HeapProfileTriggerBytes is the heap allocation in bytes that will trigger a memory profile. If 0, no memory profile 115 // will be triggered. Note this trigger looks at total heap allocation (which includes both inuse and garbage objects). 116 // 117 // Profiles will be written to files named `index-memory.prof.n` in the index directory. No more than 10 files are written. 118 // 119 // Note: heap checking is "best effort", and it's possible for the process to OOM without triggering the heap profile. 120 HeapProfileTriggerBytes uint64 121 122 // ShardPrefix is the prefix of the shard. It defaults to the repository name. 123 ShardPrefix string 124} 125 126// HashOptions contains only the options in Options that upon modification leads to IndexState of IndexStateMismatch during the next index building. 127type HashOptions struct { 128 sizeMax int 129 disableCTags bool 130 ctagsPath string 131 cTagsMustSucceed bool 132 largeFiles []string 133} 134 135func (o *Options) HashOptions() HashOptions { 136 return HashOptions{ 137 sizeMax: o.SizeMax, 138 disableCTags: o.DisableCTags, 139 ctagsPath: o.CTagsPath, 140 cTagsMustSucceed: o.CTagsMustSucceed, 141 largeFiles: o.LargeFiles, 142 } 143} 144 145func (o *Options) GetHash() string { 146 h := o.HashOptions() 147 hasher := sha1.New() 148 149 hasher.Write([]byte(h.ctagsPath)) 150 hasher.Write([]byte(fmt.Sprintf("%t", h.cTagsMustSucceed))) 151 hasher.Write([]byte(fmt.Sprintf("%d", h.sizeMax))) 152 hasher.Write([]byte(fmt.Sprintf("%q", h.largeFiles))) 153 hasher.Write([]byte(fmt.Sprintf("%t", h.disableCTags))) 154 155 return fmt.Sprintf("%x", hasher.Sum(nil)) 156} 157 158type largeFilesFlag struct{ *Options } 159 160func (f largeFilesFlag) String() string { 161 // From flag.Value documentation: 162 // 163 // The flag package may call the String method with a zero-valued receiver, 164 // such as a nil pointer. 165 if f.Options == nil { 166 return "" 167 } 168 s := append([]string{""}, f.LargeFiles...) 169 return strings.Join(s, "-large_file ") 170} 171 172func (f largeFilesFlag) Set(value string) error { 173 f.LargeFiles = append(f.LargeFiles, value) 174 return nil 175} 176 177// Flags adds flags for build options to fs. It is the "inverse" of Args. 178func (o *Options) Flags(fs *flag.FlagSet) { 179 x := *o 180 x.SetDefaults() 181 fs.IntVar(&o.SizeMax, "file_limit", x.SizeMax, "maximum file size") 182 fs.IntVar(&o.TrigramMax, "max_trigram_count", x.TrigramMax, "maximum number of trigrams per document") 183 fs.IntVar(&o.ShardMax, "shard_limit", x.ShardMax, "maximum corpus size for a shard") 184 fs.IntVar(&o.Parallelism, "parallelism", x.Parallelism, "maximum number of parallel indexing processes.") 185 fs.StringVar(&o.IndexDir, "index", x.IndexDir, "directory for search indices") 186 fs.BoolVar(&o.CTagsMustSucceed, "require_ctags", x.CTagsMustSucceed, "If set, ctags calls must succeed.") 187 fs.Var(largeFilesFlag{o}, "large_file", "A glob pattern where matching files are to be index regardless of their size. You can add multiple patterns by setting this more than once.") 188 fs.StringVar(&o.ShardPrefix, "shard_prefix", x.ShardPrefix, "the prefix of the shard. Defaults to repository name") 189 190 // Sourcegraph specific 191 fs.BoolVar(&o.DisableCTags, "disable_ctags", x.DisableCTags, "If set, ctags will not be called.") 192 fs.BoolVar(&o.ShardMerging, "shard_merging", x.ShardMerging, "If set, builder will respect compound shards.") 193} 194 195// Args generates command line arguments for o. It is the "inverse" of Flags. 196func (o *Options) Args() []string { 197 var args []string 198 199 if o.SizeMax != 0 { 200 args = append(args, "-file_limit", strconv.Itoa(o.SizeMax)) 201 } 202 203 if o.TrigramMax != 0 { 204 args = append(args, "-max_trigram_count", strconv.Itoa(o.TrigramMax)) 205 } 206 207 if o.ShardMax != 0 { 208 args = append(args, "-shard_limit", strconv.Itoa(o.ShardMax)) 209 } 210 211 if o.Parallelism != 0 { 212 args = append(args, "-parallelism", strconv.Itoa(o.Parallelism)) 213 } 214 215 if o.IndexDir != "" { 216 args = append(args, "-index", o.IndexDir) 217 } 218 219 if o.CTagsMustSucceed { 220 args = append(args, "-require_ctags") 221 } 222 223 for _, a := range o.LargeFiles { 224 args = append(args, "-large_file", a) 225 } 226 227 // Sourcegraph specific 228 if o.DisableCTags { 229 args = append(args, "-disable_ctags") 230 } 231 232 if o.ShardMerging { 233 args = append(args, "-shard_merging") 234 } 235 236 if o.ShardPrefix != "" { 237 args = append(args, "-shard_prefix", o.ShardPrefix) 238 } 239 240 return args 241} 242 243// Builder manages (parallel) creation of uniformly sized shards. The 244// builder buffers up documents until it collects enough documents and 245// then builds a shard and writes. 246type Builder struct { 247 opts Options 248 throttle chan int 249 250 nextShardNum int 251 todo []*Document 252 docChecker DocChecker 253 size int 254 255 parserBins ctags.ParserBinMap 256 building sync.WaitGroup 257 258 errMu sync.Mutex 259 buildError error 260 261 // temp name => final name for finished shards. We only rename 262 // them once all shards succeed to avoid Frankstein corpuses. 263 finishedShards map[string]string 264 265 // indexTime is set by tests for doing reproducible builds. 266 indexTime time.Time 267 268 // heapProfileMu is used to ensure that only one memory profile is written at a time 269 heapProfileMu sync.Mutex 270 heapProfileNum int 271 272 // a sortable 20 chars long id. 273 id string 274 275 finishCalled bool 276} 277 278type finishedShard struct { 279 temp, final string 280} 281 282func checkCTags() string { 283 if ctags := os.Getenv("CTAGS_COMMAND"); ctags != "" { 284 return ctags 285 } 286 287 if ctags, err := exec.LookPath("universal-ctags"); err == nil { 288 return ctags 289 } 290 291 return "" 292} 293 294func checkScipCTags() string { 295 if ctags := os.Getenv("SCIP_CTAGS_COMMAND"); ctags != "" { 296 return ctags 297 } 298 299 if ctags, err := exec.LookPath("scip-ctags"); err == nil { 300 return ctags 301 } 302 303 return "" 304} 305 306// SetDefaults sets reasonable default options. 307func (o *Options) SetDefaults() { 308 if o.CTagsPath == "" && !o.DisableCTags { 309 o.CTagsPath = checkCTags() 310 } 311 312 if o.ScipCTagsPath == "" && !o.DisableCTags { 313 o.ScipCTagsPath = checkScipCTags() 314 } 315 316 if o.Parallelism == 0 { 317 o.Parallelism = 4 318 } 319 if o.SizeMax == 0 { 320 o.SizeMax = 2 << 20 321 } 322 if o.ShardMax == 0 { 323 o.ShardMax = 100 << 20 324 } 325 if o.TrigramMax == 0 { 326 o.TrigramMax = 20000 327 } 328 329 if o.RepositoryDescription.Name == "" && o.RepositoryDescription.URL != "" { 330 parsed, _ := url.Parse(o.RepositoryDescription.URL) 331 if parsed != nil { 332 o.RepositoryDescription.Name = filepath.Join(parsed.Host, parsed.Path) 333 } 334 } 335} 336 337// ShardName returns the name the given index shard. 338func (o *Options) shardName(n int) string { 339 return o.shardNameVersion(IndexFormatVersion, n) 340} 341 342func (o *Options) shardNameVersion(version, n int) string { 343 return ShardName(o.IndexDir, cmp.Or(o.ShardPrefix, o.RepositoryDescription.Name), version, n) 344} 345 346type IndexState string 347 348const ( 349 IndexStateMissing IndexState = "missing" 350 IndexStateCorrupt IndexState = "corrupt" 351 IndexStateVersion IndexState = "version-mismatch" 352 IndexStateOption IndexState = "option-mismatch" 353 IndexStateMeta IndexState = "meta-mismatch" 354 IndexStateContent IndexState = "content-mismatch" 355 IndexStateEqual IndexState = "equal" 356) 357 358var readVersions = []struct { 359 IndexFormatVersion int 360 FeatureVersion int 361}{{ 362 IndexFormatVersion: IndexFormatVersion, 363 FeatureVersion: FeatureVersion, 364}, { 365 IndexFormatVersion: NextIndexFormatVersion, 366 FeatureVersion: FeatureVersion, 367}} 368 369// IncrementalSkipIndexing returns true if the index present on disk matches 370// the build options. 371func (o *Options) IncrementalSkipIndexing() bool { 372 state, _ := o.IndexState() 373 return state == IndexStateEqual 374} 375 376// IndexState checks how the index present on disk compares to the build 377// options and returns the IndexState and the name of the first shard. 378func (o *Options) IndexState() (IndexState, string) { 379 // Open the latest version we support that is on disk. 380 fn := o.findShard() 381 if fn == "" { 382 return IndexStateMissing, fn 383 } 384 385 repos, index, err := ReadMetadataPathAlive(fn) 386 if os.IsNotExist(err) { 387 return IndexStateMissing, fn 388 } else if err != nil { 389 return IndexStateCorrupt, fn 390 } 391 392 for _, v := range readVersions { 393 if v.IndexFormatVersion == index.IndexFormatVersion && v.FeatureVersion != index.IndexFeatureVersion { 394 return IndexStateVersion, fn 395 } 396 } 397 398 var repo *zoekt.Repository 399 for _, cand := range repos { 400 if cand.Name == o.RepositoryDescription.Name { 401 repo = cand 402 break 403 } 404 } 405 406 if repo == nil { 407 return IndexStateCorrupt, fn 408 } 409 410 if repo.IndexOptions != o.GetHash() { 411 return IndexStateOption, fn 412 } 413 414 if !reflect.DeepEqual(repo.Branches, o.RepositoryDescription.Branches) { 415 return IndexStateContent, fn 416 } 417 418 // We can mutate repo since it lives in the scope of this function call. 419 if updated, err := repo.MergeMutable(&o.RepositoryDescription); err != nil { 420 // non-nil err means we are trying to update an immutable field => 421 // reindex content. 422 log.Printf("warn: immutable field changed, requires re-index: %s", err) 423 return IndexStateContent, fn 424 } else if updated { 425 return IndexStateMeta, fn 426 } 427 428 return IndexStateEqual, fn 429} 430 431// FindRepositoryMetadata returns the index metadata for the repository 432// specified in the options. 'ok' is false if the repository's metadata 433// couldn't be found or if an error occurred. 434func (o *Options) FindRepositoryMetadata() (repository *zoekt.Repository, metadata *zoekt.IndexMetadata, ok bool, err error) { 435 shard := o.findShard() 436 if shard == "" { 437 return nil, nil, false, nil 438 } 439 440 repositories, metadata, err := ReadMetadataPathAlive(shard) 441 if err != nil { 442 return nil, nil, false, fmt.Errorf("reading metadata for shard %q: %w", shard, err) 443 } 444 445 ID := o.RepositoryDescription.ID 446 for _, r := range repositories { 447 // compound shards contain multiple repositories, so we 448 // have to pick only the one we're looking for 449 if r.ID == ID { 450 return r, metadata, true, nil 451 } 452 } 453 454 // If we're here, then we're somehow in a state where we found a matching 455 // shard that's missing the repository metadata we're looking for. This 456 // should never happen. 457 name := o.RepositoryDescription.Name 458 return nil, nil, false, fmt.Errorf("matching shard %q doesn't contain metadata for repo id %d (%q)", shard, ID, name) 459} 460 461func (o *Options) findShard() string { 462 for _, v := range readVersions { 463 fn := o.shardNameVersion(v.IndexFormatVersion, 0) 464 if _, err := os.Stat(fn); err == nil { 465 return fn 466 } 467 } 468 469 // Brute force finding the shard in compound shards. We should only hit this 470 // code path for repositories that don't exist yet or are in compound shards. 471 return o.findCompoundShard() 472} 473 474func (o *Options) findCompoundShard() string { 475 compoundShards, err := filepath.Glob(path.Join(o.IndexDir, "compound-*.zoekt")) 476 if err != nil { 477 return "" 478 } 479 for _, fn := range compoundShards { 480 if containsRepo(fn, o.RepositoryDescription.ID) { 481 return fn 482 } 483 } 484 485 return "" 486} 487 488func (o *Options) FindAllShards() []string { 489 for _, v := range readVersions { 490 fn := o.shardNameVersion(v.IndexFormatVersion, 0) 491 if _, err := os.Stat(fn); err == nil { 492 shards := []string{fn} 493 for i := 1; ; i++ { 494 fn := o.shardNameVersion(v.IndexFormatVersion, i) 495 if _, err := os.Stat(fn); err != nil { 496 return shards 497 } 498 shards = append(shards, fn) 499 } 500 } 501 } 502 503 // lazily fallback to findShard which will look for a compound shard. 504 if fn := o.findShard(); fn != "" { 505 return []string{fn} 506 } 507 508 return nil 509} 510 511// IgnoreSizeMax determines whether the max size should be ignored. 512func (o *Options) IgnoreSizeMax(name string) bool { 513 // A pattern match will override preceding pattern matches. 514 for i := len(o.LargeFiles) - 1; i >= 0; i-- { 515 pattern := strings.TrimSpace(o.LargeFiles[i]) 516 negated, validatedPattern := checkIsNegatePattern(pattern) 517 518 if m, _ := doublestar.PathMatch(validatedPattern, name); m { 519 if negated { 520 return false 521 } else { 522 return true 523 } 524 } 525 } 526 527 return false 528} 529 530func checkIsNegatePattern(pattern string) (bool, string) { 531 negate := "!" 532 533 // if negated then strip prefix meta character which identifies negated filter pattern 534 if strings.HasPrefix(pattern, negate) { 535 return true, pattern[len(negate):] 536 } 537 538 return false, pattern 539} 540 541// NewBuilder creates a new Builder instance. 542func NewBuilder(opts Options) (*Builder, error) { 543 opts.SetDefaults() 544 if opts.RepositoryDescription.Name == "" { 545 return nil, fmt.Errorf("builder: must set Name") 546 } 547 548 b := &Builder{ 549 opts: opts, 550 throttle: make(chan int, opts.Parallelism), 551 finishedShards: map[string]string{}, 552 } 553 554 parserBins, err := ctags.NewParserBinMap( 555 b.opts.CTagsPath, 556 b.opts.ScipCTagsPath, 557 opts.LanguageMap, 558 b.opts.CTagsMustSucceed, 559 ) 560 if err != nil { 561 return nil, err 562 } 563 564 b.parserBins = parserBins 565 566 if opts.IsDelta { 567 // Delta shards build on top of previously existing shards. 568 // As a consequence, the shardNum for delta shards starts from 569 // the number following the most recently generated shard - not 0. 570 // 571 // Using this numbering scheme allows all the shards to be 572 // discovered as a set. 573 shards := b.opts.FindAllShards() 574 b.nextShardNum = len(shards) // shards are zero indexed, so len() provides the next number after the last one 575 } 576 577 if _, err := b.newShardBuilder(); err != nil { 578 return nil, err 579 } 580 581 now := time.Now() 582 b.indexTime = now 583 b.id = xid.NewWithTime(now).String() 584 585 return b, nil 586} 587 588// AddFile is a convenience wrapper for the Add method 589func (b *Builder) AddFile(name string, content []byte) error { 590 return b.Add(Document{Name: name, Content: content}) 591} 592 593func (b *Builder) Add(doc Document) error { 594 if b.finishCalled { 595 return nil 596 } 597 598 allowLargeFile := b.opts.IgnoreSizeMax(doc.Name) 599 if len(doc.Content) > b.opts.SizeMax && !allowLargeFile { 600 // We could pass the document on to the shardbuilder, but if 601 // we pass through a part of the source tree with binary/large 602 // files, the corresponding shard would be mostly empty, so 603 // insert a reason here too. 604 doc.SkipReason = fmt.Sprintf("document size %d larger than limit %d", len(doc.Content), b.opts.SizeMax) 605 } else if err := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile); err != nil { 606 doc.SkipReason = err.Error() 607 } 608 609 b.todo = append(b.todo, &doc) 610 611 if doc.SkipReason == "" { 612 b.size += len(doc.Name) + len(doc.Content) 613 } else { 614 b.size += len(doc.Name) + len(doc.SkipReason) 615 // Drop the content if we are skipping the document. Skipped content is not counted towards the 616 // shard size limit, so otherwise we might buffer too much data in memory before flushing. 617 doc.Content = nil 618 } 619 620 if b.size > b.opts.ShardMax { 621 return b.flush() 622 } 623 624 return nil 625} 626 627// MarkFileAsChangedOrRemoved indicates that the file specified by the given path 628// has been changed or removed since the last indexing job for this repository. 629// 630// If this build is a delta build, these files will be tombstoned in the older shards for this repository. 631func (b *Builder) MarkFileAsChangedOrRemoved(path string) { 632 b.opts.changedOrRemovedFiles = append(b.opts.changedOrRemovedFiles, path) 633} 634 635// Finish creates a last shard from the buffered documents, and clears 636// stale shards from previous runs. This should always be called, also 637// in failure cases, to ensure cleanup. 638// 639// It is safe to call Finish() multiple times. 640func (b *Builder) Finish() error { 641 if b.finishCalled { 642 return b.buildError 643 } 644 645 b.finishCalled = true 646 647 b.flush() 648 b.building.Wait() 649 650 if b.buildError != nil { 651 for tmp := range b.finishedShards { 652 log.Printf("Builder.Finish %s", tmp) 653 os.Remove(tmp) 654 } 655 b.finishedShards = map[string]string{} 656 return b.buildError 657 } 658 659 // map of temporary -> final names for all updated shards + shard metadata files 660 artifactPaths := make(map[string]string) 661 for tmp, final := range b.finishedShards { 662 artifactPaths[tmp] = final 663 } 664 665 oldShards := b.opts.FindAllShards() 666 667 if b.opts.IsDelta { 668 // Delta shard builds need to update FileTombstone and branch commit information for all 669 // existing shards 670 for _, shard := range oldShards { 671 repositories, _, err := ReadMetadataPathAlive(shard) 672 if err != nil { 673 return fmt.Errorf("reading metadata from shard %q: %w", shard, err) 674 } 675 676 if len(repositories) > 1 { 677 return fmt.Errorf("delta shard builds don't support repositories contained in compound shards (shard %q)", shard) 678 } 679 680 if len(repositories) == 0 { 681 return fmt.Errorf("failed to update repository metadata for shard %q - shard contains no repositories", shard) 682 } 683 684 repository := repositories[0] 685 if repository.ID != b.opts.RepositoryDescription.ID { 686 return fmt.Errorf("shard %q doesn't contain repository ID %d (%q)", shard, b.opts.RepositoryDescription.ID, b.opts.RepositoryDescription.Name) 687 } 688 689 if len(b.opts.changedOrRemovedFiles) > 0 && repository.FileTombstones == nil { 690 repository.FileTombstones = make(map[string]struct{}, len(b.opts.changedOrRemovedFiles)) 691 } 692 693 for _, f := range b.opts.changedOrRemovedFiles { 694 repository.FileTombstones[f] = struct{}{} 695 } 696 697 if !BranchNamesEqual(repository.Branches, b.opts.RepositoryDescription.Branches) { 698 return deltaBranchSetError{ 699 shardName: shard, 700 old: repository.Branches, 701 new: b.opts.RepositoryDescription.Branches, 702 } 703 } 704 705 if b.opts.GetHash() != repository.IndexOptions { 706 return &deltaIndexOptionsMismatchError{ 707 shardName: shard, 708 newOptions: b.opts.HashOptions(), 709 } 710 } 711 712 repository.Branches = b.opts.RepositoryDescription.Branches 713 714 repository.LatestCommitDate = b.opts.RepositoryDescription.LatestCommitDate 715 716 tempPath, finalPath, err := JsonMarshalRepoMetaTemp(shard, repository) 717 if err != nil { 718 return fmt.Errorf("writing repository metadta for shard %q: %w", shard, err) 719 } 720 721 artifactPaths[tempPath] = finalPath 722 } 723 } 724 725 // We mark finished shards as empty when we successfully finish. Return now 726 // to allow call sites to call Finish idempotently. 727 if len(artifactPaths) == 0 { 728 return b.buildError 729 } 730 731 // Collect a map of the old shards on disk. For each new shard we replace we 732 // delete it from toDelete. Anything remaining in toDelete will be removed 733 // after we have renamed everything into place. 734 735 var toDelete map[string]struct{} 736 if !b.opts.IsDelta { 737 // Non-delta shard builds delete all existing shards before they write out 738 // new ones. 739 // By contrast, delta shard builds work by stacking changes on top of existing shards. 740 // So, we skip populating the toDelete map if we're building delta shards. 741 742 toDelete = make(map[string]struct{}) 743 for _, name := range oldShards { 744 paths, err := IndexFilePaths(name) 745 if err != nil { 746 b.buildError = fmt.Errorf("failed to find old paths for %s: %w", name, err) 747 } 748 for _, p := range paths { 749 toDelete[p] = struct{}{} 750 } 751 } 752 } 753 754 for tmp, final := range artifactPaths { 755 if err := os.Rename(tmp, final); err != nil { 756 b.buildError = err 757 continue 758 } 759 760 delete(toDelete, final) 761 } 762 763 b.finishedShards = map[string]string{} 764 765 for p := range toDelete { 766 // Don't delete compound shards, set tombstones instead. 767 if b.opts.ShardMerging && strings.HasPrefix(filepath.Base(p), "compound-") { 768 if !strings.HasSuffix(p, ".zoekt") { 769 continue 770 } 771 err := SetTombstone(p, b.opts.RepositoryDescription.ID) 772 b.buildError = err 773 continue 774 } 775 log.Printf("removing old shard file: %s", p) 776 if err := os.Remove(p); err != nil { 777 b.buildError = err 778 } 779 } 780 781 return b.buildError 782} 783 784// BranchNamesEqual compares the given zoekt.RepositoryBranch slices, and returns true 785// iff both slices specify the same set of branch names in the same order. 786func BranchNamesEqual(a, b []zoekt.RepositoryBranch) bool { 787 if len(a) != len(b) { 788 return false 789 } 790 791 for i := range a { 792 x, y := a[i], b[i] 793 if x.Name != y.Name { 794 return false 795 } 796 } 797 798 return true 799} 800 801func (b *Builder) flush() error { 802 todo := b.todo 803 b.todo = nil 804 b.size = 0 805 b.errMu.Lock() 806 defer b.errMu.Unlock() 807 if b.buildError != nil { 808 return b.buildError 809 } 810 811 hasShard := b.nextShardNum > 0 812 if len(todo) == 0 && hasShard { 813 return nil 814 } 815 816 shard := b.nextShardNum 817 b.nextShardNum++ 818 819 if b.opts.Parallelism > 1 { 820 b.building.Add(1) 821 b.throttle <- 1 822 go func() { 823 done, err := b.buildShard(todo, shard) 824 <-b.throttle 825 826 b.errMu.Lock() 827 defer b.errMu.Unlock() 828 if err != nil && b.buildError == nil { 829 b.buildError = err 830 } 831 if err == nil { 832 b.finishedShards[done.temp] = done.final 833 } 834 b.building.Done() 835 }() 836 } else { 837 // No goroutines when we're not parallel. This 838 // simplifies memory profiling. 839 done, err := b.buildShard(todo, shard) 840 b.buildError = err 841 if err == nil { 842 b.finishedShards[done.temp] = done.final 843 } 844 845 return b.buildError 846 } 847 848 return nil 849} 850 851// map [0,inf) to [0,1) monotonically 852func squashRange(j int) float64 { 853 x := float64(j) 854 return x / (1 + x) 855} 856 857type rankedDoc struct { 858 *Document 859 rank []float64 860} 861 862// rank returns a vector of scores which is used at index-time to sort documents 863// before writing them to disk. The order of documents in the shard is important 864// at query time, because earlier documents receive a boost at query time and 865// have a higher chance of being searched before limits kick in. 866func rank(d *Document, origIdx int) []float64 { 867 skipped := 0.0 868 if d.SkipReason != "" { 869 skipped = 1.0 870 } 871 872 generated := 0.0 873 if enry.IsGenerated(d.Name, d.Content) { 874 generated = 1.0 875 } 876 877 vendor := 0.0 878 if enry.IsVendor(d.Name) { 879 vendor = 1.0 880 } 881 882 test := 0.0 883 if enry.IsTest(d.Name) { 884 test = 1.0 885 } 886 887 // Smaller is earlier (=better). 888 return []float64{ 889 // Always place skipped docs last 890 skipped, 891 892 // Prefer docs that are not generated 893 generated, 894 895 // Prefer docs that are not vendored 896 vendor, 897 898 // Prefer docs that are not tests 899 test, 900 901 // With short names 902 squashRange(len(d.Name)), 903 904 // With many symbols 905 1.0 - squashRange(len(d.Symbols)), 906 907 // With short content 908 squashRange(len(d.Content)), 909 910 // That is present is as many branches as possible 911 1.0 - squashRange(len(d.Branches)), 912 913 // Preserve original ordering. 914 squashRange(origIdx), 915 } 916} 917 918func sortDocuments(todo []*Document) { 919 rs := make([]rankedDoc, 0, len(todo)) 920 for i, t := range todo { 921 rd := rankedDoc{t, rank(t, i)} 922 rs = append(rs, rd) 923 } 924 sort.Slice(rs, func(i, j int) bool { 925 r1 := rs[i].rank 926 r2 := rs[j].rank 927 for i := range r1 { 928 if r1[i] < r2[i] { 929 return true 930 } 931 if r1[i] > r2[i] { 932 return false 933 } 934 } 935 936 return false 937 }) 938 for i := range todo { 939 todo[i] = rs[i].Document 940 } 941} 942 943func (b *Builder) buildShard(todo []*Document, nextShardNum int) (*finishedShard, error) { 944 if !b.opts.DisableCTags && (b.opts.CTagsPath != "" || b.opts.ScipCTagsPath != "") { 945 err := parseSymbols(todo, b.opts.LanguageMap, b.parserBins) 946 if b.opts.CTagsMustSucceed && err != nil { 947 return nil, err 948 } 949 if err != nil { 950 log.Printf("ignoring universal:%s or scip:%s error: %v", b.opts.CTagsPath, b.opts.ScipCTagsPath, err) 951 } 952 } 953 954 name := b.opts.shardName(nextShardNum) 955 956 shardBuilder, err := b.newShardBuilder() 957 if err != nil { 958 return nil, err 959 } 960 961 sortDocuments(todo) 962 963 for idx, t := range todo { 964 if err := shardBuilder.Add(*t); err != nil { 965 return nil, err 966 } 967 968 if idx%10_000 == 0 { 969 b.CheckMemoryUsage() 970 } 971 } 972 973 return b.writeShard(name, shardBuilder) 974} 975 976// CheckMemoryUsage checks the memory usage of the process and writes a memory profile if the heap usage exceeds the 977// configured threshold. NOTE: this method is expensive and should only be used for debugging. 978func (b *Builder) CheckMemoryUsage() { 979 // Don't check memory if heap profiling is disabled, or we've already written 10 profiles 980 if b.opts.HeapProfileTriggerBytes <= 0 || b.heapProfileNum >= 10 { 981 return 982 } 983 984 var m runtime.MemStats 985 runtime.ReadMemStats(&m) 986 987 if m.HeapAlloc > b.opts.HeapProfileTriggerBytes && b.heapProfileMu.TryLock() { 988 defer b.heapProfileMu.Unlock() 989 990 log.Printf("writing memory profile, allocated heap: %s", humanize.Bytes(m.HeapAlloc)) 991 name := filepath.Join(b.opts.IndexDir, fmt.Sprintf("indexmemory.prof.%d", b.heapProfileNum)) 992 f, err := os.Create(name) 993 if err != nil { 994 log.Printf("failed to create memory profile file: %v", err) 995 return 996 } 997 998 err = pprof.WriteHeapProfile(f) 999 if err != nil { 1000 log.Printf("failed to write memory profile: %v", err) 1001 } 1002 1003 b.heapProfileNum++ 1004 } 1005} 1006 1007func (b *Builder) newShardBuilder() (*ShardBuilder, error) { 1008 desc := b.opts.RepositoryDescription 1009 desc.HasSymbols = !b.opts.DisableCTags && b.opts.CTagsPath != "" 1010 desc.SubRepoMap = b.opts.SubRepositories 1011 desc.IndexOptions = b.opts.GetHash() 1012 1013 shardBuilder, err := NewShardBuilder(&desc) 1014 if err != nil { 1015 return nil, err 1016 } 1017 shardBuilder.IndexTime = b.indexTime 1018 shardBuilder.ID = b.id 1019 return shardBuilder, nil 1020} 1021 1022func (b *Builder) writeShard(fn string, ib *ShardBuilder) (*finishedShard, error) { 1023 dir := filepath.Dir(fn) 1024 if err := os.MkdirAll(dir, 0o700); err != nil { 1025 return nil, err 1026 } 1027 1028 f, err := os.CreateTemp(dir, filepath.Base(fn)+".*.tmp") 1029 if err != nil { 1030 return nil, err 1031 } 1032 if runtime.GOOS != "windows" { 1033 if err := f.Chmod(0o666 &^ umask); err != nil { 1034 return nil, err 1035 } 1036 } 1037 1038 defer f.Close() 1039 if err := ib.Write(f); err != nil { 1040 return nil, err 1041 } 1042 fi, err := f.Stat() 1043 if err != nil { 1044 return nil, err 1045 } 1046 if err := f.Close(); err != nil { 1047 return nil, err 1048 } 1049 1050 log.Printf("finished shard %s: %d index bytes (overhead %3.1f), %d files processed \n", 1051 fn, 1052 fi.Size(), 1053 float64(fi.Size())/float64(ib.ContentSize()+1), 1054 ib.NumFiles()) 1055 1056 return &finishedShard{f.Name(), fn}, nil 1057} 1058 1059type deltaBranchSetError struct { 1060 shardName string 1061 old, new []zoekt.RepositoryBranch 1062} 1063 1064func (e deltaBranchSetError) Error() string { 1065 return fmt.Sprintf("repository metadata in shard %q contains a different set of branch names than what was requested, which is unsupported in a delta shard build. old: %+v, new: %+v", e.shardName, e.old, e.new) 1066} 1067 1068type deltaIndexOptionsMismatchError struct { 1069 shardName string 1070 newOptions HashOptions 1071} 1072 1073func (e *deltaIndexOptionsMismatchError) Error() string { 1074 return fmt.Sprintf("one or more index options for shard %q do not match Builder's index options. These index option updates are incompatible with delta build. New index options: %+v", e.shardName, e.newOptions) 1075} 1076 1077// Document holds a document (file) to index. 1078type Document struct { 1079 Name string 1080 Content []byte 1081 Branches []string 1082 SubRepositoryPath string 1083 Language string 1084 Category FileCategory 1085 1086 // If set, something is wrong with the file contents, and this 1087 // is the reason it wasn't indexed. 1088 SkipReason string 1089 1090 // Document sections for symbols. Offsets should use bytes. 1091 Symbols []DocumentSection 1092 SymbolsMetaData []*zoekt.Symbol 1093} 1094 1095type DocumentSection struct { 1096 Start, End uint32 1097} 1098 1099// umask holds the Umask of the current process 1100var umask os.FileMode 1101 1102func init() { 1103 umask = os.FileMode(unix.Umask(0)) 1104 unix.Umask(int(umask)) 1105}