fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15// package build implements a more convenient interface for building 16// zoekt indices. 17package build 18 19import ( 20 "cmp" 21 "crypto/sha1" 22 "flag" 23 "fmt" 24 "log" 25 "net/url" 26 "os" 27 "os/exec" 28 "path" 29 "path/filepath" 30 "reflect" 31 "runtime" 32 "runtime/pprof" 33 "sort" 34 "strconv" 35 "strings" 36 "sync" 37 "time" 38 39 "github.com/bmatcuk/doublestar" 40 "github.com/dustin/go-humanize" 41 "github.com/go-enry/go-enry/v2" 42 "github.com/rs/xid" 43 44 "github.com/sourcegraph/zoekt" 45 "github.com/sourcegraph/zoekt/ctags" 46) 47 48var DefaultDir = filepath.Join(os.Getenv("HOME"), ".zoekt") 49 50// Branch describes a single branch version. 51type Branch struct { 52 Name string 53 Version string 54} 55 56// Options sets options for the index building. 57type Options struct { 58 // IndexDir is a directory that holds *.zoekt index files. 59 IndexDir string 60 61 // SizeMax is the maximum file size 62 SizeMax int 63 64 // Parallelism is the maximum number of shards to index in parallel 65 Parallelism int 66 67 // ShardMax sets the maximum corpus size for a single shard 68 ShardMax int 69 70 // TrigramMax sets the maximum number of distinct trigrams per document. 71 TrigramMax int 72 73 // RepositoryDescription holds names and URLs for the repository. 74 RepositoryDescription zoekt.Repository 75 76 // SubRepositories is a path => sub repository map. 77 SubRepositories map[string]*zoekt.Repository 78 79 // DisableCTags disables the generation of ctags metadata. 80 DisableCTags bool 81 82 // CtagsPath is the path to the ctags binary to run, or empty 83 // if a valid binary couldn't be found. 84 CTagsPath string 85 86 // Same as CTagsPath but for scip-ctags 87 ScipCTagsPath string 88 89 // If set, ctags must succeed. 90 CTagsMustSucceed bool 91 92 // LargeFiles is a slice of glob patterns, including ** for any number 93 // of directories, where matching file paths should be indexed 94 // regardless of their size. The full pattern syntax is here: 95 // https://github.com/bmatcuk/doublestar/tree/v1#patterns. 96 LargeFiles []string 97 98 // IsDelta is true if this run contains only the changed documents since the 99 // last run. 100 IsDelta bool 101 102 // changedOrRemovedFiles is a list of file paths that have been changed or removed 103 // since the last indexing job for this repository. These files will be tombstoned 104 // in the older shards for this repository. 105 changedOrRemovedFiles []string 106 107 LanguageMap ctags.LanguageMap 108 109 // ShardMerging is true if builder should respect compound shards. This is a 110 // Sourcegraph specific option. 111 ShardMerging bool 112 113 // HeapProfileTriggerBytes is the heap allocation in bytes that will trigger a memory profile. If 0, no memory profile 114 // will be triggered. Note this trigger looks at total heap allocation (which includes both inuse and garbage objects). 115 // 116 // Profiles will be written to files named `index-memory.prof.n` in the index directory. No more than 10 files are written. 117 // 118 // Note: heap checking is "best effort", and it's possible for the process to OOM without triggering the heap profile. 119 HeapProfileTriggerBytes uint64 120 121 // ShardPrefix is the prefix of the shard. It defaults to the repository name. 122 ShardPrefix string 123} 124 125// HashOptions contains only the options in Options that upon modification leads to IndexState of IndexStateMismatch during the next index building. 126type HashOptions struct { 127 sizeMax int 128 disableCTags bool 129 ctagsPath string 130 cTagsMustSucceed bool 131 largeFiles []string 132} 133 134func (o *Options) HashOptions() HashOptions { 135 return HashOptions{ 136 sizeMax: o.SizeMax, 137 disableCTags: o.DisableCTags, 138 ctagsPath: o.CTagsPath, 139 cTagsMustSucceed: o.CTagsMustSucceed, 140 largeFiles: o.LargeFiles, 141 } 142} 143 144func (o *Options) GetHash() string { 145 h := o.HashOptions() 146 hasher := sha1.New() 147 148 hasher.Write([]byte(h.ctagsPath)) 149 hasher.Write([]byte(fmt.Sprintf("%t", h.cTagsMustSucceed))) 150 hasher.Write([]byte(fmt.Sprintf("%d", h.sizeMax))) 151 hasher.Write([]byte(fmt.Sprintf("%q", h.largeFiles))) 152 hasher.Write([]byte(fmt.Sprintf("%t", h.disableCTags))) 153 154 return fmt.Sprintf("%x", hasher.Sum(nil)) 155} 156 157type largeFilesFlag struct{ *Options } 158 159func (f largeFilesFlag) String() string { 160 // From flag.Value documentation: 161 // 162 // The flag package may call the String method with a zero-valued receiver, 163 // such as a nil pointer. 164 if f.Options == nil { 165 return "" 166 } 167 s := append([]string{""}, f.LargeFiles...) 168 return strings.Join(s, "-large_file ") 169} 170 171func (f largeFilesFlag) Set(value string) error { 172 f.LargeFiles = append(f.LargeFiles, value) 173 return nil 174} 175 176// Flags adds flags for build options to fs. It is the "inverse" of Args. 177func (o *Options) Flags(fs *flag.FlagSet) { 178 x := *o 179 x.SetDefaults() 180 fs.IntVar(&o.SizeMax, "file_limit", x.SizeMax, "maximum file size") 181 fs.IntVar(&o.TrigramMax, "max_trigram_count", x.TrigramMax, "maximum number of trigrams per document") 182 fs.IntVar(&o.ShardMax, "shard_limit", x.ShardMax, "maximum corpus size for a shard") 183 fs.IntVar(&o.Parallelism, "parallelism", x.Parallelism, "maximum number of parallel indexing processes.") 184 fs.StringVar(&o.IndexDir, "index", x.IndexDir, "directory for search indices") 185 fs.BoolVar(&o.CTagsMustSucceed, "require_ctags", x.CTagsMustSucceed, "If set, ctags calls must succeed.") 186 fs.Var(largeFilesFlag{o}, "large_file", "A glob pattern where matching files are to be index regardless of their size. You can add multiple patterns by setting this more than once.") 187 fs.StringVar(&o.ShardPrefix, "shard_prefix", x.ShardPrefix, "the prefix of the shard. Defaults to repository name") 188 189 // Sourcegraph specific 190 fs.BoolVar(&o.DisableCTags, "disable_ctags", x.DisableCTags, "If set, ctags will not be called.") 191 fs.BoolVar(&o.ShardMerging, "shard_merging", x.ShardMerging, "If set, builder will respect compound shards.") 192} 193 194// Args generates command line arguments for o. It is the "inverse" of Flags. 195func (o *Options) Args() []string { 196 var args []string 197 198 if o.SizeMax != 0 { 199 args = append(args, "-file_limit", strconv.Itoa(o.SizeMax)) 200 } 201 202 if o.TrigramMax != 0 { 203 args = append(args, "-max_trigram_count", strconv.Itoa(o.TrigramMax)) 204 } 205 206 if o.ShardMax != 0 { 207 args = append(args, "-shard_limit", strconv.Itoa(o.ShardMax)) 208 } 209 210 if o.Parallelism != 0 { 211 args = append(args, "-parallelism", strconv.Itoa(o.Parallelism)) 212 } 213 214 if o.IndexDir != "" { 215 args = append(args, "-index", o.IndexDir) 216 } 217 218 if o.CTagsMustSucceed { 219 args = append(args, "-require_ctags") 220 } 221 222 for _, a := range o.LargeFiles { 223 args = append(args, "-large_file", a) 224 } 225 226 // Sourcegraph specific 227 if o.DisableCTags { 228 args = append(args, "-disable_ctags") 229 } 230 231 if o.ShardMerging { 232 args = append(args, "-shard_merging") 233 } 234 235 if o.ShardPrefix != "" { 236 args = append(args, "-shard_prefix", o.ShardPrefix) 237 } 238 239 return args 240} 241 242// Builder manages (parallel) creation of uniformly sized shards. The 243// builder buffers up documents until it collects enough documents and 244// then builds a shard and writes. 245type Builder struct { 246 opts Options 247 throttle chan int 248 249 nextShardNum int 250 todo []*zoekt.Document 251 docChecker zoekt.DocChecker 252 size int 253 254 parserBins ctags.ParserBinMap 255 building sync.WaitGroup 256 257 errMu sync.Mutex 258 buildError error 259 260 // temp name => final name for finished shards. We only rename 261 // them once all shards succeed to avoid Frankstein corpuses. 262 finishedShards map[string]string 263 264 // indexTime is set by tests for doing reproducible builds. 265 indexTime time.Time 266 267 // heapProfileMu is used to ensure that only one memory profile is written at a time 268 heapProfileMu sync.Mutex 269 heapProfileNum int 270 271 // a sortable 20 chars long id. 272 id string 273 274 finishCalled bool 275} 276 277type finishedShard struct { 278 temp, final string 279} 280 281func checkCTags() string { 282 if ctags := os.Getenv("CTAGS_COMMAND"); ctags != "" { 283 return ctags 284 } 285 286 if ctags, err := exec.LookPath("universal-ctags"); err == nil { 287 return ctags 288 } 289 290 return "" 291} 292 293func checkScipCTags() string { 294 if ctags := os.Getenv("SCIP_CTAGS_COMMAND"); ctags != "" { 295 return ctags 296 } 297 298 if ctags, err := exec.LookPath("scip-ctags"); err == nil { 299 return ctags 300 } 301 302 return "" 303} 304 305// SetDefaults sets reasonable default options. 306func (o *Options) SetDefaults() { 307 if o.CTagsPath == "" && !o.DisableCTags { 308 o.CTagsPath = checkCTags() 309 } 310 311 if o.ScipCTagsPath == "" && !o.DisableCTags { 312 o.ScipCTagsPath = checkScipCTags() 313 } 314 315 if o.Parallelism == 0 { 316 o.Parallelism = 4 317 } 318 if o.SizeMax == 0 { 319 o.SizeMax = 2 << 20 320 } 321 if o.ShardMax == 0 { 322 o.ShardMax = 100 << 20 323 } 324 if o.TrigramMax == 0 { 325 o.TrigramMax = 20000 326 } 327 328 if o.RepositoryDescription.Name == "" && o.RepositoryDescription.URL != "" { 329 parsed, _ := url.Parse(o.RepositoryDescription.URL) 330 if parsed != nil { 331 o.RepositoryDescription.Name = filepath.Join(parsed.Host, parsed.Path) 332 } 333 } 334} 335 336// ShardName returns the name the given index shard. 337func (o *Options) shardName(n int) string { 338 return o.shardNameVersion(zoekt.IndexFormatVersion, n) 339} 340 341func (o *Options) shardNameVersion(version, n int) string { 342 return zoekt.ShardName(o.IndexDir, cmp.Or(o.ShardPrefix, o.RepositoryDescription.Name), version, n) 343} 344 345type IndexState string 346 347const ( 348 IndexStateMissing IndexState = "missing" 349 IndexStateCorrupt IndexState = "corrupt" 350 IndexStateVersion IndexState = "version-mismatch" 351 IndexStateOption IndexState = "option-mismatch" 352 IndexStateMeta IndexState = "meta-mismatch" 353 IndexStateContent IndexState = "content-mismatch" 354 IndexStateEqual IndexState = "equal" 355) 356 357var readVersions = []struct { 358 IndexFormatVersion int 359 FeatureVersion int 360}{{ 361 IndexFormatVersion: zoekt.IndexFormatVersion, 362 FeatureVersion: zoekt.FeatureVersion, 363}, { 364 IndexFormatVersion: zoekt.NextIndexFormatVersion, 365 FeatureVersion: zoekt.FeatureVersion, 366}} 367 368// IncrementalSkipIndexing returns true if the index present on disk matches 369// the build options. 370func (o *Options) IncrementalSkipIndexing() bool { 371 state, _ := o.IndexState() 372 return state == IndexStateEqual 373} 374 375// IndexState checks how the index present on disk compares to the build 376// options and returns the IndexState and the name of the first shard. 377func (o *Options) IndexState() (IndexState, string) { 378 // Open the latest version we support that is on disk. 379 fn := o.findShard() 380 if fn == "" { 381 return IndexStateMissing, fn 382 } 383 384 repos, index, err := zoekt.ReadMetadataPathAlive(fn) 385 if os.IsNotExist(err) { 386 return IndexStateMissing, fn 387 } else if err != nil { 388 return IndexStateCorrupt, fn 389 } 390 391 for _, v := range readVersions { 392 if v.IndexFormatVersion == index.IndexFormatVersion && v.FeatureVersion != index.IndexFeatureVersion { 393 return IndexStateVersion, fn 394 } 395 } 396 397 var repo *zoekt.Repository 398 for _, cand := range repos { 399 if cand.Name == o.RepositoryDescription.Name { 400 repo = cand 401 break 402 } 403 } 404 405 if repo == nil { 406 return IndexStateCorrupt, fn 407 } 408 409 if repo.IndexOptions != o.GetHash() { 410 return IndexStateOption, fn 411 } 412 413 if !reflect.DeepEqual(repo.Branches, o.RepositoryDescription.Branches) { 414 return IndexStateContent, fn 415 } 416 417 // We can mutate repo since it lives in the scope of this function call. 418 if updated, err := repo.MergeMutable(&o.RepositoryDescription); err != nil { 419 // non-nil err means we are trying to update an immutable field => 420 // reindex content. 421 log.Printf("warn: immutable field changed, requires re-index: %s", err) 422 return IndexStateContent, fn 423 } else if updated { 424 return IndexStateMeta, fn 425 } 426 427 return IndexStateEqual, fn 428} 429 430// FindRepositoryMetadata returns the index metadata for the repository 431// specified in the options. 'ok' is false if the repository's metadata 432// couldn't be found or if an error occurred. 433func (o *Options) FindRepositoryMetadata() (repository *zoekt.Repository, metadata *zoekt.IndexMetadata, ok bool, err error) { 434 shard := o.findShard() 435 if shard == "" { 436 return nil, nil, false, nil 437 } 438 439 repositories, metadata, err := zoekt.ReadMetadataPathAlive(shard) 440 if err != nil { 441 return nil, nil, false, fmt.Errorf("reading metadata for shard %q: %w", shard, err) 442 } 443 444 ID := o.RepositoryDescription.ID 445 for _, r := range repositories { 446 // compound shards contain multiple repositories, so we 447 // have to pick only the one we're looking for 448 if r.ID == ID { 449 return r, metadata, true, nil 450 } 451 } 452 453 // If we're here, then we're somehow in a state where we found a matching 454 // shard that's missing the repository metadata we're looking for. This 455 // should never happen. 456 name := o.RepositoryDescription.Name 457 return nil, nil, false, fmt.Errorf("matching shard %q doesn't contain metadata for repo id %d (%q)", shard, ID, name) 458} 459 460func (o *Options) findShard() string { 461 for _, v := range readVersions { 462 fn := o.shardNameVersion(v.IndexFormatVersion, 0) 463 if _, err := os.Stat(fn); err == nil { 464 return fn 465 } 466 } 467 468 // Brute force finding the shard in compound shards. We should only hit this 469 // code path for repositories that are not already existing or are in 470 // compound shards. 471 // 472 // TODO add an oracle which can speed this up in the case of repositories 473 // already in compound shards. 474 compoundShards, err := filepath.Glob(path.Join(o.IndexDir, "compound-*.zoekt")) 475 if err != nil { 476 return "" 477 } 478 for _, fn := range compoundShards { 479 repos, _, err := zoekt.ReadMetadataPathAlive(fn) 480 if err != nil { 481 continue 482 } 483 for _, repo := range repos { 484 if repo.ID == o.RepositoryDescription.ID { 485 return fn 486 } 487 } 488 } 489 490 return "" 491} 492 493func (o *Options) FindAllShards() []string { 494 for _, v := range readVersions { 495 fn := o.shardNameVersion(v.IndexFormatVersion, 0) 496 if _, err := os.Stat(fn); err == nil { 497 shards := []string{fn} 498 for i := 1; ; i++ { 499 fn := o.shardNameVersion(v.IndexFormatVersion, i) 500 if _, err := os.Stat(fn); err != nil { 501 return shards 502 } 503 shards = append(shards, fn) 504 } 505 } 506 } 507 508 // lazily fallback to findShard which will look for a compound shard. 509 if fn := o.findShard(); fn != "" { 510 return []string{fn} 511 } 512 513 return nil 514} 515 516// IgnoreSizeMax determines whether the max size should be ignored. 517func (o *Options) IgnoreSizeMax(name string) bool { 518 // A pattern match will override preceding pattern matches. 519 for i := len(o.LargeFiles) - 1; i >= 0; i-- { 520 pattern := strings.TrimSpace(o.LargeFiles[i]) 521 negated, validatedPattern := checkIsNegatePattern(pattern) 522 523 if m, _ := doublestar.PathMatch(validatedPattern, name); m { 524 if negated { 525 return false 526 } else { 527 return true 528 } 529 } 530 } 531 532 return false 533} 534 535func checkIsNegatePattern(pattern string) (bool, string) { 536 negate := "!" 537 538 // if negated then strip prefix meta character which identifies negated filter pattern 539 if strings.HasPrefix(pattern, negate) { 540 return true, pattern[len(negate):] 541 } 542 543 return false, pattern 544} 545 546// NewBuilder creates a new Builder instance. 547func NewBuilder(opts Options) (*Builder, error) { 548 opts.SetDefaults() 549 if opts.RepositoryDescription.Name == "" { 550 return nil, fmt.Errorf("builder: must set Name") 551 } 552 553 b := &Builder{ 554 opts: opts, 555 throttle: make(chan int, opts.Parallelism), 556 finishedShards: map[string]string{}, 557 } 558 559 parserBins, err := ctags.NewParserBinMap( 560 b.opts.CTagsPath, 561 b.opts.ScipCTagsPath, 562 opts.LanguageMap, 563 b.opts.CTagsMustSucceed, 564 ) 565 if err != nil { 566 return nil, err 567 } 568 569 b.parserBins = parserBins 570 571 if opts.IsDelta { 572 // Delta shards build on top of previously existing shards. 573 // As a consequence, the shardNum for delta shards starts from 574 // the number following the most recently generated shard - not 0. 575 // 576 // Using this numbering scheme allows all the shards to be 577 // discovered as a set. 578 shards := b.opts.FindAllShards() 579 b.nextShardNum = len(shards) // shards are zero indexed, so len() provides the next number after the last one 580 } 581 582 if _, err := b.newShardBuilder(); err != nil { 583 return nil, err 584 } 585 586 now := time.Now() 587 b.indexTime = now 588 b.id = xid.NewWithTime(now).String() 589 590 return b, nil 591} 592 593// AddFile is a convenience wrapper for the Add method 594func (b *Builder) AddFile(name string, content []byte) error { 595 return b.Add(zoekt.Document{Name: name, Content: content}) 596} 597 598func (b *Builder) Add(doc zoekt.Document) error { 599 if b.finishCalled { 600 return nil 601 } 602 603 allowLargeFile := b.opts.IgnoreSizeMax(doc.Name) 604 if len(doc.Content) > b.opts.SizeMax && !allowLargeFile { 605 // We could pass the document on to the shardbuilder, but if 606 // we pass through a part of the source tree with binary/large 607 // files, the corresponding shard would be mostly empty, so 608 // insert a reason here too. 609 doc.SkipReason = fmt.Sprintf("document size %d larger than limit %d", len(doc.Content), b.opts.SizeMax) 610 } else if err := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile); err != nil { 611 doc.SkipReason = err.Error() 612 doc.Language = "binary" 613 } 614 615 b.todo = append(b.todo, &doc) 616 617 if doc.SkipReason == "" { 618 b.size += len(doc.Name) + len(doc.Content) 619 } else { 620 b.size += len(doc.Name) + len(doc.SkipReason) 621 // Drop the content if we are skipping the document. Skipped content is not counted towards the 622 // shard size limit, so otherwise we might buffer too much data in memory before flushing. 623 doc.Content = nil 624 } 625 626 if b.size > b.opts.ShardMax { 627 return b.flush() 628 } 629 630 return nil 631} 632 633// MarkFileAsChangedOrRemoved indicates that the file specified by the given path 634// has been changed or removed since the last indexing job for this repository. 635// 636// If this build is a delta build, these files will be tombstoned in the older shards for this repository. 637func (b *Builder) MarkFileAsChangedOrRemoved(path string) { 638 b.opts.changedOrRemovedFiles = append(b.opts.changedOrRemovedFiles, path) 639} 640 641// Finish creates a last shard from the buffered documents, and clears 642// stale shards from previous runs. This should always be called, also 643// in failure cases, to ensure cleanup. 644// 645// It is safe to call Finish() multiple times. 646func (b *Builder) Finish() error { 647 if b.finishCalled { 648 return b.buildError 649 } 650 651 b.finishCalled = true 652 653 b.flush() 654 b.building.Wait() 655 656 if b.buildError != nil { 657 for tmp := range b.finishedShards { 658 log.Printf("Builder.Finish %s", tmp) 659 os.Remove(tmp) 660 } 661 b.finishedShards = map[string]string{} 662 return b.buildError 663 } 664 665 // map of temporary -> final names for all updated shards + shard metadata files 666 artifactPaths := make(map[string]string) 667 for tmp, final := range b.finishedShards { 668 artifactPaths[tmp] = final 669 } 670 671 oldShards := b.opts.FindAllShards() 672 673 if b.opts.IsDelta { 674 // Delta shard builds need to update FileTombstone and branch commit information for all 675 // existing shards 676 for _, shard := range oldShards { 677 repositories, _, err := zoekt.ReadMetadataPathAlive(shard) 678 if err != nil { 679 return fmt.Errorf("reading metadata from shard %q: %w", shard, err) 680 } 681 682 if len(repositories) > 1 { 683 return fmt.Errorf("delta shard builds don't support repositories contained in compound shards (shard %q)", shard) 684 } 685 686 if len(repositories) == 0 { 687 return fmt.Errorf("failed to update repository metadata for shard %q - shard contains no repositories", shard) 688 } 689 690 repository := repositories[0] 691 if repository.ID != b.opts.RepositoryDescription.ID { 692 return fmt.Errorf("shard %q doesn't contain repository ID %d (%q)", shard, b.opts.RepositoryDescription.ID, b.opts.RepositoryDescription.Name) 693 } 694 695 if len(b.opts.changedOrRemovedFiles) > 0 && repository.FileTombstones == nil { 696 repository.FileTombstones = make(map[string]struct{}, len(b.opts.changedOrRemovedFiles)) 697 } 698 699 for _, f := range b.opts.changedOrRemovedFiles { 700 repository.FileTombstones[f] = struct{}{} 701 } 702 703 if !BranchNamesEqual(repository.Branches, b.opts.RepositoryDescription.Branches) { 704 return deltaBranchSetError{ 705 shardName: shard, 706 old: repository.Branches, 707 new: b.opts.RepositoryDescription.Branches, 708 } 709 } 710 711 if b.opts.GetHash() != repository.IndexOptions { 712 return &deltaIndexOptionsMismatchError{ 713 shardName: shard, 714 newOptions: b.opts.HashOptions(), 715 } 716 } 717 718 repository.Branches = b.opts.RepositoryDescription.Branches 719 720 repository.LatestCommitDate = b.opts.RepositoryDescription.LatestCommitDate 721 722 tempPath, finalPath, err := zoekt.JsonMarshalRepoMetaTemp(shard, repository) 723 if err != nil { 724 return fmt.Errorf("writing repository metadta for shard %q: %w", shard, err) 725 } 726 727 artifactPaths[tempPath] = finalPath 728 } 729 } 730 731 // We mark finished shards as empty when we successfully finish. Return now 732 // to allow call sites to call Finish idempotently. 733 if len(artifactPaths) == 0 { 734 return b.buildError 735 } 736 737 // Collect a map of the old shards on disk. For each new shard we replace we 738 // delete it from toDelete. Anything remaining in toDelete will be removed 739 // after we have renamed everything into place. 740 741 var toDelete map[string]struct{} 742 if !b.opts.IsDelta { 743 // Non-delta shard builds delete all existing shards before they write out 744 // new ones. 745 // By contrast, delta shard builds work by stacking changes on top of existing shards. 746 // So, we skip populating the toDelete map if we're building delta shards. 747 748 toDelete = make(map[string]struct{}) 749 for _, name := range oldShards { 750 paths, err := zoekt.IndexFilePaths(name) 751 if err != nil { 752 b.buildError = fmt.Errorf("failed to find old paths for %s: %w", name, err) 753 } 754 for _, p := range paths { 755 toDelete[p] = struct{}{} 756 } 757 } 758 } 759 760 for tmp, final := range artifactPaths { 761 if err := os.Rename(tmp, final); err != nil { 762 b.buildError = err 763 continue 764 } 765 766 delete(toDelete, final) 767 } 768 769 b.finishedShards = map[string]string{} 770 771 for p := range toDelete { 772 // Don't delete compound shards, set tombstones instead. 773 if b.opts.ShardMerging && strings.HasPrefix(filepath.Base(p), "compound-") { 774 if !strings.HasSuffix(p, ".zoekt") { 775 continue 776 } 777 err := zoekt.SetTombstone(p, b.opts.RepositoryDescription.ID) 778 b.buildError = err 779 continue 780 } 781 log.Printf("removing old shard file: %s", p) 782 if err := os.Remove(p); err != nil { 783 b.buildError = err 784 } 785 } 786 787 return b.buildError 788} 789 790// BranchNamesEqual compares the given zoekt.RepositoryBranch slices, and returns true 791// iff both slices specify the same set of branch names in the same order. 792func BranchNamesEqual(a, b []zoekt.RepositoryBranch) bool { 793 if len(a) != len(b) { 794 return false 795 } 796 797 for i := range a { 798 x, y := a[i], b[i] 799 if x.Name != y.Name { 800 return false 801 } 802 } 803 804 return true 805} 806 807func (b *Builder) flush() error { 808 todo := b.todo 809 b.todo = nil 810 b.size = 0 811 b.errMu.Lock() 812 defer b.errMu.Unlock() 813 if b.buildError != nil { 814 return b.buildError 815 } 816 817 hasShard := b.nextShardNum > 0 818 if len(todo) == 0 && hasShard { 819 return nil 820 } 821 822 shard := b.nextShardNum 823 b.nextShardNum++ 824 825 if b.opts.Parallelism > 1 { 826 b.building.Add(1) 827 b.throttle <- 1 828 go func() { 829 done, err := b.buildShard(todo, shard) 830 <-b.throttle 831 832 b.errMu.Lock() 833 defer b.errMu.Unlock() 834 if err != nil && b.buildError == nil { 835 b.buildError = err 836 } 837 if err == nil { 838 b.finishedShards[done.temp] = done.final 839 } 840 b.building.Done() 841 }() 842 } else { 843 // No goroutines when we're not parallel. This 844 // simplifies memory profiling. 845 done, err := b.buildShard(todo, shard) 846 b.buildError = err 847 if err == nil { 848 b.finishedShards[done.temp] = done.final 849 } 850 851 return b.buildError 852 } 853 854 return nil 855} 856 857// map [0,inf) to [0,1) monotonically 858func squashRange(j int) float64 { 859 x := float64(j) 860 return x / (1 + x) 861} 862 863// IsLowPriority takes a file name and makes an educated guess about its priority 864// in search results. A file is considered low priority if it looks like a test, 865// vendored, or generated file. 866// 867// These 'priority' criteria affects how documents are ordered within a shard. It's 868// also used to help guess a file's rank when we're missing ranking information. 869func IsLowPriority(path string, content []byte) bool { 870 return enry.IsTest(path) || enry.IsVendor(path) || enry.IsGenerated(path, content) 871} 872 873type rankedDoc struct { 874 *zoekt.Document 875 rank []float64 876} 877 878// rank returns a vector of scores which is used at index-time to sort documents 879// before writing them to disk. The order of documents in the shard is important 880// at query time, because earlier documents receive a boost at query time and 881// have a higher chance of being searched before limits kick in. 882func rank(d *zoekt.Document, origIdx int) []float64 { 883 skipped := 0.0 884 if d.SkipReason != "" { 885 skipped = 1.0 886 } 887 888 generated := 0.0 889 if enry.IsGenerated(d.Name, d.Content) { 890 generated = 1.0 891 } 892 893 vendor := 0.0 894 if enry.IsVendor(d.Name) { 895 vendor = 1.0 896 } 897 898 test := 0.0 899 if enry.IsTest(d.Name) { 900 test = 1.0 901 } 902 903 // Smaller is earlier (=better). 904 return []float64{ 905 // Always place skipped docs last 906 skipped, 907 908 // Prefer docs that are not generated 909 generated, 910 911 // Prefer docs that are not vendored 912 vendor, 913 914 // Prefer docs that are not tests 915 test, 916 917 // With short names 918 squashRange(len(d.Name)), 919 920 // With many symbols 921 1.0 - squashRange(len(d.Symbols)), 922 923 // With short content 924 squashRange(len(d.Content)), 925 926 // That is present is as many branches as possible 927 1.0 - squashRange(len(d.Branches)), 928 929 // Preserve original ordering. 930 squashRange(origIdx), 931 } 932} 933 934func sortDocuments(todo []*zoekt.Document) { 935 rs := make([]rankedDoc, 0, len(todo)) 936 for i, t := range todo { 937 rd := rankedDoc{t, rank(t, i)} 938 rs = append(rs, rd) 939 } 940 sort.Slice(rs, func(i, j int) bool { 941 r1 := rs[i].rank 942 r2 := rs[j].rank 943 for i := range r1 { 944 if r1[i] < r2[i] { 945 return true 946 } 947 if r1[i] > r2[i] { 948 return false 949 } 950 } 951 952 return false 953 }) 954 for i := range todo { 955 todo[i] = rs[i].Document 956 } 957} 958 959func (b *Builder) buildShard(todo []*zoekt.Document, nextShardNum int) (*finishedShard, error) { 960 if !b.opts.DisableCTags && (b.opts.CTagsPath != "" || b.opts.ScipCTagsPath != "") { 961 err := parseSymbols(todo, b.opts.LanguageMap, b.parserBins) 962 if b.opts.CTagsMustSucceed && err != nil { 963 return nil, err 964 } 965 if err != nil { 966 log.Printf("ignoring universal:%s or scip:%s error: %v", b.opts.CTagsPath, b.opts.ScipCTagsPath, err) 967 } 968 } 969 970 name := b.opts.shardName(nextShardNum) 971 972 shardBuilder, err := b.newShardBuilder() 973 if err != nil { 974 return nil, err 975 } 976 977 sortDocuments(todo) 978 979 for idx, t := range todo { 980 if err := shardBuilder.Add(*t); err != nil { 981 return nil, err 982 } 983 984 if idx%10_000 == 0 { 985 b.CheckMemoryUsage() 986 } 987 } 988 989 return b.writeShard(name, shardBuilder) 990} 991 992// CheckMemoryUsage checks the memory usage of the process and writes a memory profile if the heap usage exceeds the 993// configured threshold. NOTE: this method is expensive and should only be used for debugging. 994func (b *Builder) CheckMemoryUsage() { 995 // Don't check memory if heap profiling is disabled, or we've already written 10 profiles 996 if b.opts.HeapProfileTriggerBytes <= 0 || b.heapProfileNum >= 10 { 997 return 998 } 999 1000 var m runtime.MemStats 1001 runtime.ReadMemStats(&m) 1002 1003 if m.HeapAlloc > b.opts.HeapProfileTriggerBytes && b.heapProfileMu.TryLock() { 1004 defer b.heapProfileMu.Unlock() 1005 1006 log.Printf("writing memory profile, allocated heap: %s", humanize.Bytes(m.HeapAlloc)) 1007 name := filepath.Join(b.opts.IndexDir, fmt.Sprintf("indexmemory.prof.%d", b.heapProfileNum)) 1008 f, err := os.Create(name) 1009 if err != nil { 1010 log.Printf("failed to create memory profile file: %v", err) 1011 return 1012 } 1013 1014 err = pprof.WriteHeapProfile(f) 1015 if err != nil { 1016 log.Printf("failed to write memory profile: %v", err) 1017 } 1018 1019 b.heapProfileNum++ 1020 } 1021} 1022 1023func (b *Builder) newShardBuilder() (*zoekt.IndexBuilder, error) { 1024 desc := b.opts.RepositoryDescription 1025 desc.HasSymbols = !b.opts.DisableCTags && b.opts.CTagsPath != "" 1026 desc.SubRepoMap = b.opts.SubRepositories 1027 desc.IndexOptions = b.opts.GetHash() 1028 1029 shardBuilder, err := zoekt.NewIndexBuilder(&desc) 1030 if err != nil { 1031 return nil, err 1032 } 1033 shardBuilder.IndexTime = b.indexTime 1034 shardBuilder.ID = b.id 1035 return shardBuilder, nil 1036} 1037 1038func (b *Builder) writeShard(fn string, ib *zoekt.IndexBuilder) (*finishedShard, error) { 1039 dir := filepath.Dir(fn) 1040 if err := os.MkdirAll(dir, 0o700); err != nil { 1041 return nil, err 1042 } 1043 1044 f, err := os.CreateTemp(dir, filepath.Base(fn)+".*.tmp") 1045 if err != nil { 1046 return nil, err 1047 } 1048 if runtime.GOOS != "windows" { 1049 if err := f.Chmod(0o666 &^ umask); err != nil { 1050 return nil, err 1051 } 1052 } 1053 1054 defer f.Close() 1055 if err := ib.Write(f); err != nil { 1056 return nil, err 1057 } 1058 fi, err := f.Stat() 1059 if err != nil { 1060 return nil, err 1061 } 1062 if err := f.Close(); err != nil { 1063 return nil, err 1064 } 1065 1066 log.Printf("finished shard %s: %d index bytes (overhead %3.1f), %d files processed \n", 1067 fn, 1068 fi.Size(), 1069 float64(fi.Size())/float64(ib.ContentSize()+1), 1070 ib.NumFiles()) 1071 1072 return &finishedShard{f.Name(), fn}, nil 1073} 1074 1075type deltaBranchSetError struct { 1076 shardName string 1077 old, new []zoekt.RepositoryBranch 1078} 1079 1080func (e deltaBranchSetError) Error() string { 1081 return fmt.Sprintf("repository metadata in shard %q contains a different set of branch names than what was requested, which is unsupported in a delta shard build. old: %+v, new: %+v", e.shardName, e.old, e.new) 1082} 1083 1084type deltaIndexOptionsMismatchError struct { 1085 shardName string 1086 newOptions HashOptions 1087} 1088 1089func (e *deltaIndexOptionsMismatchError) Error() string { 1090 return fmt.Sprintf("one or more index options for shard %q do not match Builder's index options. These index option updates are incompatible with delta build. New index options: %+v", e.shardName, e.newOptions) 1091} 1092 1093// umask holds the Umask of the current process 1094var umask os.FileMode