fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15// package build implements a more convenient interface for building 16// zoekt indices. 17package build 18 19import ( 20 "crypto/sha1" 21 "flag" 22 "fmt" 23 "io" 24 "log" 25 "net/url" 26 "os" 27 "os/exec" 28 "path" 29 "path/filepath" 30 "reflect" 31 "runtime" 32 "runtime/pprof" 33 "sort" 34 "strconv" 35 "strings" 36 "sync" 37 "time" 38 39 "github.com/bmatcuk/doublestar" 40 "github.com/dustin/go-humanize" 41 "github.com/go-enry/go-enry/v2" 42 "github.com/rs/xid" 43 44 "github.com/sourcegraph/zoekt" 45 "github.com/sourcegraph/zoekt/ctags" 46) 47 48var DefaultDir = filepath.Join(os.Getenv("HOME"), ".zoekt") 49 50// Branch describes a single branch version. 51type Branch struct { 52 Name string 53 Version string 54} 55 56// Options sets options for the index building. 57type Options struct { 58 // IndexDir is a directory that holds *.zoekt index files. 59 IndexDir string 60 61 // SizeMax is the maximum file size 62 SizeMax int 63 64 // Parallelism is the maximum number of shards to index in parallel 65 Parallelism int 66 67 // ShardMax sets the maximum corpus size for a single shard 68 ShardMax int 69 70 // TrigramMax sets the maximum number of distinct trigrams per document. 71 TrigramMax int 72 73 // RepositoryDescription holds names and URLs for the repository. 74 RepositoryDescription zoekt.Repository 75 76 // SubRepositories is a path => sub repository map. 77 SubRepositories map[string]*zoekt.Repository 78 79 // DisableCTags disables the generation of ctags metadata. 80 DisableCTags bool 81 82 // CtagsPath is the path to the ctags binary to run, or empty 83 // if a valid binary couldn't be found. 84 CTagsPath string 85 86 // Same as CTagsPath but for scip-ctags 87 ScipCTagsPath string 88 89 // If set, ctags must succeed. 90 CTagsMustSucceed bool 91 92 // LargeFiles is a slice of glob patterns, including ** for any number 93 // of directories, where matching file paths should be indexed 94 // regardless of their size. The full pattern syntax is here: 95 // https://github.com/bmatcuk/doublestar/tree/v1#patterns. 96 LargeFiles []string 97 98 // IsDelta is true if this run contains only the changed documents since the 99 // last run. 100 IsDelta bool 101 102 // DocumentRanksPath is the path to the file with document ranks. If empty, 103 // ranks will be computed on-the-fly. 104 DocumentRanksPath string 105 106 // DocumentRanksVersion is a string which when changed will cause us to 107 // reindex a shard. This field is used so that when the contents of 108 // DocumentRanksPath changes, we can reindex. 109 DocumentRanksVersion string 110 111 // changedOrRemovedFiles is a list of file paths that have been changed or removed 112 // since the last indexing job for this repository. These files will be tombstoned 113 // in the older shards for this repository. 114 changedOrRemovedFiles []string 115 116 LanguageMap ctags.LanguageMap 117 118 // ShardMerging is true if builder should respect compound shards. This is a 119 // Sourcegraph specific option. 120 ShardMerging bool 121 122 // HeapProfileTriggerBytes is the heap usage in bytes that will trigger a memory profile. If 0, no memory profile will be triggered. 123 // Profiles will be written to files named `index-memory.prof.n` in the index directory. No more than 10 files are written. 124 // 125 // Note: heap checking is "best effort", and it's possible for the process to OOM without triggering the heap profile. 126 HeapProfileTriggerBytes uint64 127} 128 129// HashOptions contains only the options in Options that upon modification leads to IndexState of IndexStateMismatch during the next index building. 130type HashOptions struct { 131 sizeMax int 132 disableCTags bool 133 ctagsPath string 134 cTagsMustSucceed bool 135 largeFiles []string 136 137 // documentRankVersion is an experimental field which will change when the 138 // DocumentRanksPath content changes. If empty we ignore it. 139 documentRankVersion string 140} 141 142func (o *Options) HashOptions() HashOptions { 143 return HashOptions{ 144 sizeMax: o.SizeMax, 145 disableCTags: o.DisableCTags, 146 ctagsPath: o.CTagsPath, 147 cTagsMustSucceed: o.CTagsMustSucceed, 148 largeFiles: o.LargeFiles, 149 documentRankVersion: o.DocumentRanksVersion, 150 } 151} 152 153func (o *Options) GetHash() string { 154 h := o.HashOptions() 155 hasher := sha1.New() 156 157 hasher.Write([]byte(h.ctagsPath)) 158 hasher.Write([]byte(fmt.Sprintf("%t", h.cTagsMustSucceed))) 159 hasher.Write([]byte(fmt.Sprintf("%d", h.sizeMax))) 160 hasher.Write([]byte(fmt.Sprintf("%q", h.largeFiles))) 161 hasher.Write([]byte(fmt.Sprintf("%t", h.disableCTags))) 162 163 if h.documentRankVersion != "" { 164 hasher.Write([]byte{0}) 165 io.WriteString(hasher, h.documentRankVersion) 166 } 167 168 return fmt.Sprintf("%x", hasher.Sum(nil)) 169} 170 171type largeFilesFlag struct{ *Options } 172 173func (f largeFilesFlag) String() string { 174 // From flag.Value documentation: 175 // 176 // The flag package may call the String method with a zero-valued receiver, 177 // such as a nil pointer. 178 if f.Options == nil { 179 return "" 180 } 181 s := append([]string{""}, f.LargeFiles...) 182 return strings.Join(s, "-large_file ") 183} 184 185func (f largeFilesFlag) Set(value string) error { 186 f.LargeFiles = append(f.LargeFiles, value) 187 return nil 188} 189 190// Flags adds flags for build options to fs. It is the "inverse" of Args. 191func (o *Options) Flags(fs *flag.FlagSet) { 192 x := *o 193 x.SetDefaults() 194 fs.IntVar(&o.SizeMax, "file_limit", x.SizeMax, "maximum file size") 195 fs.IntVar(&o.TrigramMax, "max_trigram_count", x.TrigramMax, "maximum number of trigrams per document") 196 fs.IntVar(&o.ShardMax, "shard_limit", x.ShardMax, "maximum corpus size for a shard") 197 fs.IntVar(&o.Parallelism, "parallelism", x.Parallelism, "maximum number of parallel indexing processes.") 198 fs.StringVar(&o.IndexDir, "index", x.IndexDir, "directory for search indices") 199 fs.BoolVar(&o.CTagsMustSucceed, "require_ctags", x.CTagsMustSucceed, "If set, ctags calls must succeed.") 200 fs.Var(largeFilesFlag{o}, "large_file", "A glob pattern where matching files are to be index regardless of their size. You can add multiple patterns by setting this more than once.") 201 202 // Sourcegraph specific 203 fs.BoolVar(&o.DisableCTags, "disable_ctags", x.DisableCTags, "If set, ctags will not be called.") 204 fs.BoolVar(&o.ShardMerging, "shard_merging", x.ShardMerging, "If set, builder will respect compound shards.") 205} 206 207// Args generates command line arguments for o. It is the "inverse" of Flags. 208func (o *Options) Args() []string { 209 var args []string 210 211 if o.SizeMax != 0 { 212 args = append(args, "-file_limit", strconv.Itoa(o.SizeMax)) 213 } 214 215 if o.TrigramMax != 0 { 216 args = append(args, "-max_trigram_count", strconv.Itoa(o.TrigramMax)) 217 } 218 219 if o.ShardMax != 0 { 220 args = append(args, "-shard_limit", strconv.Itoa(o.ShardMax)) 221 } 222 223 if o.Parallelism != 0 { 224 args = append(args, "-parallelism", strconv.Itoa(o.Parallelism)) 225 } 226 227 if o.IndexDir != "" { 228 args = append(args, "-index", o.IndexDir) 229 } 230 231 if o.CTagsMustSucceed { 232 args = append(args, "-require_ctags") 233 } 234 235 for _, a := range o.LargeFiles { 236 args = append(args, "-large_file", a) 237 } 238 239 // Sourcegraph specific 240 if o.DisableCTags { 241 args = append(args, "-disable_ctags") 242 } 243 244 if o.ShardMerging { 245 args = append(args, "-shard_merging") 246 } 247 248 return args 249} 250 251// Builder manages (parallel) creation of uniformly sized shards. The 252// builder buffers up documents until it collects enough documents and 253// then builds a shard and writes. 254type Builder struct { 255 opts Options 256 throttle chan int 257 258 nextShardNum int 259 todo []*zoekt.Document 260 docChecker zoekt.DocChecker 261 size int 262 263 parserBins ctags.ParserBinMap 264 building sync.WaitGroup 265 266 errMu sync.Mutex 267 buildError error 268 269 // temp name => final name for finished shards. We only rename 270 // them once all shards succeed to avoid Frankstein corpuses. 271 finishedShards map[string]string 272 273 // indexTime is set by tests for doing reproducible builds. 274 indexTime time.Time 275 276 // heapProfileMu is used to ensure that only one memory profile is written at a time 277 heapProfileMu sync.Mutex 278 heapProfileNum int 279 280 // a sortable 20 chars long id. 281 id string 282 283 finishCalled bool 284} 285 286type finishedShard struct { 287 temp, final string 288} 289 290func checkCTags() string { 291 if ctags := os.Getenv("CTAGS_COMMAND"); ctags != "" { 292 return ctags 293 } 294 295 if ctags, err := exec.LookPath("universal-ctags"); err == nil { 296 return ctags 297 } 298 299 return "" 300} 301 302func checkScipCTags() string { 303 if ctags := os.Getenv("SCIP_CTAGS_COMMAND"); ctags != "" { 304 return ctags 305 } 306 307 if ctags, err := exec.LookPath("scip-ctags"); err == nil { 308 return ctags 309 } 310 311 return "" 312} 313 314// SetDefaults sets reasonable default options. 315func (o *Options) SetDefaults() { 316 if o.CTagsPath == "" && !o.DisableCTags { 317 o.CTagsPath = checkCTags() 318 } 319 320 if o.ScipCTagsPath == "" && !o.DisableCTags { 321 o.ScipCTagsPath = checkScipCTags() 322 } 323 324 if o.Parallelism == 0 { 325 o.Parallelism = 4 326 } 327 if o.SizeMax == 0 { 328 o.SizeMax = 2 << 20 329 } 330 if o.ShardMax == 0 { 331 o.ShardMax = 100 << 20 332 } 333 if o.TrigramMax == 0 { 334 o.TrigramMax = 20000 335 } 336 337 if o.RepositoryDescription.Name == "" && o.RepositoryDescription.URL != "" { 338 parsed, _ := url.Parse(o.RepositoryDescription.URL) 339 if parsed != nil { 340 o.RepositoryDescription.Name = filepath.Join(parsed.Host, parsed.Path) 341 } 342 } 343} 344 345func hashString(s string) string { 346 h := sha1.New() 347 _, _ = io.WriteString(h, s) 348 return fmt.Sprintf("%x", h.Sum(nil)) 349} 350 351// ShardName returns the name the given index shard. 352func (o *Options) shardName(n int) string { 353 return o.shardNameVersion(zoekt.IndexFormatVersion, n) 354} 355 356func (o *Options) shardNameVersion(version, n int) string { 357 abs := url.QueryEscape(o.RepositoryDescription.Name) 358 if len(abs) > 200 { 359 abs = abs[:200] + hashString(abs)[:8] 360 } 361 return filepath.Join(o.IndexDir, 362 fmt.Sprintf("%s_v%d.%05d.zoekt", abs, version, n)) 363} 364 365type IndexState string 366 367const ( 368 IndexStateMissing IndexState = "missing" 369 IndexStateCorrupt IndexState = "corrupt" 370 IndexStateVersion IndexState = "version-mismatch" 371 IndexStateOption IndexState = "option-mismatch" 372 IndexStateMeta IndexState = "meta-mismatch" 373 IndexStateContent IndexState = "content-mismatch" 374 IndexStateEqual IndexState = "equal" 375) 376 377var readVersions = []struct { 378 IndexFormatVersion int 379 FeatureVersion int 380}{{ 381 IndexFormatVersion: zoekt.IndexFormatVersion, 382 FeatureVersion: zoekt.FeatureVersion, 383}, { 384 IndexFormatVersion: zoekt.NextIndexFormatVersion, 385 FeatureVersion: zoekt.FeatureVersion, 386}} 387 388// IncrementalSkipIndexing returns true if the index present on disk matches 389// the build options. 390func (o *Options) IncrementalSkipIndexing() bool { 391 state, _ := o.IndexState() 392 return state == IndexStateEqual 393} 394 395// IndexState checks how the index present on disk compares to the build 396// options and returns the IndexState and the name of the first shard. 397func (o *Options) IndexState() (IndexState, string) { 398 // Open the latest version we support that is on disk. 399 fn := o.findShard() 400 if fn == "" { 401 return IndexStateMissing, fn 402 } 403 404 repos, index, err := zoekt.ReadMetadataPathAlive(fn) 405 if os.IsNotExist(err) { 406 return IndexStateMissing, fn 407 } else if err != nil { 408 return IndexStateCorrupt, fn 409 } 410 411 for _, v := range readVersions { 412 if v.IndexFormatVersion == index.IndexFormatVersion && v.FeatureVersion != index.IndexFeatureVersion { 413 return IndexStateVersion, fn 414 } 415 } 416 417 var repo *zoekt.Repository 418 for _, cand := range repos { 419 if cand.Name == o.RepositoryDescription.Name { 420 repo = cand 421 break 422 } 423 } 424 425 if repo == nil { 426 return IndexStateCorrupt, fn 427 } 428 429 if repo.IndexOptions != o.GetHash() { 430 return IndexStateOption, fn 431 } 432 433 if !reflect.DeepEqual(repo.Branches, o.RepositoryDescription.Branches) { 434 return IndexStateContent, fn 435 } 436 437 // We can mutate repo since it lives in the scope of this function call. 438 if updated, err := repo.MergeMutable(&o.RepositoryDescription); err != nil { 439 // non-nil err means we are trying to update an immutable field => 440 // reindex content. 441 log.Printf("warn: immutable field changed, requires re-index: %s", err) 442 return IndexStateContent, fn 443 } else if updated { 444 return IndexStateMeta, fn 445 } 446 447 return IndexStateEqual, fn 448} 449 450// FindRepositoryMetadata returns the index metadata for the repository 451// specified in the options. 'ok' is false if the repository's metadata 452// couldn't be found or if an error occurred. 453func (o *Options) FindRepositoryMetadata() (repository *zoekt.Repository, metadata *zoekt.IndexMetadata, ok bool, err error) { 454 shard := o.findShard() 455 if shard == "" { 456 return nil, nil, false, nil 457 } 458 459 repositories, metadata, err := zoekt.ReadMetadataPathAlive(shard) 460 if err != nil { 461 return nil, nil, false, fmt.Errorf("reading metadata for shard %q: %w", shard, err) 462 } 463 464 ID := o.RepositoryDescription.ID 465 for _, r := range repositories { 466 // compound shards contain multiple repositories, so we 467 // have to pick only the one we're looking for 468 if r.ID == ID { 469 return r, metadata, true, nil 470 } 471 } 472 473 // If we're here, then we're somehow in a state where we found a matching 474 // shard that's missing the repository metadata we're looking for. This 475 // should never happen. 476 name := o.RepositoryDescription.Name 477 return nil, nil, false, fmt.Errorf("matching shard %q doesn't contain metadata for repo id %d (%q)", shard, ID, name) 478} 479 480func (o *Options) findShard() string { 481 for _, v := range readVersions { 482 fn := o.shardNameVersion(v.IndexFormatVersion, 0) 483 if _, err := os.Stat(fn); err == nil { 484 return fn 485 } 486 } 487 488 // Brute force finding the shard in compound shards. We should only hit this 489 // code path for repositories that are not already existing or are in 490 // compound shards. 491 // 492 // TODO add an oracle which can speed this up in the case of repositories 493 // already in compound shards. 494 compoundShards, err := filepath.Glob(path.Join(o.IndexDir, "compound-*.zoekt")) 495 if err != nil { 496 return "" 497 } 498 for _, fn := range compoundShards { 499 repos, _, err := zoekt.ReadMetadataPathAlive(fn) 500 if err != nil { 501 continue 502 } 503 for _, repo := range repos { 504 if repo.ID == o.RepositoryDescription.ID { 505 return fn 506 } 507 } 508 } 509 510 return "" 511} 512 513func (o *Options) FindAllShards() []string { 514 for _, v := range readVersions { 515 fn := o.shardNameVersion(v.IndexFormatVersion, 0) 516 if _, err := os.Stat(fn); err == nil { 517 shards := []string{fn} 518 for i := 1; ; i++ { 519 fn := o.shardNameVersion(v.IndexFormatVersion, i) 520 if _, err := os.Stat(fn); err != nil { 521 return shards 522 } 523 shards = append(shards, fn) 524 } 525 } 526 } 527 528 // lazily fallback to findShard which will look for a compound shard. 529 if fn := o.findShard(); fn != "" { 530 return []string{fn} 531 } 532 533 return nil 534} 535 536// IgnoreSizeMax determines whether the max size should be ignored. 537func (o *Options) IgnoreSizeMax(name string) bool { 538 // A pattern match will override preceding pattern matches. 539 for i := len(o.LargeFiles) - 1; i >= 0; i-- { 540 pattern := strings.TrimSpace(o.LargeFiles[i]) 541 negated, validatedPattern := checkIsNegatePattern(pattern) 542 543 if m, _ := doublestar.PathMatch(validatedPattern, name); m { 544 if negated { 545 return false 546 } else { 547 return true 548 } 549 } 550 } 551 552 return false 553} 554 555func checkIsNegatePattern(pattern string) (bool, string) { 556 negate := "!" 557 558 // if negated then strip prefix meta character which identifies negated filter pattern 559 if strings.HasPrefix(pattern, negate) { 560 return true, pattern[len(negate):] 561 } 562 563 return false, pattern 564} 565 566// NewBuilder creates a new Builder instance. 567func NewBuilder(opts Options) (*Builder, error) { 568 opts.SetDefaults() 569 if opts.RepositoryDescription.Name == "" { 570 return nil, fmt.Errorf("builder: must set Name") 571 } 572 573 b := &Builder{ 574 opts: opts, 575 throttle: make(chan int, opts.Parallelism), 576 finishedShards: map[string]string{}, 577 } 578 579 parserBins, err := ctags.NewParserBinMap( 580 b.opts.CTagsPath, 581 b.opts.ScipCTagsPath, 582 opts.LanguageMap, 583 b.opts.CTagsMustSucceed, 584 ) 585 if err != nil { 586 return nil, err 587 } 588 589 b.parserBins = parserBins 590 591 if opts.IsDelta { 592 // Delta shards build on top of previously existing shards. 593 // As a consequence, the shardNum for delta shards starts from 594 // the number following the most recently generated shard - not 0. 595 // 596 // Using this numbering scheme allows all the shards to be 597 // discovered as a set. 598 shards := b.opts.FindAllShards() 599 b.nextShardNum = len(shards) // shards are zero indexed, so len() provides the next number after the last one 600 } 601 602 if _, err := b.newShardBuilder(); err != nil { 603 return nil, err 604 } 605 606 now := time.Now() 607 b.indexTime = now 608 b.id = xid.NewWithTime(now).String() 609 610 return b, nil 611} 612 613// AddFile is a convenience wrapper for the Add method 614func (b *Builder) AddFile(name string, content []byte) error { 615 return b.Add(zoekt.Document{Name: name, Content: content}) 616} 617 618func (b *Builder) Add(doc zoekt.Document) error { 619 if b.finishCalled { 620 return nil 621 } 622 623 allowLargeFile := b.opts.IgnoreSizeMax(doc.Name) 624 if len(doc.Content) > b.opts.SizeMax && !allowLargeFile { 625 // We could pass the document on to the shardbuilder, but if 626 // we pass through a part of the source tree with binary/large 627 // files, the corresponding shard would be mostly empty, so 628 // insert a reason here too. 629 doc.SkipReason = fmt.Sprintf("document size %d larger than limit %d", len(doc.Content), b.opts.SizeMax) 630 } else if err := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile); err != nil { 631 doc.SkipReason = err.Error() 632 doc.Language = "binary" 633 } 634 635 b.todo = append(b.todo, &doc) 636 637 if doc.SkipReason == "" { 638 b.size += len(doc.Name) + len(doc.Content) 639 } else { 640 b.size += len(doc.Name) + len(doc.SkipReason) 641 // Drop the content if we are skipping the document. Skipped content is not counted towards the 642 // shard size limit, so otherwise we might buffer too much data in memory before flushing. 643 doc.Content = nil 644 } 645 646 if b.size > b.opts.ShardMax { 647 return b.flush() 648 } 649 650 return nil 651} 652 653// MarkFileAsChangedOrRemoved indicates that the file specified by the given path 654// has been changed or removed since the last indexing job for this repository. 655// 656// If this build is a delta build, these files will be tombstoned in the older shards for this repository. 657func (b *Builder) MarkFileAsChangedOrRemoved(path string) { 658 b.opts.changedOrRemovedFiles = append(b.opts.changedOrRemovedFiles, path) 659} 660 661// Finish creates a last shard from the buffered documents, and clears 662// stale shards from previous runs. This should always be called, also 663// in failure cases, to ensure cleanup. 664// 665// It is safe to call Finish() multiple times. 666func (b *Builder) Finish() error { 667 if b.finishCalled { 668 return b.buildError 669 } 670 671 b.finishCalled = true 672 673 b.flush() 674 b.building.Wait() 675 676 if b.buildError != nil { 677 for tmp := range b.finishedShards { 678 log.Printf("Builder.Finish %s", tmp) 679 os.Remove(tmp) 680 } 681 b.finishedShards = map[string]string{} 682 return b.buildError 683 } 684 685 // map of temporary -> final names for all updated shards + shard metadata files 686 artifactPaths := make(map[string]string) 687 for tmp, final := range b.finishedShards { 688 artifactPaths[tmp] = final 689 } 690 691 oldShards := b.opts.FindAllShards() 692 693 if b.opts.IsDelta { 694 // Delta shard builds need to update FileTombstone and branch commit information for all 695 // existing shards 696 for _, shard := range oldShards { 697 repositories, _, err := zoekt.ReadMetadataPathAlive(shard) 698 if err != nil { 699 return fmt.Errorf("reading metadata from shard %q: %w", shard, err) 700 } 701 702 if len(repositories) > 1 { 703 return fmt.Errorf("delta shard builds don't support repositories contained in compound shards (shard %q)", shard) 704 } 705 706 if len(repositories) == 0 { 707 return fmt.Errorf("failed to update repository metadata for shard %q - shard contains no repositories", shard) 708 } 709 710 repository := repositories[0] 711 if repository.ID != b.opts.RepositoryDescription.ID { 712 return fmt.Errorf("shard %q doesn't contain repository ID %d (%q)", shard, b.opts.RepositoryDescription.ID, b.opts.RepositoryDescription.Name) 713 } 714 715 if len(b.opts.changedOrRemovedFiles) > 0 && repository.FileTombstones == nil { 716 repository.FileTombstones = make(map[string]struct{}, len(b.opts.changedOrRemovedFiles)) 717 } 718 719 for _, f := range b.opts.changedOrRemovedFiles { 720 repository.FileTombstones[f] = struct{}{} 721 } 722 723 if !BranchNamesEqual(repository.Branches, b.opts.RepositoryDescription.Branches) { 724 return deltaBranchSetError{ 725 shardName: shard, 726 old: repository.Branches, 727 new: b.opts.RepositoryDescription.Branches, 728 } 729 } 730 731 if b.opts.GetHash() != repository.IndexOptions { 732 return &deltaIndexOptionsMismatchError{ 733 shardName: shard, 734 newOptions: b.opts.HashOptions(), 735 } 736 } 737 738 repository.Branches = b.opts.RepositoryDescription.Branches 739 740 repository.LatestCommitDate = b.opts.RepositoryDescription.LatestCommitDate 741 742 tempPath, finalPath, err := zoekt.JsonMarshalRepoMetaTemp(shard, repository) 743 if err != nil { 744 return fmt.Errorf("writing repository metadta for shard %q: %w", shard, err) 745 } 746 747 artifactPaths[tempPath] = finalPath 748 } 749 } 750 751 // We mark finished shards as empty when we successfully finish. Return now 752 // to allow call sites to call Finish idempotently. 753 if len(artifactPaths) == 0 { 754 return b.buildError 755 } 756 757 // Collect a map of the old shards on disk. For each new shard we replace we 758 // delete it from toDelete. Anything remaining in toDelete will be removed 759 // after we have renamed everything into place. 760 761 var toDelete map[string]struct{} 762 if !b.opts.IsDelta { 763 // Non-delta shard builds delete all existing shards before they write out 764 // new ones. 765 // By contrast, delta shard builds work by stacking changes on top of existing shards. 766 // So, we skip populating the toDelete map if we're building delta shards. 767 768 toDelete = make(map[string]struct{}) 769 for _, name := range oldShards { 770 paths, err := zoekt.IndexFilePaths(name) 771 if err != nil { 772 b.buildError = fmt.Errorf("failed to find old paths for %s: %w", name, err) 773 } 774 for _, p := range paths { 775 toDelete[p] = struct{}{} 776 } 777 } 778 } 779 780 for tmp, final := range artifactPaths { 781 if err := os.Rename(tmp, final); err != nil { 782 b.buildError = err 783 continue 784 } 785 786 delete(toDelete, final) 787 } 788 789 b.finishedShards = map[string]string{} 790 791 for p := range toDelete { 792 // Don't delete compound shards, set tombstones instead. 793 if b.opts.ShardMerging && strings.HasPrefix(filepath.Base(p), "compound-") { 794 if !strings.HasSuffix(p, ".zoekt") { 795 continue 796 } 797 err := zoekt.SetTombstone(p, b.opts.RepositoryDescription.ID) 798 b.buildError = err 799 continue 800 } 801 log.Printf("removing old shard file: %s", p) 802 if err := os.Remove(p); err != nil { 803 b.buildError = err 804 } 805 } 806 807 return b.buildError 808} 809 810// BranchNamesEqual compares the given zoekt.RepositoryBranch slices, and returns true 811// iff both slices specify the same set of branch names in the same order. 812func BranchNamesEqual(a, b []zoekt.RepositoryBranch) bool { 813 if len(a) != len(b) { 814 return false 815 } 816 817 for i := range a { 818 x, y := a[i], b[i] 819 if x.Name != y.Name { 820 return false 821 } 822 } 823 824 return true 825} 826 827func (b *Builder) flush() error { 828 todo := b.todo 829 b.todo = nil 830 b.size = 0 831 b.errMu.Lock() 832 defer b.errMu.Unlock() 833 if b.buildError != nil { 834 return b.buildError 835 } 836 837 hasShard := b.nextShardNum > 0 838 if len(todo) == 0 && hasShard { 839 return nil 840 } 841 842 shard := b.nextShardNum 843 b.nextShardNum++ 844 845 if b.opts.Parallelism > 1 { 846 b.building.Add(1) 847 b.throttle <- 1 848 go func() { 849 done, err := b.buildShard(todo, shard) 850 <-b.throttle 851 852 b.errMu.Lock() 853 defer b.errMu.Unlock() 854 if err != nil && b.buildError == nil { 855 b.buildError = err 856 } 857 if err == nil { 858 b.finishedShards[done.temp] = done.final 859 } 860 b.building.Done() 861 }() 862 } else { 863 // No goroutines when we're not parallel. This 864 // simplifies memory profiling. 865 done, err := b.buildShard(todo, shard) 866 b.buildError = err 867 if err == nil { 868 b.finishedShards[done.temp] = done.final 869 } 870 871 return b.buildError 872 } 873 874 return nil 875} 876 877// map [0,inf) to [0,1) monotonically 878func squashRange(j int) float64 { 879 x := float64(j) 880 return x / (1 + x) 881} 882 883// IsLowPriority takes a file name and makes an educated guess about its priority 884// in search results. A file is considered low priority if it looks like a test, 885// vendored, or generated file. 886// 887// These 'priority' criteria affects how documents are ordered within a shard. It's 888// also used to help guess a file's rank when we're missing ranking information. 889func IsLowPriority(path string, content []byte) bool { 890 return enry.IsTest(path) || enry.IsVendor(path) || enry.IsGenerated(path, content) 891} 892 893type rankedDoc struct { 894 *zoekt.Document 895 rank []float64 896} 897 898// rank returns a vector of scores which is used at index-time to sort documents 899// before writing them to disk. The order of documents in the shard is important 900// at query time, because earlier documents receive a boost at query time and 901// have a higher chance of being searched before limits kick in. 902func rank(d *zoekt.Document, origIdx int) []float64 { 903 skipped := 0.0 904 if d.SkipReason != "" { 905 skipped = 1.0 906 } 907 908 generated := 0.0 909 if enry.IsGenerated(d.Name, d.Content) { 910 generated = 1.0 911 } 912 913 vendor := 0.0 914 if enry.IsVendor(d.Name) { 915 vendor = 1.0 916 } 917 918 test := 0.0 919 if enry.IsTest(d.Name) { 920 test = 1.0 921 } 922 923 // Smaller is earlier (=better). 924 return []float64{ 925 // Always place skipped docs last 926 skipped, 927 928 // Prefer docs that are not generated 929 generated, 930 931 // Prefer docs that are not vendored 932 vendor, 933 934 // Prefer docs that are not tests 935 test, 936 937 // With short names 938 squashRange(len(d.Name)), 939 940 // With many symbols 941 1.0 - squashRange(len(d.Symbols)), 942 943 // With short content 944 squashRange(len(d.Content)), 945 946 // That is present is as many branches as possible 947 1.0 - squashRange(len(d.Branches)), 948 949 // Preserve original ordering. 950 squashRange(origIdx), 951 } 952} 953 954func sortDocuments(todo []*zoekt.Document) { 955 rs := make([]rankedDoc, 0, len(todo)) 956 for i, t := range todo { 957 rd := rankedDoc{t, rank(t, i)} 958 rs = append(rs, rd) 959 } 960 sort.Slice(rs, func(i, j int) bool { 961 r1 := rs[i].rank 962 r2 := rs[j].rank 963 for i := range r1 { 964 if r1[i] < r2[i] { 965 return true 966 } 967 if r1[i] > r2[i] { 968 return false 969 } 970 } 971 972 return false 973 }) 974 for i := range todo { 975 todo[i] = rs[i].Document 976 } 977} 978 979func (b *Builder) buildShard(todo []*zoekt.Document, nextShardNum int) (*finishedShard, error) { 980 if !b.opts.DisableCTags && (b.opts.CTagsPath != "" || b.opts.ScipCTagsPath != "") { 981 err := parseSymbols(todo, b.opts.LanguageMap, b.parserBins) 982 if b.opts.CTagsMustSucceed && err != nil { 983 return nil, err 984 } 985 if err != nil { 986 log.Printf("ignoring universal:%s or scip:%s error: %v", b.opts.CTagsPath, b.opts.ScipCTagsPath, err) 987 } 988 } 989 990 name := b.opts.shardName(nextShardNum) 991 992 shardBuilder, err := b.newShardBuilder() 993 if err != nil { 994 return nil, err 995 } 996 997 sortDocuments(todo) 998 999 for idx, t := range todo { 1000 if err := shardBuilder.Add(*t); err != nil { 1001 return nil, err 1002 } 1003 1004 if idx%10_000 == 0 { 1005 b.CheckMemoryUsage() 1006 } 1007 } 1008 1009 return b.writeShard(name, shardBuilder) 1010} 1011 1012// CheckMemoryUsage checks the memory usage of the process and writes a memory profile if the heap usage exceeds the 1013// configured threshold. NOTE: this method is expensive and should only be used for debugging. 1014func (b *Builder) CheckMemoryUsage() { 1015 // Don't check memory if heap profiling is disabled, or we've already written 10 profiles 1016 if b.opts.HeapProfileTriggerBytes <= 0 || b.heapProfileNum >= 10 { 1017 return 1018 } 1019 1020 var m runtime.MemStats 1021 runtime.ReadMemStats(&m) 1022 1023 if m.HeapAlloc > b.opts.HeapProfileTriggerBytes && b.heapProfileMu.TryLock() { 1024 defer b.heapProfileMu.Unlock() 1025 1026 log.Printf("writing memory profile, heap usage: %s", humanize.Bytes(m.HeapAlloc)) 1027 name := filepath.Join(b.opts.IndexDir, fmt.Sprintf("indexmemory.prof.%d", b.heapProfileNum)) 1028 f, err := os.Create(name) 1029 if err != nil { 1030 log.Printf("failed to create memory profile file: %v", err) 1031 return 1032 } 1033 1034 err = pprof.WriteHeapProfile(f) 1035 if err != nil { 1036 log.Printf("failed to write memory profile: %v", err) 1037 } 1038 1039 b.heapProfileNum++ 1040 } 1041} 1042 1043func (b *Builder) newShardBuilder() (*zoekt.IndexBuilder, error) { 1044 desc := b.opts.RepositoryDescription 1045 desc.HasSymbols = !b.opts.DisableCTags && b.opts.CTagsPath != "" 1046 desc.SubRepoMap = b.opts.SubRepositories 1047 desc.IndexOptions = b.opts.GetHash() 1048 1049 shardBuilder, err := zoekt.NewIndexBuilder(&desc) 1050 if err != nil { 1051 return nil, err 1052 } 1053 shardBuilder.IndexTime = b.indexTime 1054 shardBuilder.ID = b.id 1055 return shardBuilder, nil 1056} 1057 1058func (b *Builder) writeShard(fn string, ib *zoekt.IndexBuilder) (*finishedShard, error) { 1059 dir := filepath.Dir(fn) 1060 if err := os.MkdirAll(dir, 0o700); err != nil { 1061 return nil, err 1062 } 1063 1064 f, err := os.CreateTemp(dir, filepath.Base(fn)+".*.tmp") 1065 if err != nil { 1066 return nil, err 1067 } 1068 if runtime.GOOS != "windows" { 1069 if err := f.Chmod(0o666 &^ umask); err != nil { 1070 return nil, err 1071 } 1072 } 1073 1074 defer f.Close() 1075 if err := ib.Write(f); err != nil { 1076 return nil, err 1077 } 1078 fi, err := f.Stat() 1079 if err != nil { 1080 return nil, err 1081 } 1082 if err := f.Close(); err != nil { 1083 return nil, err 1084 } 1085 1086 log.Printf("finished shard %s: %d index bytes (overhead %3.1f), %d files processed \n", 1087 fn, 1088 fi.Size(), 1089 float64(fi.Size())/float64(ib.ContentSize()+1), 1090 ib.NumFiles()) 1091 1092 return &finishedShard{f.Name(), fn}, nil 1093} 1094 1095type deltaBranchSetError struct { 1096 shardName string 1097 old, new []zoekt.RepositoryBranch 1098} 1099 1100func (e deltaBranchSetError) Error() string { 1101 return fmt.Sprintf("repository metadata in shard %q contains a different set of branch names than what was requested, which is unsupported in a delta shard build. old: %+v, new: %+v", e.shardName, e.old, e.new) 1102} 1103 1104type deltaIndexOptionsMismatchError struct { 1105 shardName string 1106 newOptions HashOptions 1107} 1108 1109func (e *deltaIndexOptionsMismatchError) Error() string { 1110 return fmt.Sprintf("one or more index options for shard %q do not match Builder's index options. These index option updates are incompatible with delta build. New index options: %+v", e.shardName, e.newOptions) 1111} 1112 1113// umask holds the Umask of the current process 1114var umask os.FileMode