fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15// package build implements a more convenient interface for building 16// zoekt indices. 17package build 18 19import ( 20 "crypto/sha1" 21 "flag" 22 "fmt" 23 "io" 24 "log" 25 "net/url" 26 "os" 27 "os/exec" 28 "path" 29 "path/filepath" 30 "reflect" 31 "runtime" 32 "runtime/pprof" 33 "sort" 34 "strconv" 35 "strings" 36 "sync" 37 "time" 38 39 "github.com/bmatcuk/doublestar" 40 "github.com/dustin/go-humanize" 41 "github.com/go-enry/go-enry/v2" 42 "github.com/rs/xid" 43 44 "github.com/sourcegraph/zoekt" 45 "github.com/sourcegraph/zoekt/ctags" 46) 47 48var DefaultDir = filepath.Join(os.Getenv("HOME"), ".zoekt") 49 50// Branch describes a single branch version. 51type Branch struct { 52 Name string 53 Version string 54} 55 56// Options sets options for the index building. 57type Options struct { 58 // IndexDir is a directory that holds *.zoekt index files. 59 IndexDir string 60 61 // SizeMax is the maximum file size 62 SizeMax int 63 64 // Parallelism is the maximum number of shards to index in parallel 65 Parallelism int 66 67 // ShardMax sets the maximum corpus size for a single shard 68 ShardMax int 69 70 // TrigramMax sets the maximum number of distinct trigrams per document. 71 TrigramMax int 72 73 // RepositoryDescription holds names and URLs for the repository. 74 RepositoryDescription zoekt.Repository 75 76 // SubRepositories is a path => sub repository map. 77 SubRepositories map[string]*zoekt.Repository 78 79 // DisableCTags disables the generation of ctags metadata. 80 DisableCTags bool 81 82 // CtagsPath is the path to the ctags binary to run, or empty 83 // if a valid binary couldn't be found. 84 CTagsPath string 85 86 // Same as CTagsPath but for scip-ctags 87 ScipCTagsPath string 88 89 // If set, ctags must succeed. 90 CTagsMustSucceed bool 91 92 // LargeFiles is a slice of glob patterns, including ** for any number 93 // of directories, where matching file paths should be indexed 94 // regardless of their size. The full pattern syntax is here: 95 // https://github.com/bmatcuk/doublestar/tree/v1#patterns. 96 LargeFiles []string 97 98 // IsDelta is true if this run contains only the changed documents since the 99 // last run. 100 IsDelta bool 101 102 // changedOrRemovedFiles is a list of file paths that have been changed or removed 103 // since the last indexing job for this repository. These files will be tombstoned 104 // in the older shards for this repository. 105 changedOrRemovedFiles []string 106 107 LanguageMap ctags.LanguageMap 108 109 // ShardMerging is true if builder should respect compound shards. This is a 110 // Sourcegraph specific option. 111 ShardMerging bool 112 113 // HeapProfileTriggerBytes is the heap allocation in bytes that will trigger a memory profile. If 0, no memory profile 114 // will be triggered. Note this trigger looks at total heap allocation (which includes both inuse and garbage objects). 115 // 116 // Profiles will be written to files named `index-memory.prof.n` in the index directory. No more than 10 files are written. 117 // 118 // Note: heap checking is "best effort", and it's possible for the process to OOM without triggering the heap profile. 119 HeapProfileTriggerBytes uint64 120} 121 122// HashOptions contains only the options in Options that upon modification leads to IndexState of IndexStateMismatch during the next index building. 123type HashOptions struct { 124 sizeMax int 125 disableCTags bool 126 ctagsPath string 127 cTagsMustSucceed bool 128 largeFiles []string 129} 130 131func (o *Options) HashOptions() HashOptions { 132 return HashOptions{ 133 sizeMax: o.SizeMax, 134 disableCTags: o.DisableCTags, 135 ctagsPath: o.CTagsPath, 136 cTagsMustSucceed: o.CTagsMustSucceed, 137 largeFiles: o.LargeFiles, 138 } 139} 140 141func (o *Options) GetHash() string { 142 h := o.HashOptions() 143 hasher := sha1.New() 144 145 hasher.Write([]byte(h.ctagsPath)) 146 hasher.Write([]byte(fmt.Sprintf("%t", h.cTagsMustSucceed))) 147 hasher.Write([]byte(fmt.Sprintf("%d", h.sizeMax))) 148 hasher.Write([]byte(fmt.Sprintf("%q", h.largeFiles))) 149 hasher.Write([]byte(fmt.Sprintf("%t", h.disableCTags))) 150 151 return fmt.Sprintf("%x", hasher.Sum(nil)) 152} 153 154type largeFilesFlag struct{ *Options } 155 156func (f largeFilesFlag) String() string { 157 // From flag.Value documentation: 158 // 159 // The flag package may call the String method with a zero-valued receiver, 160 // such as a nil pointer. 161 if f.Options == nil { 162 return "" 163 } 164 s := append([]string{""}, f.LargeFiles...) 165 return strings.Join(s, "-large_file ") 166} 167 168func (f largeFilesFlag) Set(value string) error { 169 f.LargeFiles = append(f.LargeFiles, value) 170 return nil 171} 172 173// Flags adds flags for build options to fs. It is the "inverse" of Args. 174func (o *Options) Flags(fs *flag.FlagSet) { 175 x := *o 176 x.SetDefaults() 177 fs.IntVar(&o.SizeMax, "file_limit", x.SizeMax, "maximum file size") 178 fs.IntVar(&o.TrigramMax, "max_trigram_count", x.TrigramMax, "maximum number of trigrams per document") 179 fs.IntVar(&o.ShardMax, "shard_limit", x.ShardMax, "maximum corpus size for a shard") 180 fs.IntVar(&o.Parallelism, "parallelism", x.Parallelism, "maximum number of parallel indexing processes.") 181 fs.StringVar(&o.IndexDir, "index", x.IndexDir, "directory for search indices") 182 fs.BoolVar(&o.CTagsMustSucceed, "require_ctags", x.CTagsMustSucceed, "If set, ctags calls must succeed.") 183 fs.Var(largeFilesFlag{o}, "large_file", "A glob pattern where matching files are to be index regardless of their size. You can add multiple patterns by setting this more than once.") 184 185 // Sourcegraph specific 186 fs.BoolVar(&o.DisableCTags, "disable_ctags", x.DisableCTags, "If set, ctags will not be called.") 187 fs.BoolVar(&o.ShardMerging, "shard_merging", x.ShardMerging, "If set, builder will respect compound shards.") 188} 189 190// Args generates command line arguments for o. It is the "inverse" of Flags. 191func (o *Options) Args() []string { 192 var args []string 193 194 if o.SizeMax != 0 { 195 args = append(args, "-file_limit", strconv.Itoa(o.SizeMax)) 196 } 197 198 if o.TrigramMax != 0 { 199 args = append(args, "-max_trigram_count", strconv.Itoa(o.TrigramMax)) 200 } 201 202 if o.ShardMax != 0 { 203 args = append(args, "-shard_limit", strconv.Itoa(o.ShardMax)) 204 } 205 206 if o.Parallelism != 0 { 207 args = append(args, "-parallelism", strconv.Itoa(o.Parallelism)) 208 } 209 210 if o.IndexDir != "" { 211 args = append(args, "-index", o.IndexDir) 212 } 213 214 if o.CTagsMustSucceed { 215 args = append(args, "-require_ctags") 216 } 217 218 for _, a := range o.LargeFiles { 219 args = append(args, "-large_file", a) 220 } 221 222 // Sourcegraph specific 223 if o.DisableCTags { 224 args = append(args, "-disable_ctags") 225 } 226 227 if o.ShardMerging { 228 args = append(args, "-shard_merging") 229 } 230 231 return args 232} 233 234// Builder manages (parallel) creation of uniformly sized shards. The 235// builder buffers up documents until it collects enough documents and 236// then builds a shard and writes. 237type Builder struct { 238 opts Options 239 throttle chan int 240 241 nextShardNum int 242 todo []*zoekt.Document 243 docChecker zoekt.DocChecker 244 size int 245 246 parserBins ctags.ParserBinMap 247 building sync.WaitGroup 248 249 errMu sync.Mutex 250 buildError error 251 252 // temp name => final name for finished shards. We only rename 253 // them once all shards succeed to avoid Frankstein corpuses. 254 finishedShards map[string]string 255 256 // indexTime is set by tests for doing reproducible builds. 257 indexTime time.Time 258 259 // heapProfileMu is used to ensure that only one memory profile is written at a time 260 heapProfileMu sync.Mutex 261 heapProfileNum int 262 263 // a sortable 20 chars long id. 264 id string 265 266 finishCalled bool 267} 268 269type finishedShard struct { 270 temp, final string 271} 272 273func checkCTags() string { 274 if ctags := os.Getenv("CTAGS_COMMAND"); ctags != "" { 275 return ctags 276 } 277 278 if ctags, err := exec.LookPath("universal-ctags"); err == nil { 279 return ctags 280 } 281 282 return "" 283} 284 285func checkScipCTags() string { 286 if ctags := os.Getenv("SCIP_CTAGS_COMMAND"); ctags != "" { 287 return ctags 288 } 289 290 if ctags, err := exec.LookPath("scip-ctags"); err == nil { 291 return ctags 292 } 293 294 return "" 295} 296 297// SetDefaults sets reasonable default options. 298func (o *Options) SetDefaults() { 299 if o.CTagsPath == "" && !o.DisableCTags { 300 o.CTagsPath = checkCTags() 301 } 302 303 if o.ScipCTagsPath == "" && !o.DisableCTags { 304 o.ScipCTagsPath = checkScipCTags() 305 } 306 307 if o.Parallelism == 0 { 308 o.Parallelism = 4 309 } 310 if o.SizeMax == 0 { 311 o.SizeMax = 2 << 20 312 } 313 if o.ShardMax == 0 { 314 o.ShardMax = 100 << 20 315 } 316 if o.TrigramMax == 0 { 317 o.TrigramMax = 20000 318 } 319 320 if o.RepositoryDescription.Name == "" && o.RepositoryDescription.URL != "" { 321 parsed, _ := url.Parse(o.RepositoryDescription.URL) 322 if parsed != nil { 323 o.RepositoryDescription.Name = filepath.Join(parsed.Host, parsed.Path) 324 } 325 } 326} 327 328func hashString(s string) string { 329 h := sha1.New() 330 _, _ = io.WriteString(h, s) 331 return fmt.Sprintf("%x", h.Sum(nil)) 332} 333 334// ShardName returns the name the given index shard. 335func (o *Options) shardName(n int) string { 336 return o.shardNameVersion(zoekt.IndexFormatVersion, n) 337} 338 339func (o *Options) shardNameVersion(version, n int) string { 340 abs := url.QueryEscape(o.RepositoryDescription.Name) 341 if len(abs) > 200 { 342 abs = abs[:200] + hashString(abs)[:8] 343 } 344 return filepath.Join(o.IndexDir, 345 fmt.Sprintf("%s_v%d.%05d.zoekt", abs, version, n)) 346} 347 348type IndexState string 349 350const ( 351 IndexStateMissing IndexState = "missing" 352 IndexStateCorrupt IndexState = "corrupt" 353 IndexStateVersion IndexState = "version-mismatch" 354 IndexStateOption IndexState = "option-mismatch" 355 IndexStateMeta IndexState = "meta-mismatch" 356 IndexStateContent IndexState = "content-mismatch" 357 IndexStateEqual IndexState = "equal" 358) 359 360var readVersions = []struct { 361 IndexFormatVersion int 362 FeatureVersion int 363}{{ 364 IndexFormatVersion: zoekt.IndexFormatVersion, 365 FeatureVersion: zoekt.FeatureVersion, 366}, { 367 IndexFormatVersion: zoekt.NextIndexFormatVersion, 368 FeatureVersion: zoekt.FeatureVersion, 369}} 370 371// IncrementalSkipIndexing returns true if the index present on disk matches 372// the build options. 373func (o *Options) IncrementalSkipIndexing() bool { 374 state, _ := o.IndexState() 375 return state == IndexStateEqual 376} 377 378// IndexState checks how the index present on disk compares to the build 379// options and returns the IndexState and the name of the first shard. 380func (o *Options) IndexState() (IndexState, string) { 381 // Open the latest version we support that is on disk. 382 fn := o.findShard() 383 if fn == "" { 384 return IndexStateMissing, fn 385 } 386 387 repos, index, err := zoekt.ReadMetadataPathAlive(fn) 388 if os.IsNotExist(err) { 389 return IndexStateMissing, fn 390 } else if err != nil { 391 return IndexStateCorrupt, fn 392 } 393 394 for _, v := range readVersions { 395 if v.IndexFormatVersion == index.IndexFormatVersion && v.FeatureVersion != index.IndexFeatureVersion { 396 return IndexStateVersion, fn 397 } 398 } 399 400 var repo *zoekt.Repository 401 for _, cand := range repos { 402 if cand.Name == o.RepositoryDescription.Name { 403 repo = cand 404 break 405 } 406 } 407 408 if repo == nil { 409 return IndexStateCorrupt, fn 410 } 411 412 if repo.IndexOptions != o.GetHash() { 413 return IndexStateOption, fn 414 } 415 416 if !reflect.DeepEqual(repo.Branches, o.RepositoryDescription.Branches) { 417 return IndexStateContent, fn 418 } 419 420 // We can mutate repo since it lives in the scope of this function call. 421 if updated, err := repo.MergeMutable(&o.RepositoryDescription); err != nil { 422 // non-nil err means we are trying to update an immutable field => 423 // reindex content. 424 log.Printf("warn: immutable field changed, requires re-index: %s", err) 425 return IndexStateContent, fn 426 } else if updated { 427 return IndexStateMeta, fn 428 } 429 430 return IndexStateEqual, fn 431} 432 433// FindRepositoryMetadata returns the index metadata for the repository 434// specified in the options. 'ok' is false if the repository's metadata 435// couldn't be found or if an error occurred. 436func (o *Options) FindRepositoryMetadata() (repository *zoekt.Repository, metadata *zoekt.IndexMetadata, ok bool, err error) { 437 shard := o.findShard() 438 if shard == "" { 439 return nil, nil, false, nil 440 } 441 442 repositories, metadata, err := zoekt.ReadMetadataPathAlive(shard) 443 if err != nil { 444 return nil, nil, false, fmt.Errorf("reading metadata for shard %q: %w", shard, err) 445 } 446 447 ID := o.RepositoryDescription.ID 448 for _, r := range repositories { 449 // compound shards contain multiple repositories, so we 450 // have to pick only the one we're looking for 451 if r.ID == ID { 452 return r, metadata, true, nil 453 } 454 } 455 456 // If we're here, then we're somehow in a state where we found a matching 457 // shard that's missing the repository metadata we're looking for. This 458 // should never happen. 459 name := o.RepositoryDescription.Name 460 return nil, nil, false, fmt.Errorf("matching shard %q doesn't contain metadata for repo id %d (%q)", shard, ID, name) 461} 462 463func (o *Options) findShard() string { 464 for _, v := range readVersions { 465 fn := o.shardNameVersion(v.IndexFormatVersion, 0) 466 if _, err := os.Stat(fn); err == nil { 467 return fn 468 } 469 } 470 471 // Brute force finding the shard in compound shards. We should only hit this 472 // code path for repositories that are not already existing or are in 473 // compound shards. 474 // 475 // TODO add an oracle which can speed this up in the case of repositories 476 // already in compound shards. 477 compoundShards, err := filepath.Glob(path.Join(o.IndexDir, "compound-*.zoekt")) 478 if err != nil { 479 return "" 480 } 481 for _, fn := range compoundShards { 482 repos, _, err := zoekt.ReadMetadataPathAlive(fn) 483 if err != nil { 484 continue 485 } 486 for _, repo := range repos { 487 if repo.ID == o.RepositoryDescription.ID { 488 return fn 489 } 490 } 491 } 492 493 return "" 494} 495 496func (o *Options) FindAllShards() []string { 497 for _, v := range readVersions { 498 fn := o.shardNameVersion(v.IndexFormatVersion, 0) 499 if _, err := os.Stat(fn); err == nil { 500 shards := []string{fn} 501 for i := 1; ; i++ { 502 fn := o.shardNameVersion(v.IndexFormatVersion, i) 503 if _, err := os.Stat(fn); err != nil { 504 return shards 505 } 506 shards = append(shards, fn) 507 } 508 } 509 } 510 511 // lazily fallback to findShard which will look for a compound shard. 512 if fn := o.findShard(); fn != "" { 513 return []string{fn} 514 } 515 516 return nil 517} 518 519// IgnoreSizeMax determines whether the max size should be ignored. 520func (o *Options) IgnoreSizeMax(name string) bool { 521 // A pattern match will override preceding pattern matches. 522 for i := len(o.LargeFiles) - 1; i >= 0; i-- { 523 pattern := strings.TrimSpace(o.LargeFiles[i]) 524 negated, validatedPattern := checkIsNegatePattern(pattern) 525 526 if m, _ := doublestar.PathMatch(validatedPattern, name); m { 527 if negated { 528 return false 529 } else { 530 return true 531 } 532 } 533 } 534 535 return false 536} 537 538func checkIsNegatePattern(pattern string) (bool, string) { 539 negate := "!" 540 541 // if negated then strip prefix meta character which identifies negated filter pattern 542 if strings.HasPrefix(pattern, negate) { 543 return true, pattern[len(negate):] 544 } 545 546 return false, pattern 547} 548 549// NewBuilder creates a new Builder instance. 550func NewBuilder(opts Options) (*Builder, error) { 551 opts.SetDefaults() 552 if opts.RepositoryDescription.Name == "" { 553 return nil, fmt.Errorf("builder: must set Name") 554 } 555 556 b := &Builder{ 557 opts: opts, 558 throttle: make(chan int, opts.Parallelism), 559 finishedShards: map[string]string{}, 560 } 561 562 parserBins, err := ctags.NewParserBinMap( 563 b.opts.CTagsPath, 564 b.opts.ScipCTagsPath, 565 opts.LanguageMap, 566 b.opts.CTagsMustSucceed, 567 ) 568 if err != nil { 569 return nil, err 570 } 571 572 b.parserBins = parserBins 573 574 if opts.IsDelta { 575 // Delta shards build on top of previously existing shards. 576 // As a consequence, the shardNum for delta shards starts from 577 // the number following the most recently generated shard - not 0. 578 // 579 // Using this numbering scheme allows all the shards to be 580 // discovered as a set. 581 shards := b.opts.FindAllShards() 582 b.nextShardNum = len(shards) // shards are zero indexed, so len() provides the next number after the last one 583 } 584 585 if _, err := b.newShardBuilder(); err != nil { 586 return nil, err 587 } 588 589 now := time.Now() 590 b.indexTime = now 591 b.id = xid.NewWithTime(now).String() 592 593 return b, nil 594} 595 596// AddFile is a convenience wrapper for the Add method 597func (b *Builder) AddFile(name string, content []byte) error { 598 return b.Add(zoekt.Document{Name: name, Content: content}) 599} 600 601func (b *Builder) Add(doc zoekt.Document) error { 602 if b.finishCalled { 603 return nil 604 } 605 606 allowLargeFile := b.opts.IgnoreSizeMax(doc.Name) 607 if len(doc.Content) > b.opts.SizeMax && !allowLargeFile { 608 // We could pass the document on to the shardbuilder, but if 609 // we pass through a part of the source tree with binary/large 610 // files, the corresponding shard would be mostly empty, so 611 // insert a reason here too. 612 doc.SkipReason = fmt.Sprintf("document size %d larger than limit %d", len(doc.Content), b.opts.SizeMax) 613 } else if err := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile); err != nil { 614 doc.SkipReason = err.Error() 615 doc.Language = "binary" 616 } 617 618 b.todo = append(b.todo, &doc) 619 620 if doc.SkipReason == "" { 621 b.size += len(doc.Name) + len(doc.Content) 622 } else { 623 b.size += len(doc.Name) + len(doc.SkipReason) 624 // Drop the content if we are skipping the document. Skipped content is not counted towards the 625 // shard size limit, so otherwise we might buffer too much data in memory before flushing. 626 doc.Content = nil 627 } 628 629 if b.size > b.opts.ShardMax { 630 return b.flush() 631 } 632 633 return nil 634} 635 636// MarkFileAsChangedOrRemoved indicates that the file specified by the given path 637// has been changed or removed since the last indexing job for this repository. 638// 639// If this build is a delta build, these files will be tombstoned in the older shards for this repository. 640func (b *Builder) MarkFileAsChangedOrRemoved(path string) { 641 b.opts.changedOrRemovedFiles = append(b.opts.changedOrRemovedFiles, path) 642} 643 644// Finish creates a last shard from the buffered documents, and clears 645// stale shards from previous runs. This should always be called, also 646// in failure cases, to ensure cleanup. 647// 648// It is safe to call Finish() multiple times. 649func (b *Builder) Finish() error { 650 if b.finishCalled { 651 return b.buildError 652 } 653 654 b.finishCalled = true 655 656 b.flush() 657 b.building.Wait() 658 659 if b.buildError != nil { 660 for tmp := range b.finishedShards { 661 log.Printf("Builder.Finish %s", tmp) 662 os.Remove(tmp) 663 } 664 b.finishedShards = map[string]string{} 665 return b.buildError 666 } 667 668 // map of temporary -> final names for all updated shards + shard metadata files 669 artifactPaths := make(map[string]string) 670 for tmp, final := range b.finishedShards { 671 artifactPaths[tmp] = final 672 } 673 674 oldShards := b.opts.FindAllShards() 675 676 if b.opts.IsDelta { 677 // Delta shard builds need to update FileTombstone and branch commit information for all 678 // existing shards 679 for _, shard := range oldShards { 680 repositories, _, err := zoekt.ReadMetadataPathAlive(shard) 681 if err != nil { 682 return fmt.Errorf("reading metadata from shard %q: %w", shard, err) 683 } 684 685 if len(repositories) > 1 { 686 return fmt.Errorf("delta shard builds don't support repositories contained in compound shards (shard %q)", shard) 687 } 688 689 if len(repositories) == 0 { 690 return fmt.Errorf("failed to update repository metadata for shard %q - shard contains no repositories", shard) 691 } 692 693 repository := repositories[0] 694 if repository.ID != b.opts.RepositoryDescription.ID { 695 return fmt.Errorf("shard %q doesn't contain repository ID %d (%q)", shard, b.opts.RepositoryDescription.ID, b.opts.RepositoryDescription.Name) 696 } 697 698 if len(b.opts.changedOrRemovedFiles) > 0 && repository.FileTombstones == nil { 699 repository.FileTombstones = make(map[string]struct{}, len(b.opts.changedOrRemovedFiles)) 700 } 701 702 for _, f := range b.opts.changedOrRemovedFiles { 703 repository.FileTombstones[f] = struct{}{} 704 } 705 706 if !BranchNamesEqual(repository.Branches, b.opts.RepositoryDescription.Branches) { 707 return deltaBranchSetError{ 708 shardName: shard, 709 old: repository.Branches, 710 new: b.opts.RepositoryDescription.Branches, 711 } 712 } 713 714 if b.opts.GetHash() != repository.IndexOptions { 715 return &deltaIndexOptionsMismatchError{ 716 shardName: shard, 717 newOptions: b.opts.HashOptions(), 718 } 719 } 720 721 repository.Branches = b.opts.RepositoryDescription.Branches 722 723 repository.LatestCommitDate = b.opts.RepositoryDescription.LatestCommitDate 724 725 tempPath, finalPath, err := zoekt.JsonMarshalRepoMetaTemp(shard, repository) 726 if err != nil { 727 return fmt.Errorf("writing repository metadta for shard %q: %w", shard, err) 728 } 729 730 artifactPaths[tempPath] = finalPath 731 } 732 } 733 734 // We mark finished shards as empty when we successfully finish. Return now 735 // to allow call sites to call Finish idempotently. 736 if len(artifactPaths) == 0 { 737 return b.buildError 738 } 739 740 // Collect a map of the old shards on disk. For each new shard we replace we 741 // delete it from toDelete. Anything remaining in toDelete will be removed 742 // after we have renamed everything into place. 743 744 var toDelete map[string]struct{} 745 if !b.opts.IsDelta { 746 // Non-delta shard builds delete all existing shards before they write out 747 // new ones. 748 // By contrast, delta shard builds work by stacking changes on top of existing shards. 749 // So, we skip populating the toDelete map if we're building delta shards. 750 751 toDelete = make(map[string]struct{}) 752 for _, name := range oldShards { 753 paths, err := zoekt.IndexFilePaths(name) 754 if err != nil { 755 b.buildError = fmt.Errorf("failed to find old paths for %s: %w", name, err) 756 } 757 for _, p := range paths { 758 toDelete[p] = struct{}{} 759 } 760 } 761 } 762 763 for tmp, final := range artifactPaths { 764 if err := os.Rename(tmp, final); err != nil { 765 b.buildError = err 766 continue 767 } 768 769 delete(toDelete, final) 770 } 771 772 b.finishedShards = map[string]string{} 773 774 for p := range toDelete { 775 // Don't delete compound shards, set tombstones instead. 776 if b.opts.ShardMerging && strings.HasPrefix(filepath.Base(p), "compound-") { 777 if !strings.HasSuffix(p, ".zoekt") { 778 continue 779 } 780 err := zoekt.SetTombstone(p, b.opts.RepositoryDescription.ID) 781 b.buildError = err 782 continue 783 } 784 log.Printf("removing old shard file: %s", p) 785 if err := os.Remove(p); err != nil { 786 b.buildError = err 787 } 788 } 789 790 return b.buildError 791} 792 793// BranchNamesEqual compares the given zoekt.RepositoryBranch slices, and returns true 794// iff both slices specify the same set of branch names in the same order. 795func BranchNamesEqual(a, b []zoekt.RepositoryBranch) bool { 796 if len(a) != len(b) { 797 return false 798 } 799 800 for i := range a { 801 x, y := a[i], b[i] 802 if x.Name != y.Name { 803 return false 804 } 805 } 806 807 return true 808} 809 810func (b *Builder) flush() error { 811 todo := b.todo 812 b.todo = nil 813 b.size = 0 814 b.errMu.Lock() 815 defer b.errMu.Unlock() 816 if b.buildError != nil { 817 return b.buildError 818 } 819 820 hasShard := b.nextShardNum > 0 821 if len(todo) == 0 && hasShard { 822 return nil 823 } 824 825 shard := b.nextShardNum 826 b.nextShardNum++ 827 828 if b.opts.Parallelism > 1 { 829 b.building.Add(1) 830 b.throttle <- 1 831 go func() { 832 done, err := b.buildShard(todo, shard) 833 <-b.throttle 834 835 b.errMu.Lock() 836 defer b.errMu.Unlock() 837 if err != nil && b.buildError == nil { 838 b.buildError = err 839 } 840 if err == nil { 841 b.finishedShards[done.temp] = done.final 842 } 843 b.building.Done() 844 }() 845 } else { 846 // No goroutines when we're not parallel. This 847 // simplifies memory profiling. 848 done, err := b.buildShard(todo, shard) 849 b.buildError = err 850 if err == nil { 851 b.finishedShards[done.temp] = done.final 852 } 853 854 return b.buildError 855 } 856 857 return nil 858} 859 860// map [0,inf) to [0,1) monotonically 861func squashRange(j int) float64 { 862 x := float64(j) 863 return x / (1 + x) 864} 865 866// IsLowPriority takes a file name and makes an educated guess about its priority 867// in search results. A file is considered low priority if it looks like a test, 868// vendored, or generated file. 869// 870// These 'priority' criteria affects how documents are ordered within a shard. It's 871// also used to help guess a file's rank when we're missing ranking information. 872func IsLowPriority(path string, content []byte) bool { 873 return enry.IsTest(path) || enry.IsVendor(path) || enry.IsGenerated(path, content) 874} 875 876type rankedDoc struct { 877 *zoekt.Document 878 rank []float64 879} 880 881// rank returns a vector of scores which is used at index-time to sort documents 882// before writing them to disk. The order of documents in the shard is important 883// at query time, because earlier documents receive a boost at query time and 884// have a higher chance of being searched before limits kick in. 885func rank(d *zoekt.Document, origIdx int) []float64 { 886 skipped := 0.0 887 if d.SkipReason != "" { 888 skipped = 1.0 889 } 890 891 generated := 0.0 892 if enry.IsGenerated(d.Name, d.Content) { 893 generated = 1.0 894 } 895 896 vendor := 0.0 897 if enry.IsVendor(d.Name) { 898 vendor = 1.0 899 } 900 901 test := 0.0 902 if enry.IsTest(d.Name) { 903 test = 1.0 904 } 905 906 // Smaller is earlier (=better). 907 return []float64{ 908 // Always place skipped docs last 909 skipped, 910 911 // Prefer docs that are not generated 912 generated, 913 914 // Prefer docs that are not vendored 915 vendor, 916 917 // Prefer docs that are not tests 918 test, 919 920 // With short names 921 squashRange(len(d.Name)), 922 923 // With many symbols 924 1.0 - squashRange(len(d.Symbols)), 925 926 // With short content 927 squashRange(len(d.Content)), 928 929 // That is present is as many branches as possible 930 1.0 - squashRange(len(d.Branches)), 931 932 // Preserve original ordering. 933 squashRange(origIdx), 934 } 935} 936 937func sortDocuments(todo []*zoekt.Document) { 938 rs := make([]rankedDoc, 0, len(todo)) 939 for i, t := range todo { 940 rd := rankedDoc{t, rank(t, i)} 941 rs = append(rs, rd) 942 } 943 sort.Slice(rs, func(i, j int) bool { 944 r1 := rs[i].rank 945 r2 := rs[j].rank 946 for i := range r1 { 947 if r1[i] < r2[i] { 948 return true 949 } 950 if r1[i] > r2[i] { 951 return false 952 } 953 } 954 955 return false 956 }) 957 for i := range todo { 958 todo[i] = rs[i].Document 959 } 960} 961 962func (b *Builder) buildShard(todo []*zoekt.Document, nextShardNum int) (*finishedShard, error) { 963 if !b.opts.DisableCTags && (b.opts.CTagsPath != "" || b.opts.ScipCTagsPath != "") { 964 err := parseSymbols(todo, b.opts.LanguageMap, b.parserBins) 965 if b.opts.CTagsMustSucceed && err != nil { 966 return nil, err 967 } 968 if err != nil { 969 log.Printf("ignoring universal:%s or scip:%s error: %v", b.opts.CTagsPath, b.opts.ScipCTagsPath, err) 970 } 971 } 972 973 name := b.opts.shardName(nextShardNum) 974 975 shardBuilder, err := b.newShardBuilder() 976 if err != nil { 977 return nil, err 978 } 979 980 sortDocuments(todo) 981 982 for idx, t := range todo { 983 if err := shardBuilder.Add(*t); err != nil { 984 return nil, err 985 } 986 987 if idx%10_000 == 0 { 988 b.CheckMemoryUsage() 989 } 990 } 991 992 return b.writeShard(name, shardBuilder) 993} 994 995// CheckMemoryUsage checks the memory usage of the process and writes a memory profile if the heap usage exceeds the 996// configured threshold. NOTE: this method is expensive and should only be used for debugging. 997func (b *Builder) CheckMemoryUsage() { 998 // Don't check memory if heap profiling is disabled, or we've already written 10 profiles 999 if b.opts.HeapProfileTriggerBytes <= 0 || b.heapProfileNum >= 10 { 1000 return 1001 } 1002 1003 var m runtime.MemStats 1004 runtime.ReadMemStats(&m) 1005 1006 if m.HeapAlloc > b.opts.HeapProfileTriggerBytes && b.heapProfileMu.TryLock() { 1007 defer b.heapProfileMu.Unlock() 1008 1009 log.Printf("writing memory profile, allocated heap: %s", humanize.Bytes(m.HeapAlloc)) 1010 name := filepath.Join(b.opts.IndexDir, fmt.Sprintf("indexmemory.prof.%d", b.heapProfileNum)) 1011 f, err := os.Create(name) 1012 if err != nil { 1013 log.Printf("failed to create memory profile file: %v", err) 1014 return 1015 } 1016 1017 err = pprof.WriteHeapProfile(f) 1018 if err != nil { 1019 log.Printf("failed to write memory profile: %v", err) 1020 } 1021 1022 b.heapProfileNum++ 1023 } 1024} 1025 1026func (b *Builder) newShardBuilder() (*zoekt.IndexBuilder, error) { 1027 desc := b.opts.RepositoryDescription 1028 desc.HasSymbols = !b.opts.DisableCTags && b.opts.CTagsPath != "" 1029 desc.SubRepoMap = b.opts.SubRepositories 1030 desc.IndexOptions = b.opts.GetHash() 1031 1032 shardBuilder, err := zoekt.NewIndexBuilder(&desc) 1033 if err != nil { 1034 return nil, err 1035 } 1036 shardBuilder.IndexTime = b.indexTime 1037 shardBuilder.ID = b.id 1038 return shardBuilder, nil 1039} 1040 1041func (b *Builder) writeShard(fn string, ib *zoekt.IndexBuilder) (*finishedShard, error) { 1042 dir := filepath.Dir(fn) 1043 if err := os.MkdirAll(dir, 0o700); err != nil { 1044 return nil, err 1045 } 1046 1047 f, err := os.CreateTemp(dir, filepath.Base(fn)+".*.tmp") 1048 if err != nil { 1049 return nil, err 1050 } 1051 if runtime.GOOS != "windows" { 1052 if err := f.Chmod(0o666 &^ umask); err != nil { 1053 return nil, err 1054 } 1055 } 1056 1057 defer f.Close() 1058 if err := ib.Write(f); err != nil { 1059 return nil, err 1060 } 1061 fi, err := f.Stat() 1062 if err != nil { 1063 return nil, err 1064 } 1065 if err := f.Close(); err != nil { 1066 return nil, err 1067 } 1068 1069 log.Printf("finished shard %s: %d index bytes (overhead %3.1f), %d files processed \n", 1070 fn, 1071 fi.Size(), 1072 float64(fi.Size())/float64(ib.ContentSize()+1), 1073 ib.NumFiles()) 1074 1075 return &finishedShard{f.Name(), fn}, nil 1076} 1077 1078type deltaBranchSetError struct { 1079 shardName string 1080 old, new []zoekt.RepositoryBranch 1081} 1082 1083func (e deltaBranchSetError) Error() string { 1084 return fmt.Sprintf("repository metadata in shard %q contains a different set of branch names than what was requested, which is unsupported in a delta shard build. old: %+v, new: %+v", e.shardName, e.old, e.new) 1085} 1086 1087type deltaIndexOptionsMismatchError struct { 1088 shardName string 1089 newOptions HashOptions 1090} 1091 1092func (e *deltaIndexOptionsMismatchError) Error() string { 1093 return fmt.Sprintf("one or more index options for shard %q do not match Builder's index options. These index option updates are incompatible with delta build. New index options: %+v", e.shardName, e.newOptions) 1094} 1095 1096// umask holds the Umask of the current process 1097var umask os.FileMode