fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15// package build implements a more convenient interface for building 16// zoekt indices. 17package build 18 19import ( 20 "crypto/sha1" 21 "flag" 22 "fmt" 23 "io" 24 "log" 25 "net/url" 26 "os" 27 "os/exec" 28 "path" 29 "path/filepath" 30 "reflect" 31 "runtime" 32 "runtime/pprof" 33 "sort" 34 "strconv" 35 "strings" 36 "sync" 37 "time" 38 39 "github.com/bmatcuk/doublestar" 40 "github.com/go-enry/go-enry/v2" 41 "github.com/rs/xid" 42 43 "github.com/sourcegraph/zoekt" 44 "github.com/sourcegraph/zoekt/ctags" 45) 46 47var DefaultDir = filepath.Join(os.Getenv("HOME"), ".zoekt") 48 49// Branch describes a single branch version. 50type Branch struct { 51 Name string 52 Version string 53} 54 55// Options sets options for the index building. 56type Options struct { 57 // IndexDir is a directory that holds *.zoekt index files. 58 IndexDir string 59 60 // SizeMax is the maximum file size 61 SizeMax int 62 63 // Parallelism is the maximum number of shards to index in parallel 64 Parallelism int 65 66 // ShardMax sets the maximum corpus size for a single shard 67 ShardMax int 68 69 // TrigramMax sets the maximum number of distinct trigrams per document. 70 TrigramMax int 71 72 // RepositoryDescription holds names and URLs for the repository. 73 RepositoryDescription zoekt.Repository 74 75 // SubRepositories is a path => sub repository map. 76 SubRepositories map[string]*zoekt.Repository 77 78 // DisableCTags disables the generation of ctags metadata. 79 DisableCTags bool 80 81 // CtagsPath is the path to the ctags binary to run, or empty 82 // if a valid binary couldn't be found. 83 CTagsPath string 84 85 // Same as CTagsPath but for scip-ctags 86 ScipCTagsPath string 87 88 // If set, ctags must succeed. 89 CTagsMustSucceed bool 90 91 // Write memory profiles to this file. 92 MemProfile string 93 94 // LargeFiles is a slice of glob patterns, including ** for any number 95 // of directories, where matching file paths should be indexed 96 // regardless of their size. The full pattern syntax is here: 97 // https://github.com/bmatcuk/doublestar/tree/v1#patterns. 98 LargeFiles []string 99 100 // IsDelta is true if this run contains only the changed documents since the 101 // last run. 102 IsDelta bool 103 104 // DocumentRanksPath is the path to the file with document ranks. If empty, 105 // ranks will be computed on-the-fly. 106 DocumentRanksPath string 107 108 // DocumentRanksVersion is a string which when changed will cause us to 109 // reindex a shard. This field is used so that when the contents of 110 // DocumentRanksPath changes, we can reindex. 111 DocumentRanksVersion string 112 113 // changedOrRemovedFiles is a list of file paths that have been changed or removed 114 // since the last indexing job for this repository. These files will be tombstoned 115 // in the older shards for this repository. 116 changedOrRemovedFiles []string 117 118 LanguageMap ctags.LanguageMap 119 120 // ShardMerging is true if builder should respect compound shards. This is a 121 // Sourcegraph specific option. 122 ShardMerging bool 123} 124 125// HashOptions contains only the options in Options that upon modification leads to IndexState of IndexStateMismatch during the next index building. 126type HashOptions struct { 127 sizeMax int 128 disableCTags bool 129 ctagsPath string 130 cTagsMustSucceed bool 131 largeFiles []string 132 133 // documentRankVersion is an experimental field which will change when the 134 // DocumentRanksPath content changes. If empty we ignore it. 135 documentRankVersion string 136} 137 138func (o *Options) HashOptions() HashOptions { 139 return HashOptions{ 140 sizeMax: o.SizeMax, 141 disableCTags: o.DisableCTags, 142 ctagsPath: o.CTagsPath, 143 cTagsMustSucceed: o.CTagsMustSucceed, 144 largeFiles: o.LargeFiles, 145 documentRankVersion: o.DocumentRanksVersion, 146 } 147} 148 149func (o *Options) GetHash() string { 150 h := o.HashOptions() 151 hasher := sha1.New() 152 153 hasher.Write([]byte(h.ctagsPath)) 154 hasher.Write([]byte(fmt.Sprintf("%t", h.cTagsMustSucceed))) 155 hasher.Write([]byte(fmt.Sprintf("%d", h.sizeMax))) 156 hasher.Write([]byte(fmt.Sprintf("%q", h.largeFiles))) 157 hasher.Write([]byte(fmt.Sprintf("%t", h.disableCTags))) 158 159 if h.documentRankVersion != "" { 160 hasher.Write([]byte{0}) 161 io.WriteString(hasher, h.documentRankVersion) 162 } 163 164 return fmt.Sprintf("%x", hasher.Sum(nil)) 165} 166 167type largeFilesFlag struct{ *Options } 168 169func (f largeFilesFlag) String() string { 170 // From flag.Value documentation: 171 // 172 // The flag package may call the String method with a zero-valued receiver, 173 // such as a nil pointer. 174 if f.Options == nil { 175 return "" 176 } 177 s := append([]string{""}, f.LargeFiles...) 178 return strings.Join(s, "-large_file ") 179} 180 181func (f largeFilesFlag) Set(value string) error { 182 f.LargeFiles = append(f.LargeFiles, value) 183 return nil 184} 185 186// Flags adds flags for build options to fs. It is the "inverse" of Args. 187func (o *Options) Flags(fs *flag.FlagSet) { 188 x := *o 189 x.SetDefaults() 190 fs.IntVar(&o.SizeMax, "file_limit", x.SizeMax, "maximum file size") 191 fs.IntVar(&o.TrigramMax, "max_trigram_count", x.TrigramMax, "maximum number of trigrams per document") 192 fs.IntVar(&o.ShardMax, "shard_limit", x.ShardMax, "maximum corpus size for a shard") 193 fs.IntVar(&o.Parallelism, "parallelism", x.Parallelism, "maximum number of parallel indexing processes.") 194 fs.StringVar(&o.IndexDir, "index", x.IndexDir, "directory for search indices") 195 fs.BoolVar(&o.CTagsMustSucceed, "require_ctags", x.CTagsMustSucceed, "If set, ctags calls must succeed.") 196 fs.Var(largeFilesFlag{o}, "large_file", "A glob pattern where matching files are to be index regardless of their size. You can add multiple patterns by setting this more than once.") 197 fs.StringVar(&o.MemProfile, "memprofile", "", "write memory profile(s) to `file.shardnum`. Note: sets parallelism to 1.") 198 199 // Sourcegraph specific 200 fs.BoolVar(&o.DisableCTags, "disable_ctags", x.DisableCTags, "If set, ctags will not be called.") 201 fs.BoolVar(&o.ShardMerging, "shard_merging", x.ShardMerging, "If set, builder will respect compound shards.") 202} 203 204// Args generates command line arguments for o. It is the "inverse" of Flags. 205func (o *Options) Args() []string { 206 var args []string 207 208 if o.SizeMax != 0 { 209 args = append(args, "-file_limit", strconv.Itoa(o.SizeMax)) 210 } 211 212 if o.TrigramMax != 0 { 213 args = append(args, "-max_trigram_count", strconv.Itoa(o.TrigramMax)) 214 } 215 216 if o.ShardMax != 0 { 217 args = append(args, "-shard_limit", strconv.Itoa(o.ShardMax)) 218 } 219 220 if o.Parallelism != 0 { 221 args = append(args, "-parallelism", strconv.Itoa(o.Parallelism)) 222 } 223 224 if o.IndexDir != "" { 225 args = append(args, "-index", o.IndexDir) 226 } 227 228 if o.CTagsMustSucceed { 229 args = append(args, "-require_ctags") 230 } 231 232 for _, a := range o.LargeFiles { 233 args = append(args, "-large_file", a) 234 } 235 236 // Sourcegraph specific 237 if o.DisableCTags { 238 args = append(args, "-disable_ctags") 239 } 240 241 if o.ShardMerging { 242 args = append(args, "-shard_merging") 243 } 244 245 return args 246} 247 248// Builder manages (parallel) creation of uniformly sized shards. The 249// builder buffers up documents until it collects enough documents and 250// then builds a shard and writes. 251type Builder struct { 252 opts Options 253 throttle chan int 254 255 nextShardNum int 256 todo []*zoekt.Document 257 docChecker zoekt.DocChecker 258 size int 259 260 parserBins ctags.ParserBinMap 261 building sync.WaitGroup 262 263 errMu sync.Mutex 264 buildError error 265 266 // temp name => final name for finished shards. We only rename 267 // them once all shards succeed to avoid Frankstein corpuses. 268 finishedShards map[string]string 269 270 // indexTime is set by tests for doing reproducible builds. 271 indexTime time.Time 272 273 // a sortable 20 chars long id. 274 id string 275 276 finishCalled bool 277} 278 279type finishedShard struct { 280 temp, final string 281} 282 283func checkCTags() string { 284 if ctags := os.Getenv("CTAGS_COMMAND"); ctags != "" { 285 return ctags 286 } 287 288 if ctags, err := exec.LookPath("universal-ctags"); err == nil { 289 return ctags 290 } 291 292 return "" 293} 294 295func checkScipCTags() string { 296 if ctags := os.Getenv("SCIP_CTAGS_COMMAND"); ctags != "" { 297 return ctags 298 } 299 300 if ctags, err := exec.LookPath("scip-ctags"); err == nil { 301 return ctags 302 } 303 304 return "" 305} 306 307// SetDefaults sets reasonable default options. 308func (o *Options) SetDefaults() { 309 if o.CTagsPath == "" && !o.DisableCTags { 310 o.CTagsPath = checkCTags() 311 } 312 313 if o.ScipCTagsPath == "" && !o.DisableCTags { 314 o.ScipCTagsPath = checkScipCTags() 315 } 316 317 if o.Parallelism == 0 { 318 o.Parallelism = 4 319 } 320 if o.SizeMax == 0 { 321 o.SizeMax = 2 << 20 322 } 323 if o.ShardMax == 0 { 324 o.ShardMax = 100 << 20 325 } 326 if o.TrigramMax == 0 { 327 o.TrigramMax = 20000 328 } 329 330 if o.RepositoryDescription.Name == "" && o.RepositoryDescription.URL != "" { 331 parsed, _ := url.Parse(o.RepositoryDescription.URL) 332 if parsed != nil { 333 o.RepositoryDescription.Name = filepath.Join(parsed.Host, parsed.Path) 334 } 335 } 336} 337 338func hashString(s string) string { 339 h := sha1.New() 340 _, _ = io.WriteString(h, s) 341 return fmt.Sprintf("%x", h.Sum(nil)) 342} 343 344// ShardName returns the name the given index shard. 345func (o *Options) shardName(n int) string { 346 return o.shardNameVersion(zoekt.IndexFormatVersion, n) 347} 348 349func (o *Options) shardNameVersion(version, n int) string { 350 abs := url.QueryEscape(o.RepositoryDescription.Name) 351 if len(abs) > 200 { 352 abs = abs[:200] + hashString(abs)[:8] 353 } 354 return filepath.Join(o.IndexDir, 355 fmt.Sprintf("%s_v%d.%05d.zoekt", abs, version, n)) 356} 357 358type IndexState string 359 360const ( 361 IndexStateMissing IndexState = "missing" 362 IndexStateCorrupt IndexState = "corrupt" 363 IndexStateVersion IndexState = "version-mismatch" 364 IndexStateOption IndexState = "option-mismatch" 365 IndexStateMeta IndexState = "meta-mismatch" 366 IndexStateContent IndexState = "content-mismatch" 367 IndexStateEqual IndexState = "equal" 368) 369 370var readVersions = []struct { 371 IndexFormatVersion int 372 FeatureVersion int 373}{{ 374 IndexFormatVersion: zoekt.IndexFormatVersion, 375 FeatureVersion: zoekt.FeatureVersion, 376}, { 377 IndexFormatVersion: zoekt.NextIndexFormatVersion, 378 FeatureVersion: zoekt.FeatureVersion, 379}} 380 381// IncrementalSkipIndexing returns true if the index present on disk matches 382// the build options. 383func (o *Options) IncrementalSkipIndexing() bool { 384 state, _ := o.IndexState() 385 return state == IndexStateEqual 386} 387 388// IndexState checks how the index present on disk compares to the build 389// options and returns the IndexState and the name of the first shard. 390func (o *Options) IndexState() (IndexState, string) { 391 // Open the latest version we support that is on disk. 392 fn := o.findShard() 393 if fn == "" { 394 return IndexStateMissing, fn 395 } 396 397 repos, index, err := zoekt.ReadMetadataPathAlive(fn) 398 if os.IsNotExist(err) { 399 return IndexStateMissing, fn 400 } else if err != nil { 401 return IndexStateCorrupt, fn 402 } 403 404 for _, v := range readVersions { 405 if v.IndexFormatVersion == index.IndexFormatVersion && v.FeatureVersion != index.IndexFeatureVersion { 406 return IndexStateVersion, fn 407 } 408 } 409 410 var repo *zoekt.Repository 411 for _, cand := range repos { 412 if cand.Name == o.RepositoryDescription.Name { 413 repo = cand 414 break 415 } 416 } 417 418 if repo == nil { 419 return IndexStateCorrupt, fn 420 } 421 422 if repo.IndexOptions != o.GetHash() { 423 return IndexStateOption, fn 424 } 425 426 if !reflect.DeepEqual(repo.Branches, o.RepositoryDescription.Branches) { 427 return IndexStateContent, fn 428 } 429 430 // We can mutate repo since it lives in the scope of this function call. 431 if updated, err := repo.MergeMutable(&o.RepositoryDescription); err != nil { 432 // non-nil err means we are trying to update an immutable field => 433 // reindex content. 434 log.Printf("warn: immutable field changed, requires re-index: %s", err) 435 return IndexStateContent, fn 436 } else if updated { 437 return IndexStateMeta, fn 438 } 439 440 return IndexStateEqual, fn 441} 442 443// FindRepositoryMetadata returns the index metadata for the repository 444// specified in the options. 'ok' is false if the repository's metadata 445// couldn't be found or if an error occurred. 446func (o *Options) FindRepositoryMetadata() (repository *zoekt.Repository, metadata *zoekt.IndexMetadata, ok bool, err error) { 447 shard := o.findShard() 448 if shard == "" { 449 return nil, nil, false, nil 450 } 451 452 repositories, metadata, err := zoekt.ReadMetadataPathAlive(shard) 453 if err != nil { 454 return nil, nil, false, fmt.Errorf("reading metadata for shard %q: %w", shard, err) 455 } 456 457 ID := o.RepositoryDescription.ID 458 for _, r := range repositories { 459 // compound shards contain multiple repositories, so we 460 // have to pick only the one we're looking for 461 if r.ID == ID { 462 return r, metadata, true, nil 463 } 464 } 465 466 // If we're here, then we're somehow in a state where we found a matching 467 // shard that's missing the repository metadata we're looking for. This 468 // should never happen. 469 name := o.RepositoryDescription.Name 470 return nil, nil, false, fmt.Errorf("matching shard %q doesn't contain metadata for repo id %d (%q)", shard, ID, name) 471} 472 473func (o *Options) findShard() string { 474 for _, v := range readVersions { 475 fn := o.shardNameVersion(v.IndexFormatVersion, 0) 476 if _, err := os.Stat(fn); err == nil { 477 return fn 478 } 479 } 480 481 // Brute force finding the shard in compound shards. We should only hit this 482 // code path for repositories that are not already existing or are in 483 // compound shards. 484 // 485 // TODO add an oracle which can speed this up in the case of repositories 486 // already in compound shards. 487 compoundShards, err := filepath.Glob(path.Join(o.IndexDir, "compound-*.zoekt")) 488 if err != nil { 489 return "" 490 } 491 for _, fn := range compoundShards { 492 repos, _, err := zoekt.ReadMetadataPathAlive(fn) 493 if err != nil { 494 continue 495 } 496 for _, repo := range repos { 497 if repo.ID == o.RepositoryDescription.ID { 498 return fn 499 } 500 } 501 } 502 503 return "" 504} 505 506func (o *Options) FindAllShards() []string { 507 for _, v := range readVersions { 508 fn := o.shardNameVersion(v.IndexFormatVersion, 0) 509 if _, err := os.Stat(fn); err == nil { 510 shards := []string{fn} 511 for i := 1; ; i++ { 512 fn := o.shardNameVersion(v.IndexFormatVersion, i) 513 if _, err := os.Stat(fn); err != nil { 514 return shards 515 } 516 shards = append(shards, fn) 517 } 518 } 519 } 520 521 // lazily fallback to findShard which will look for a compound shard. 522 if fn := o.findShard(); fn != "" { 523 return []string{fn} 524 } 525 526 return nil 527} 528 529// IgnoreSizeMax determines whether the max size should be ignored. 530func (o *Options) IgnoreSizeMax(name string) bool { 531 // A pattern match will override preceding pattern matches. 532 for i := len(o.LargeFiles) - 1; i >= 0; i-- { 533 pattern := strings.TrimSpace(o.LargeFiles[i]) 534 negated, validatedPattern := checkIsNegatePattern(pattern) 535 536 if m, _ := doublestar.PathMatch(validatedPattern, name); m { 537 if negated { 538 return false 539 } else { 540 return true 541 } 542 } 543 } 544 545 return false 546} 547 548func checkIsNegatePattern(pattern string) (bool, string) { 549 negate := "!" 550 551 // if negated then strip prefix meta character which identifies negated filter pattern 552 if strings.HasPrefix(pattern, negate) { 553 return true, pattern[len(negate):] 554 } 555 556 return false, pattern 557} 558 559// NewBuilder creates a new Builder instance. 560func NewBuilder(opts Options) (*Builder, error) { 561 opts.SetDefaults() 562 if opts.RepositoryDescription.Name == "" { 563 return nil, fmt.Errorf("builder: must set Name") 564 } 565 566 b := &Builder{ 567 opts: opts, 568 throttle: make(chan int, opts.Parallelism), 569 finishedShards: map[string]string{}, 570 } 571 572 parserBins, err := ctags.NewParserBinMap( 573 b.opts.CTagsPath, 574 b.opts.ScipCTagsPath, 575 opts.LanguageMap, 576 b.opts.CTagsMustSucceed, 577 ) 578 if err != nil { 579 return nil, err 580 } 581 582 b.parserBins = parserBins 583 584 if opts.IsDelta { 585 // Delta shards build on top of previously existing shards. 586 // As a consequence, the shardNum for delta shards starts from 587 // the number following the most recently generated shard - not 0. 588 // 589 // Using this numbering scheme allows all the shards to be 590 // discovered as a set. 591 shards := b.opts.FindAllShards() 592 b.nextShardNum = len(shards) // shards are zero indexed, so len() provides the next number after the last one 593 } 594 595 if _, err := b.newShardBuilder(); err != nil { 596 return nil, err 597 } 598 599 now := time.Now() 600 b.indexTime = now 601 b.id = xid.NewWithTime(now).String() 602 603 return b, nil 604} 605 606// AddFile is a convenience wrapper for the Add method 607func (b *Builder) AddFile(name string, content []byte) error { 608 return b.Add(zoekt.Document{Name: name, Content: content}) 609} 610 611func (b *Builder) Add(doc zoekt.Document) error { 612 if b.finishCalled { 613 return nil 614 } 615 616 allowLargeFile := b.opts.IgnoreSizeMax(doc.Name) 617 if len(doc.Content) > b.opts.SizeMax && !allowLargeFile { 618 // We could pass the document on to the shardbuilder, but if 619 // we pass through a part of the source tree with binary/large 620 // files, the corresponding shard would be mostly empty, so 621 // insert a reason here too. 622 doc.SkipReason = fmt.Sprintf("document size %d larger than limit %d", len(doc.Content), b.opts.SizeMax) 623 } else if err := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile); err != nil { 624 doc.SkipReason = err.Error() 625 doc.Language = "binary" 626 } 627 628 b.todo = append(b.todo, &doc) 629 630 if doc.SkipReason == "" { 631 b.size += len(doc.Name) + len(doc.Content) 632 } else { 633 b.size += len(doc.Name) + len(doc.SkipReason) 634 // Drop the content if we are skipping the document. Skipped content is not counted towards the 635 // shard size limit, so otherwise we might buffer too much data in memory before flushing. 636 doc.Content = nil 637 } 638 639 if b.size > b.opts.ShardMax { 640 return b.flush() 641 } 642 643 return nil 644} 645 646// MarkFileAsChangedOrRemoved indicates that the file specified by the given path 647// has been changed or removed since the last indexing job for this repository. 648// 649// If this build is a delta build, these files will be tombstoned in the older shards for this repository. 650func (b *Builder) MarkFileAsChangedOrRemoved(path string) { 651 b.opts.changedOrRemovedFiles = append(b.opts.changedOrRemovedFiles, path) 652} 653 654// Finish creates a last shard from the buffered documents, and clears 655// stale shards from previous runs. This should always be called, also 656// in failure cases, to ensure cleanup. 657// 658// It is safe to call Finish() multiple times. 659func (b *Builder) Finish() error { 660 if b.finishCalled { 661 return b.buildError 662 } 663 664 b.finishCalled = true 665 666 b.flush() 667 b.building.Wait() 668 669 if b.buildError != nil { 670 for tmp := range b.finishedShards { 671 log.Printf("Builder.Finish %s", tmp) 672 os.Remove(tmp) 673 } 674 b.finishedShards = map[string]string{} 675 return b.buildError 676 } 677 678 // map of temporary -> final names for all updated shards + shard metadata files 679 artifactPaths := make(map[string]string) 680 for tmp, final := range b.finishedShards { 681 artifactPaths[tmp] = final 682 } 683 684 oldShards := b.opts.FindAllShards() 685 686 if b.opts.IsDelta { 687 // Delta shard builds need to update FileTombstone and branch commit information for all 688 // existing shards 689 for _, shard := range oldShards { 690 repositories, _, err := zoekt.ReadMetadataPathAlive(shard) 691 if err != nil { 692 return fmt.Errorf("reading metadata from shard %q: %w", shard, err) 693 } 694 695 if len(repositories) > 1 { 696 return fmt.Errorf("delta shard builds don't support repositories contained in compound shards (shard %q)", shard) 697 } 698 699 if len(repositories) == 0 { 700 return fmt.Errorf("failed to update repository metadata for shard %q - shard contains no repositories", shard) 701 } 702 703 repository := repositories[0] 704 if repository.ID != b.opts.RepositoryDescription.ID { 705 return fmt.Errorf("shard %q doesn't contain repository ID %d (%q)", shard, b.opts.RepositoryDescription.ID, b.opts.RepositoryDescription.Name) 706 } 707 708 if len(b.opts.changedOrRemovedFiles) > 0 && repository.FileTombstones == nil { 709 repository.FileTombstones = make(map[string]struct{}, len(b.opts.changedOrRemovedFiles)) 710 } 711 712 for _, f := range b.opts.changedOrRemovedFiles { 713 repository.FileTombstones[f] = struct{}{} 714 } 715 716 if !BranchNamesEqual(repository.Branches, b.opts.RepositoryDescription.Branches) { 717 return deltaBranchSetError{ 718 shardName: shard, 719 old: repository.Branches, 720 new: b.opts.RepositoryDescription.Branches, 721 } 722 } 723 724 if b.opts.GetHash() != repository.IndexOptions { 725 return &deltaIndexOptionsMismatchError{ 726 shardName: shard, 727 newOptions: b.opts.HashOptions(), 728 } 729 } 730 731 repository.Branches = b.opts.RepositoryDescription.Branches 732 733 repository.LatestCommitDate = b.opts.RepositoryDescription.LatestCommitDate 734 735 tempPath, finalPath, err := zoekt.JsonMarshalRepoMetaTemp(shard, repository) 736 if err != nil { 737 return fmt.Errorf("writing repository metadta for shard %q: %w", shard, err) 738 } 739 740 artifactPaths[tempPath] = finalPath 741 } 742 } 743 744 // We mark finished shards as empty when we successfully finish. Return now 745 // to allow call sites to call Finish idempotently. 746 if len(artifactPaths) == 0 { 747 return b.buildError 748 } 749 750 // Collect a map of the old shards on disk. For each new shard we replace we 751 // delete it from toDelete. Anything remaining in toDelete will be removed 752 // after we have renamed everything into place. 753 754 var toDelete map[string]struct{} 755 if !b.opts.IsDelta { 756 // Non-delta shard builds delete all existing shards before they write out 757 // new ones. 758 // By contrast, delta shard builds work by stacking changes on top of existing shards. 759 // So, we skip populating the toDelete map if we're building delta shards. 760 761 toDelete = make(map[string]struct{}) 762 for _, name := range oldShards { 763 paths, err := zoekt.IndexFilePaths(name) 764 if err != nil { 765 b.buildError = fmt.Errorf("failed to find old paths for %s: %w", name, err) 766 } 767 for _, p := range paths { 768 toDelete[p] = struct{}{} 769 } 770 } 771 } 772 773 for tmp, final := range artifactPaths { 774 if err := os.Rename(tmp, final); err != nil { 775 b.buildError = err 776 continue 777 } 778 779 delete(toDelete, final) 780 } 781 782 b.finishedShards = map[string]string{} 783 784 for p := range toDelete { 785 // Don't delete compound shards, set tombstones instead. 786 if b.opts.ShardMerging && strings.HasPrefix(filepath.Base(p), "compound-") { 787 if !strings.HasSuffix(p, ".zoekt") { 788 continue 789 } 790 err := zoekt.SetTombstone(p, b.opts.RepositoryDescription.ID) 791 b.buildError = err 792 continue 793 } 794 log.Printf("removing old shard file: %s", p) 795 if err := os.Remove(p); err != nil { 796 b.buildError = err 797 } 798 } 799 800 return b.buildError 801} 802 803// BranchNamesEqual compares the given zoekt.RepositoryBranch slices, and returns true 804// iff both slices specify the same set of branch names in the same order. 805func BranchNamesEqual(a, b []zoekt.RepositoryBranch) bool { 806 if len(a) != len(b) { 807 return false 808 } 809 810 for i := range a { 811 x, y := a[i], b[i] 812 if x.Name != y.Name { 813 return false 814 } 815 } 816 817 return true 818} 819 820func (b *Builder) flush() error { 821 todo := b.todo 822 b.todo = nil 823 b.size = 0 824 b.errMu.Lock() 825 defer b.errMu.Unlock() 826 if b.buildError != nil { 827 return b.buildError 828 } 829 830 hasShard := b.nextShardNum > 0 831 if len(todo) == 0 && hasShard { 832 return nil 833 } 834 835 shard := b.nextShardNum 836 b.nextShardNum++ 837 838 if b.opts.Parallelism > 1 && b.opts.MemProfile == "" { 839 b.building.Add(1) 840 b.throttle <- 1 841 go func() { 842 done, err := b.buildShard(todo, shard) 843 <-b.throttle 844 845 b.errMu.Lock() 846 defer b.errMu.Unlock() 847 if err != nil && b.buildError == nil { 848 b.buildError = err 849 } 850 if err == nil { 851 b.finishedShards[done.temp] = done.final 852 } 853 b.building.Done() 854 }() 855 } else { 856 // No goroutines when we're not parallel. This 857 // simplifies memory profiling. 858 done, err := b.buildShard(todo, shard) 859 b.buildError = err 860 if err == nil { 861 b.finishedShards[done.temp] = done.final 862 } 863 if b.opts.MemProfile != "" { 864 // drop memory, and profile. 865 todo = nil 866 b.writeMemProfile(b.opts.MemProfile) 867 } 868 869 return b.buildError 870 } 871 872 return nil 873} 874 875var profileNumber int 876 877func (b *Builder) writeMemProfile(name string) { 878 nm := fmt.Sprintf("%s.%d", name, profileNumber) 879 profileNumber++ 880 f, err := os.Create(nm) 881 if err != nil { 882 log.Fatal("could not create memory profile: ", err) 883 } 884 runtime.GC() // get up-to-date statistics 885 if err := pprof.WriteHeapProfile(f); err != nil { 886 log.Fatal("could not write memory profile: ", err) 887 } 888 f.Close() 889 log.Printf("wrote mem profile %q", nm) 890} 891 892// map [0,inf) to [0,1) monotonically 893func squashRange(j int) float64 { 894 x := float64(j) 895 return x / (1 + x) 896} 897 898// IsLowPriority takes a file name and makes an educated guess about its priority 899// in search results. A file is considered low priority if it looks like a test, 900// vendored, or generated file. 901// 902// These 'priority' criteria affects how documents are ordered within a shard. It's 903// also used to help guess a file's rank when we're missing ranking information. 904func IsLowPriority(path string, content []byte) bool { 905 return enry.IsTest(path) || enry.IsVendor(path) || enry.IsGenerated(path, content) 906} 907 908type rankedDoc struct { 909 *zoekt.Document 910 rank []float64 911} 912 913// rank returns a vector of scores which is used at index-time to sort documents 914// before writing them to disk. The order of documents in the shard is important 915// at query time, because earlier documents receive a boost at query time and 916// have a higher chance of being searched before limits kick in. 917func rank(d *zoekt.Document, origIdx int) []float64 { 918 skipped := 0.0 919 if d.SkipReason != "" { 920 skipped = 1.0 921 } 922 923 generated := 0.0 924 if enry.IsGenerated(d.Name, d.Content) { 925 generated = 1.0 926 } 927 928 vendor := 0.0 929 if enry.IsVendor(d.Name) { 930 vendor = 1.0 931 } 932 933 test := 0.0 934 if enry.IsTest(d.Name) { 935 test = 1.0 936 } 937 938 // Smaller is earlier (=better). 939 return []float64{ 940 // Always place skipped docs last 941 skipped, 942 943 // Prefer docs that are not generated 944 generated, 945 946 // Prefer docs that are not vendored 947 vendor, 948 949 // Prefer docs that are not tests 950 test, 951 952 // With short names 953 squashRange(len(d.Name)), 954 955 // With many symbols 956 1.0 - squashRange(len(d.Symbols)), 957 958 // With short content 959 squashRange(len(d.Content)), 960 961 // That is present is as many branches as possible 962 1.0 - squashRange(len(d.Branches)), 963 964 // Preserve original ordering. 965 squashRange(origIdx), 966 } 967} 968 969func sortDocuments(todo []*zoekt.Document) { 970 rs := make([]rankedDoc, 0, len(todo)) 971 for i, t := range todo { 972 rd := rankedDoc{t, rank(t, i)} 973 rs = append(rs, rd) 974 } 975 sort.Slice(rs, func(i, j int) bool { 976 r1 := rs[i].rank 977 r2 := rs[j].rank 978 for i := range r1 { 979 if r1[i] < r2[i] { 980 return true 981 } 982 if r1[i] > r2[i] { 983 return false 984 } 985 } 986 987 return false 988 }) 989 for i := range todo { 990 todo[i] = rs[i].Document 991 } 992} 993 994func (b *Builder) buildShard(todo []*zoekt.Document, nextShardNum int) (*finishedShard, error) { 995 if !b.opts.DisableCTags && (b.opts.CTagsPath != "" || b.opts.ScipCTagsPath != "") { 996 err := parseSymbols(todo, b.opts.LanguageMap, b.parserBins) 997 if b.opts.CTagsMustSucceed && err != nil { 998 return nil, err 999 } 1000 if err != nil { 1001 log.Printf("ignoring universal:%s or scip:%s error: %v", b.opts.CTagsPath, b.opts.ScipCTagsPath, err) 1002 } 1003 } 1004 1005 name := b.opts.shardName(nextShardNum) 1006 1007 shardBuilder, err := b.newShardBuilder() 1008 if err != nil { 1009 return nil, err 1010 } 1011 1012 sortDocuments(todo) 1013 1014 for _, t := range todo { 1015 if err := shardBuilder.Add(*t); err != nil { 1016 return nil, err 1017 } 1018 } 1019 1020 return b.writeShard(name, shardBuilder) 1021} 1022 1023func (b *Builder) newShardBuilder() (*zoekt.IndexBuilder, error) { 1024 desc := b.opts.RepositoryDescription 1025 desc.HasSymbols = !b.opts.DisableCTags && b.opts.CTagsPath != "" 1026 desc.SubRepoMap = b.opts.SubRepositories 1027 desc.IndexOptions = b.opts.GetHash() 1028 1029 shardBuilder, err := zoekt.NewIndexBuilder(&desc) 1030 if err != nil { 1031 return nil, err 1032 } 1033 shardBuilder.IndexTime = b.indexTime 1034 shardBuilder.ID = b.id 1035 return shardBuilder, nil 1036} 1037 1038func (b *Builder) writeShard(fn string, ib *zoekt.IndexBuilder) (*finishedShard, error) { 1039 dir := filepath.Dir(fn) 1040 if err := os.MkdirAll(dir, 0o700); err != nil { 1041 return nil, err 1042 } 1043 1044 f, err := os.CreateTemp(dir, filepath.Base(fn)+".*.tmp") 1045 if err != nil { 1046 return nil, err 1047 } 1048 if runtime.GOOS != "windows" { 1049 if err := f.Chmod(0o666 &^ umask); err != nil { 1050 return nil, err 1051 } 1052 } 1053 1054 defer f.Close() 1055 if err := ib.Write(f); err != nil { 1056 return nil, err 1057 } 1058 fi, err := f.Stat() 1059 if err != nil { 1060 return nil, err 1061 } 1062 if err := f.Close(); err != nil { 1063 return nil, err 1064 } 1065 1066 log.Printf("finished shard %s: %d index bytes (overhead %3.1f), %d files processed \n", 1067 fn, 1068 fi.Size(), 1069 float64(fi.Size())/float64(ib.ContentSize()+1), 1070 ib.NumFiles()) 1071 1072 return &finishedShard{f.Name(), fn}, nil 1073} 1074 1075type deltaBranchSetError struct { 1076 shardName string 1077 old, new []zoekt.RepositoryBranch 1078} 1079 1080func (e deltaBranchSetError) Error() string { 1081 return fmt.Sprintf("repository metadata in shard %q contains a different set of branch names than what was requested, which is unsupported in a delta shard build. old: %+v, new: %+v", e.shardName, e.old, e.new) 1082} 1083 1084type deltaIndexOptionsMismatchError struct { 1085 shardName string 1086 newOptions HashOptions 1087} 1088 1089func (e *deltaIndexOptionsMismatchError) Error() string { 1090 return fmt.Sprintf("one or more index options for shard %q do not match Builder's index options. These index option updates are incompatible with delta build. New index options: %+v", e.shardName, e.newOptions) 1091} 1092 1093// umask holds the Umask of the current process 1094var umask os.FileMode