fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15// package build implements a more convenient interface for building 16// zoekt indices. 17package build 18 19import ( 20 "crypto/sha1" 21 "flag" 22 "fmt" 23 "io" 24 "log" 25 "net/url" 26 "os" 27 "os/exec" 28 "path" 29 "path/filepath" 30 "reflect" 31 "runtime" 32 "runtime/pprof" 33 "sort" 34 "strconv" 35 "strings" 36 "sync" 37 "time" 38 39 "github.com/bmatcuk/doublestar" 40 "github.com/grafana/regexp" 41 "github.com/rs/xid" 42 43 "github.com/sourcegraph/zoekt" 44 "github.com/sourcegraph/zoekt/ctags" 45) 46 47var DefaultDir = filepath.Join(os.Getenv("HOME"), ".zoekt") 48 49// Branch describes a single branch version. 50type Branch struct { 51 Name string 52 Version string 53} 54 55// Options sets options for the index building. 56type Options struct { 57 // IndexDir is a directory that holds *.zoekt index files. 58 IndexDir string 59 60 // SizeMax is the maximum file size 61 SizeMax int 62 63 // Parallelism is the maximum number of shards to index in parallel 64 Parallelism int 65 66 // ShardMax sets the maximum corpus size for a single shard 67 ShardMax int 68 69 // TrigramMax sets the maximum number of distinct trigrams per document. 70 TrigramMax int 71 72 // RepositoryDescription holds names and URLs for the repository. 73 RepositoryDescription zoekt.Repository 74 75 // SubRepositories is a path => sub repository map. 76 SubRepositories map[string]*zoekt.Repository 77 78 // DisableCTags disables the generation of ctags metadata. 79 DisableCTags bool 80 81 // CtagsPath is the path to the ctags binary to run, or empty 82 // if a valid binary couldn't be found. 83 CTagsPath string 84 85 // Same as CTagsPath but for scip-ctags 86 ScipCTagsPath string 87 88 // If set, ctags must succeed. 89 CTagsMustSucceed bool 90 91 // Write memory profiles to this file. 92 MemProfile string 93 94 // LargeFiles is a slice of glob patterns, including ** for any number 95 // of directories, where matching file paths should be indexed 96 // regardless of their size. The full pattern syntax is here: 97 // https://github.com/bmatcuk/doublestar/tree/v1#patterns. 98 LargeFiles []string 99 100 // IsDelta is true if this run contains only the changed documents since the 101 // last run. 102 IsDelta bool 103 104 // DocumentRanksPath is the path to the file with document ranks. If empty, 105 // ranks will be computed on-the-fly. 106 DocumentRanksPath string 107 108 // DocumentRanksVersion is a string which when changed will cause us to 109 // reindex a shard. This field is used so that when the contents of 110 // DocumentRanksPath changes, we can reindex. 111 DocumentRanksVersion string 112 113 // changedOrRemovedFiles is a list of file paths that have been changed or removed 114 // since the last indexing job for this repository. These files will be tombstoned 115 // in the older shards for this repository. 116 changedOrRemovedFiles []string 117 118 LanguageMap ctags.LanguageMap 119} 120 121// HashOptions contains only the options in Options that upon modification leads to IndexState of IndexStateMismatch during the next index building. 122type HashOptions struct { 123 sizeMax int 124 disableCTags bool 125 ctagsPath string 126 cTagsMustSucceed bool 127 largeFiles []string 128 129 // documentRankVersion is an experimental field which will change when the 130 // DocumentRanksPath content changes. If empty we ignore it. 131 documentRankVersion string 132} 133 134func (o *Options) HashOptions() HashOptions { 135 return HashOptions{ 136 sizeMax: o.SizeMax, 137 disableCTags: o.DisableCTags, 138 ctagsPath: o.CTagsPath, 139 cTagsMustSucceed: o.CTagsMustSucceed, 140 largeFiles: o.LargeFiles, 141 documentRankVersion: o.DocumentRanksVersion, 142 } 143} 144 145func (o *Options) GetHash() string { 146 h := o.HashOptions() 147 hasher := sha1.New() 148 149 hasher.Write([]byte(h.ctagsPath)) 150 hasher.Write([]byte(fmt.Sprintf("%t", h.cTagsMustSucceed))) 151 hasher.Write([]byte(fmt.Sprintf("%d", h.sizeMax))) 152 hasher.Write([]byte(fmt.Sprintf("%q", h.largeFiles))) 153 hasher.Write([]byte(fmt.Sprintf("%t", h.disableCTags))) 154 155 if h.documentRankVersion != "" { 156 hasher.Write([]byte{0}) 157 io.WriteString(hasher, h.documentRankVersion) 158 } 159 160 return fmt.Sprintf("%x", hasher.Sum(nil)) 161} 162 163type largeFilesFlag struct{ *Options } 164 165func (f largeFilesFlag) String() string { 166 // From flag.Value documentation: 167 // 168 // The flag package may call the String method with a zero-valued receiver, 169 // such as a nil pointer. 170 if f.Options == nil { 171 return "" 172 } 173 s := append([]string{""}, f.LargeFiles...) 174 return strings.Join(s, "-large_file ") 175} 176 177func (f largeFilesFlag) Set(value string) error { 178 f.LargeFiles = append(f.LargeFiles, value) 179 return nil 180} 181 182// Flags adds flags for build options to fs. It is the "inverse" of Args. 183func (o *Options) Flags(fs *flag.FlagSet) { 184 x := *o 185 x.SetDefaults() 186 fs.IntVar(&o.SizeMax, "file_limit", x.SizeMax, "maximum file size") 187 fs.IntVar(&o.TrigramMax, "max_trigram_count", x.TrigramMax, "maximum number of trigrams per document") 188 fs.IntVar(&o.ShardMax, "shard_limit", x.ShardMax, "maximum corpus size for a shard") 189 fs.IntVar(&o.Parallelism, "parallelism", x.Parallelism, "maximum number of parallel indexing processes.") 190 fs.StringVar(&o.IndexDir, "index", x.IndexDir, "directory for search indices") 191 fs.BoolVar(&o.CTagsMustSucceed, "require_ctags", x.CTagsMustSucceed, "If set, ctags calls must succeed.") 192 fs.Var(largeFilesFlag{o}, "large_file", "A glob pattern where matching files are to be index regardless of their size. You can add multiple patterns by setting this more than once.") 193 fs.StringVar(&o.MemProfile, "memprofile", "", "write memory profile(s) to `file.shardnum`. Note: sets parallelism to 1.") 194 195 // Sourcegraph specific 196 fs.BoolVar(&o.DisableCTags, "disable_ctags", x.DisableCTags, "If set, ctags will not be called.") 197} 198 199// Args generates command line arguments for o. It is the "inverse" of Flags. 200func (o *Options) Args() []string { 201 var args []string 202 203 if o.SizeMax != 0 { 204 args = append(args, "-file_limit", strconv.Itoa(o.SizeMax)) 205 } 206 207 if o.TrigramMax != 0 { 208 args = append(args, "-max_trigram_count", strconv.Itoa(o.TrigramMax)) 209 } 210 211 if o.ShardMax != 0 { 212 args = append(args, "-shard_limit", strconv.Itoa(o.ShardMax)) 213 } 214 215 if o.Parallelism != 0 { 216 args = append(args, "-parallelism", strconv.Itoa(o.Parallelism)) 217 } 218 219 if o.IndexDir != "" { 220 args = append(args, "-index", o.IndexDir) 221 } 222 223 if o.CTagsMustSucceed { 224 args = append(args, "-require_ctags") 225 } 226 227 for _, a := range o.LargeFiles { 228 args = append(args, "-large_file", a) 229 } 230 231 // Sourcegraph specific 232 if o.DisableCTags { 233 args = append(args, "-disable_ctags") 234 } 235 236 return args 237} 238 239// Builder manages (parallel) creation of uniformly sized shards. The 240// builder buffers up documents until it collects enough documents and 241// then builds a shard and writes. 242type Builder struct { 243 opts Options 244 throttle chan int 245 246 nextShardNum int 247 todo []*zoekt.Document 248 docChecker zoekt.DocChecker 249 size int 250 251 parserBins ctags.ParserBinMap 252 building sync.WaitGroup 253 254 errMu sync.Mutex 255 buildError error 256 257 // temp name => final name for finished shards. We only rename 258 // them once all shards succeed to avoid Frankstein corpuses. 259 finishedShards map[string]string 260 261 // indexTime is set by tests for doing reproducible builds. 262 indexTime time.Time 263 264 // a sortable 20 chars long id. 265 id string 266 267 finishCalled bool 268} 269 270type finishedShard struct { 271 temp, final string 272} 273 274func checkCTags() string { 275 if ctags := os.Getenv("CTAGS_COMMAND"); ctags != "" { 276 return ctags 277 } 278 279 if ctags, err := exec.LookPath("universal-ctags"); err == nil { 280 return ctags 281 } 282 283 return "" 284} 285 286func checkScipCTags() string { 287 if ctags := os.Getenv("SCIP_CTAGS_COMMAND"); ctags != "" { 288 return ctags 289 } 290 291 if ctags, err := exec.LookPath("scip-ctags"); err == nil { 292 return ctags 293 } 294 295 return "" 296} 297 298// SetDefaults sets reasonable default options. 299func (o *Options) SetDefaults() { 300 if o.CTagsPath == "" && !o.DisableCTags { 301 o.CTagsPath = checkCTags() 302 } 303 304 if o.ScipCTagsPath == "" && !o.DisableCTags { 305 o.ScipCTagsPath = checkScipCTags() 306 } 307 308 if o.Parallelism == 0 { 309 o.Parallelism = 4 310 } 311 if o.SizeMax == 0 { 312 o.SizeMax = 2 << 20 313 } 314 if o.ShardMax == 0 { 315 o.ShardMax = 100 << 20 316 } 317 if o.TrigramMax == 0 { 318 o.TrigramMax = 20000 319 } 320 321 if o.RepositoryDescription.Name == "" && o.RepositoryDescription.URL != "" { 322 parsed, _ := url.Parse(o.RepositoryDescription.URL) 323 if parsed != nil { 324 o.RepositoryDescription.Name = filepath.Join(parsed.Host, parsed.Path) 325 } 326 } 327} 328 329func hashString(s string) string { 330 h := sha1.New() 331 _, _ = io.WriteString(h, s) 332 return fmt.Sprintf("%x", h.Sum(nil)) 333} 334 335// ShardName returns the name the given index shard. 336func (o *Options) shardName(n int) string { 337 return o.shardNameVersion(zoekt.IndexFormatVersion, n) 338} 339 340func (o *Options) shardNameVersion(version, n int) string { 341 abs := url.QueryEscape(o.RepositoryDescription.Name) 342 if len(abs) > 200 { 343 abs = abs[:200] + hashString(abs)[:8] 344 } 345 return filepath.Join(o.IndexDir, 346 fmt.Sprintf("%s_v%d.%05d.zoekt", abs, version, n)) 347} 348 349type IndexState string 350 351const ( 352 IndexStateMissing IndexState = "missing" 353 IndexStateCorrupt IndexState = "corrupt" 354 IndexStateVersion IndexState = "version-mismatch" 355 IndexStateOption IndexState = "option-mismatch" 356 IndexStateMeta IndexState = "meta-mismatch" 357 IndexStateContent IndexState = "content-mismatch" 358 IndexStateEqual IndexState = "equal" 359) 360 361var readVersions = []struct { 362 IndexFormatVersion int 363 FeatureVersion int 364}{{ 365 IndexFormatVersion: zoekt.IndexFormatVersion, 366 FeatureVersion: zoekt.FeatureVersion, 367}, { 368 IndexFormatVersion: zoekt.NextIndexFormatVersion, 369 FeatureVersion: zoekt.FeatureVersion, 370}} 371 372// IncrementalSkipIndexing returns true if the index present on disk matches 373// the build options. 374func (o *Options) IncrementalSkipIndexing() bool { 375 state, _ := o.IndexState() 376 return state == IndexStateEqual 377} 378 379// IndexState checks how the index present on disk compares to the build 380// options and returns the IndexState and the name of the first shard. 381func (o *Options) IndexState() (IndexState, string) { 382 // Open the latest version we support that is on disk. 383 fn := o.findShard() 384 if fn == "" { 385 return IndexStateMissing, fn 386 } 387 388 repos, index, err := zoekt.ReadMetadataPathAlive(fn) 389 if os.IsNotExist(err) { 390 return IndexStateMissing, fn 391 } else if err != nil { 392 return IndexStateCorrupt, fn 393 } 394 395 for _, v := range readVersions { 396 if v.IndexFormatVersion == index.IndexFormatVersion && v.FeatureVersion != index.IndexFeatureVersion { 397 return IndexStateVersion, fn 398 } 399 } 400 401 var repo *zoekt.Repository 402 for _, cand := range repos { 403 if cand.Name == o.RepositoryDescription.Name { 404 repo = cand 405 break 406 } 407 } 408 409 if repo == nil { 410 return IndexStateCorrupt, fn 411 } 412 413 if repo.IndexOptions != o.GetHash() { 414 return IndexStateOption, fn 415 } 416 417 if !reflect.DeepEqual(repo.Branches, o.RepositoryDescription.Branches) { 418 return IndexStateContent, fn 419 } 420 421 // We can mutate repo since it lives in the scope of this function call. 422 if updated, err := repo.MergeMutable(&o.RepositoryDescription); err != nil { 423 // non-nil err means we are trying to update an immutable field => 424 // reindex content. 425 log.Printf("warn: immutable field changed, requires re-index: %s", err) 426 return IndexStateContent, fn 427 } else if updated { 428 return IndexStateMeta, fn 429 } 430 431 return IndexStateEqual, fn 432} 433 434// FindRepositoryMetadata returns the index metadata for the repository 435// specified in the options. 'ok' is false if the repository's metadata 436// couldn't be found or if an error occurred. 437func (o *Options) FindRepositoryMetadata() (repository *zoekt.Repository, metadata *zoekt.IndexMetadata, ok bool, err error) { 438 shard := o.findShard() 439 if shard == "" { 440 return nil, nil, false, nil 441 } 442 443 repositories, metadata, err := zoekt.ReadMetadataPathAlive(shard) 444 if err != nil { 445 return nil, nil, false, fmt.Errorf("reading metadata for shard %q: %w", shard, err) 446 } 447 448 ID := o.RepositoryDescription.ID 449 for _, r := range repositories { 450 // compound shards contain multiple repositories, so we 451 // have to pick only the one we're looking for 452 if r.ID == ID { 453 return r, metadata, true, nil 454 } 455 } 456 457 // If we're here, then we're somehow in a state where we found a matching 458 // shard that's missing the repository metadata we're looking for. This 459 // should never happen. 460 name := o.RepositoryDescription.Name 461 return nil, nil, false, fmt.Errorf("matching shard %q doesn't contain metadata for repo id %d (%q)", shard, ID, name) 462} 463 464func (o *Options) findShard() string { 465 for _, v := range readVersions { 466 fn := o.shardNameVersion(v.IndexFormatVersion, 0) 467 if _, err := os.Stat(fn); err == nil { 468 return fn 469 } 470 } 471 472 // Brute force finding the shard in compound shards. We should only hit this 473 // code path for repositories that are not already existing or are in 474 // compound shards. 475 // 476 // TODO add an oracle which can speed this up in the case of repositories 477 // already in compound shards. 478 compoundShards, err := filepath.Glob(path.Join(o.IndexDir, "compound-*.zoekt")) 479 if err != nil { 480 return "" 481 } 482 for _, fn := range compoundShards { 483 repos, _, err := zoekt.ReadMetadataPathAlive(fn) 484 if err != nil { 485 continue 486 } 487 for _, repo := range repos { 488 if repo.ID == o.RepositoryDescription.ID { 489 return fn 490 } 491 } 492 } 493 494 return "" 495} 496 497func (o *Options) FindAllShards() []string { 498 for _, v := range readVersions { 499 fn := o.shardNameVersion(v.IndexFormatVersion, 0) 500 if _, err := os.Stat(fn); err == nil { 501 shards := []string{fn} 502 for i := 1; ; i++ { 503 fn := o.shardNameVersion(v.IndexFormatVersion, i) 504 if _, err := os.Stat(fn); err != nil { 505 return shards 506 } 507 shards = append(shards, fn) 508 } 509 } 510 } 511 512 // lazily fallback to findShard which will look for a compound shard. 513 if fn := o.findShard(); fn != "" { 514 return []string{fn} 515 } 516 517 return nil 518} 519 520// IgnoreSizeMax determines whether the max size should be ignored. 521func (o *Options) IgnoreSizeMax(name string) bool { 522 // A pattern match will override preceding pattern matches. 523 for i := len(o.LargeFiles) - 1; i >= 0; i-- { 524 pattern := strings.TrimSpace(o.LargeFiles[i]) 525 negated, validatedPattern := checkIsNegatePattern(pattern) 526 527 if m, _ := doublestar.PathMatch(validatedPattern, name); m { 528 if negated { 529 return false 530 } else { 531 return true 532 } 533 } 534 } 535 536 return false 537} 538 539func checkIsNegatePattern(pattern string) (bool, string) { 540 negate := "!" 541 542 // if negated then strip prefix meta character which identifies negated filter pattern 543 if strings.HasPrefix(pattern, negate) { 544 return true, pattern[len(negate):] 545 } 546 547 return false, pattern 548} 549 550// NewBuilder creates a new Builder instance. 551func NewBuilder(opts Options) (*Builder, error) { 552 opts.SetDefaults() 553 if opts.RepositoryDescription.Name == "" { 554 return nil, fmt.Errorf("builder: must set Name") 555 } 556 557 b := &Builder{ 558 opts: opts, 559 throttle: make(chan int, opts.Parallelism), 560 finishedShards: map[string]string{}, 561 } 562 563 parserBins, err := ctags.NewParserBinMap( 564 b.opts.CTagsPath, 565 b.opts.ScipCTagsPath, 566 opts.LanguageMap, 567 b.opts.CTagsMustSucceed, 568 ) 569 if err != nil { 570 return nil, err 571 } 572 573 b.parserBins = parserBins 574 575 if opts.IsDelta { 576 // Delta shards build on top of previously existing shards. 577 // As a consequence, the shardNum for delta shards starts from 578 // the number following the most recently generated shard - not 0. 579 // 580 // Using this numbering scheme allows all the shards to be 581 // discovered as a set. 582 shards := b.opts.FindAllShards() 583 b.nextShardNum = len(shards) // shards are zero indexed, so len() provides the next number after the last one 584 } 585 586 if _, err := b.newShardBuilder(); err != nil { 587 return nil, err 588 } 589 590 now := time.Now() 591 b.indexTime = now 592 b.id = xid.NewWithTime(now).String() 593 594 return b, nil 595} 596 597// AddFile is a convenience wrapper for the Add method 598func (b *Builder) AddFile(name string, content []byte) error { 599 return b.Add(zoekt.Document{Name: name, Content: content}) 600} 601 602func (b *Builder) Add(doc zoekt.Document) error { 603 if b.finishCalled { 604 return nil 605 } 606 607 allowLargeFile := b.opts.IgnoreSizeMax(doc.Name) 608 if len(doc.Content) > b.opts.SizeMax && !allowLargeFile { 609 // We could pass the document on to the shardbuilder, but if 610 // we pass through a part of the source tree with binary/large 611 // files, the corresponding shard would be mostly empty, so 612 // insert a reason here too. 613 doc.SkipReason = fmt.Sprintf("document size %d larger than limit %d", len(doc.Content), b.opts.SizeMax) 614 } else if err := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile); err != nil { 615 doc.SkipReason = err.Error() 616 doc.Language = "binary" 617 } 618 619 b.todo = append(b.todo, &doc) 620 621 if doc.SkipReason == "" { 622 b.size += len(doc.Name) + len(doc.Content) 623 } else { 624 b.size += len(doc.Name) + len(doc.SkipReason) 625 // Drop the content if we are skipping the document. Skipped content is not counted towards the 626 // shard size limit, so otherwise we might buffer too much data in memory before flushing. 627 doc.Content = nil 628 } 629 630 if b.size > b.opts.ShardMax { 631 return b.flush() 632 } 633 634 return nil 635} 636 637// MarkFileAsChangedOrRemoved indicates that the file specified by the given path 638// has been changed or removed since the last indexing job for this repository. 639// 640// If this build is a delta build, these files will be tombstoned in the older shards for this repository. 641func (b *Builder) MarkFileAsChangedOrRemoved(path string) { 642 b.opts.changedOrRemovedFiles = append(b.opts.changedOrRemovedFiles, path) 643} 644 645// Finish creates a last shard from the buffered documents, and clears 646// stale shards from previous runs. This should always be called, also 647// in failure cases, to ensure cleanup. 648// 649// It is safe to call Finish() multiple times. 650func (b *Builder) Finish() error { 651 if b.finishCalled { 652 return b.buildError 653 } 654 655 b.finishCalled = true 656 657 b.flush() 658 b.building.Wait() 659 660 if b.buildError != nil { 661 for tmp := range b.finishedShards { 662 log.Printf("Builder.Finish %s", tmp) 663 os.Remove(tmp) 664 } 665 b.finishedShards = map[string]string{} 666 return b.buildError 667 } 668 669 // map of temporary -> final names for all updated shards + shard metadata files 670 artifactPaths := make(map[string]string) 671 for tmp, final := range b.finishedShards { 672 artifactPaths[tmp] = final 673 } 674 675 oldShards := b.opts.FindAllShards() 676 677 if b.opts.IsDelta { 678 // Delta shard builds need to update FileTombstone and branch commit information for all 679 // existing shards 680 for _, shard := range oldShards { 681 repositories, _, err := zoekt.ReadMetadataPathAlive(shard) 682 if err != nil { 683 return fmt.Errorf("reading metadata from shard %q: %w", shard, err) 684 } 685 686 if len(repositories) > 1 { 687 return fmt.Errorf("delta shard builds don't support repositories contained in compound shards (shard %q)", shard) 688 } 689 690 if len(repositories) == 0 { 691 return fmt.Errorf("failed to update repository metadata for shard %q - shard contains no repositories", shard) 692 } 693 694 repository := repositories[0] 695 if repository.ID != b.opts.RepositoryDescription.ID { 696 return fmt.Errorf("shard %q doesn't contain repository ID %d (%q)", shard, b.opts.RepositoryDescription.ID, b.opts.RepositoryDescription.Name) 697 } 698 699 if len(b.opts.changedOrRemovedFiles) > 0 && repository.FileTombstones == nil { 700 repository.FileTombstones = make(map[string]struct{}, len(b.opts.changedOrRemovedFiles)) 701 } 702 703 for _, f := range b.opts.changedOrRemovedFiles { 704 repository.FileTombstones[f] = struct{}{} 705 } 706 707 if !BranchNamesEqual(repository.Branches, b.opts.RepositoryDescription.Branches) { 708 return deltaBranchSetError{ 709 shardName: shard, 710 old: repository.Branches, 711 new: b.opts.RepositoryDescription.Branches, 712 } 713 } 714 715 if b.opts.GetHash() != repository.IndexOptions { 716 return &deltaIndexOptionsMismatchError{ 717 shardName: shard, 718 newOptions: b.opts.HashOptions(), 719 } 720 } 721 722 repository.Branches = b.opts.RepositoryDescription.Branches 723 724 repository.LatestCommitDate = b.opts.RepositoryDescription.LatestCommitDate 725 726 tempPath, finalPath, err := zoekt.JsonMarshalRepoMetaTemp(shard, repository) 727 if err != nil { 728 return fmt.Errorf("writing repository metadta for shard %q: %w", shard, err) 729 } 730 731 artifactPaths[tempPath] = finalPath 732 } 733 } 734 735 // We mark finished shards as empty when we successfully finish. Return now 736 // to allow call sites to call Finish idempotently. 737 if len(artifactPaths) == 0 { 738 return b.buildError 739 } 740 741 // Collect a map of the old shards on disk. For each new shard we replace we 742 // delete it from toDelete. Anything remaining in toDelete will be removed 743 // after we have renamed everything into place. 744 745 var toDelete map[string]struct{} 746 if !b.opts.IsDelta { 747 // Non-delta shard builds delete all existing shards before they write out 748 // new ones. 749 // By contrast, delta shard builds work by stacking changes on top of existing shards. 750 // So, we skip populating the toDelete map if we're building delta shards. 751 752 toDelete = make(map[string]struct{}) 753 for _, name := range oldShards { 754 paths, err := zoekt.IndexFilePaths(name) 755 if err != nil { 756 b.buildError = fmt.Errorf("failed to find old paths for %s: %w", name, err) 757 } 758 for _, p := range paths { 759 toDelete[p] = struct{}{} 760 } 761 } 762 } 763 764 for tmp, final := range artifactPaths { 765 if err := os.Rename(tmp, final); err != nil { 766 b.buildError = err 767 continue 768 } 769 770 delete(toDelete, final) 771 } 772 773 b.finishedShards = map[string]string{} 774 775 for p := range toDelete { 776 // Don't delete compound shards, set tombstones instead. 777 if zoekt.ShardMergingEnabled() && strings.HasPrefix(filepath.Base(p), "compound-") { 778 if !strings.HasSuffix(p, ".zoekt") { 779 continue 780 } 781 err := zoekt.SetTombstone(p, b.opts.RepositoryDescription.ID) 782 b.buildError = err 783 continue 784 } 785 log.Printf("removing old shard file: %s", p) 786 if err := os.Remove(p); err != nil { 787 b.buildError = err 788 } 789 } 790 791 return b.buildError 792} 793 794// BranchNamesEqual compares the given zoekt.RepositoryBranch slices, and returns true 795// iff both slices specify the same set of branch names in the same order. 796func BranchNamesEqual(a, b []zoekt.RepositoryBranch) bool { 797 if len(a) != len(b) { 798 return false 799 } 800 801 for i := range a { 802 x, y := a[i], b[i] 803 if x.Name != y.Name { 804 return false 805 } 806 } 807 808 return true 809} 810 811func (b *Builder) flush() error { 812 todo := b.todo 813 b.todo = nil 814 b.size = 0 815 b.errMu.Lock() 816 defer b.errMu.Unlock() 817 if b.buildError != nil { 818 return b.buildError 819 } 820 821 hasShard := b.nextShardNum > 0 822 if len(todo) == 0 && hasShard { 823 return nil 824 } 825 826 shard := b.nextShardNum 827 b.nextShardNum++ 828 829 if b.opts.Parallelism > 1 && b.opts.MemProfile == "" { 830 b.building.Add(1) 831 b.throttle <- 1 832 go func() { 833 done, err := b.buildShard(todo, shard) 834 <-b.throttle 835 836 b.errMu.Lock() 837 defer b.errMu.Unlock() 838 if err != nil && b.buildError == nil { 839 b.buildError = err 840 } 841 if err == nil { 842 b.finishedShards[done.temp] = done.final 843 } 844 b.building.Done() 845 }() 846 } else { 847 // No goroutines when we're not parallel. This 848 // simplifies memory profiling. 849 done, err := b.buildShard(todo, shard) 850 b.buildError = err 851 if err == nil { 852 b.finishedShards[done.temp] = done.final 853 } 854 if b.opts.MemProfile != "" { 855 // drop memory, and profile. 856 todo = nil 857 b.writeMemProfile(b.opts.MemProfile) 858 } 859 860 return b.buildError 861 } 862 863 return nil 864} 865 866var profileNumber int 867 868func (b *Builder) writeMemProfile(name string) { 869 nm := fmt.Sprintf("%s.%d", name, profileNumber) 870 profileNumber++ 871 f, err := os.Create(nm) 872 if err != nil { 873 log.Fatal("could not create memory profile: ", err) 874 } 875 runtime.GC() // get up-to-date statistics 876 if err := pprof.WriteHeapProfile(f); err != nil { 877 log.Fatal("could not write memory profile: ", err) 878 } 879 f.Close() 880 log.Printf("wrote mem profile %q", nm) 881} 882 883// map [0,inf) to [0,1) monotonically 884func squashRange(j int) float64 { 885 x := float64(j) 886 return x / (1 + x) 887} 888 889// IsLowPriority takes a file name and makes an educated guess about its priority 890// in search results. A file is considered low priority if it looks like a test, 891// vendored, or generated file. 892// 893// These 'priority' criteria affects how documents are ordered within a shard. It's 894// also used to help guess a file's rank when we're missing ranking information. 895func IsLowPriority(file string) bool { 896 return testRe.MatchString(file) || isGenerated(file) || isVendored(file) 897} 898 899var testRe = regexp.MustCompile("[Tt]est") 900 901func isGenerated(file string) bool { 902 return strings.HasSuffix(file, "min.js") || strings.HasSuffix(file, "js.map") 903} 904 905func isVendored(file string) bool { 906 return strings.Contains(file, "vendor/") || strings.Contains(file, "node_modules/") 907} 908 909type rankedDoc struct { 910 *zoekt.Document 911 rank []float64 912} 913 914// rank returns a vector of scores which is used at index-time to sort documents 915// before writing them to disk. The order of documents in the shard is important 916// at query time, because earlier documents receive a boost at query time and 917// have a higher chance of being searched before limits kick in. 918func rank(d *zoekt.Document, origIdx int) []float64 { 919 skipped := 0.0 920 if d.SkipReason != "" { 921 skipped = 1.0 922 } 923 924 generated := 0.0 925 if isGenerated(d.Name) { 926 generated = 1.0 927 } 928 929 vendor := 0.0 930 if isVendored(d.Name) { 931 vendor = 1.0 932 } 933 934 test := 0.0 935 if testRe.MatchString(d.Name) { 936 test = 1.0 937 } 938 939 // Smaller is earlier (=better). 940 return []float64{ 941 // Always place skipped docs last 942 skipped, 943 944 // Prefer docs that are not generated 945 generated, 946 947 // Prefer docs that are not vendored 948 vendor, 949 950 // Prefer docs that are not tests 951 test, 952 953 // With short names 954 squashRange(len(d.Name)), 955 956 // With many symbols 957 1.0 - squashRange(len(d.Symbols)), 958 959 // With short content 960 squashRange(len(d.Content)), 961 962 // That is present is as many branches as possible 963 1.0 - squashRange(len(d.Branches)), 964 965 // Preserve original ordering. 966 squashRange(origIdx), 967 } 968} 969 970func sortDocuments(todo []*zoekt.Document) { 971 rs := make([]rankedDoc, 0, len(todo)) 972 for i, t := range todo { 973 rd := rankedDoc{t, rank(t, i)} 974 rs = append(rs, rd) 975 } 976 sort.Slice(rs, func(i, j int) bool { 977 r1 := rs[i].rank 978 r2 := rs[j].rank 979 for i := range r1 { 980 if r1[i] < r2[i] { 981 return true 982 } 983 if r1[i] > r2[i] { 984 return false 985 } 986 } 987 988 return false 989 }) 990 for i := range todo { 991 todo[i] = rs[i].Document 992 } 993} 994 995func (b *Builder) buildShard(todo []*zoekt.Document, nextShardNum int) (*finishedShard, error) { 996 if !b.opts.DisableCTags && (b.opts.CTagsPath != "" || b.opts.ScipCTagsPath != "") { 997 err := parseSymbols(todo, b.opts.LanguageMap, b.parserBins) 998 if b.opts.CTagsMustSucceed && err != nil { 999 return nil, err 1000 } 1001 if err != nil { 1002 log.Printf("ignoring universal:%s or scip:%s error: %v", b.opts.CTagsPath, b.opts.ScipCTagsPath, err) 1003 } 1004 } 1005 1006 name := b.opts.shardName(nextShardNum) 1007 1008 shardBuilder, err := b.newShardBuilder() 1009 if err != nil { 1010 return nil, err 1011 } 1012 1013 sortDocuments(todo) 1014 1015 for _, t := range todo { 1016 if err := shardBuilder.Add(*t); err != nil { 1017 return nil, err 1018 } 1019 } 1020 1021 return b.writeShard(name, shardBuilder) 1022} 1023 1024func (b *Builder) newShardBuilder() (*zoekt.IndexBuilder, error) { 1025 desc := b.opts.RepositoryDescription 1026 desc.HasSymbols = !b.opts.DisableCTags && b.opts.CTagsPath != "" 1027 desc.SubRepoMap = b.opts.SubRepositories 1028 desc.IndexOptions = b.opts.GetHash() 1029 1030 shardBuilder, err := zoekt.NewIndexBuilder(&desc) 1031 if err != nil { 1032 return nil, err 1033 } 1034 shardBuilder.IndexTime = b.indexTime 1035 shardBuilder.ID = b.id 1036 return shardBuilder, nil 1037} 1038 1039func (b *Builder) writeShard(fn string, ib *zoekt.IndexBuilder) (*finishedShard, error) { 1040 dir := filepath.Dir(fn) 1041 if err := os.MkdirAll(dir, 0o700); err != nil { 1042 return nil, err 1043 } 1044 1045 f, err := os.CreateTemp(dir, filepath.Base(fn)+".*.tmp") 1046 if err != nil { 1047 return nil, err 1048 } 1049 if runtime.GOOS != "windows" { 1050 if err := f.Chmod(0o666 &^ umask); err != nil { 1051 return nil, err 1052 } 1053 } 1054 1055 defer f.Close() 1056 if err := ib.Write(f); err != nil { 1057 return nil, err 1058 } 1059 fi, err := f.Stat() 1060 if err != nil { 1061 return nil, err 1062 } 1063 if err := f.Close(); err != nil { 1064 return nil, err 1065 } 1066 1067 log.Printf("finished shard %s: %d index bytes (overhead %3.1f), %d files processed \n", 1068 fn, 1069 fi.Size(), 1070 float64(fi.Size())/float64(ib.ContentSize()+1), 1071 ib.NumFiles()) 1072 1073 return &finishedShard{f.Name(), fn}, nil 1074} 1075 1076type deltaBranchSetError struct { 1077 shardName string 1078 old, new []zoekt.RepositoryBranch 1079} 1080 1081func (e deltaBranchSetError) Error() string { 1082 return fmt.Sprintf("repository metadata in shard %q contains a different set of branch names than what was requested, which is unsupported in a delta shard build. old: %+v, new: %+v", e.shardName, e.old, e.new) 1083} 1084 1085type deltaIndexOptionsMismatchError struct { 1086 shardName string 1087 newOptions HashOptions 1088} 1089 1090func (e *deltaIndexOptionsMismatchError) Error() string { 1091 return fmt.Sprintf("one or more index options for shard %q do not match Builder's index options. These index option updates are incompatible with delta build. New index options: %+v", e.shardName, e.newOptions) 1092} 1093 1094// umask holds the Umask of the current process 1095var umask os.FileMode