fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15// package build implements a more convenient interface for building 16// zoekt indices. 17package build 18 19import ( 20 "crypto/sha1" 21 "flag" 22 "fmt" 23 "io" 24 "log" 25 "math" 26 "net/url" 27 "os" 28 "os/exec" 29 "path" 30 "path/filepath" 31 "reflect" 32 "runtime" 33 "runtime/pprof" 34 "sort" 35 "strconv" 36 "strings" 37 "sync" 38 "time" 39 40 "github.com/bmatcuk/doublestar" 41 "github.com/grafana/regexp" 42 "github.com/rs/xid" 43 "gopkg.in/natefinch/lumberjack.v2" 44 45 "github.com/sourcegraph/zoekt" 46 "github.com/sourcegraph/zoekt/ctags" 47) 48 49var DefaultDir = filepath.Join(os.Getenv("HOME"), ".zoekt") 50 51// Branch describes a single branch version. 52type Branch struct { 53 Name string 54 Version string 55} 56 57// Options sets options for the index building. 58type Options struct { 59 // IndexDir is a directory that holds *.zoekt index files. 60 IndexDir string 61 62 // SizeMax is the maximum file size 63 SizeMax int 64 65 // Parallelism is the maximum number of shards to index in parallel 66 Parallelism int 67 68 // ShardMax sets the maximum corpus size for a single shard 69 ShardMax int 70 71 // TrigramMax sets the maximum number of distinct trigrams per document. 72 TrigramMax int 73 74 // RepositoryDescription holds names and URLs for the repository. 75 RepositoryDescription zoekt.Repository 76 77 // SubRepositories is a path => sub repository map. 78 SubRepositories map[string]*zoekt.Repository 79 80 // DisableCTags disables the generation of ctags metadata. 81 DisableCTags bool 82 83 // CtagsPath is the path to the ctags binary to run, or empty 84 // if a valid binary couldn't be found. 85 CTagsPath string 86 87 // Same as CTagsPath but for scip-ctags 88 ScipCTagsPath string 89 90 // If set, ctags must succeed. 91 CTagsMustSucceed bool 92 93 // Write memory profiles to this file. 94 MemProfile string 95 96 // LargeFiles is a slice of glob patterns, including ** for any number 97 // of directories, where matching file paths should be indexed 98 // regardless of their size. The full pattern syntax is here: 99 // https://github.com/bmatcuk/doublestar/tree/v1#patterns. 100 LargeFiles []string 101 102 // IsDelta is true if this run contains only the changed documents since the 103 // last run. 104 IsDelta bool 105 106 // DocumentRanksPath is the path to the file with document ranks. If empty, 107 // ranks will be computed on-the-fly. 108 DocumentRanksPath string 109 110 // DocumentRanksVersion is a string which when changed will cause us to 111 // reindex a shard. This field is used so that when the contents of 112 // DocumentRanksPath changes, we can reindex. 113 DocumentRanksVersion string 114 115 // changedOrRemovedFiles is a list of file paths that have been changed or removed 116 // since the last indexing job for this repository. These files will be tombstoned 117 // in the older shards for this repository. 118 changedOrRemovedFiles []string 119 120 LanguageMap ctags.LanguageMap 121} 122 123// HashOptions contains only the options in Options that upon modification leads to IndexState of IndexStateMismatch during the next index building. 124type HashOptions struct { 125 sizeMax int 126 disableCTags bool 127 ctagsPath string 128 cTagsMustSucceed bool 129 largeFiles []string 130 131 // documentRankVersion is an experimental field which will change when the 132 // DocumentRanksPath content changes. If empty we ignore it. 133 documentRankVersion string 134} 135 136func (o *Options) HashOptions() HashOptions { 137 return HashOptions{ 138 sizeMax: o.SizeMax, 139 disableCTags: o.DisableCTags, 140 ctagsPath: o.CTagsPath, 141 cTagsMustSucceed: o.CTagsMustSucceed, 142 largeFiles: o.LargeFiles, 143 documentRankVersion: o.DocumentRanksVersion, 144 } 145} 146 147func (o *Options) GetHash() string { 148 h := o.HashOptions() 149 hasher := sha1.New() 150 151 hasher.Write([]byte(h.ctagsPath)) 152 hasher.Write([]byte(fmt.Sprintf("%t", h.cTagsMustSucceed))) 153 hasher.Write([]byte(fmt.Sprintf("%d", h.sizeMax))) 154 hasher.Write([]byte(fmt.Sprintf("%q", h.largeFiles))) 155 hasher.Write([]byte(fmt.Sprintf("%t", h.disableCTags))) 156 157 if h.documentRankVersion != "" { 158 hasher.Write([]byte{0}) 159 io.WriteString(hasher, h.documentRankVersion) 160 } 161 162 return fmt.Sprintf("%x", hasher.Sum(nil)) 163} 164 165type largeFilesFlag struct{ *Options } 166 167func (f largeFilesFlag) String() string { 168 // From flag.Value documentation: 169 // 170 // The flag package may call the String method with a zero-valued receiver, 171 // such as a nil pointer. 172 if f.Options == nil { 173 return "" 174 } 175 s := append([]string{""}, f.LargeFiles...) 176 return strings.Join(s, "-large_file ") 177} 178 179func (f largeFilesFlag) Set(value string) error { 180 f.LargeFiles = append(f.LargeFiles, value) 181 return nil 182} 183 184// Flags adds flags for build options to fs. It is the "inverse" of Args. 185func (o *Options) Flags(fs *flag.FlagSet) { 186 x := *o 187 x.SetDefaults() 188 fs.IntVar(&o.SizeMax, "file_limit", x.SizeMax, "maximum file size") 189 fs.IntVar(&o.TrigramMax, "max_trigram_count", x.TrigramMax, "maximum number of trigrams per document") 190 fs.IntVar(&o.ShardMax, "shard_limit", x.ShardMax, "maximum corpus size for a shard") 191 fs.IntVar(&o.Parallelism, "parallelism", x.Parallelism, "maximum number of parallel indexing processes.") 192 fs.StringVar(&o.IndexDir, "index", x.IndexDir, "directory for search indices") 193 fs.BoolVar(&o.CTagsMustSucceed, "require_ctags", x.CTagsMustSucceed, "If set, ctags calls must succeed.") 194 fs.Var(largeFilesFlag{o}, "large_file", "A glob pattern where matching files are to be index regardless of their size. You can add multiple patterns by setting this more than once.") 195 fs.StringVar(&o.MemProfile, "memprofile", "", "write memory profile(s) to `file.shardnum`. Note: sets parallelism to 1.") 196 197 // Sourcegraph specific 198 fs.BoolVar(&o.DisableCTags, "disable_ctags", x.DisableCTags, "If set, ctags will not be called.") 199} 200 201// Args generates command line arguments for o. It is the "inverse" of Flags. 202func (o *Options) Args() []string { 203 var args []string 204 205 if o.SizeMax != 0 { 206 args = append(args, "-file_limit", strconv.Itoa(o.SizeMax)) 207 } 208 209 if o.TrigramMax != 0 { 210 args = append(args, "-max_trigram_count", strconv.Itoa(o.TrigramMax)) 211 } 212 213 if o.ShardMax != 0 { 214 args = append(args, "-shard_limit", strconv.Itoa(o.ShardMax)) 215 } 216 217 if o.Parallelism != 0 { 218 args = append(args, "-parallelism", strconv.Itoa(o.Parallelism)) 219 } 220 221 if o.IndexDir != "" { 222 args = append(args, "-index", o.IndexDir) 223 } 224 225 if o.CTagsMustSucceed { 226 args = append(args, "-require_ctags") 227 } 228 229 for _, a := range o.LargeFiles { 230 args = append(args, "-large_file", a) 231 } 232 233 // Sourcegraph specific 234 if o.DisableCTags { 235 args = append(args, "-disable_ctags") 236 } 237 238 return args 239} 240 241// Builder manages (parallel) creation of uniformly sized shards. The 242// builder buffers up documents until it collects enough documents and 243// then builds a shard and writes. 244type Builder struct { 245 opts Options 246 throttle chan int 247 248 nextShardNum int 249 todo []*zoekt.Document 250 size int 251 252 parserMap ctags.ParserMap 253 254 building sync.WaitGroup 255 256 errMu sync.Mutex 257 buildError error 258 259 // temp name => final name for finished shards. We only rename 260 // them once all shards succeed to avoid Frankstein corpuses. 261 finishedShards map[string]string 262 263 shardLogger io.WriteCloser 264 265 // indexTime is set by tests for doing reproducible builds. 266 indexTime time.Time 267 268 // a sortable 20 chars long id. 269 id string 270 271 finishCalled bool 272} 273 274type finishedShard struct { 275 temp, final string 276} 277 278func checkCTags() string { 279 if ctags := os.Getenv("CTAGS_COMMAND"); ctags != "" { 280 return ctags 281 } 282 283 if ctags, err := exec.LookPath("universal-ctags"); err == nil { 284 return ctags 285 } 286 287 return "" 288} 289 290func checkScipCTags() string { 291 if ctags := os.Getenv("SCIP_CTAGS_COMMAND"); ctags != "" { 292 return ctags 293 } 294 295 if ctags, err := exec.LookPath("scip-ctags"); err == nil { 296 return ctags 297 } 298 299 return "" 300} 301 302// SetDefaults sets reasonable default options. 303func (o *Options) SetDefaults() { 304 if o.CTagsPath == "" && !o.DisableCTags { 305 o.CTagsPath = checkCTags() 306 } 307 308 if o.ScipCTagsPath == "" && !o.DisableCTags { 309 o.ScipCTagsPath = checkScipCTags() 310 } 311 312 if o.Parallelism == 0 { 313 o.Parallelism = 4 314 } 315 if o.SizeMax == 0 { 316 o.SizeMax = 2 << 20 317 } 318 if o.ShardMax == 0 { 319 o.ShardMax = 100 << 20 320 } 321 if o.TrigramMax == 0 { 322 o.TrigramMax = 20000 323 } 324 325 if o.RepositoryDescription.Name == "" && o.RepositoryDescription.URL != "" { 326 parsed, _ := url.Parse(o.RepositoryDescription.URL) 327 if parsed != nil { 328 o.RepositoryDescription.Name = filepath.Join(parsed.Host, parsed.Path) 329 } 330 } 331} 332 333func hashString(s string) string { 334 h := sha1.New() 335 _, _ = io.WriteString(h, s) 336 return fmt.Sprintf("%x", h.Sum(nil)) 337} 338 339// ShardName returns the name the given index shard. 340func (o *Options) shardName(n int) string { 341 return o.shardNameVersion(zoekt.IndexFormatVersion, n) 342} 343 344func (o *Options) shardNameVersion(version, n int) string { 345 abs := url.QueryEscape(o.RepositoryDescription.Name) 346 if len(abs) > 200 { 347 abs = abs[:200] + hashString(abs)[:8] 348 } 349 return filepath.Join(o.IndexDir, 350 fmt.Sprintf("%s_v%d.%05d.zoekt", abs, version, n)) 351} 352 353type IndexState string 354 355const ( 356 IndexStateMissing IndexState = "missing" 357 IndexStateCorrupt IndexState = "corrupt" 358 IndexStateVersion IndexState = "version-mismatch" 359 IndexStateOption IndexState = "option-mismatch" 360 IndexStateMeta IndexState = "meta-mismatch" 361 IndexStateContent IndexState = "content-mismatch" 362 IndexStateEqual IndexState = "equal" 363) 364 365var readVersions = []struct { 366 IndexFormatVersion int 367 FeatureVersion int 368}{{ 369 IndexFormatVersion: zoekt.IndexFormatVersion, 370 FeatureVersion: zoekt.FeatureVersion, 371}, { 372 IndexFormatVersion: zoekt.NextIndexFormatVersion, 373 FeatureVersion: zoekt.FeatureVersion, 374}} 375 376// IncrementalSkipIndexing returns true if the index present on disk matches 377// the build options. 378func (o *Options) IncrementalSkipIndexing() bool { 379 state, _ := o.IndexState() 380 return state == IndexStateEqual 381} 382 383// IndexState checks how the index present on disk compares to the build 384// options and returns the IndexState and the name of the first shard. 385func (o *Options) IndexState() (IndexState, string) { 386 // Open the latest version we support that is on disk. 387 fn := o.findShard() 388 if fn == "" { 389 return IndexStateMissing, fn 390 } 391 392 repos, index, err := zoekt.ReadMetadataPathAlive(fn) 393 if os.IsNotExist(err) { 394 return IndexStateMissing, fn 395 } else if err != nil { 396 return IndexStateCorrupt, fn 397 } 398 399 for _, v := range readVersions { 400 if v.IndexFormatVersion == index.IndexFormatVersion && v.FeatureVersion != index.IndexFeatureVersion { 401 return IndexStateVersion, fn 402 } 403 } 404 405 var repo *zoekt.Repository 406 for _, cand := range repos { 407 if cand.Name == o.RepositoryDescription.Name { 408 repo = cand 409 break 410 } 411 } 412 413 if repo == nil { 414 return IndexStateCorrupt, fn 415 } 416 417 if repo.IndexOptions != o.GetHash() { 418 return IndexStateOption, fn 419 } 420 421 if !reflect.DeepEqual(repo.Branches, o.RepositoryDescription.Branches) { 422 return IndexStateContent, fn 423 } 424 425 // We can mutate repo since it lives in the scope of this function call. 426 if updated, err := repo.MergeMutable(&o.RepositoryDescription); err != nil { 427 // non-nil err means we are trying to update an immutable field => 428 // reindex content. 429 log.Printf("warn: immutable field changed, requires re-index: %s", err) 430 return IndexStateContent, fn 431 } else if updated { 432 return IndexStateMeta, fn 433 } 434 435 return IndexStateEqual, fn 436} 437 438// FindRepositoryMetadata returns the index metadata for the repository 439// specified in the options. 'ok' is false if the repository's metadata 440// couldn't be found or if an error occurred. 441func (o *Options) FindRepositoryMetadata() (repository *zoekt.Repository, metadata *zoekt.IndexMetadata, ok bool, err error) { 442 shard := o.findShard() 443 if shard == "" { 444 return nil, nil, false, nil 445 } 446 447 repositories, metadata, err := zoekt.ReadMetadataPathAlive(shard) 448 if err != nil { 449 return nil, nil, false, fmt.Errorf("reading metadata for shard %q: %w", shard, err) 450 } 451 452 ID := o.RepositoryDescription.ID 453 for _, r := range repositories { 454 // compound shards contain multiple repositories, so we 455 // have to pick only the one we're looking for 456 if r.ID == ID { 457 return r, metadata, true, nil 458 } 459 } 460 461 // If we're here, then we're somehow in a state where we found a matching 462 // shard that's missing the repository metadata we're looking for. This 463 // should never happen. 464 name := o.RepositoryDescription.Name 465 return nil, nil, false, fmt.Errorf("matching shard %q doesn't contain metadata for repo id %d (%q)", shard, ID, name) 466} 467 468func (o *Options) findShard() string { 469 for _, v := range readVersions { 470 fn := o.shardNameVersion(v.IndexFormatVersion, 0) 471 if _, err := os.Stat(fn); err == nil { 472 return fn 473 } 474 } 475 476 // Brute force finding the shard in compound shards. We should only hit this 477 // code path for repositories that are not already existing or are in 478 // compound shards. 479 // 480 // TODO add an oracle which can speed this up in the case of repositories 481 // already in compound shards. 482 compoundShards, err := filepath.Glob(path.Join(o.IndexDir, "compound-*.zoekt")) 483 if err != nil { 484 return "" 485 } 486 for _, fn := range compoundShards { 487 repos, _, err := zoekt.ReadMetadataPathAlive(fn) 488 if err != nil { 489 continue 490 } 491 for _, repo := range repos { 492 if repo.ID == o.RepositoryDescription.ID { 493 return fn 494 } 495 } 496 } 497 498 return "" 499} 500 501func (o *Options) FindAllShards() []string { 502 for _, v := range readVersions { 503 fn := o.shardNameVersion(v.IndexFormatVersion, 0) 504 if _, err := os.Stat(fn); err == nil { 505 shards := []string{fn} 506 for i := 1; ; i++ { 507 fn := o.shardNameVersion(v.IndexFormatVersion, i) 508 if _, err := os.Stat(fn); err != nil { 509 return shards 510 } 511 shards = append(shards, fn) 512 } 513 } 514 } 515 516 // lazily fallback to findShard which will look for a compound shard. 517 if fn := o.findShard(); fn != "" { 518 return []string{fn} 519 } 520 521 return nil 522} 523 524// IgnoreSizeMax determines whether the max size should be ignored. 525func (o *Options) IgnoreSizeMax(name string) bool { 526 // A pattern match will override preceding pattern matches. 527 for i := len(o.LargeFiles) - 1; i >= 0; i-- { 528 pattern := strings.TrimSpace(o.LargeFiles[i]) 529 negated, validatedPattern := checkIsNegatePattern(pattern) 530 531 if m, _ := doublestar.PathMatch(validatedPattern, name); m { 532 if negated { 533 return false 534 } else { 535 return true 536 } 537 } 538 } 539 540 return false 541} 542 543func checkIsNegatePattern(pattern string) (bool, string) { 544 negate := "!" 545 546 // if negated then strip prefix meta character which identifies negated filter pattern 547 if strings.HasPrefix(pattern, negate) { 548 return true, pattern[len(negate):] 549 } 550 551 return false, pattern 552} 553 554// NewBuilder creates a new Builder instance. 555func NewBuilder(opts Options) (*Builder, error) { 556 opts.SetDefaults() 557 if opts.RepositoryDescription.Name == "" { 558 return nil, fmt.Errorf("builder: must set Name") 559 } 560 561 b := &Builder{ 562 opts: opts, 563 throttle: make(chan int, opts.Parallelism), 564 finishedShards: map[string]string{}, 565 } 566 567 if b.opts.CTagsPath == "" && b.opts.CTagsMustSucceed { 568 return nil, fmt.Errorf("ctags binary not found, but CTagsMustSucceed set") 569 } 570 571 parserMap, err := ctags.NewParserMap(ctags.ParserBinMap{ 572 ctags.UniversalCTags: b.opts.CTagsPath, 573 ctags.ScipCTags: b.opts.ScipCTagsPath, 574 }, b.opts.CTagsMustSucceed) 575 576 if err != nil { 577 return nil, err 578 } 579 580 b.parserMap = parserMap 581 582 b.shardLogger = &lumberjack.Logger{ 583 Filename: filepath.Join(opts.IndexDir, "zoekt-builder-shard-log.tsv"), 584 MaxSize: 100, // Megabyte 585 MaxBackups: 5, 586 } 587 588 if opts.IsDelta { 589 // Delta shards build on top of previously existing shards. 590 // As a consequence, the shardNum for delta shards starts from 591 // the number following the most recently generated shard - not 0. 592 // 593 // Using this numbering scheme allows all the shards to be 594 // discovered as a set. 595 shards := b.opts.FindAllShards() 596 b.nextShardNum = len(shards) // shards are zero indexed, so len() provides the next number after the last one 597 } 598 599 if _, err := b.newShardBuilder(); err != nil { 600 return nil, err 601 } 602 603 now := time.Now() 604 b.indexTime = now 605 b.id = xid.NewWithTime(now).String() 606 607 return b, nil 608} 609 610// AddFile is a convenience wrapper for the Add method 611func (b *Builder) AddFile(name string, content []byte) error { 612 return b.Add(zoekt.Document{Name: name, Content: content}) 613} 614 615func (b *Builder) Add(doc zoekt.Document) error { 616 if b.finishCalled { 617 return nil 618 } 619 620 allowLargeFile := b.opts.IgnoreSizeMax(doc.Name) 621 622 // Adjust trigramMax for allowed large files so we don't exclude them. 623 trigramMax := b.opts.TrigramMax 624 if allowLargeFile { 625 trigramMax = math.MaxInt64 626 } 627 628 if len(doc.Content) > b.opts.SizeMax && !allowLargeFile { 629 // We could pass the document on to the shardbuilder, but if 630 // we pass through a part of the source tree with binary/large 631 // files, the corresponding shard would be mostly empty, so 632 // insert a reason here too. 633 doc.SkipReason = fmt.Sprintf("document size %d larger than limit %d", len(doc.Content), b.opts.SizeMax) 634 } else if err := zoekt.CheckText(doc.Content, trigramMax); err != nil { 635 doc.SkipReason = err.Error() 636 doc.Language = "binary" 637 } 638 639 b.todo = append(b.todo, &doc) 640 641 if doc.SkipReason == "" { 642 b.size += len(doc.Name) + len(doc.Content) 643 } else { 644 b.size += len(doc.Name) + len(doc.SkipReason) 645 // Drop the content if we are skipping the document. Skipped content is not counted towards the 646 // shard size limit, so otherwise we might buffer too much data in memory before flushing. 647 doc.Content = nil 648 } 649 650 if b.size > b.opts.ShardMax { 651 return b.flush() 652 } 653 654 return nil 655} 656 657// MarkFileAsChangedOrRemoved indicates that the file specified by the given path 658// has been changed or removed since the last indexing job for this repository. 659// 660// If this build is a delta build, these files will be tombstoned in the older shards for this repository. 661func (b *Builder) MarkFileAsChangedOrRemoved(path string) { 662 b.opts.changedOrRemovedFiles = append(b.opts.changedOrRemovedFiles, path) 663} 664 665// Finish creates a last shard from the buffered documents, and clears 666// stale shards from previous runs. This should always be called, also 667// in failure cases, to ensure cleanup. 668// 669// It is safe to call Finish() multiple times. 670func (b *Builder) Finish() error { 671 if b.finishCalled { 672 return b.buildError 673 } 674 675 b.finishCalled = true 676 677 b.flush() 678 b.building.Wait() 679 680 if b.buildError != nil { 681 for tmp := range b.finishedShards { 682 log.Printf("Builder.Finish %s", tmp) 683 os.Remove(tmp) 684 } 685 b.finishedShards = map[string]string{} 686 return b.buildError 687 } 688 689 // map of temporary -> final names for all updated shards + shard metadata files 690 artifactPaths := make(map[string]string) 691 for tmp, final := range b.finishedShards { 692 artifactPaths[tmp] = final 693 } 694 695 oldShards := b.opts.FindAllShards() 696 697 if b.opts.IsDelta { 698 // Delta shard builds need to update FileTombstone and branch commit information for all 699 // existing shards 700 for _, shard := range oldShards { 701 repositories, _, err := zoekt.ReadMetadataPathAlive(shard) 702 if err != nil { 703 return fmt.Errorf("reading metadata from shard %q: %w", shard, err) 704 } 705 706 if len(repositories) > 1 { 707 return fmt.Errorf("delta shard builds don't support repositories contained in compound shards (shard %q)", shard) 708 } 709 710 if len(repositories) == 0 { 711 return fmt.Errorf("failed to update repository metadata for shard %q - shard contains no repositories", shard) 712 } 713 714 repository := repositories[0] 715 if repository.ID != b.opts.RepositoryDescription.ID { 716 return fmt.Errorf("shard %q doesn't contain repository ID %d (%q)", shard, b.opts.RepositoryDescription.ID, b.opts.RepositoryDescription.Name) 717 } 718 719 if len(b.opts.changedOrRemovedFiles) > 0 && repository.FileTombstones == nil { 720 repository.FileTombstones = make(map[string]struct{}, len(b.opts.changedOrRemovedFiles)) 721 } 722 723 for _, f := range b.opts.changedOrRemovedFiles { 724 repository.FileTombstones[f] = struct{}{} 725 } 726 727 if !BranchNamesEqual(repository.Branches, b.opts.RepositoryDescription.Branches) { 728 return deltaBranchSetError{ 729 shardName: shard, 730 old: repository.Branches, 731 new: b.opts.RepositoryDescription.Branches, 732 } 733 } 734 735 if b.opts.GetHash() != repository.IndexOptions { 736 return &deltaIndexOptionsMismatchError{ 737 shardName: shard, 738 newOptions: b.opts.HashOptions(), 739 } 740 } 741 742 repository.Branches = b.opts.RepositoryDescription.Branches 743 744 repository.LatestCommitDate = b.opts.RepositoryDescription.LatestCommitDate 745 746 tempPath, finalPath, err := zoekt.JsonMarshalRepoMetaTemp(shard, repository) 747 if err != nil { 748 return fmt.Errorf("writing repository metadta for shard %q: %w", shard, err) 749 } 750 751 artifactPaths[tempPath] = finalPath 752 } 753 } 754 755 // We mark finished shards as empty when we successfully finish. Return now 756 // to allow call sites to call Finish idempotently. 757 if len(artifactPaths) == 0 { 758 return b.buildError 759 } 760 761 defer b.shardLogger.Close() 762 763 // Collect a map of the old shards on disk. For each new shard we replace we 764 // delete it from toDelete. Anything remaining in toDelete will be removed 765 // after we have renamed everything into place. 766 767 var toDelete map[string]struct{} 768 if !b.opts.IsDelta { 769 // Non-delta shard builds delete all existing shards before they write out 770 // new ones. 771 // By contrast, delta shard builds work by stacking changes on top of existing shards. 772 // So, we skip populating the toDelete map if we're building delta shards. 773 774 toDelete = make(map[string]struct{}) 775 for _, name := range oldShards { 776 paths, err := zoekt.IndexFilePaths(name) 777 if err != nil { 778 b.buildError = fmt.Errorf("failed to find old paths for %s: %w", name, err) 779 } 780 for _, p := range paths { 781 toDelete[p] = struct{}{} 782 } 783 } 784 } 785 786 for tmp, final := range artifactPaths { 787 if err := os.Rename(tmp, final); err != nil { 788 b.buildError = err 789 continue 790 } 791 792 delete(toDelete, final) 793 794 b.shardLog("upsert", final, b.opts.RepositoryDescription.Name) 795 } 796 797 b.finishedShards = map[string]string{} 798 799 for p := range toDelete { 800 // Don't delete compound shards, set tombstones instead. 801 if zoekt.ShardMergingEnabled() && strings.HasPrefix(filepath.Base(p), "compound-") { 802 if !strings.HasSuffix(p, ".zoekt") { 803 continue 804 } 805 b.shardLog("tomb", p, b.opts.RepositoryDescription.Name) 806 err := zoekt.SetTombstone(p, b.opts.RepositoryDescription.ID) 807 b.buildError = err 808 continue 809 } 810 log.Printf("removing old shard file: %s", p) 811 b.shardLog("remove", p, b.opts.RepositoryDescription.Name) 812 if err := os.Remove(p); err != nil { 813 b.buildError = err 814 } 815 } 816 817 return b.buildError 818} 819 820// BranchNamesEqual compares the given zoekt.RepositoryBranch slices, and returns true 821// iff both slices specify the same set of branch names in the same order. 822func BranchNamesEqual(a, b []zoekt.RepositoryBranch) bool { 823 if len(a) != len(b) { 824 return false 825 } 826 827 for i := range a { 828 x, y := a[i], b[i] 829 if x.Name != y.Name { 830 return false 831 } 832 } 833 834 return true 835} 836 837func (b *Builder) flush() error { 838 todo := b.todo 839 b.todo = nil 840 b.size = 0 841 b.errMu.Lock() 842 defer b.errMu.Unlock() 843 if b.buildError != nil { 844 return b.buildError 845 } 846 847 hasShard := b.nextShardNum > 0 848 if len(todo) == 0 && hasShard { 849 return nil 850 } 851 852 shard := b.nextShardNum 853 b.nextShardNum++ 854 855 if b.opts.Parallelism > 1 && b.opts.MemProfile == "" { 856 b.building.Add(1) 857 b.throttle <- 1 858 go func() { 859 done, err := b.buildShard(todo, shard) 860 <-b.throttle 861 862 b.errMu.Lock() 863 defer b.errMu.Unlock() 864 if err != nil && b.buildError == nil { 865 b.buildError = err 866 } 867 if err == nil { 868 b.finishedShards[done.temp] = done.final 869 } 870 b.building.Done() 871 }() 872 } else { 873 // No goroutines when we're not parallel. This 874 // simplifies memory profiling. 875 done, err := b.buildShard(todo, shard) 876 b.buildError = err 877 if err == nil { 878 b.finishedShards[done.temp] = done.final 879 } 880 if b.opts.MemProfile != "" { 881 // drop memory, and profile. 882 todo = nil 883 b.writeMemProfile(b.opts.MemProfile) 884 } 885 886 return b.buildError 887 } 888 889 return nil 890} 891 892func (b *Builder) shardLog(action, shard string, repoName string) { 893 shard = filepath.Base(shard) 894 var shardSize int64 895 if fi, err := os.Stat(filepath.Join(b.opts.IndexDir, shard)); err == nil { 896 shardSize = fi.Size() 897 } 898 _, _ = fmt.Fprintf(b.shardLogger, "%s\t%s\t%s\t%d\t%s\n", time.Now().UTC().Format(time.RFC3339), action, shard, shardSize, repoName) 899} 900 901var profileNumber int 902 903func (b *Builder) writeMemProfile(name string) { 904 nm := fmt.Sprintf("%s.%d", name, profileNumber) 905 profileNumber++ 906 f, err := os.Create(nm) 907 if err != nil { 908 log.Fatal("could not create memory profile: ", err) 909 } 910 runtime.GC() // get up-to-date statistics 911 if err := pprof.WriteHeapProfile(f); err != nil { 912 log.Fatal("could not write memory profile: ", err) 913 } 914 f.Close() 915 log.Printf("wrote mem profile %q", nm) 916} 917 918// map [0,inf) to [0,1) monotonically 919func squashRange(j int) float64 { 920 x := float64(j) 921 return x / (1 + x) 922} 923 924// IsLowPriority takes a file name and makes an educated guess about its priority 925// in search results. A file is considered low priority if it looks like a test, 926// vendored, or generated file. 927// 928// These 'priority' criteria affects how documents are ordered within a shard. It's 929// also used to help guess a file's rank when we're missing ranking information. 930func IsLowPriority(file string) bool { 931 return testRe.MatchString(file) || isGenerated(file) || isVendored(file) 932} 933 934var testRe = regexp.MustCompile("[Tt]est") 935 936func isGenerated(file string) bool { 937 return strings.HasSuffix(file, "min.js") || strings.HasSuffix(file, "js.map") 938} 939 940func isVendored(file string) bool { 941 return strings.Contains(file, "vendor/") || strings.Contains(file, "node_modules/") 942} 943 944type rankedDoc struct { 945 *zoekt.Document 946 rank []float64 947} 948 949// rank returns a vector of scores which is used at index-time to sort documents 950// before writing them to disk. The order of documents in the shard is important 951// at query time, because earlier documents receive a boost at query time and 952// have a higher chance of being searched before limits kick in. 953func rank(d *zoekt.Document, origIdx int) []float64 { 954 skipped := 0.0 955 if d.SkipReason != "" { 956 skipped = 1.0 957 } 958 959 generated := 0.0 960 if isGenerated(d.Name) { 961 generated = 1.0 962 } 963 964 vendor := 0.0 965 if isVendored(d.Name) { 966 vendor = 1.0 967 } 968 969 test := 0.0 970 if testRe.MatchString(d.Name) { 971 test = 1.0 972 } 973 974 // Smaller is earlier (=better). 975 return []float64{ 976 // Always place skipped docs last 977 skipped, 978 979 // Prefer docs that are not generated 980 generated, 981 982 // Prefer docs that are not vendored 983 vendor, 984 985 // Prefer docs that are not tests 986 test, 987 988 // With short names 989 squashRange(len(d.Name)), 990 991 // With many symbols 992 1.0 - squashRange(len(d.Symbols)), 993 994 // With short content 995 squashRange(len(d.Content)), 996 997 // That is present is as many branches as possible 998 1.0 - squashRange(len(d.Branches)), 999 1000 // Preserve original ordering. 1001 squashRange(origIdx), 1002 } 1003} 1004 1005func sortDocuments(todo []*zoekt.Document) { 1006 rs := make([]rankedDoc, 0, len(todo)) 1007 for i, t := range todo { 1008 rd := rankedDoc{t, rank(t, i)} 1009 rs = append(rs, rd) 1010 } 1011 sort.Slice(rs, func(i, j int) bool { 1012 r1 := rs[i].rank 1013 r2 := rs[j].rank 1014 for i := range r1 { 1015 if r1[i] < r2[i] { 1016 return true 1017 } 1018 if r1[i] > r2[i] { 1019 return false 1020 } 1021 } 1022 1023 return false 1024 }) 1025 for i := range todo { 1026 todo[i] = rs[i].Document 1027 } 1028} 1029 1030func (b *Builder) buildShard(todo []*zoekt.Document, nextShardNum int) (*finishedShard, error) { 1031 if !b.opts.DisableCTags && (b.opts.CTagsPath != "" || b.opts.ScipCTagsPath != "") { 1032 err := ctagsAddSymbolsParserMap(todo, b.opts.LanguageMap, b.parserMap) 1033 if b.opts.CTagsMustSucceed && err != nil { 1034 return nil, err 1035 } 1036 if err != nil { 1037 log.Printf("ignoring universal:%s or scip:%s error: %v", b.opts.CTagsPath, b.opts.ScipCTagsPath, err) 1038 } 1039 } 1040 1041 name := b.opts.shardName(nextShardNum) 1042 1043 shardBuilder, err := b.newShardBuilder() 1044 if err != nil { 1045 return nil, err 1046 } 1047 1048 sortDocuments(todo) 1049 1050 for _, t := range todo { 1051 if err := shardBuilder.Add(*t); err != nil { 1052 return nil, err 1053 } 1054 } 1055 1056 return b.writeShard(name, shardBuilder) 1057} 1058 1059func (b *Builder) newShardBuilder() (*zoekt.IndexBuilder, error) { 1060 desc := b.opts.RepositoryDescription 1061 desc.HasSymbols = !b.opts.DisableCTags && b.opts.CTagsPath != "" 1062 desc.SubRepoMap = b.opts.SubRepositories 1063 desc.IndexOptions = b.opts.GetHash() 1064 1065 shardBuilder, err := zoekt.NewIndexBuilder(&desc) 1066 if err != nil { 1067 return nil, err 1068 } 1069 shardBuilder.IndexTime = b.indexTime 1070 shardBuilder.ID = b.id 1071 return shardBuilder, nil 1072} 1073 1074func (b *Builder) writeShard(fn string, ib *zoekt.IndexBuilder) (*finishedShard, error) { 1075 dir := filepath.Dir(fn) 1076 if err := os.MkdirAll(dir, 0o700); err != nil { 1077 return nil, err 1078 } 1079 1080 f, err := os.CreateTemp(dir, filepath.Base(fn)+".*.tmp") 1081 if err != nil { 1082 return nil, err 1083 } 1084 if runtime.GOOS != "windows" { 1085 if err := f.Chmod(0o666 &^ umask); err != nil { 1086 return nil, err 1087 } 1088 } 1089 1090 defer f.Close() 1091 if err := ib.Write(f); err != nil { 1092 return nil, err 1093 } 1094 fi, err := f.Stat() 1095 if err != nil { 1096 return nil, err 1097 } 1098 if err := f.Close(); err != nil { 1099 return nil, err 1100 } 1101 1102 log.Printf("finished shard %s: %d index bytes (overhead %3.1f), %d files processed \n", 1103 fn, 1104 fi.Size(), 1105 float64(fi.Size())/float64(ib.ContentSize()+1), 1106 ib.NumFiles()) 1107 1108 return &finishedShard{f.Name(), fn}, nil 1109} 1110 1111type deltaBranchSetError struct { 1112 shardName string 1113 old, new []zoekt.RepositoryBranch 1114} 1115 1116func (e deltaBranchSetError) Error() string { 1117 return fmt.Sprintf("repository metadata in shard %q contains a different set of branch names than what was requested, which is unsupported in a delta shard build. old: %+v, new: %+v", e.shardName, e.old, e.new) 1118} 1119 1120type deltaIndexOptionsMismatchError struct { 1121 shardName string 1122 newOptions HashOptions 1123} 1124 1125func (e *deltaIndexOptionsMismatchError) Error() string { 1126 return fmt.Sprintf("one or more index options for shard %q do not match Builder's index options. These index option updates are incompatible with delta build. New index options: %+v", e.shardName, e.newOptions) 1127} 1128 1129// umask holds the Umask of the current process 1130var umask os.FileMode