fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15// package build implements a more convenient interface for building 16// zoekt indices. 17package build 18 19import ( 20 "crypto/sha1" 21 "flag" 22 "fmt" 23 "io" 24 "log" 25 "math" 26 "net/url" 27 "os" 28 "os/exec" 29 "path" 30 "path/filepath" 31 "reflect" 32 "runtime" 33 "runtime/pprof" 34 "sort" 35 "strconv" 36 "strings" 37 "sync" 38 "time" 39 40 "github.com/bmatcuk/doublestar" 41 "github.com/grafana/regexp" 42 "github.com/rs/xid" 43 "gopkg.in/natefinch/lumberjack.v2" 44 45 "github.com/sourcegraph/zoekt" 46 "github.com/sourcegraph/zoekt/ctags" 47) 48 49var DefaultDir = filepath.Join(os.Getenv("HOME"), ".zoekt") 50 51// Branch describes a single branch version. 52type Branch struct { 53 Name string 54 Version string 55} 56 57// Options sets options for the index building. 58type Options struct { 59 // IndexDir is a directory that holds *.zoekt index files. 60 IndexDir string 61 62 // SizeMax is the maximum file size 63 SizeMax int 64 65 // Parallelism is the maximum number of shards to index in parallel 66 Parallelism int 67 68 // ShardMax sets the maximum corpus size for a single shard 69 ShardMax int 70 71 // TrigramMax sets the maximum number of distinct trigrams per document. 72 TrigramMax int 73 74 // RepositoryDescription holds names and URLs for the repository. 75 RepositoryDescription zoekt.Repository 76 77 // SubRepositories is a path => sub repository map. 78 SubRepositories map[string]*zoekt.Repository 79 80 // DisableCTags disables the generation of ctags metadata. 81 DisableCTags bool 82 83 // CtagsPath is the path to the ctags binary to run, or empty 84 // if a valid binary couldn't be found. 85 CTagsPath string 86 87 // If set, ctags must succeed. 88 CTagsMustSucceed bool 89 90 // Write memory profiles to this file. 91 MemProfile string 92 93 // LargeFiles is a slice of glob patterns, including ** for any number 94 // of directories, where matching file paths should be indexed 95 // regardless of their size. The full pattern syntax is here: 96 // https://github.com/bmatcuk/doublestar/tree/v1#patterns. 97 LargeFiles []string 98 99 // IsDelta is true if this run contains only the changed documents since the 100 // last run. 101 IsDelta bool 102 103 // DocumentRanksPath is the path to the file with document ranks. If empty, 104 // ranks will be computed on-the-fly. 105 DocumentRanksPath string 106 107 // DocumentRanksVersion is a string which when changed will cause us to 108 // reindex a shard. This field is used so that when the contents of 109 // DocumentRanksPath changes, we can reindex. 110 DocumentRanksVersion string 111 112 // changedOrRemovedFiles is a list of file paths that have been changed or removed 113 // since the last indexing job for this repository. These files will be tombstoned 114 // in the older shards for this repository. 115 changedOrRemovedFiles []string 116} 117 118// HashOptions contains only the options in Options that upon modification leads to IndexState of IndexStateMismatch during the next index building. 119type HashOptions struct { 120 sizeMax int 121 disableCTags bool 122 ctagsPath string 123 cTagsMustSucceed bool 124 largeFiles []string 125 126 // documentRankVersion is an experimental field which will change when the 127 // DocumentRanksPath content changes. If empty we ignore it. 128 documentRankVersion string 129} 130 131func (o *Options) HashOptions() HashOptions { 132 return HashOptions{ 133 sizeMax: o.SizeMax, 134 disableCTags: o.DisableCTags, 135 ctagsPath: o.CTagsPath, 136 cTagsMustSucceed: o.CTagsMustSucceed, 137 largeFiles: o.LargeFiles, 138 documentRankVersion: o.DocumentRanksVersion, 139 } 140} 141 142func (o *Options) GetHash() string { 143 h := o.HashOptions() 144 hasher := sha1.New() 145 146 hasher.Write([]byte(h.ctagsPath)) 147 hasher.Write([]byte(fmt.Sprintf("%t", h.cTagsMustSucceed))) 148 hasher.Write([]byte(fmt.Sprintf("%d", h.sizeMax))) 149 hasher.Write([]byte(fmt.Sprintf("%q", h.largeFiles))) 150 hasher.Write([]byte(fmt.Sprintf("%t", h.disableCTags))) 151 152 if h.documentRankVersion != "" { 153 hasher.Write([]byte{0}) 154 io.WriteString(hasher, h.documentRankVersion) 155 } 156 157 return fmt.Sprintf("%x", hasher.Sum(nil)) 158} 159 160type largeFilesFlag struct{ *Options } 161 162func (f largeFilesFlag) String() string { 163 // From flag.Value documentation: 164 // 165 // The flag package may call the String method with a zero-valued receiver, 166 // such as a nil pointer. 167 if f.Options == nil { 168 return "" 169 } 170 s := append([]string{""}, f.LargeFiles...) 171 return strings.Join(s, "-large_file ") 172} 173 174func (f largeFilesFlag) Set(value string) error { 175 f.LargeFiles = append(f.LargeFiles, value) 176 return nil 177} 178 179// Flags adds flags for build options to fs. It is the "inverse" of Args. 180func (o *Options) Flags(fs *flag.FlagSet) { 181 x := *o 182 x.SetDefaults() 183 fs.IntVar(&o.SizeMax, "file_limit", x.SizeMax, "maximum file size") 184 fs.IntVar(&o.TrigramMax, "max_trigram_count", x.TrigramMax, "maximum number of trigrams per document") 185 fs.IntVar(&o.ShardMax, "shard_limit", x.ShardMax, "maximum corpus size for a shard") 186 fs.IntVar(&o.Parallelism, "parallelism", x.Parallelism, "maximum number of parallel indexing processes.") 187 fs.StringVar(&o.IndexDir, "index", x.IndexDir, "directory for search indices") 188 fs.BoolVar(&o.CTagsMustSucceed, "require_ctags", x.CTagsMustSucceed, "If set, ctags calls must succeed.") 189 fs.Var(largeFilesFlag{o}, "large_file", "A glob pattern where matching files are to be index regardless of their size. You can add multiple patterns by setting this more than once.") 190 fs.StringVar(&o.MemProfile, "memprofile", "", "write memory profile(s) to `file.shardnum`. Note: sets parallelism to 1.") 191 192 // Sourcegraph specific 193 fs.BoolVar(&o.DisableCTags, "disable_ctags", x.DisableCTags, "If set, ctags will not be called.") 194} 195 196// Args generates command line arguments for o. It is the "inverse" of Flags. 197func (o *Options) Args() []string { 198 var args []string 199 200 if o.SizeMax != 0 { 201 args = append(args, "-file_limit", strconv.Itoa(o.SizeMax)) 202 } 203 204 if o.TrigramMax != 0 { 205 args = append(args, "-max_trigram_count", strconv.Itoa(o.TrigramMax)) 206 } 207 208 if o.ShardMax != 0 { 209 args = append(args, "-shard_limit", strconv.Itoa(o.ShardMax)) 210 } 211 212 if o.Parallelism != 0 { 213 args = append(args, "-parallelism", strconv.Itoa(o.Parallelism)) 214 } 215 216 if o.IndexDir != "" { 217 args = append(args, "-index", o.IndexDir) 218 } 219 220 if o.CTagsMustSucceed { 221 args = append(args, "-require_ctags") 222 } 223 224 for _, a := range o.LargeFiles { 225 args = append(args, "-large_file", a) 226 } 227 228 // Sourcegraph specific 229 if o.DisableCTags { 230 args = append(args, "-disable_ctags") 231 } 232 233 return args 234} 235 236// Builder manages (parallel) creation of uniformly sized shards. The 237// builder buffers up documents until it collects enough documents and 238// then builds a shard and writes. 239type Builder struct { 240 opts Options 241 throttle chan int 242 243 nextShardNum int 244 todo []*zoekt.Document 245 size int 246 247 parser ctags.Parser 248 249 building sync.WaitGroup 250 251 errMu sync.Mutex 252 buildError error 253 254 // temp name => final name for finished shards. We only rename 255 // them once all shards succeed to avoid Frankstein corpuses. 256 finishedShards map[string]string 257 258 shardLogger io.WriteCloser 259 260 // indexTime is set by tests for doing reproducible builds. 261 indexTime time.Time 262 263 // a sortable 20 chars long id. 264 id string 265 266 finishCalled bool 267} 268 269type finishedShard struct { 270 temp, final string 271} 272 273func checkCTags() string { 274 if ctags := os.Getenv("CTAGS_COMMAND"); ctags != "" { 275 return ctags 276 } 277 278 if ctags, err := exec.LookPath("universal-ctags"); err == nil { 279 return ctags 280 } 281 282 if ctags, err := exec.LookPath("ctags-exuberant"); err == nil { 283 return ctags 284 } 285 return "" 286} 287 288// SetDefaults sets reasonable default options. 289func (o *Options) SetDefaults() { 290 if o.CTagsPath == "" && !o.DisableCTags { 291 o.CTagsPath = checkCTags() 292 } 293 294 if o.Parallelism == 0 { 295 o.Parallelism = 4 296 } 297 if o.SizeMax == 0 { 298 o.SizeMax = 2 << 20 299 } 300 if o.ShardMax == 0 { 301 o.ShardMax = 100 << 20 302 } 303 if o.TrigramMax == 0 { 304 o.TrigramMax = 20000 305 } 306 307 if o.RepositoryDescription.Name == "" && o.RepositoryDescription.URL != "" { 308 parsed, _ := url.Parse(o.RepositoryDescription.URL) 309 if parsed != nil { 310 o.RepositoryDescription.Name = filepath.Join(parsed.Host, parsed.Path) 311 } 312 } 313} 314 315func hashString(s string) string { 316 h := sha1.New() 317 _, _ = io.WriteString(h, s) 318 return fmt.Sprintf("%x", h.Sum(nil)) 319} 320 321// ShardName returns the name the given index shard. 322func (o *Options) shardName(n int) string { 323 return o.shardNameVersion(zoekt.IndexFormatVersion, n) 324} 325 326func (o *Options) shardNameVersion(version, n int) string { 327 abs := url.QueryEscape(o.RepositoryDescription.Name) 328 if len(abs) > 200 { 329 abs = abs[:200] + hashString(abs)[:8] 330 } 331 return filepath.Join(o.IndexDir, 332 fmt.Sprintf("%s_v%d.%05d.zoekt", abs, version, n)) 333} 334 335type IndexState string 336 337const ( 338 IndexStateMissing IndexState = "missing" 339 IndexStateCorrupt IndexState = "corrupt" 340 IndexStateVersion IndexState = "version-mismatch" 341 IndexStateOption IndexState = "option-mismatch" 342 IndexStateMeta IndexState = "meta-mismatch" 343 IndexStateContent IndexState = "content-mismatch" 344 IndexStateEqual IndexState = "equal" 345) 346 347var readVersions = []struct { 348 IndexFormatVersion int 349 FeatureVersion int 350}{{ 351 IndexFormatVersion: zoekt.IndexFormatVersion, 352 FeatureVersion: zoekt.FeatureVersion, 353}, { 354 IndexFormatVersion: zoekt.NextIndexFormatVersion, 355 FeatureVersion: zoekt.FeatureVersion, 356}} 357 358// IncrementalSkipIndexing returns true if the index present on disk matches 359// the build options. 360func (o *Options) IncrementalSkipIndexing() bool { 361 state, _ := o.IndexState() 362 return state == IndexStateEqual 363} 364 365// IndexState checks how the index present on disk compares to the build 366// options and returns the IndexState and the name of the first shard. 367func (o *Options) IndexState() (IndexState, string) { 368 // Open the latest version we support that is on disk. 369 fn := o.findShard() 370 if fn == "" { 371 return IndexStateMissing, fn 372 } 373 374 repos, index, err := zoekt.ReadMetadataPathAlive(fn) 375 if os.IsNotExist(err) { 376 return IndexStateMissing, fn 377 } else if err != nil { 378 return IndexStateCorrupt, fn 379 } 380 381 for _, v := range readVersions { 382 if v.IndexFormatVersion == index.IndexFormatVersion && v.FeatureVersion != index.IndexFeatureVersion { 383 return IndexStateVersion, fn 384 } 385 } 386 387 var repo *zoekt.Repository 388 for _, cand := range repos { 389 if cand.Name == o.RepositoryDescription.Name { 390 repo = cand 391 break 392 } 393 } 394 395 if repo == nil { 396 return IndexStateCorrupt, fn 397 } 398 399 if repo.IndexOptions != o.GetHash() { 400 return IndexStateOption, fn 401 } 402 403 if !reflect.DeepEqual(repo.Branches, o.RepositoryDescription.Branches) { 404 return IndexStateContent, fn 405 } 406 407 // We can mutate repo since it lives in the scope of this function call. 408 if updated, err := repo.MergeMutable(&o.RepositoryDescription); err != nil { 409 // non-nil err means we are trying to update an immutable field => 410 // reindex content. 411 log.Printf("warn: immutable field changed, requires re-index: %s", err) 412 return IndexStateContent, fn 413 } else if updated { 414 return IndexStateMeta, fn 415 } 416 417 return IndexStateEqual, fn 418} 419 420// FindRepositoryMetadata returns the index metadata for the repository 421// specified in the options. 'ok' is false if the repository's metadata 422// couldn't be found or if an error occurred. 423func (o *Options) FindRepositoryMetadata() (repository *zoekt.Repository, ok bool, err error) { 424 shard := o.findShard() 425 if shard == "" { 426 return nil, false, nil 427 } 428 429 repositories, _, err := zoekt.ReadMetadataPathAlive(shard) 430 if err != nil { 431 return nil, false, fmt.Errorf("reading metadata for shard %q: %w", shard, err) 432 } 433 434 ID := o.RepositoryDescription.ID 435 for _, r := range repositories { 436 // compound shards contain multiple repositories, so we 437 // have to pick only the one we're looking for 438 if r.ID == ID { 439 return r, true, nil 440 } 441 } 442 443 // If we're here, then we're somehow in a state where we found a matching 444 // shard that's missing the repository metadata we're looking for. This 445 // should never happen. 446 name := o.RepositoryDescription.Name 447 return nil, false, fmt.Errorf("matching shard %q doesn't contain metadata for repo id %d (%q)", shard, ID, name) 448} 449 450func (o *Options) findShard() string { 451 for _, v := range readVersions { 452 fn := o.shardNameVersion(v.IndexFormatVersion, 0) 453 if _, err := os.Stat(fn); err == nil { 454 return fn 455 } 456 } 457 458 // Brute force finding the shard in compound shards. We should only hit this 459 // code path for repositories that are not already existing or are in 460 // compound shards. 461 // 462 // TODO add an oracle which can speed this up in the case of repositories 463 // already in compound shards. 464 compoundShards, err := filepath.Glob(path.Join(o.IndexDir, "compound-*.zoekt")) 465 if err != nil { 466 return "" 467 } 468 for _, fn := range compoundShards { 469 repos, _, err := zoekt.ReadMetadataPathAlive(fn) 470 if err != nil { 471 continue 472 } 473 for _, repo := range repos { 474 if repo.ID == o.RepositoryDescription.ID { 475 return fn 476 } 477 } 478 } 479 480 return "" 481} 482 483func (o *Options) FindAllShards() []string { 484 for _, v := range readVersions { 485 fn := o.shardNameVersion(v.IndexFormatVersion, 0) 486 if _, err := os.Stat(fn); err == nil { 487 shards := []string{fn} 488 for i := 1; ; i++ { 489 fn := o.shardNameVersion(v.IndexFormatVersion, i) 490 if _, err := os.Stat(fn); err != nil { 491 return shards 492 } 493 shards = append(shards, fn) 494 } 495 } 496 } 497 498 // lazily fallback to findShard which will look for a compound shard. 499 if fn := o.findShard(); fn != "" { 500 return []string{fn} 501 } 502 503 return nil 504} 505 506// IgnoreSizeMax determines whether the max size should be ignored. 507func (o *Options) IgnoreSizeMax(name string) bool { 508 // A pattern match will override preceding pattern matches. 509 for i := len(o.LargeFiles) - 1; i >= 0; i-- { 510 pattern := strings.TrimSpace(o.LargeFiles[i]) 511 negated, validatedPattern := checkIsNegatePattern(pattern) 512 513 if m, _ := doublestar.PathMatch(validatedPattern, name); m { 514 if negated { 515 return false 516 } else { 517 return true 518 } 519 } 520 } 521 522 return false 523} 524 525func checkIsNegatePattern(pattern string) (bool, string) { 526 negate := "!" 527 528 // if negated then strip prefix meta character which identifies negated filter pattern 529 if strings.HasPrefix(pattern, negate) { 530 return true, pattern[len(negate):] 531 } 532 533 return false, pattern 534} 535 536// NewBuilder creates a new Builder instance. 537func NewBuilder(opts Options) (*Builder, error) { 538 opts.SetDefaults() 539 if opts.RepositoryDescription.Name == "" { 540 return nil, fmt.Errorf("builder: must set Name") 541 } 542 543 b := &Builder{ 544 opts: opts, 545 throttle: make(chan int, opts.Parallelism), 546 finishedShards: map[string]string{}, 547 } 548 549 if b.opts.CTagsPath == "" && b.opts.CTagsMustSucceed { 550 return nil, fmt.Errorf("ctags binary not found, but CTagsMustSucceed set") 551 } 552 553 if strings.Contains(opts.CTagsPath, "universal-ctags") { 554 parser, err := ctags.NewParser(opts.CTagsPath) 555 if err != nil && opts.CTagsMustSucceed { 556 return nil, fmt.Errorf("ctags.NewParser: %v", err) 557 } 558 559 b.parser = parser 560 } 561 562 b.shardLogger = &lumberjack.Logger{ 563 Filename: filepath.Join(opts.IndexDir, "zoekt-builder-shard-log.tsv"), 564 MaxSize: 100, // Megabyte 565 MaxBackups: 5, 566 } 567 568 if opts.IsDelta { 569 // Delta shards build on top of previously existing shards. 570 // As a consequence, the shardNum for delta shards starts from 571 // the number following the most recently generated shard - not 0. 572 // 573 // Using this numbering scheme allows all the shards to be 574 // discovered as a set. 575 shards := b.opts.FindAllShards() 576 b.nextShardNum = len(shards) // shards are zero indexed, so len() provides the next number after the last one 577 } 578 579 if _, err := b.newShardBuilder(); err != nil { 580 return nil, err 581 } 582 583 now := time.Now() 584 b.indexTime = now 585 b.id = xid.NewWithTime(now).String() 586 587 return b, nil 588} 589 590// AddFile is a convenience wrapper for the Add method 591func (b *Builder) AddFile(name string, content []byte) error { 592 return b.Add(zoekt.Document{Name: name, Content: content}) 593} 594 595func (b *Builder) Add(doc zoekt.Document) error { 596 if b.finishCalled { 597 return nil 598 } 599 600 allowLargeFile := b.opts.IgnoreSizeMax(doc.Name) 601 602 // Adjust trigramMax for allowed large files so we don't exclude them. 603 trigramMax := b.opts.TrigramMax 604 if allowLargeFile { 605 trigramMax = math.MaxInt64 606 } 607 608 if len(doc.Content) > b.opts.SizeMax && !allowLargeFile { 609 // We could pass the document on to the shardbuilder, but if 610 // we pass through a part of the source tree with binary/large 611 // files, the corresponding shard would be mostly empty, so 612 // insert a reason here too. 613 doc.SkipReason = fmt.Sprintf("document size %d larger than limit %d", len(doc.Content), b.opts.SizeMax) 614 } else if err := zoekt.CheckText(doc.Content, trigramMax); err != nil { 615 doc.SkipReason = err.Error() 616 doc.Language = "binary" 617 } 618 619 b.todo = append(b.todo, &doc) 620 621 if doc.SkipReason == "" { 622 b.size += len(doc.Name) + len(doc.Content) 623 } else { 624 b.size += len(doc.Name) + len(doc.SkipReason) 625 } 626 627 if b.size > b.opts.ShardMax { 628 return b.flush() 629 } 630 631 return nil 632} 633 634// MarkFileAsChangedOrRemoved indicates that the file specified by the given path 635// has been changed or removed since the last indexing job for this repository. 636// 637// If this build is a delta build, these files will be tombstoned in the older shards for this repository. 638func (b *Builder) MarkFileAsChangedOrRemoved(path string) { 639 b.opts.changedOrRemovedFiles = append(b.opts.changedOrRemovedFiles, path) 640} 641 642// Finish creates a last shard from the buffered documents, and clears 643// stale shards from previous runs. This should always be called, also 644// in failure cases, to ensure cleanup. 645// 646// It is safe to call Finish() multiple times. 647func (b *Builder) Finish() error { 648 if b.finishCalled { 649 return b.buildError 650 } 651 652 b.finishCalled = true 653 654 b.flush() 655 b.building.Wait() 656 657 if b.buildError != nil { 658 for tmp := range b.finishedShards { 659 log.Printf("Builder.Finish %s", tmp) 660 os.Remove(tmp) 661 } 662 b.finishedShards = map[string]string{} 663 return b.buildError 664 } 665 666 // map of temporary -> final names for all updated shards + shard metadata files 667 artifactPaths := make(map[string]string) 668 for tmp, final := range b.finishedShards { 669 artifactPaths[tmp] = final 670 } 671 672 oldShards := b.opts.FindAllShards() 673 674 if b.opts.IsDelta { 675 // Delta shard builds need to update FileTombstone and branch commit information for all 676 // existing shards 677 for _, shard := range oldShards { 678 repositories, _, err := zoekt.ReadMetadataPathAlive(shard) 679 if err != nil { 680 return fmt.Errorf("reading metadata from shard %q: %w", shard, err) 681 } 682 683 if len(repositories) > 1 { 684 return fmt.Errorf("delta shard builds don't support repositories contained in compound shards (shard %q)", shard) 685 } 686 687 if len(repositories) == 0 { 688 return fmt.Errorf("failed to update repository metadata for shard %q - shard contains no repositories", shard) 689 } 690 691 repository := repositories[0] 692 if repository.ID != b.opts.RepositoryDescription.ID { 693 return fmt.Errorf("shard %q doesn't contain repository ID %d (%q)", shard, b.opts.RepositoryDescription.ID, b.opts.RepositoryDescription.Name) 694 } 695 696 if len(b.opts.changedOrRemovedFiles) > 0 && repository.FileTombstones == nil { 697 repository.FileTombstones = make(map[string]struct{}, len(b.opts.changedOrRemovedFiles)) 698 } 699 700 for _, f := range b.opts.changedOrRemovedFiles { 701 repository.FileTombstones[f] = struct{}{} 702 } 703 704 if !BranchNamesEqual(repository.Branches, b.opts.RepositoryDescription.Branches) { 705 return deltaBranchSetError{ 706 shardName: shard, 707 old: repository.Branches, 708 new: b.opts.RepositoryDescription.Branches, 709 } 710 } 711 712 if b.opts.GetHash() != repository.IndexOptions { 713 return &deltaIndexOptionsMismatchError{ 714 shardName: shard, 715 newOptions: b.opts.HashOptions(), 716 } 717 } 718 719 repository.Branches = b.opts.RepositoryDescription.Branches 720 721 repository.LatestCommitDate = b.opts.RepositoryDescription.LatestCommitDate 722 723 tempPath, finalPath, err := zoekt.JsonMarshalRepoMetaTemp(shard, repository) 724 if err != nil { 725 return fmt.Errorf("writing repository metadta for shard %q: %w", shard, err) 726 } 727 728 artifactPaths[tempPath] = finalPath 729 } 730 } 731 732 // We mark finished shards as empty when we successfully finish. Return now 733 // to allow call sites to call Finish idempotently. 734 if len(artifactPaths) == 0 { 735 return b.buildError 736 } 737 738 defer b.shardLogger.Close() 739 740 // Collect a map of the old shards on disk. For each new shard we replace we 741 // delete it from toDelete. Anything remaining in toDelete will be removed 742 // after we have renamed everything into place. 743 744 var toDelete map[string]struct{} 745 if !b.opts.IsDelta { 746 // Non-delta shard builds delete all existing shards before they write out 747 // new ones. 748 // By contrast, delta shard builds work by stacking changes on top of existing shards. 749 // So, we skip populating the toDelete map if we're building delta shards. 750 751 toDelete = make(map[string]struct{}) 752 for _, name := range oldShards { 753 paths, err := zoekt.IndexFilePaths(name) 754 if err != nil { 755 b.buildError = fmt.Errorf("failed to find old paths for %s: %w", name, err) 756 } 757 for _, p := range paths { 758 toDelete[p] = struct{}{} 759 } 760 } 761 } 762 763 for tmp, final := range artifactPaths { 764 if err := os.Rename(tmp, final); err != nil { 765 b.buildError = err 766 continue 767 } 768 769 delete(toDelete, final) 770 771 b.shardLog("upsert", final, b.opts.RepositoryDescription.Name) 772 } 773 774 b.finishedShards = map[string]string{} 775 776 for p := range toDelete { 777 // Don't delete compound shards, set tombstones instead. 778 if zoekt.ShardMergingEnabled() && strings.HasPrefix(filepath.Base(p), "compound-") { 779 if !strings.HasSuffix(p, ".zoekt") { 780 continue 781 } 782 b.shardLog("tomb", p, b.opts.RepositoryDescription.Name) 783 err := zoekt.SetTombstone(p, b.opts.RepositoryDescription.ID) 784 b.buildError = err 785 continue 786 } 787 log.Printf("removing old shard file: %s", p) 788 b.shardLog("remove", p, b.opts.RepositoryDescription.Name) 789 if err := os.Remove(p); err != nil { 790 b.buildError = err 791 } 792 } 793 794 return b.buildError 795} 796 797// BranchNamesEqual compares the given zoekt.RepositoryBranch slices, and returns true 798// iff both slices specify the same set of branch names in the same order. 799func BranchNamesEqual(a, b []zoekt.RepositoryBranch) bool { 800 if len(a) != len(b) { 801 return false 802 } 803 804 for i := range a { 805 x, y := a[i], b[i] 806 if x.Name != y.Name { 807 return false 808 } 809 } 810 811 return true 812} 813 814func (b *Builder) flush() error { 815 todo := b.todo 816 b.todo = nil 817 b.size = 0 818 b.errMu.Lock() 819 defer b.errMu.Unlock() 820 if b.buildError != nil { 821 return b.buildError 822 } 823 824 hasShard := b.nextShardNum > 0 825 if len(todo) == 0 && hasShard { 826 return nil 827 } 828 829 shard := b.nextShardNum 830 b.nextShardNum++ 831 832 if b.opts.Parallelism > 1 && b.opts.MemProfile == "" { 833 b.building.Add(1) 834 go func() { 835 b.throttle <- 1 836 done, err := b.buildShard(todo, shard) 837 <-b.throttle 838 839 b.errMu.Lock() 840 defer b.errMu.Unlock() 841 if err != nil && b.buildError == nil { 842 b.buildError = err 843 } 844 if err == nil { 845 b.finishedShards[done.temp] = done.final 846 } 847 b.building.Done() 848 }() 849 } else { 850 // No goroutines when we're not parallel. This 851 // simplifies memory profiling. 852 done, err := b.buildShard(todo, shard) 853 b.buildError = err 854 if err == nil { 855 b.finishedShards[done.temp] = done.final 856 } 857 if b.opts.MemProfile != "" { 858 // drop memory, and profile. 859 todo = nil 860 b.writeMemProfile(b.opts.MemProfile) 861 } 862 863 return b.buildError 864 } 865 866 return nil 867} 868 869func (b *Builder) shardLog(action, shard string, repoName string) { 870 shard = filepath.Base(shard) 871 var shardSize int64 872 if fi, err := os.Stat(filepath.Join(b.opts.IndexDir, shard)); err == nil { 873 shardSize = fi.Size() 874 } 875 _, _ = fmt.Fprintf(b.shardLogger, "%s\t%s\t%s\t%d\t%s\n", time.Now().UTC().Format(time.RFC3339), action, shard, shardSize, repoName) 876} 877 878var profileNumber int 879 880func (b *Builder) writeMemProfile(name string) { 881 nm := fmt.Sprintf("%s.%d", name, profileNumber) 882 profileNumber++ 883 f, err := os.Create(nm) 884 if err != nil { 885 log.Fatal("could not create memory profile: ", err) 886 } 887 runtime.GC() // get up-to-date statistics 888 if err := pprof.WriteHeapProfile(f); err != nil { 889 log.Fatal("could not write memory profile: ", err) 890 } 891 f.Close() 892 log.Printf("wrote mem profile %q", nm) 893} 894 895// map [0,inf) to [0,1) monotonically 896func squashRange(j int) float64 { 897 x := float64(j) 898 return x / (1 + x) 899} 900 901// IsLowPriority takes a file name and makes an educated guess about its priority 902// in search results. A file is considered low priority if it looks like a test, 903// vendored, or generated file. 904// 905// These 'priority' criteria affects how documents are ordered within a shard. It's 906// also used to help guess a file's rank when we're missing ranking information. 907func IsLowPriority(file string) bool { 908 return testRe.MatchString(file) || isGenerated(file) || isVendored(file) 909} 910 911var testRe = regexp.MustCompile("[Tt]est") 912 913func isGenerated(file string) bool { 914 return strings.HasSuffix(file, "min.js") || strings.HasSuffix(file, "js.map") 915} 916 917func isVendored(file string) bool { 918 return strings.Contains(file, "vendor/") || strings.Contains(file, "node_modules/") 919} 920 921type rankedDoc struct { 922 *zoekt.Document 923 rank []float64 924} 925 926// rank returns a vector of scores which is used at index-time to sort documents 927// before writing them to disk. The order of documents in the shard is important 928// at query time, because earlier documents receive a boost at query time and 929// have a higher chance of being searched before limits kick in. 930func rank(d *zoekt.Document, origIdx int) []float64 { 931 generated := 0.0 932 if isGenerated(d.Name) { 933 generated = 1.0 934 } 935 936 vendor := 0.0 937 if isVendored(d.Name) { 938 vendor = 1.0 939 } 940 941 test := 0.0 942 if testRe.MatchString(d.Name) { 943 test = 1.0 944 } 945 946 // Smaller is earlier (=better). 947 return []float64{ 948 // Prefer docs that are not generated 949 generated, 950 951 // Prefer docs that are not vendored 952 vendor, 953 954 // Prefer docs that are not tests 955 test, 956 957 // With short names 958 squashRange(len(d.Name)), 959 960 // With many symbols 961 1.0 - squashRange(len(d.Symbols)), 962 963 // With short content 964 squashRange(len(d.Content)), 965 966 // That is present is as many branches as possible 967 1.0 - squashRange(len(d.Branches)), 968 969 // Preserve original ordering. 970 squashRange(origIdx), 971 } 972} 973 974func sortDocuments(todo []*zoekt.Document) { 975 rs := make([]rankedDoc, 0, len(todo)) 976 for i, t := range todo { 977 rd := rankedDoc{t, rank(t, i)} 978 rs = append(rs, rd) 979 } 980 sort.Slice(rs, func(i, j int) bool { 981 r1 := rs[i].rank 982 r2 := rs[j].rank 983 for i := range r1 { 984 if r1[i] < r2[i] { 985 return true 986 } 987 if r1[i] > r2[i] { 988 return false 989 } 990 } 991 992 return false 993 }) 994 for i := range todo { 995 todo[i] = rs[i].Document 996 } 997} 998 999func (b *Builder) buildShard(todo []*zoekt.Document, nextShardNum int) (*finishedShard, error) { 1000 if !b.opts.DisableCTags && b.opts.CTagsPath != "" { 1001 err := ctagsAddSymbols(todo, b.parser, b.opts.CTagsPath) 1002 if b.opts.CTagsMustSucceed && err != nil { 1003 return nil, err 1004 } 1005 if err != nil { 1006 log.Printf("ignoring %s error: %v", b.opts.CTagsPath, err) 1007 } 1008 } 1009 1010 name := b.opts.shardName(nextShardNum) 1011 1012 shardBuilder, err := b.newShardBuilder() 1013 if err != nil { 1014 return nil, err 1015 } 1016 1017 sortDocuments(todo) 1018 1019 for _, t := range todo { 1020 if err := shardBuilder.Add(*t); err != nil { 1021 return nil, err 1022 } 1023 } 1024 1025 return b.writeShard(name, shardBuilder) 1026} 1027 1028func (b *Builder) newShardBuilder() (*zoekt.IndexBuilder, error) { 1029 desc := b.opts.RepositoryDescription 1030 desc.HasSymbols = !b.opts.DisableCTags && b.opts.CTagsPath != "" 1031 desc.SubRepoMap = b.opts.SubRepositories 1032 desc.IndexOptions = b.opts.GetHash() 1033 1034 shardBuilder, err := zoekt.NewIndexBuilder(&desc) 1035 if err != nil { 1036 return nil, err 1037 } 1038 shardBuilder.IndexTime = b.indexTime 1039 shardBuilder.ID = b.id 1040 return shardBuilder, nil 1041} 1042 1043func (b *Builder) writeShard(fn string, ib *zoekt.IndexBuilder) (*finishedShard, error) { 1044 dir := filepath.Dir(fn) 1045 if err := os.MkdirAll(dir, 0o700); err != nil { 1046 return nil, err 1047 } 1048 1049 f, err := os.CreateTemp(dir, filepath.Base(fn)+".*.tmp") 1050 if err != nil { 1051 return nil, err 1052 } 1053 if runtime.GOOS != "windows" { 1054 if err := f.Chmod(0o666 &^ umask); err != nil { 1055 return nil, err 1056 } 1057 } 1058 1059 defer f.Close() 1060 if err := ib.Write(f); err != nil { 1061 return nil, err 1062 } 1063 fi, err := f.Stat() 1064 if err != nil { 1065 return nil, err 1066 } 1067 if err := f.Close(); err != nil { 1068 return nil, err 1069 } 1070 1071 log.Printf("finished %s: %d index bytes (overhead %3.1f)", fn, fi.Size(), 1072 float64(fi.Size())/float64(ib.ContentSize()+1)) 1073 1074 return &finishedShard{f.Name(), fn}, nil 1075} 1076 1077type deltaBranchSetError struct { 1078 shardName string 1079 old, new []zoekt.RepositoryBranch 1080} 1081 1082func (e deltaBranchSetError) Error() string { 1083 return fmt.Sprintf("repository metadata in shard %q contains a different set of branch names than what was requested, which is unsupported in a delta shard build. old: %+v, new: %+v", e.shardName, e.old, e.new) 1084} 1085 1086type deltaIndexOptionsMismatchError struct { 1087 shardName string 1088 newOptions HashOptions 1089} 1090 1091func (e *deltaIndexOptionsMismatchError) Error() string { 1092 return fmt.Sprintf("one or more index options for shard %q do not match Builder's index options. These index option updates are incompatible with delta build. New index options: %+v", e.shardName, e.newOptions) 1093} 1094 1095// umask holds the Umask of the current process 1096var umask os.FileMode