build/builder.go at 650136078a98ef99acf4159310a2c12fd25075d7 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / build / builder.go
at 650136078a98ef99acf4159310a2c12fd25075d7 30 kB View raw
Julie Tibshirani Debug: write memory profile if heap exceeds threshold (#819) 2y ago
   1// Copyright 2016 Google Inc. All rights reserved.
   2//
   3// Licensed under the Apache License, Version 2.0 (the "License");
   4// you may not use this file except in compliance with the License.
   5// You may obtain a copy of the License at
   6//
   7//    http://www.apache.org/licenses/LICENSE-2.0
   8//
   9// Unless required by applicable law or agreed to in writing, software
  10// distributed under the License is distributed on an "AS IS" BASIS,
  11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12// See the License for the specific language governing permissions and
  13// limitations under the License.
  14
  15// package build implements a more convenient interface for building
  16// zoekt indices.
  17package build
  18
  19import (
  20	"crypto/sha1"
  21	"flag"
  22	"fmt"
  23	"io"
  24	"log"
  25	"net/url"
  26	"os"
  27	"os/exec"
  28	"path"
  29	"path/filepath"
  30	"reflect"
  31	"runtime"
  32	"runtime/pprof"
  33	"sort"
  34	"strconv"
  35	"strings"
  36	"sync"
  37	"time"
  38
  39	"github.com/bmatcuk/doublestar"
  40	"github.com/dustin/go-humanize"
  41	"github.com/go-enry/go-enry/v2"
  42	"github.com/rs/xid"
  43
  44	"github.com/sourcegraph/zoekt"
  45	"github.com/sourcegraph/zoekt/ctags"
  46)
  47
  48var DefaultDir = filepath.Join(os.Getenv("HOME"), ".zoekt")
  49
  50// Branch describes a single branch version.
  51type Branch struct {
  52	Name    string
  53	Version string
  54}
  55
  56// Options sets options for the index building.
  57type Options struct {
  58	// IndexDir is a directory that holds *.zoekt index files.
  59	IndexDir string
  60
  61	// SizeMax is the maximum file size
  62	SizeMax int
  63
  64	// Parallelism is the maximum number of shards to index in parallel
  65	Parallelism int
  66
  67	// ShardMax sets the maximum corpus size for a single shard
  68	ShardMax int
  69
  70	// TrigramMax sets the maximum number of distinct trigrams per document.
  71	TrigramMax int
  72
  73	// RepositoryDescription holds names and URLs for the repository.
  74	RepositoryDescription zoekt.Repository
  75
  76	// SubRepositories is a path => sub repository map.
  77	SubRepositories map[string]*zoekt.Repository
  78
  79	// DisableCTags disables the generation of ctags metadata.
  80	DisableCTags bool
  81
  82	// CtagsPath is the path to the ctags binary to run, or empty
  83	// if a valid binary couldn't be found.
  84	CTagsPath string
  85
  86	// Same as CTagsPath but for scip-ctags
  87	ScipCTagsPath string
  88
  89	// If set, ctags must succeed.
  90	CTagsMustSucceed bool
  91
  92	// LargeFiles is a slice of glob patterns, including ** for any number
  93	// of directories, where matching file paths should be indexed
  94	// regardless of their size. The full pattern syntax is here:
  95	// https://github.com/bmatcuk/doublestar/tree/v1#patterns.
  96	LargeFiles []string
  97
  98	// IsDelta is true if this run contains only the changed documents since the
  99	// last run.
 100	IsDelta bool
 101
 102	// DocumentRanksPath is the path to the file with document ranks. If empty,
 103	// ranks will be computed on-the-fly.
 104	DocumentRanksPath string
 105
 106	// DocumentRanksVersion is a string which when changed will cause us to
 107	// reindex a shard. This field is used so that when the contents of
 108	// DocumentRanksPath changes, we can reindex.
 109	DocumentRanksVersion string
 110
 111	// changedOrRemovedFiles is a list of file paths that have been changed or removed
 112	// since the last indexing job for this repository. These files will be tombstoned
 113	// in the older shards for this repository.
 114	changedOrRemovedFiles []string
 115
 116	LanguageMap ctags.LanguageMap
 117
 118	// ShardMerging is true if builder should respect compound shards. This is a
 119	// Sourcegraph specific option.
 120	ShardMerging bool
 121
 122	// HeapProfileTriggerBytes is the heap usage in bytes that will trigger a memory profile. If 0, no memory profile will be triggered.
 123	// Profiles will be written to files named `index-memory.prof.n` in the index directory. No more than 10 files are written.
 124	//
 125	// Note: heap checking is "best effort", and it's possible for the process to OOM without triggering the heap profile.
 126	HeapProfileTriggerBytes uint64
 127}
 128
 129// HashOptions contains only the options in Options that upon modification leads to IndexState of IndexStateMismatch during the next index building.
 130type HashOptions struct {
 131	sizeMax          int
 132	disableCTags     bool
 133	ctagsPath        string
 134	cTagsMustSucceed bool
 135	largeFiles       []string
 136
 137	// documentRankVersion is an experimental field which will change when the
 138	// DocumentRanksPath content changes. If empty we ignore it.
 139	documentRankVersion string
 140}
 141
 142func (o *Options) HashOptions() HashOptions {
 143	return HashOptions{
 144		sizeMax:             o.SizeMax,
 145		disableCTags:        o.DisableCTags,
 146		ctagsPath:           o.CTagsPath,
 147		cTagsMustSucceed:    o.CTagsMustSucceed,
 148		largeFiles:          o.LargeFiles,
 149		documentRankVersion: o.DocumentRanksVersion,
 150	}
 151}
 152
 153func (o *Options) GetHash() string {
 154	h := o.HashOptions()
 155	hasher := sha1.New()
 156
 157	hasher.Write([]byte(h.ctagsPath))
 158	hasher.Write([]byte(fmt.Sprintf("%t", h.cTagsMustSucceed)))
 159	hasher.Write([]byte(fmt.Sprintf("%d", h.sizeMax)))
 160	hasher.Write([]byte(fmt.Sprintf("%q", h.largeFiles)))
 161	hasher.Write([]byte(fmt.Sprintf("%t", h.disableCTags)))
 162
 163	if h.documentRankVersion != "" {
 164		hasher.Write([]byte{0})
 165		io.WriteString(hasher, h.documentRankVersion)
 166	}
 167
 168	return fmt.Sprintf("%x", hasher.Sum(nil))
 169}
 170
 171type largeFilesFlag struct{ *Options }
 172
 173func (f largeFilesFlag) String() string {
 174	// From flag.Value documentation:
 175	//
 176	// The flag package may call the String method with a zero-valued receiver,
 177	// such as a nil pointer.
 178	if f.Options == nil {
 179		return ""
 180	}
 181	s := append([]string{""}, f.LargeFiles...)
 182	return strings.Join(s, "-large_file ")
 183}
 184
 185func (f largeFilesFlag) Set(value string) error {
 186	f.LargeFiles = append(f.LargeFiles, value)
 187	return nil
 188}
 189
 190// Flags adds flags for build options to fs. It is the "inverse" of Args.
 191func (o *Options) Flags(fs *flag.FlagSet) {
 192	x := *o
 193	x.SetDefaults()
 194	fs.IntVar(&o.SizeMax, "file_limit", x.SizeMax, "maximum file size")
 195	fs.IntVar(&o.TrigramMax, "max_trigram_count", x.TrigramMax, "maximum number of trigrams per document")
 196	fs.IntVar(&o.ShardMax, "shard_limit", x.ShardMax, "maximum corpus size for a shard")
 197	fs.IntVar(&o.Parallelism, "parallelism", x.Parallelism, "maximum number of parallel indexing processes.")
 198	fs.StringVar(&o.IndexDir, "index", x.IndexDir, "directory for search indices")
 199	fs.BoolVar(&o.CTagsMustSucceed, "require_ctags", x.CTagsMustSucceed, "If set, ctags calls must succeed.")
 200	fs.Var(largeFilesFlag{o}, "large_file", "A glob pattern where matching files are to be index regardless of their size. You can add multiple patterns by setting this more than once.")
 201
 202	// Sourcegraph specific
 203	fs.BoolVar(&o.DisableCTags, "disable_ctags", x.DisableCTags, "If set, ctags will not be called.")
 204	fs.BoolVar(&o.ShardMerging, "shard_merging", x.ShardMerging, "If set, builder will respect compound shards.")
 205}
 206
 207// Args generates command line arguments for o. It is the "inverse" of Flags.
 208func (o *Options) Args() []string {
 209	var args []string
 210
 211	if o.SizeMax != 0 {
 212		args = append(args, "-file_limit", strconv.Itoa(o.SizeMax))
 213	}
 214
 215	if o.TrigramMax != 0 {
 216		args = append(args, "-max_trigram_count", strconv.Itoa(o.TrigramMax))
 217	}
 218
 219	if o.ShardMax != 0 {
 220		args = append(args, "-shard_limit", strconv.Itoa(o.ShardMax))
 221	}
 222
 223	if o.Parallelism != 0 {
 224		args = append(args, "-parallelism", strconv.Itoa(o.Parallelism))
 225	}
 226
 227	if o.IndexDir != "" {
 228		args = append(args, "-index", o.IndexDir)
 229	}
 230
 231	if o.CTagsMustSucceed {
 232		args = append(args, "-require_ctags")
 233	}
 234
 235	for _, a := range o.LargeFiles {
 236		args = append(args, "-large_file", a)
 237	}
 238
 239	// Sourcegraph specific
 240	if o.DisableCTags {
 241		args = append(args, "-disable_ctags")
 242	}
 243
 244	if o.ShardMerging {
 245		args = append(args, "-shard_merging")
 246	}
 247
 248	return args
 249}
 250
 251// Builder manages (parallel) creation of uniformly sized shards. The
 252// builder buffers up documents until it collects enough documents and
 253// then builds a shard and writes.
 254type Builder struct {
 255	opts     Options
 256	throttle chan int
 257
 258	nextShardNum int
 259	todo         []*zoekt.Document
 260	docChecker   zoekt.DocChecker
 261	size         int
 262
 263	parserBins ctags.ParserBinMap
 264	building   sync.WaitGroup
 265
 266	errMu      sync.Mutex
 267	buildError error
 268
 269	// temp name => final name for finished shards. We only rename
 270	// them once all shards succeed to avoid Frankstein corpuses.
 271	finishedShards map[string]string
 272
 273	// indexTime is set by tests for doing reproducible builds.
 274	indexTime time.Time
 275
 276	// heapProfileMu is used to ensure that only one memory profile is written at a time
 277	heapProfileMu  sync.Mutex
 278	heapProfileNum int
 279
 280	// a sortable 20 chars long id.
 281	id string
 282
 283	finishCalled bool
 284}
 285
 286type finishedShard struct {
 287	temp, final string
 288}
 289
 290func checkCTags() string {
 291	if ctags := os.Getenv("CTAGS_COMMAND"); ctags != "" {
 292		return ctags
 293	}
 294
 295	if ctags, err := exec.LookPath("universal-ctags"); err == nil {
 296		return ctags
 297	}
 298
 299	return ""
 300}
 301
 302func checkScipCTags() string {
 303	if ctags := os.Getenv("SCIP_CTAGS_COMMAND"); ctags != "" {
 304		return ctags
 305	}
 306
 307	if ctags, err := exec.LookPath("scip-ctags"); err == nil {
 308		return ctags
 309	}
 310
 311	return ""
 312}
 313
 314// SetDefaults sets reasonable default options.
 315func (o *Options) SetDefaults() {
 316	if o.CTagsPath == "" && !o.DisableCTags {
 317		o.CTagsPath = checkCTags()
 318	}
 319
 320	if o.ScipCTagsPath == "" && !o.DisableCTags {
 321		o.ScipCTagsPath = checkScipCTags()
 322	}
 323
 324	if o.Parallelism == 0 {
 325		o.Parallelism = 4
 326	}
 327	if o.SizeMax == 0 {
 328		o.SizeMax = 2 << 20
 329	}
 330	if o.ShardMax == 0 {
 331		o.ShardMax = 100 << 20
 332	}
 333	if o.TrigramMax == 0 {
 334		o.TrigramMax = 20000
 335	}
 336
 337	if o.RepositoryDescription.Name == "" && o.RepositoryDescription.URL != "" {
 338		parsed, _ := url.Parse(o.RepositoryDescription.URL)
 339		if parsed != nil {
 340			o.RepositoryDescription.Name = filepath.Join(parsed.Host, parsed.Path)
 341		}
 342	}
 343}
 344
 345func hashString(s string) string {
 346	h := sha1.New()
 347	_, _ = io.WriteString(h, s)
 348	return fmt.Sprintf("%x", h.Sum(nil))
 349}
 350
 351// ShardName returns the name the given index shard.
 352func (o *Options) shardName(n int) string {
 353	return o.shardNameVersion(zoekt.IndexFormatVersion, n)
 354}
 355
 356func (o *Options) shardNameVersion(version, n int) string {
 357	abs := url.QueryEscape(o.RepositoryDescription.Name)
 358	if len(abs) > 200 {
 359		abs = abs[:200] + hashString(abs)[:8]
 360	}
 361	return filepath.Join(o.IndexDir,
 362		fmt.Sprintf("%s_v%d.%05d.zoekt", abs, version, n))
 363}
 364
 365type IndexState string
 366
 367const (
 368	IndexStateMissing IndexState = "missing"
 369	IndexStateCorrupt IndexState = "corrupt"
 370	IndexStateVersion IndexState = "version-mismatch"
 371	IndexStateOption  IndexState = "option-mismatch"
 372	IndexStateMeta    IndexState = "meta-mismatch"
 373	IndexStateContent IndexState = "content-mismatch"
 374	IndexStateEqual   IndexState = "equal"
 375)
 376
 377var readVersions = []struct {
 378	IndexFormatVersion int
 379	FeatureVersion     int
 380}{{
 381	IndexFormatVersion: zoekt.IndexFormatVersion,
 382	FeatureVersion:     zoekt.FeatureVersion,
 383}, {
 384	IndexFormatVersion: zoekt.NextIndexFormatVersion,
 385	FeatureVersion:     zoekt.FeatureVersion,
 386}}
 387
 388// IncrementalSkipIndexing returns true if the index present on disk matches
 389// the build options.
 390func (o *Options) IncrementalSkipIndexing() bool {
 391	state, _ := o.IndexState()
 392	return state == IndexStateEqual
 393}
 394
 395// IndexState checks how the index present on disk compares to the build
 396// options and returns the IndexState and the name of the first shard.
 397func (o *Options) IndexState() (IndexState, string) {
 398	// Open the latest version we support that is on disk.
 399	fn := o.findShard()
 400	if fn == "" {
 401		return IndexStateMissing, fn
 402	}
 403
 404	repos, index, err := zoekt.ReadMetadataPathAlive(fn)
 405	if os.IsNotExist(err) {
 406		return IndexStateMissing, fn
 407	} else if err != nil {
 408		return IndexStateCorrupt, fn
 409	}
 410
 411	for _, v := range readVersions {
 412		if v.IndexFormatVersion == index.IndexFormatVersion && v.FeatureVersion != index.IndexFeatureVersion {
 413			return IndexStateVersion, fn
 414		}
 415	}
 416
 417	var repo *zoekt.Repository
 418	for _, cand := range repos {
 419		if cand.Name == o.RepositoryDescription.Name {
 420			repo = cand
 421			break
 422		}
 423	}
 424
 425	if repo == nil {
 426		return IndexStateCorrupt, fn
 427	}
 428
 429	if repo.IndexOptions != o.GetHash() {
 430		return IndexStateOption, fn
 431	}
 432
 433	if !reflect.DeepEqual(repo.Branches, o.RepositoryDescription.Branches) {
 434		return IndexStateContent, fn
 435	}
 436
 437	// We can mutate repo since it lives in the scope of this function call.
 438	if updated, err := repo.MergeMutable(&o.RepositoryDescription); err != nil {
 439		// non-nil err means we are trying to update an immutable field =>
 440		// reindex content.
 441		log.Printf("warn: immutable field changed, requires re-index: %s", err)
 442		return IndexStateContent, fn
 443	} else if updated {
 444		return IndexStateMeta, fn
 445	}
 446
 447	return IndexStateEqual, fn
 448}
 449
 450// FindRepositoryMetadata returns the index metadata for the repository
 451// specified in the options. 'ok' is false if the repository's metadata
 452// couldn't be found or if an error occurred.
 453func (o *Options) FindRepositoryMetadata() (repository *zoekt.Repository, metadata *zoekt.IndexMetadata, ok bool, err error) {
 454	shard := o.findShard()
 455	if shard == "" {
 456		return nil, nil, false, nil
 457	}
 458
 459	repositories, metadata, err := zoekt.ReadMetadataPathAlive(shard)
 460	if err != nil {
 461		return nil, nil, false, fmt.Errorf("reading metadata for shard %q: %w", shard, err)
 462	}
 463
 464	ID := o.RepositoryDescription.ID
 465	for _, r := range repositories {
 466		// compound shards contain multiple repositories, so we
 467		// have to pick only the one we're looking for
 468		if r.ID == ID {
 469			return r, metadata, true, nil
 470		}
 471	}
 472
 473	// If we're here, then we're somehow in a state where we found a matching
 474	// shard that's missing the repository metadata we're looking for. This
 475	// should never happen.
 476	name := o.RepositoryDescription.Name
 477	return nil, nil, false, fmt.Errorf("matching shard %q doesn't contain metadata for repo id %d (%q)", shard, ID, name)
 478}
 479
 480func (o *Options) findShard() string {
 481	for _, v := range readVersions {
 482		fn := o.shardNameVersion(v.IndexFormatVersion, 0)
 483		if _, err := os.Stat(fn); err == nil {
 484			return fn
 485		}
 486	}
 487
 488	// Brute force finding the shard in compound shards. We should only hit this
 489	// code path for repositories that are not already existing or are in
 490	// compound shards.
 491	//
 492	// TODO add an oracle which can speed this up in the case of repositories
 493	// already in compound shards.
 494	compoundShards, err := filepath.Glob(path.Join(o.IndexDir, "compound-*.zoekt"))
 495	if err != nil {
 496		return ""
 497	}
 498	for _, fn := range compoundShards {
 499		repos, _, err := zoekt.ReadMetadataPathAlive(fn)
 500		if err != nil {
 501			continue
 502		}
 503		for _, repo := range repos {
 504			if repo.ID == o.RepositoryDescription.ID {
 505				return fn
 506			}
 507		}
 508	}
 509
 510	return ""
 511}
 512
 513func (o *Options) FindAllShards() []string {
 514	for _, v := range readVersions {
 515		fn := o.shardNameVersion(v.IndexFormatVersion, 0)
 516		if _, err := os.Stat(fn); err == nil {
 517			shards := []string{fn}
 518			for i := 1; ; i++ {
 519				fn := o.shardNameVersion(v.IndexFormatVersion, i)
 520				if _, err := os.Stat(fn); err != nil {
 521					return shards
 522				}
 523				shards = append(shards, fn)
 524			}
 525		}
 526	}
 527
 528	// lazily fallback to findShard which will look for a compound shard.
 529	if fn := o.findShard(); fn != "" {
 530		return []string{fn}
 531	}
 532
 533	return nil
 534}
 535
 536// IgnoreSizeMax determines whether the max size should be ignored.
 537func (o *Options) IgnoreSizeMax(name string) bool {
 538	// A pattern match will override preceding pattern matches.
 539	for i := len(o.LargeFiles) - 1; i >= 0; i-- {
 540		pattern := strings.TrimSpace(o.LargeFiles[i])
 541		negated, validatedPattern := checkIsNegatePattern(pattern)
 542
 543		if m, _ := doublestar.PathMatch(validatedPattern, name); m {
 544			if negated {
 545				return false
 546			} else {
 547				return true
 548			}
 549		}
 550	}
 551
 552	return false
 553}
 554
 555func checkIsNegatePattern(pattern string) (bool, string) {
 556	negate := "!"
 557
 558	// if negated then strip prefix meta character which identifies negated filter pattern
 559	if strings.HasPrefix(pattern, negate) {
 560		return true, pattern[len(negate):]
 561	}
 562
 563	return false, pattern
 564}
 565
 566// NewBuilder creates a new Builder instance.
 567func NewBuilder(opts Options) (*Builder, error) {
 568	opts.SetDefaults()
 569	if opts.RepositoryDescription.Name == "" {
 570		return nil, fmt.Errorf("builder: must set Name")
 571	}
 572
 573	b := &Builder{
 574		opts:           opts,
 575		throttle:       make(chan int, opts.Parallelism),
 576		finishedShards: map[string]string{},
 577	}
 578
 579	parserBins, err := ctags.NewParserBinMap(
 580		b.opts.CTagsPath,
 581		b.opts.ScipCTagsPath,
 582		opts.LanguageMap,
 583		b.opts.CTagsMustSucceed,
 584	)
 585	if err != nil {
 586		return nil, err
 587	}
 588
 589	b.parserBins = parserBins
 590
 591	if opts.IsDelta {
 592		// Delta shards build on top of previously existing shards.
 593		// As a consequence, the shardNum for delta shards starts from
 594		// the number following the most recently generated shard - not 0.
 595		//
 596		// Using this numbering scheme allows all the shards to be
 597		// discovered as a set.
 598		shards := b.opts.FindAllShards()
 599		b.nextShardNum = len(shards) // shards are zero indexed, so len() provides the next number after the last one
 600	}
 601
 602	if _, err := b.newShardBuilder(); err != nil {
 603		return nil, err
 604	}
 605
 606	now := time.Now()
 607	b.indexTime = now
 608	b.id = xid.NewWithTime(now).String()
 609
 610	return b, nil
 611}
 612
 613// AddFile is a convenience wrapper for the Add method
 614func (b *Builder) AddFile(name string, content []byte) error {
 615	return b.Add(zoekt.Document{Name: name, Content: content})
 616}
 617
 618func (b *Builder) Add(doc zoekt.Document) error {
 619	if b.finishCalled {
 620		return nil
 621	}
 622
 623	allowLargeFile := b.opts.IgnoreSizeMax(doc.Name)
 624	if len(doc.Content) > b.opts.SizeMax && !allowLargeFile {
 625		// We could pass the document on to the shardbuilder, but if
 626		// we pass through a part of the source tree with binary/large
 627		// files, the corresponding shard would be mostly empty, so
 628		// insert a reason here too.
 629		doc.SkipReason = fmt.Sprintf("document size %d larger than limit %d", len(doc.Content), b.opts.SizeMax)
 630	} else if err := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile); err != nil {
 631		doc.SkipReason = err.Error()
 632		doc.Language = "binary"
 633	}
 634
 635	b.todo = append(b.todo, &doc)
 636
 637	if doc.SkipReason == "" {
 638		b.size += len(doc.Name) + len(doc.Content)
 639	} else {
 640		b.size += len(doc.Name) + len(doc.SkipReason)
 641		// Drop the content if we are skipping the document. Skipped content is not counted towards the
 642		// shard size limit, so otherwise we might buffer too much data in memory before flushing.
 643		doc.Content = nil
 644	}
 645
 646	if b.size > b.opts.ShardMax {
 647		return b.flush()
 648	}
 649
 650	return nil
 651}
 652
 653// MarkFileAsChangedOrRemoved indicates that the file specified by the given path
 654// has been changed or removed since the last indexing job for this repository.
 655//
 656// If this build is a delta build, these files will be tombstoned in the older shards for this repository.
 657func (b *Builder) MarkFileAsChangedOrRemoved(path string) {
 658	b.opts.changedOrRemovedFiles = append(b.opts.changedOrRemovedFiles, path)
 659}
 660
 661// Finish creates a last shard from the buffered documents, and clears
 662// stale shards from previous runs. This should always be called, also
 663// in failure cases, to ensure cleanup.
 664//
 665// It is safe to call Finish() multiple times.
 666func (b *Builder) Finish() error {
 667	if b.finishCalled {
 668		return b.buildError
 669	}
 670
 671	b.finishCalled = true
 672
 673	b.flush()
 674	b.building.Wait()
 675
 676	if b.buildError != nil {
 677		for tmp := range b.finishedShards {
 678			log.Printf("Builder.Finish %s", tmp)
 679			os.Remove(tmp)
 680		}
 681		b.finishedShards = map[string]string{}
 682		return b.buildError
 683	}
 684
 685	// map of temporary -> final names for all updated shards + shard metadata files
 686	artifactPaths := make(map[string]string)
 687	for tmp, final := range b.finishedShards {
 688		artifactPaths[tmp] = final
 689	}
 690
 691	oldShards := b.opts.FindAllShards()
 692
 693	if b.opts.IsDelta {
 694		// Delta shard builds need to update FileTombstone and branch commit information for all
 695		// existing shards
 696		for _, shard := range oldShards {
 697			repositories, _, err := zoekt.ReadMetadataPathAlive(shard)
 698			if err != nil {
 699				return fmt.Errorf("reading metadata from shard %q: %w", shard, err)
 700			}
 701
 702			if len(repositories) > 1 {
 703				return fmt.Errorf("delta shard builds don't support repositories contained in compound shards (shard %q)", shard)
 704			}
 705
 706			if len(repositories) == 0 {
 707				return fmt.Errorf("failed to update repository metadata for shard %q - shard contains no repositories", shard)
 708			}
 709
 710			repository := repositories[0]
 711			if repository.ID != b.opts.RepositoryDescription.ID {
 712				return fmt.Errorf("shard %q doesn't contain repository ID %d (%q)", shard, b.opts.RepositoryDescription.ID, b.opts.RepositoryDescription.Name)
 713			}
 714
 715			if len(b.opts.changedOrRemovedFiles) > 0 && repository.FileTombstones == nil {
 716				repository.FileTombstones = make(map[string]struct{}, len(b.opts.changedOrRemovedFiles))
 717			}
 718
 719			for _, f := range b.opts.changedOrRemovedFiles {
 720				repository.FileTombstones[f] = struct{}{}
 721			}
 722
 723			if !BranchNamesEqual(repository.Branches, b.opts.RepositoryDescription.Branches) {
 724				return deltaBranchSetError{
 725					shardName: shard,
 726					old:       repository.Branches,
 727					new:       b.opts.RepositoryDescription.Branches,
 728				}
 729			}
 730
 731			if b.opts.GetHash() != repository.IndexOptions {
 732				return &deltaIndexOptionsMismatchError{
 733					shardName:  shard,
 734					newOptions: b.opts.HashOptions(),
 735				}
 736			}
 737
 738			repository.Branches = b.opts.RepositoryDescription.Branches
 739
 740			repository.LatestCommitDate = b.opts.RepositoryDescription.LatestCommitDate
 741
 742			tempPath, finalPath, err := zoekt.JsonMarshalRepoMetaTemp(shard, repository)
 743			if err != nil {
 744				return fmt.Errorf("writing repository metadta for shard %q: %w", shard, err)
 745			}
 746
 747			artifactPaths[tempPath] = finalPath
 748		}
 749	}
 750
 751	// We mark finished shards as empty when we successfully finish. Return now
 752	// to allow call sites to call Finish idempotently.
 753	if len(artifactPaths) == 0 {
 754		return b.buildError
 755	}
 756
 757	// Collect a map of the old shards on disk. For each new shard we replace we
 758	// delete it from toDelete. Anything remaining in toDelete will be removed
 759	// after we have renamed everything into place.
 760
 761	var toDelete map[string]struct{}
 762	if !b.opts.IsDelta {
 763		// Non-delta shard builds delete all existing shards before they write out
 764		// new ones.
 765		// By contrast, delta shard builds work by stacking changes on top of existing shards.
 766		// So, we skip populating the toDelete map if we're building delta shards.
 767
 768		toDelete = make(map[string]struct{})
 769		for _, name := range oldShards {
 770			paths, err := zoekt.IndexFilePaths(name)
 771			if err != nil {
 772				b.buildError = fmt.Errorf("failed to find old paths for %s: %w", name, err)
 773			}
 774			for _, p := range paths {
 775				toDelete[p] = struct{}{}
 776			}
 777		}
 778	}
 779
 780	for tmp, final := range artifactPaths {
 781		if err := os.Rename(tmp, final); err != nil {
 782			b.buildError = err
 783			continue
 784		}
 785
 786		delete(toDelete, final)
 787	}
 788
 789	b.finishedShards = map[string]string{}
 790
 791	for p := range toDelete {
 792		// Don't delete compound shards, set tombstones instead.
 793		if b.opts.ShardMerging && strings.HasPrefix(filepath.Base(p), "compound-") {
 794			if !strings.HasSuffix(p, ".zoekt") {
 795				continue
 796			}
 797			err := zoekt.SetTombstone(p, b.opts.RepositoryDescription.ID)
 798			b.buildError = err
 799			continue
 800		}
 801		log.Printf("removing old shard file: %s", p)
 802		if err := os.Remove(p); err != nil {
 803			b.buildError = err
 804		}
 805	}
 806
 807	return b.buildError
 808}
 809
 810// BranchNamesEqual compares the given zoekt.RepositoryBranch slices, and returns true
 811// iff both slices specify the same set of branch names in the same order.
 812func BranchNamesEqual(a, b []zoekt.RepositoryBranch) bool {
 813	if len(a) != len(b) {
 814		return false
 815	}
 816
 817	for i := range a {
 818		x, y := a[i], b[i]
 819		if x.Name != y.Name {
 820			return false
 821		}
 822	}
 823
 824	return true
 825}
 826
 827func (b *Builder) flush() error {
 828	todo := b.todo
 829	b.todo = nil
 830	b.size = 0
 831	b.errMu.Lock()
 832	defer b.errMu.Unlock()
 833	if b.buildError != nil {
 834		return b.buildError
 835	}
 836
 837	hasShard := b.nextShardNum > 0
 838	if len(todo) == 0 && hasShard {
 839		return nil
 840	}
 841
 842	shard := b.nextShardNum
 843	b.nextShardNum++
 844
 845	if b.opts.Parallelism > 1 {
 846		b.building.Add(1)
 847		b.throttle <- 1
 848		go func() {
 849			done, err := b.buildShard(todo, shard)
 850			<-b.throttle
 851
 852			b.errMu.Lock()
 853			defer b.errMu.Unlock()
 854			if err != nil && b.buildError == nil {
 855				b.buildError = err
 856			}
 857			if err == nil {
 858				b.finishedShards[done.temp] = done.final
 859			}
 860			b.building.Done()
 861		}()
 862	} else {
 863		// No goroutines when we're not parallel. This
 864		// simplifies memory profiling.
 865		done, err := b.buildShard(todo, shard)
 866		b.buildError = err
 867		if err == nil {
 868			b.finishedShards[done.temp] = done.final
 869		}
 870
 871		return b.buildError
 872	}
 873
 874	return nil
 875}
 876
 877// map [0,inf) to [0,1) monotonically
 878func squashRange(j int) float64 {
 879	x := float64(j)
 880	return x / (1 + x)
 881}
 882
 883// IsLowPriority takes a file name and makes an educated guess about its priority
 884// in search results. A file is considered low priority if it looks like a test,
 885// vendored, or generated file.
 886//
 887// These 'priority' criteria affects how documents are ordered within a shard. It's
 888// also used to help guess a file's rank when we're missing ranking information.
 889func IsLowPriority(path string, content []byte) bool {
 890	return enry.IsTest(path) || enry.IsVendor(path) || enry.IsGenerated(path, content)
 891}
 892
 893type rankedDoc struct {
 894	*zoekt.Document
 895	rank []float64
 896}
 897
 898// rank returns a vector of scores which is used at index-time to sort documents
 899// before writing them to disk. The order of documents in the shard is important
 900// at query time, because earlier documents receive a boost at query time and
 901// have a higher chance of being searched before limits kick in.
 902func rank(d *zoekt.Document, origIdx int) []float64 {
 903	skipped := 0.0
 904	if d.SkipReason != "" {
 905		skipped = 1.0
 906	}
 907
 908	generated := 0.0
 909	if enry.IsGenerated(d.Name, d.Content) {
 910		generated = 1.0
 911	}
 912
 913	vendor := 0.0
 914	if enry.IsVendor(d.Name) {
 915		vendor = 1.0
 916	}
 917
 918	test := 0.0
 919	if enry.IsTest(d.Name) {
 920		test = 1.0
 921	}
 922
 923	// Smaller is earlier (=better).
 924	return []float64{
 925		// Always place skipped docs last
 926		skipped,
 927
 928		// Prefer docs that are not generated
 929		generated,
 930
 931		// Prefer docs that are not vendored
 932		vendor,
 933
 934		// Prefer docs that are not tests
 935		test,
 936
 937		// With short names
 938		squashRange(len(d.Name)),
 939
 940		// With many symbols
 941		1.0 - squashRange(len(d.Symbols)),
 942
 943		// With short content
 944		squashRange(len(d.Content)),
 945
 946		// That is present is as many branches as possible
 947		1.0 - squashRange(len(d.Branches)),
 948
 949		// Preserve original ordering.
 950		squashRange(origIdx),
 951	}
 952}
 953
 954func sortDocuments(todo []*zoekt.Document) {
 955	rs := make([]rankedDoc, 0, len(todo))
 956	for i, t := range todo {
 957		rd := rankedDoc{t, rank(t, i)}
 958		rs = append(rs, rd)
 959	}
 960	sort.Slice(rs, func(i, j int) bool {
 961		r1 := rs[i].rank
 962		r2 := rs[j].rank
 963		for i := range r1 {
 964			if r1[i] < r2[i] {
 965				return true
 966			}
 967			if r1[i] > r2[i] {
 968				return false
 969			}
 970		}
 971
 972		return false
 973	})
 974	for i := range todo {
 975		todo[i] = rs[i].Document
 976	}
 977}
 978
 979func (b *Builder) buildShard(todo []*zoekt.Document, nextShardNum int) (*finishedShard, error) {
 980	if !b.opts.DisableCTags && (b.opts.CTagsPath != "" || b.opts.ScipCTagsPath != "") {
 981		err := parseSymbols(todo, b.opts.LanguageMap, b.parserBins)
 982		if b.opts.CTagsMustSucceed && err != nil {
 983			return nil, err
 984		}
 985		if err != nil {
 986			log.Printf("ignoring universal:%s or scip:%s error: %v", b.opts.CTagsPath, b.opts.ScipCTagsPath, err)
 987		}
 988	}
 989
 990	name := b.opts.shardName(nextShardNum)
 991
 992	shardBuilder, err := b.newShardBuilder()
 993	if err != nil {
 994		return nil, err
 995	}
 996
 997	sortDocuments(todo)
 998
 999	for idx, t := range todo {
1000		if err := shardBuilder.Add(*t); err != nil {
1001			return nil, err
1002		}
1003
1004		if idx%10_000 == 0 {
1005			b.CheckMemoryUsage()
1006		}
1007	}
1008
1009	return b.writeShard(name, shardBuilder)
1010}
1011
1012// CheckMemoryUsage checks the memory usage of the process and writes a memory profile if the heap usage exceeds the
1013// configured threshold. NOTE: this method is expensive and should only be used for debugging.
1014func (b *Builder) CheckMemoryUsage() {
1015	// Don't check memory if heap profiling is disabled, or we've already written 10 profiles
1016	if b.opts.HeapProfileTriggerBytes <= 0 || b.heapProfileNum >= 10 {
1017		return
1018	}
1019
1020	var m runtime.MemStats
1021	runtime.ReadMemStats(&m)
1022
1023	if m.HeapAlloc > b.opts.HeapProfileTriggerBytes && b.heapProfileMu.TryLock() {
1024		defer b.heapProfileMu.Unlock()
1025
1026		log.Printf("writing memory profile, heap usage: %s", humanize.Bytes(m.HeapAlloc))
1027		name := filepath.Join(b.opts.IndexDir, fmt.Sprintf("indexmemory.prof.%d", b.heapProfileNum))
1028		f, err := os.Create(name)
1029		if err != nil {
1030			log.Printf("failed to create memory profile file: %v", err)
1031			return
1032		}
1033
1034		err = pprof.WriteHeapProfile(f)
1035		if err != nil {
1036			log.Printf("failed to write memory profile: %v", err)
1037		}
1038
1039		b.heapProfileNum++
1040	}
1041}
1042
1043func (b *Builder) newShardBuilder() (*zoekt.IndexBuilder, error) {
1044	desc := b.opts.RepositoryDescription
1045	desc.HasSymbols = !b.opts.DisableCTags && b.opts.CTagsPath != ""
1046	desc.SubRepoMap = b.opts.SubRepositories
1047	desc.IndexOptions = b.opts.GetHash()
1048
1049	shardBuilder, err := zoekt.NewIndexBuilder(&desc)
1050	if err != nil {
1051		return nil, err
1052	}
1053	shardBuilder.IndexTime = b.indexTime
1054	shardBuilder.ID = b.id
1055	return shardBuilder, nil
1056}
1057
1058func (b *Builder) writeShard(fn string, ib *zoekt.IndexBuilder) (*finishedShard, error) {
1059	dir := filepath.Dir(fn)
1060	if err := os.MkdirAll(dir, 0o700); err != nil {
1061		return nil, err
1062	}
1063
1064	f, err := os.CreateTemp(dir, filepath.Base(fn)+".*.tmp")
1065	if err != nil {
1066		return nil, err
1067	}
1068	if runtime.GOOS != "windows" {
1069		if err := f.Chmod(0o666 &^ umask); err != nil {
1070			return nil, err
1071		}
1072	}
1073
1074	defer f.Close()
1075	if err := ib.Write(f); err != nil {
1076		return nil, err
1077	}
1078	fi, err := f.Stat()
1079	if err != nil {
1080		return nil, err
1081	}
1082	if err := f.Close(); err != nil {
1083		return nil, err
1084	}
1085
1086	log.Printf("finished shard %s: %d index bytes (overhead %3.1f), %d files processed \n",
1087		fn,
1088		fi.Size(),
1089		float64(fi.Size())/float64(ib.ContentSize()+1),
1090		ib.NumFiles())
1091
1092	return &finishedShard{f.Name(), fn}, nil
1093}
1094
1095type deltaBranchSetError struct {
1096	shardName string
1097	old, new  []zoekt.RepositoryBranch
1098}
1099
1100func (e deltaBranchSetError) Error() string {
1101	return fmt.Sprintf("repository metadata in shard %q contains a different set of branch names than what was requested, which is unsupported in a delta shard build. old: %+v, new: %+v", e.shardName, e.old, e.new)
1102}
1103
1104type deltaIndexOptionsMismatchError struct {
1105	shardName  string
1106	newOptions HashOptions
1107}
1108
1109func (e *deltaIndexOptionsMismatchError) Error() string {
1110	return fmt.Sprintf("one or more index options for shard %q do not match Builder's index options. These index option updates are incompatible with delta build. New index options: %+v", e.shardName, e.newOptions)
1111}
1112
1113// umask holds the Umask of the current process
1114var umask os.FileMode
Configure Feed

Configure Feed