index/builder.go at 045b0e0b3cb2cdc6c66b941a17483007ed44f0a8 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / index / builder.go
at 045b0e0b3cb2cdc6c66b941a17483007ed44f0a8 29 kB View raw
Keegan Carruthers-Smith index: remove shard_prefix option (#947) 1y ago
5aa52060
   1// Copyright 2016 Google Inc. All rights reserved.
   2//
   3// Licensed under the Apache License, Version 2.0 (the "License");
   4// you may not use this file except in compliance with the License.
   5// You may obtain a copy of the License at
   6//
   7//    http://www.apache.org/licenses/LICENSE-2.0
   8//
   9// Unless required by applicable law or agreed to in writing, software
  10// distributed under the License is distributed on an "AS IS" BASIS,
  11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12// See the License for the specific language governing permissions and
  13// limitations under the License.
  14
  15// Package index contains logic for building Zoekt indexes. NOTE: this package is not considered
  16// part of the public API, and it is not recommended to rely on it in external code.
  17package index
  18
  19import (
  20	"crypto/sha1"
  21	"flag"
  22	"fmt"
  23	"log"
  24	"net/url"
  25	"os"
  26	"os/exec"
  27	"path"
  28	"path/filepath"
  29	"reflect"
  30	"runtime"
  31	"runtime/pprof"
  32	"sort"
  33	"strconv"
  34	"strings"
  35	"sync"
  36	"time"
  37
  38	"github.com/bmatcuk/doublestar"
  39	"github.com/dustin/go-humanize"
  40	"github.com/go-enry/go-enry/v2"
  41	"github.com/rs/xid"
  42	"golang.org/x/sys/unix"
  43
  44	"maps"
  45
  46	"github.com/sourcegraph/zoekt"
  47	"github.com/sourcegraph/zoekt/internal/ctags"
  48	"github.com/sourcegraph/zoekt/internal/tenant"
  49)
  50
  51var DefaultDir = filepath.Join(os.Getenv("HOME"), ".zoekt")
  52
  53// Branch describes a single branch version.
  54type Branch struct {
  55	Name    string
  56	Version string
  57}
  58
  59// Options sets options for the index building.
  60type Options struct {
  61	// IndexDir is a directory that holds *.zoekt index files.
  62	IndexDir string
  63
  64	// SizeMax is the maximum file size
  65	SizeMax int
  66
  67	// Parallelism is the maximum number of shards to index in parallel
  68	Parallelism int
  69
  70	// ShardMax sets the maximum corpus size for a single shard
  71	ShardMax int
  72
  73	// TrigramMax sets the maximum number of distinct trigrams per document.
  74	TrigramMax int
  75
  76	// RepositoryDescription holds names and URLs for the repository.
  77	RepositoryDescription zoekt.Repository
  78
  79	// SubRepositories is a path => sub repository map.
  80	SubRepositories map[string]*zoekt.Repository
  81
  82	// DisableCTags disables the generation of ctags metadata.
  83	DisableCTags bool
  84
  85	// CtagsPath is the path to the ctags binary to run, or empty
  86	// if a valid binary couldn't be found.
  87	CTagsPath string
  88
  89	// Same as CTagsPath but for scip-ctags
  90	ScipCTagsPath string
  91
  92	// If set, ctags must succeed.
  93	CTagsMustSucceed bool
  94
  95	// LargeFiles is a slice of glob patterns, including ** for any number
  96	// of directories, where matching file paths should be indexed
  97	// regardless of their size. The full pattern syntax is here:
  98	// https://github.com/bmatcuk/doublestar/tree/v1#patterns.
  99	LargeFiles []string
 100
 101	// IsDelta is true if this run contains only the changed documents since the
 102	// last run.
 103	IsDelta bool
 104
 105	// changedOrRemovedFiles is a list of file paths that have been changed or removed
 106	// since the last indexing job for this repository. These files will be tombstoned
 107	// in the older shards for this repository.
 108	changedOrRemovedFiles []string
 109
 110	LanguageMap ctags.LanguageMap
 111
 112	// ShardMerging is true if builder should respect compound shards. This is a
 113	// Sourcegraph specific option.
 114	ShardMerging bool
 115
 116	// HeapProfileTriggerBytes is the heap allocation in bytes that will trigger a memory profile. If 0, no memory profile
 117	// will be triggered. Note this trigger looks at total heap allocation (which includes both inuse and garbage objects).
 118	//
 119	// Profiles will be written to files named `index-memory.prof.n` in the index directory. No more than 10 files are written.
 120	//
 121	// Note: heap checking is "best effort", and it's possible for the process to OOM without triggering the heap profile.
 122	HeapProfileTriggerBytes uint64
 123
 124	// TenantID is the ID of the tenant this shard belongs to.
 125	TenantID int
 126
 127	// RepoID is the ID of the repository this shard belongs to.
 128	RepoID uint32
 129}
 130
 131// HashOptions contains only the options in Options that upon modification leads to IndexState of IndexStateMismatch during the next index building.
 132type HashOptions struct {
 133	sizeMax          int
 134	disableCTags     bool
 135	ctagsPath        string
 136	cTagsMustSucceed bool
 137	largeFiles       []string
 138}
 139
 140func (o *Options) HashOptions() HashOptions {
 141	return HashOptions{
 142		sizeMax:          o.SizeMax,
 143		disableCTags:     o.DisableCTags,
 144		ctagsPath:        o.CTagsPath,
 145		cTagsMustSucceed: o.CTagsMustSucceed,
 146		largeFiles:       o.LargeFiles,
 147	}
 148}
 149
 150func (o *Options) GetHash() string {
 151	h := o.HashOptions()
 152	hasher := sha1.New()
 153
 154	hasher.Write([]byte(h.ctagsPath))
 155	hasher.Write(fmt.Appendf(nil, "%t", h.cTagsMustSucceed))
 156	hasher.Write(fmt.Appendf(nil, "%d", h.sizeMax))
 157	hasher.Write(fmt.Appendf(nil, "%q", h.largeFiles))
 158	hasher.Write(fmt.Appendf(nil, "%t", h.disableCTags))
 159
 160	return fmt.Sprintf("%x", hasher.Sum(nil))
 161}
 162
 163type largeFilesFlag struct{ *Options }
 164
 165func (f largeFilesFlag) String() string {
 166	// From flag.Value documentation:
 167	//
 168	// The flag package may call the String method with a zero-valued receiver,
 169	// such as a nil pointer.
 170	if f.Options == nil {
 171		return ""
 172	}
 173	s := append([]string{""}, f.LargeFiles...)
 174	return strings.Join(s, "-large_file ")
 175}
 176
 177func (f largeFilesFlag) Set(value string) error {
 178	f.LargeFiles = append(f.LargeFiles, value)
 179	return nil
 180}
 181
 182// Flags adds flags for build options to fs. It is the "inverse" of Args.
 183func (o *Options) Flags(fs *flag.FlagSet) {
 184	x := *o
 185	x.SetDefaults()
 186	fs.IntVar(&o.SizeMax, "file_limit", x.SizeMax, "maximum file size")
 187	fs.IntVar(&o.TrigramMax, "max_trigram_count", x.TrigramMax, "maximum number of trigrams per document")
 188	fs.IntVar(&o.ShardMax, "shard_limit", x.ShardMax, "maximum corpus size for a shard")
 189	fs.IntVar(&o.Parallelism, "parallelism", x.Parallelism, "maximum number of parallel indexing processes.")
 190	fs.StringVar(&o.IndexDir, "index", x.IndexDir, "directory for search indices")
 191	fs.BoolVar(&o.CTagsMustSucceed, "require_ctags", x.CTagsMustSucceed, "If set, ctags calls must succeed.")
 192	fs.Var(largeFilesFlag{o}, "large_file", "A glob pattern where matching files are to be index regardless of their size. You can add multiple patterns by setting this more than once.")
 193
 194	// Sourcegraph specific
 195	fs.BoolVar(&o.DisableCTags, "disable_ctags", x.DisableCTags, "If set, ctags will not be called.")
 196	fs.BoolVar(&o.ShardMerging, "shard_merging", x.ShardMerging, "If set, builder will respect compound shards.")
 197}
 198
 199// Args generates command line arguments for o. It is the "inverse" of Flags.
 200func (o *Options) Args() []string {
 201	var args []string
 202
 203	if o.SizeMax != 0 {
 204		args = append(args, "-file_limit", strconv.Itoa(o.SizeMax))
 205	}
 206
 207	if o.TrigramMax != 0 {
 208		args = append(args, "-max_trigram_count", strconv.Itoa(o.TrigramMax))
 209	}
 210
 211	if o.ShardMax != 0 {
 212		args = append(args, "-shard_limit", strconv.Itoa(o.ShardMax))
 213	}
 214
 215	if o.Parallelism != 0 {
 216		args = append(args, "-parallelism", strconv.Itoa(o.Parallelism))
 217	}
 218
 219	if o.IndexDir != "" {
 220		args = append(args, "-index", o.IndexDir)
 221	}
 222
 223	if o.CTagsMustSucceed {
 224		args = append(args, "-require_ctags")
 225	}
 226
 227	for _, a := range o.LargeFiles {
 228		args = append(args, "-large_file", a)
 229	}
 230
 231	// Sourcegraph specific
 232	if o.DisableCTags {
 233		args = append(args, "-disable_ctags")
 234	}
 235
 236	if o.ShardMerging {
 237		args = append(args, "-shard_merging")
 238	}
 239
 240	return args
 241}
 242
 243// Builder manages (parallel) creation of uniformly sized shards. The
 244// builder buffers up documents until it collects enough documents and
 245// then builds a shard and writes.
 246type Builder struct {
 247	opts     Options
 248	throttle chan int
 249
 250	nextShardNum int
 251	todo         []*Document
 252	docChecker   DocChecker
 253	size         int
 254
 255	parserBins ctags.ParserBinMap
 256	building   sync.WaitGroup
 257
 258	errMu      sync.Mutex
 259	buildError error
 260
 261	// temp name => final name for finished shards. We only rename
 262	// them once all shards succeed to avoid Frankstein corpuses.
 263	finishedShards map[string]string
 264
 265	// indexTime is set by tests for doing reproducible builds.
 266	indexTime time.Time
 267
 268	// heapProfileMu is used to ensure that only one memory profile is written at a time
 269	heapProfileMu  sync.Mutex
 270	heapProfileNum int
 271
 272	// a sortable 20 chars long id.
 273	id string
 274
 275	finishCalled bool
 276}
 277
 278type finishedShard struct {
 279	temp, final string
 280}
 281
 282func checkCTags() string {
 283	if ctags := os.Getenv("CTAGS_COMMAND"); ctags != "" {
 284		return ctags
 285	}
 286
 287	if ctags, err := exec.LookPath("universal-ctags"); err == nil {
 288		return ctags
 289	}
 290
 291	return ""
 292}
 293
 294func checkScipCTags() string {
 295	if ctags := os.Getenv("SCIP_CTAGS_COMMAND"); ctags != "" {
 296		return ctags
 297	}
 298
 299	if ctags, err := exec.LookPath("scip-ctags"); err == nil {
 300		return ctags
 301	}
 302
 303	return ""
 304}
 305
 306// SetDefaults sets reasonable default options.
 307func (o *Options) SetDefaults() {
 308	if o.CTagsPath == "" && !o.DisableCTags {
 309		o.CTagsPath = checkCTags()
 310	}
 311
 312	if o.ScipCTagsPath == "" && !o.DisableCTags {
 313		o.ScipCTagsPath = checkScipCTags()
 314	}
 315
 316	if o.Parallelism == 0 {
 317		o.Parallelism = 4
 318	}
 319	if o.SizeMax == 0 {
 320		o.SizeMax = 2 << 20
 321	}
 322	if o.ShardMax == 0 {
 323		o.ShardMax = 100 << 20
 324	}
 325	if o.TrigramMax == 0 {
 326		o.TrigramMax = 20000
 327	}
 328
 329	if o.RepositoryDescription.Name == "" && o.RepositoryDescription.URL != "" {
 330		parsed, _ := url.Parse(o.RepositoryDescription.URL)
 331		if parsed != nil {
 332			o.RepositoryDescription.Name = filepath.Join(parsed.Host, parsed.Path)
 333		}
 334	}
 335}
 336
 337// ShardName returns the name the given index shard.
 338func (o *Options) shardName(n int) string {
 339	return o.shardNameVersion(IndexFormatVersion, n)
 340}
 341
 342func (o *Options) shardNameVersion(version, n int) string {
 343	var prefix string
 344
 345	// If tenant enforcement is enabled and we have tenant/repo IDs, use those to generate the prefix
 346	if o.TenantID != 0 && o.RepoID != 0 && tenant.EnforceTenant() {
 347		prefix = tenant.SrcPrefix(o.TenantID, o.RepoID)
 348	} else {
 349		prefix = o.RepositoryDescription.Name
 350	}
 351
 352	return ShardName(o.IndexDir, prefix, version, n)
 353}
 354
 355type IndexState string
 356
 357const (
 358	IndexStateMissing IndexState = "missing"
 359	IndexStateCorrupt IndexState = "corrupt"
 360	IndexStateVersion IndexState = "version-mismatch"
 361	IndexStateOption  IndexState = "option-mismatch"
 362	IndexStateMeta    IndexState = "meta-mismatch"
 363	IndexStateContent IndexState = "content-mismatch"
 364	IndexStateEqual   IndexState = "equal"
 365)
 366
 367var readVersions = []struct {
 368	IndexFormatVersion int
 369	FeatureVersion     int
 370}{{
 371	IndexFormatVersion: IndexFormatVersion,
 372	FeatureVersion:     FeatureVersion,
 373}, {
 374	IndexFormatVersion: NextIndexFormatVersion,
 375	FeatureVersion:     FeatureVersion,
 376}}
 377
 378// IncrementalSkipIndexing returns true if the index present on disk matches
 379// the build options.
 380func (o *Options) IncrementalSkipIndexing() bool {
 381	state, _ := o.IndexState()
 382	return state == IndexStateEqual
 383}
 384
 385// IndexState checks how the index present on disk compares to the build
 386// options and returns the IndexState and the name of the first shard.
 387func (o *Options) IndexState() (IndexState, string) {
 388	// Open the latest version we support that is on disk.
 389	fn := o.findShard()
 390	if fn == "" {
 391		return IndexStateMissing, fn
 392	}
 393
 394	repos, index, err := ReadMetadataPathAlive(fn)
 395	if os.IsNotExist(err) {
 396		return IndexStateMissing, fn
 397	} else if err != nil {
 398		return IndexStateCorrupt, fn
 399	}
 400
 401	for _, v := range readVersions {
 402		if v.IndexFormatVersion == index.IndexFormatVersion && v.FeatureVersion != index.IndexFeatureVersion {
 403			return IndexStateVersion, fn
 404		}
 405	}
 406
 407	var repo *zoekt.Repository
 408	for _, cand := range repos {
 409		if cand.Name == o.RepositoryDescription.Name {
 410			repo = cand
 411			break
 412		}
 413	}
 414
 415	if repo == nil {
 416		return IndexStateCorrupt, fn
 417	}
 418
 419	if repo.IndexOptions != o.GetHash() {
 420		return IndexStateOption, fn
 421	}
 422
 423	if !reflect.DeepEqual(repo.Branches, o.RepositoryDescription.Branches) {
 424		return IndexStateContent, fn
 425	}
 426
 427	// We can mutate repo since it lives in the scope of this function call.
 428	if updated, err := repo.MergeMutable(&o.RepositoryDescription); err != nil {
 429		// non-nil err means we are trying to update an immutable field =>
 430		// reindex content.
 431		log.Printf("warn: immutable field changed, requires re-index: %s", err)
 432		return IndexStateContent, fn
 433	} else if updated {
 434		return IndexStateMeta, fn
 435	}
 436
 437	return IndexStateEqual, fn
 438}
 439
 440// FindRepositoryMetadata returns the index metadata for the repository
 441// specified in the options. 'ok' is false if the repository's metadata
 442// couldn't be found or if an error occurred.
 443func (o *Options) FindRepositoryMetadata() (repository *zoekt.Repository, metadata *zoekt.IndexMetadata, ok bool, err error) {
 444	shard := o.findShard()
 445	if shard == "" {
 446		return nil, nil, false, nil
 447	}
 448
 449	repositories, metadata, err := ReadMetadataPathAlive(shard)
 450	if err != nil {
 451		return nil, nil, false, fmt.Errorf("reading metadata for shard %q: %w", shard, err)
 452	}
 453
 454	ID := o.RepositoryDescription.ID
 455	for _, r := range repositories {
 456		// compound shards contain multiple repositories, so we
 457		// have to pick only the one we're looking for
 458		if r.ID == ID {
 459			return r, metadata, true, nil
 460		}
 461	}
 462
 463	// If we're here, then we're somehow in a state where we found a matching
 464	// shard that's missing the repository metadata we're looking for. This
 465	// should never happen.
 466	name := o.RepositoryDescription.Name
 467	return nil, nil, false, fmt.Errorf("matching shard %q doesn't contain metadata for repo id %d (%q)", shard, ID, name)
 468}
 469
 470func (o *Options) findShard() string {
 471	for _, v := range readVersions {
 472		fn := o.shardNameVersion(v.IndexFormatVersion, 0)
 473		if _, err := os.Stat(fn); err == nil {
 474			return fn
 475		}
 476	}
 477
 478	// Brute force finding the shard in compound shards. We should only hit this
 479	// code path for repositories that don't exist yet or are in compound shards.
 480	return o.findCompoundShard()
 481}
 482
 483func (o *Options) findCompoundShard() string {
 484	compoundShards, err := filepath.Glob(path.Join(o.IndexDir, "compound-*.zoekt"))
 485	if err != nil {
 486		return ""
 487	}
 488	for _, fn := range compoundShards {
 489		if containsRepo(fn, o.RepositoryDescription.ID) {
 490			return fn
 491		}
 492	}
 493
 494	return ""
 495}
 496
 497func (o *Options) FindAllShards() []string {
 498	for _, v := range readVersions {
 499		fn := o.shardNameVersion(v.IndexFormatVersion, 0)
 500		if _, err := os.Stat(fn); err == nil {
 501			shards := []string{fn}
 502			for i := 1; ; i++ {
 503				fn := o.shardNameVersion(v.IndexFormatVersion, i)
 504				if _, err := os.Stat(fn); err != nil {
 505					return shards
 506				}
 507				shards = append(shards, fn)
 508			}
 509		}
 510	}
 511
 512	// lazily fallback to findShard which will look for a compound shard.
 513	if fn := o.findShard(); fn != "" {
 514		return []string{fn}
 515	}
 516
 517	return nil
 518}
 519
 520// IgnoreSizeMax determines whether the max size should be ignored.
 521func (o *Options) IgnoreSizeMax(name string) bool {
 522	// A pattern match will override preceding pattern matches.
 523	for i := len(o.LargeFiles) - 1; i >= 0; i-- {
 524		pattern := strings.TrimSpace(o.LargeFiles[i])
 525		negated, validatedPattern := checkIsNegatePattern(pattern)
 526
 527		if m, _ := doublestar.PathMatch(validatedPattern, name); m {
 528			if negated {
 529				return false
 530			} else {
 531				return true
 532			}
 533		}
 534	}
 535
 536	return false
 537}
 538
 539func checkIsNegatePattern(pattern string) (bool, string) {
 540	negate := "!"
 541
 542	// if negated then strip prefix meta character which identifies negated filter pattern
 543	if strings.HasPrefix(pattern, negate) {
 544		return true, pattern[len(negate):]
 545	}
 546
 547	return false, pattern
 548}
 549
 550// NewBuilder creates a new Builder instance.
 551func NewBuilder(opts Options) (*Builder, error) {
 552	opts.SetDefaults()
 553	if opts.RepositoryDescription.Name == "" {
 554		return nil, fmt.Errorf("builder: must set Name")
 555	}
 556
 557	b := &Builder{
 558		opts:           opts,
 559		throttle:       make(chan int, opts.Parallelism),
 560		finishedShards: map[string]string{},
 561	}
 562
 563	parserBins, err := ctags.NewParserBinMap(
 564		b.opts.CTagsPath,
 565		b.opts.ScipCTagsPath,
 566		opts.LanguageMap,
 567		b.opts.CTagsMustSucceed,
 568	)
 569	if err != nil {
 570		return nil, err
 571	}
 572
 573	b.parserBins = parserBins
 574
 575	if opts.IsDelta {
 576		// Delta shards build on top of previously existing shards.
 577		// As a consequence, the shardNum for delta shards starts from
 578		// the number following the most recently generated shard - not 0.
 579		//
 580		// Using this numbering scheme allows all the shards to be
 581		// discovered as a set.
 582		shards := b.opts.FindAllShards()
 583		b.nextShardNum = len(shards) // shards are zero indexed, so len() provides the next number after the last one
 584	}
 585
 586	if _, err := b.newShardBuilder(); err != nil {
 587		return nil, err
 588	}
 589
 590	now := time.Now()
 591	b.indexTime = now
 592	b.id = xid.NewWithTime(now).String()
 593
 594	return b, nil
 595}
 596
 597// AddFile is a convenience wrapper for the Add method
 598func (b *Builder) AddFile(name string, content []byte) error {
 599	return b.Add(Document{Name: name, Content: content})
 600}
 601
 602func (b *Builder) Add(doc Document) error {
 603	if b.finishCalled {
 604		return nil
 605	}
 606
 607	allowLargeFile := b.opts.IgnoreSizeMax(doc.Name)
 608	if len(doc.Content) > b.opts.SizeMax && !allowLargeFile {
 609		// We could pass the document on to the shardbuilder, but if
 610		// we pass through a part of the source tree with binary/large
 611		// files, the corresponding shard would be mostly empty, so
 612		// insert a reason here too.
 613		doc.SkipReason = SkipReasonTooLarge
 614	} else if skip := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile); skip != SkipReasonNone {
 615		doc.SkipReason = skip
 616	}
 617
 618	b.todo = append(b.todo, &doc)
 619
 620	if doc.SkipReason == SkipReasonNone {
 621		b.size += len(doc.Name) + len(doc.Content)
 622	} else {
 623		b.size += len(doc.Name)
 624		// Drop the content if we are skipping the document. Skipped content is not counted towards the
 625		// shard size limit, so otherwise we might buffer too much data in memory before flushing.
 626		doc.Content = nil
 627	}
 628
 629	if b.size > b.opts.ShardMax {
 630		return b.flush()
 631	}
 632
 633	return nil
 634}
 635
 636// MarkFileAsChangedOrRemoved indicates that the file specified by the given path
 637// has been changed or removed since the last indexing job for this repository.
 638//
 639// If this build is a delta build, these files will be tombstoned in the older shards for this repository.
 640func (b *Builder) MarkFileAsChangedOrRemoved(path string) {
 641	b.opts.changedOrRemovedFiles = append(b.opts.changedOrRemovedFiles, path)
 642}
 643
 644// Finish creates a last shard from the buffered documents, and clears
 645// stale shards from previous runs. This should always be called, also
 646// in failure cases, to ensure cleanup.
 647//
 648// It is safe to call Finish() multiple times.
 649func (b *Builder) Finish() error {
 650	if b.finishCalled {
 651		return b.buildError
 652	}
 653
 654	b.finishCalled = true
 655
 656	b.flush()
 657	b.building.Wait()
 658
 659	if b.buildError != nil {
 660		for tmp := range b.finishedShards {
 661			log.Printf("Builder.Finish %s", tmp)
 662			os.Remove(tmp)
 663		}
 664		b.finishedShards = map[string]string{}
 665		return b.buildError
 666	}
 667
 668	// map of temporary -> final names for all updated shards + shard metadata files
 669	artifactPaths := make(map[string]string)
 670	maps.Copy(artifactPaths, b.finishedShards)
 671
 672	oldShards := b.opts.FindAllShards()
 673
 674	if b.opts.IsDelta {
 675		// Delta shard builds need to update FileTombstone and branch commit information for all
 676		// existing shards
 677		for _, shard := range oldShards {
 678			repositories, _, err := ReadMetadataPathAlive(shard)
 679			if err != nil {
 680				return fmt.Errorf("reading metadata from shard %q: %w", shard, err)
 681			}
 682
 683			if len(repositories) > 1 {
 684				return fmt.Errorf("delta shard builds don't support repositories contained in compound shards (shard %q)", shard)
 685			}
 686
 687			if len(repositories) == 0 {
 688				return fmt.Errorf("failed to update repository metadata for shard %q - shard contains no repositories", shard)
 689			}
 690
 691			repository := repositories[0]
 692			if repository.ID != b.opts.RepositoryDescription.ID {
 693				return fmt.Errorf("shard %q doesn't contain repository ID %d (%q)", shard, b.opts.RepositoryDescription.ID, b.opts.RepositoryDescription.Name)
 694			}
 695
 696			if len(b.opts.changedOrRemovedFiles) > 0 && repository.FileTombstones == nil {
 697				repository.FileTombstones = make(map[string]struct{}, len(b.opts.changedOrRemovedFiles))
 698			}
 699
 700			for _, f := range b.opts.changedOrRemovedFiles {
 701				repository.FileTombstones[f] = struct{}{}
 702			}
 703
 704			if !BranchNamesEqual(repository.Branches, b.opts.RepositoryDescription.Branches) {
 705				return deltaBranchSetError{
 706					shardName: shard,
 707					old:       repository.Branches,
 708					new:       b.opts.RepositoryDescription.Branches,
 709				}
 710			}
 711
 712			if b.opts.GetHash() != repository.IndexOptions {
 713				return &deltaIndexOptionsMismatchError{
 714					shardName:  shard,
 715					newOptions: b.opts.HashOptions(),
 716				}
 717			}
 718
 719			repository.Branches = b.opts.RepositoryDescription.Branches
 720
 721			repository.LatestCommitDate = b.opts.RepositoryDescription.LatestCommitDate
 722
 723			tempPath, finalPath, err := JsonMarshalRepoMetaTemp(shard, repository)
 724			if err != nil {
 725				return fmt.Errorf("writing repository metadta for shard %q: %w", shard, err)
 726			}
 727
 728			artifactPaths[tempPath] = finalPath
 729		}
 730	}
 731
 732	// We mark finished shards as empty when we successfully finish. Return now
 733	// to allow call sites to call Finish idempotently.
 734	if len(artifactPaths) == 0 {
 735		return b.buildError
 736	}
 737
 738	// Collect a map of the old shards on disk. For each new shard we replace we
 739	// delete it from toDelete. Anything remaining in toDelete will be removed
 740	// after we have renamed everything into place.
 741
 742	var toDelete map[string]struct{}
 743	if !b.opts.IsDelta {
 744		// Non-delta shard builds delete all existing shards before they write out
 745		// new ones.
 746		// By contrast, delta shard builds work by stacking changes on top of existing shards.
 747		// So, we skip populating the toDelete map if we're building delta shards.
 748
 749		toDelete = make(map[string]struct{})
 750		for _, name := range oldShards {
 751			paths, err := IndexFilePaths(name)
 752			if err != nil {
 753				b.buildError = fmt.Errorf("failed to find old paths for %s: %w", name, err)
 754			}
 755			for _, p := range paths {
 756				toDelete[p] = struct{}{}
 757			}
 758		}
 759	}
 760
 761	for tmp, final := range artifactPaths {
 762		if err := os.Rename(tmp, final); err != nil {
 763			b.buildError = err
 764			continue
 765		}
 766
 767		delete(toDelete, final)
 768	}
 769
 770	b.finishedShards = map[string]string{}
 771
 772	for p := range toDelete {
 773		// Don't delete compound shards, set tombstones instead.
 774		if b.opts.ShardMerging && strings.HasPrefix(filepath.Base(p), "compound-") {
 775			if !strings.HasSuffix(p, ".zoekt") {
 776				continue
 777			}
 778			err := SetTombstone(p, b.opts.RepositoryDescription.ID)
 779			b.buildError = err
 780			continue
 781		}
 782		log.Printf("removing old shard file: %s", p)
 783		if err := os.Remove(p); err != nil {
 784			b.buildError = err
 785		}
 786	}
 787
 788	return b.buildError
 789}
 790
 791// BranchNamesEqual compares the given zoekt.RepositoryBranch slices, and returns true
 792// iff both slices specify the same set of branch names in the same order.
 793func BranchNamesEqual(a, b []zoekt.RepositoryBranch) bool {
 794	if len(a) != len(b) {
 795		return false
 796	}
 797
 798	for i := range a {
 799		x, y := a[i], b[i]
 800		if x.Name != y.Name {
 801			return false
 802		}
 803	}
 804
 805	return true
 806}
 807
 808func (b *Builder) flush() error {
 809	todo := b.todo
 810	b.todo = nil
 811	b.size = 0
 812	b.errMu.Lock()
 813	defer b.errMu.Unlock()
 814	if b.buildError != nil {
 815		return b.buildError
 816	}
 817
 818	hasShard := b.nextShardNum > 0
 819	if len(todo) == 0 && hasShard {
 820		return nil
 821	}
 822
 823	shard := b.nextShardNum
 824	b.nextShardNum++
 825
 826	if b.opts.Parallelism > 1 {
 827		b.building.Add(1)
 828		b.throttle <- 1
 829		go func() {
 830			done, err := b.buildShard(todo, shard)
 831			<-b.throttle
 832
 833			b.errMu.Lock()
 834			defer b.errMu.Unlock()
 835			if err != nil && b.buildError == nil {
 836				b.buildError = err
 837			}
 838			if err == nil {
 839				b.finishedShards[done.temp] = done.final
 840			}
 841			b.building.Done()
 842		}()
 843	} else {
 844		// No goroutines when we're not parallel. This
 845		// simplifies memory profiling.
 846		done, err := b.buildShard(todo, shard)
 847		b.buildError = err
 848		if err == nil {
 849			b.finishedShards[done.temp] = done.final
 850		}
 851
 852		return b.buildError
 853	}
 854
 855	return nil
 856}
 857
 858// map [0,inf) to [0,1) monotonically
 859func squashRange(j int) float64 {
 860	x := float64(j)
 861	return x / (1 + x)
 862}
 863
 864type rankedDoc struct {
 865	*Document
 866	rank []float64
 867}
 868
 869// rank returns a vector of scores which is used at index-time to sort documents
 870// before writing them to disk. The order of documents in the shard is important
 871// at query time, because earlier documents receive a boost at query time and
 872// have a higher chance of being searched before limits kick in.
 873func rank(d *Document, origIdx int) []float64 {
 874	skipped := 0.0
 875	if d.SkipReason != SkipReasonNone {
 876		skipped = 1.0
 877	}
 878
 879	generated := 0.0
 880	if enry.IsGenerated(d.Name, d.Content) {
 881		generated = 1.0
 882	}
 883
 884	vendor := 0.0
 885	if enry.IsVendor(d.Name) {
 886		vendor = 1.0
 887	}
 888
 889	test := 0.0
 890	if enry.IsTest(d.Name) {
 891		test = 1.0
 892	}
 893
 894	// Smaller is earlier (=better).
 895	return []float64{
 896		// Always place skipped docs last
 897		skipped,
 898
 899		// Prefer docs that are not generated
 900		generated,
 901
 902		// Prefer docs that are not vendored
 903		vendor,
 904
 905		// Prefer docs that are not tests
 906		test,
 907
 908		// With short names
 909		squashRange(len(d.Name)),
 910
 911		// With many symbols
 912		1.0 - squashRange(len(d.Symbols)),
 913
 914		// With short content
 915		squashRange(len(d.Content)),
 916
 917		// That is present is as many branches as possible
 918		1.0 - squashRange(len(d.Branches)),
 919
 920		// Preserve original ordering.
 921		squashRange(origIdx),
 922	}
 923}
 924
 925func sortDocuments(todo []*Document) {
 926	rs := make([]rankedDoc, 0, len(todo))
 927	for i, t := range todo {
 928		rd := rankedDoc{t, rank(t, i)}
 929		rs = append(rs, rd)
 930	}
 931	sort.Slice(rs, func(i, j int) bool {
 932		r1 := rs[i].rank
 933		r2 := rs[j].rank
 934		for i := range r1 {
 935			if r1[i] < r2[i] {
 936				return true
 937			}
 938			if r1[i] > r2[i] {
 939				return false
 940			}
 941		}
 942
 943		return false
 944	})
 945	for i := range todo {
 946		todo[i] = rs[i].Document
 947	}
 948}
 949
 950func (b *Builder) buildShard(todo []*Document, nextShardNum int) (*finishedShard, error) {
 951	if !b.opts.DisableCTags && (b.opts.CTagsPath != "" || b.opts.ScipCTagsPath != "") {
 952		err := parseSymbols(todo, b.opts.LanguageMap, b.parserBins)
 953		if b.opts.CTagsMustSucceed && err != nil {
 954			return nil, err
 955		}
 956		if err != nil {
 957			log.Printf("ignoring universal:%s or scip:%s error: %v", b.opts.CTagsPath, b.opts.ScipCTagsPath, err)
 958		}
 959	}
 960
 961	name := b.opts.shardName(nextShardNum)
 962
 963	shardBuilder, err := b.newShardBuilder()
 964	if err != nil {
 965		return nil, err
 966	}
 967
 968	sortDocuments(todo)
 969
 970	for idx, t := range todo {
 971		if err := shardBuilder.Add(*t); err != nil {
 972			return nil, err
 973		}
 974
 975		if idx%10_000 == 0 {
 976			b.CheckMemoryUsage()
 977		}
 978	}
 979
 980	return b.writeShard(name, shardBuilder)
 981}
 982
 983// CheckMemoryUsage checks the memory usage of the process and writes a memory profile if the heap usage exceeds the
 984// configured threshold. NOTE: this method is expensive and should only be used for debugging.
 985func (b *Builder) CheckMemoryUsage() {
 986	// Don't check memory if heap profiling is disabled, or we've already written 10 profiles
 987	if b.opts.HeapProfileTriggerBytes <= 0 || b.heapProfileNum >= 10 {
 988		return
 989	}
 990
 991	var m runtime.MemStats
 992	runtime.ReadMemStats(&m)
 993
 994	if m.HeapAlloc > b.opts.HeapProfileTriggerBytes && b.heapProfileMu.TryLock() {
 995		defer b.heapProfileMu.Unlock()
 996
 997		log.Printf("writing memory profile, allocated heap: %s", humanize.Bytes(m.HeapAlloc))
 998		name := filepath.Join(b.opts.IndexDir, fmt.Sprintf("indexmemory.prof.%d", b.heapProfileNum))
 999		f, err := os.Create(name)
1000		if err != nil {
1001			log.Printf("failed to create memory profile file: %v", err)
1002			return
1003		}
1004
1005		err = pprof.WriteHeapProfile(f)
1006		if err != nil {
1007			log.Printf("failed to write memory profile: %v", err)
1008		}
1009
1010		b.heapProfileNum++
1011	}
1012}
1013
1014func (b *Builder) newShardBuilder() (*ShardBuilder, error) {
1015	desc := b.opts.RepositoryDescription
1016	desc.HasSymbols = !b.opts.DisableCTags && b.opts.CTagsPath != ""
1017	desc.SubRepoMap = b.opts.SubRepositories
1018	desc.IndexOptions = b.opts.GetHash()
1019
1020	shardBuilder, err := NewShardBuilder(&desc)
1021	if err != nil {
1022		return nil, err
1023	}
1024	shardBuilder.IndexTime = b.indexTime
1025	shardBuilder.ID = b.id
1026	return shardBuilder, nil
1027}
1028
1029func (b *Builder) writeShard(fn string, ib *ShardBuilder) (*finishedShard, error) {
1030	dir := filepath.Dir(fn)
1031	if err := os.MkdirAll(dir, 0o700); err != nil {
1032		return nil, err
1033	}
1034
1035	f, err := os.CreateTemp(dir, filepath.Base(fn)+".*.tmp")
1036	if err != nil {
1037		return nil, err
1038	}
1039	if runtime.GOOS != "windows" {
1040		if err := f.Chmod(0o666 &^ umask); err != nil {
1041			return nil, err
1042		}
1043	}
1044
1045	defer f.Close()
1046	if err := ib.Write(f); err != nil {
1047		return nil, err
1048	}
1049	fi, err := f.Stat()
1050	if err != nil {
1051		return nil, err
1052	}
1053	if err := f.Close(); err != nil {
1054		return nil, err
1055	}
1056
1057	log.Printf("finished shard %s: %d index bytes (overhead %3.1f), %d files processed \n",
1058		fn,
1059		fi.Size(),
1060		float64(fi.Size())/float64(ib.ContentSize()+1),
1061		ib.NumFiles())
1062
1063	return &finishedShard{f.Name(), fn}, nil
1064}
1065
1066type deltaBranchSetError struct {
1067	shardName string
1068	old, new  []zoekt.RepositoryBranch
1069}
1070
1071func (e deltaBranchSetError) Error() string {
1072	return fmt.Sprintf("repository metadata in shard %q contains a different set of branch names than what was requested, which is unsupported in a delta shard build. old: %+v, new: %+v", e.shardName, e.old, e.new)
1073}
1074
1075type deltaIndexOptionsMismatchError struct {
1076	shardName  string
1077	newOptions HashOptions
1078}
1079
1080func (e *deltaIndexOptionsMismatchError) Error() string {
1081	return fmt.Sprintf("one or more index options for shard %q do not match Builder's index options. These index option updates are incompatible with delta build. New index options: %+v", e.shardName, e.newOptions)
1082}
1083
1084// umask holds the Umask of the current process
1085var umask os.FileMode
1086
1087func init() {
1088	umask = os.FileMode(unix.Umask(0))
1089	unix.Umask(int(umask))
1090}
Configure Feed

Configure Feed