index/builder.go at 1e121443f9f27758c3cb4f887bd4d028d66b0994 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / index / builder.go
at 1e121443f9f27758c3cb4f887bd4d028d66b0994 30 kB View raw
Ravi Kumar Add ShardPrefixOverride to support custom shard naming (#1005) 3mo ago
   1// Copyright 2016 Google Inc. All rights reserved.
   2//
   3// Licensed under the Apache License, Version 2.0 (the "License");
   4// you may not use this file except in compliance with the License.
   5// You may obtain a copy of the License at
   6//
   7//    http://www.apache.org/licenses/LICENSE-2.0
   8//
   9// Unless required by applicable law or agreed to in writing, software
  10// distributed under the License is distributed on an "AS IS" BASIS,
  11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12// See the License for the specific language governing permissions and
  13// limitations under the License.
  14
  15// Package index contains logic for building Zoekt indexes. NOTE: this package is not considered
  16// part of the public API, and it is not recommended to rely on it in external code.
  17package index
  18
  19import (
  20	"crypto/sha1"
  21	"flag"
  22	"fmt"
  23	"log"
  24	"net/url"
  25	"os"
  26	"os/exec"
  27	"path"
  28	"path/filepath"
  29	"reflect"
  30	"runtime"
  31	"runtime/pprof"
  32	"sort"
  33	"strconv"
  34	"strings"
  35	"sync"
  36	"time"
  37
  38	"github.com/bmatcuk/doublestar"
  39	"github.com/dustin/go-humanize"
  40	"github.com/go-enry/go-enry/v2"
  41	"github.com/rs/xid"
  42	"golang.org/x/sys/unix"
  43
  44	"maps"
  45
  46	"github.com/sourcegraph/zoekt"
  47	"github.com/sourcegraph/zoekt/internal/ctags"
  48	"github.com/sourcegraph/zoekt/internal/tenant"
  49)
  50
  51var DefaultDir = filepath.Join(os.Getenv("HOME"), ".zoekt")
  52
  53// Branch describes a single branch version.
  54type Branch struct {
  55	Name    string
  56	Version string
  57}
  58
  59// Options sets options for the index building.
  60type Options struct {
  61	// IndexDir is a directory that holds *.zoekt index files.
  62	IndexDir string
  63
  64	// ShardPrefixOverride sets the prefix for shards name
  65	ShardPrefixOverride string
  66
  67	// SizeMax is the maximum file size
  68	SizeMax int
  69
  70	// Parallelism is the maximum number of shards to index in parallel
  71	Parallelism int
  72
  73	// ShardMax sets the maximum corpus size for a single shard
  74	ShardMax int
  75
  76	// TrigramMax sets the maximum number of distinct trigrams per document.
  77	TrigramMax int
  78
  79	// RepositoryDescription holds names and URLs for the repository.
  80	RepositoryDescription zoekt.Repository
  81
  82	// SubRepositories is a path => sub repository map.
  83	SubRepositories map[string]*zoekt.Repository
  84
  85	// DisableCTags disables the generation of ctags metadata.
  86	DisableCTags bool
  87
  88	// CtagsPath is the path to the ctags binary to run, or empty
  89	// if a valid binary couldn't be found.
  90	CTagsPath string
  91
  92	// Same as CTagsPath but for scip-ctags
  93	ScipCTagsPath string
  94
  95	// If set, ctags must succeed.
  96	CTagsMustSucceed bool
  97
  98	// LargeFiles is a slice of glob patterns, including ** for any number
  99	// of directories, where matching file paths should be indexed
 100	// regardless of their size. The full pattern syntax is here:
 101	// https://github.com/bmatcuk/doublestar/tree/v1#patterns.
 102	LargeFiles []string
 103
 104	// IsDelta is true if this run contains only the changed documents since the
 105	// last run.
 106	IsDelta bool
 107
 108	// changedOrRemovedFiles is a list of file paths that have been changed or removed
 109	// since the last indexing job for this repository. These files will be tombstoned
 110	// in the older shards for this repository.
 111	changedOrRemovedFiles []string
 112
 113	LanguageMap ctags.LanguageMap
 114
 115	// ShardMerging is true if builder should respect compound shards. This is a
 116	// Sourcegraph specific option.
 117	ShardMerging bool
 118
 119	// HeapProfileTriggerBytes is the heap allocation in bytes that will trigger a memory profile. If 0, no memory profile
 120	// will be triggered. Note this trigger looks at total heap allocation (which includes both inuse and garbage objects).
 121	//
 122	// Profiles will be written to files named `index-memory.prof.n` in the index directory. No more than 10 files are written.
 123	//
 124	// Note: heap checking is "best effort", and it's possible for the process to OOM without triggering the heap profile.
 125	HeapProfileTriggerBytes uint64
 126}
 127
 128// HashOptions contains only the options in Options that upon modification leads to IndexState of IndexStateMismatch during the next index building.
 129type HashOptions struct {
 130	sizeMax          int
 131	disableCTags     bool
 132	ctagsPath        string
 133	cTagsMustSucceed bool
 134	largeFiles       []string
 135}
 136
 137func (o *Options) HashOptions() HashOptions {
 138	return HashOptions{
 139		sizeMax:          o.SizeMax,
 140		disableCTags:     o.DisableCTags,
 141		ctagsPath:        o.CTagsPath,
 142		cTagsMustSucceed: o.CTagsMustSucceed,
 143		largeFiles:       o.LargeFiles,
 144	}
 145}
 146
 147func (o *Options) GetHash() string {
 148	h := o.HashOptions()
 149	hasher := sha1.New()
 150
 151	hasher.Write([]byte(h.ctagsPath))
 152	hasher.Write(fmt.Appendf(nil, "%t", h.cTagsMustSucceed))
 153	hasher.Write(fmt.Appendf(nil, "%d", h.sizeMax))
 154	hasher.Write(fmt.Appendf(nil, "%q", h.largeFiles))
 155	hasher.Write(fmt.Appendf(nil, "%t", h.disableCTags))
 156
 157	return fmt.Sprintf("%x", hasher.Sum(nil))
 158}
 159
 160type largeFilesFlag struct{ *Options }
 161
 162func (f largeFilesFlag) String() string {
 163	// From flag.Value documentation:
 164	//
 165	// The flag package may call the String method with a zero-valued receiver,
 166	// such as a nil pointer.
 167	if f.Options == nil {
 168		return ""
 169	}
 170	s := append([]string{""}, f.LargeFiles...)
 171	return strings.Join(s, "-large_file ")
 172}
 173
 174func (f largeFilesFlag) Set(value string) error {
 175	f.LargeFiles = append(f.LargeFiles, value)
 176	return nil
 177}
 178
 179// Flags adds flags for build options to fs. It is the "inverse" of Args.
 180func (o *Options) Flags(fs *flag.FlagSet) {
 181	x := *o
 182	x.SetDefaults()
 183	fs.IntVar(&o.SizeMax, "file_limit", x.SizeMax, "maximum file size")
 184	fs.IntVar(&o.TrigramMax, "max_trigram_count", x.TrigramMax, "maximum number of trigrams per document")
 185	fs.IntVar(&o.ShardMax, "shard_limit", x.ShardMax, "maximum corpus size for a shard")
 186	fs.IntVar(&o.Parallelism, "parallelism", x.Parallelism, "maximum number of parallel indexing processes.")
 187	fs.StringVar(&o.IndexDir, "index", x.IndexDir, "directory for search indices")
 188	fs.StringVar(&o.ShardPrefixOverride, "shard_prefix_override", x.ShardPrefixOverride, "prefix for shard name")
 189	fs.BoolVar(&o.CTagsMustSucceed, "require_ctags", x.CTagsMustSucceed, "If set, ctags calls must succeed.")
 190	fs.Var(largeFilesFlag{o}, "large_file", "A glob pattern where matching files are to be index regardless of their size. You can add multiple patterns by setting this more than once.")
 191
 192	// Sourcegraph specific
 193	fs.BoolVar(&o.DisableCTags, "disable_ctags", x.DisableCTags, "If set, ctags will not be called.")
 194	fs.BoolVar(&o.ShardMerging, "shard_merging", x.ShardMerging, "If set, builder will respect compound shards.")
 195}
 196
 197// Args generates command line arguments for o. It is the "inverse" of Flags.
 198func (o *Options) Args() []string {
 199	var args []string
 200
 201	if o.SizeMax != 0 {
 202		args = append(args, "-file_limit", strconv.Itoa(o.SizeMax))
 203	}
 204
 205	if o.TrigramMax != 0 {
 206		args = append(args, "-max_trigram_count", strconv.Itoa(o.TrigramMax))
 207	}
 208
 209	if o.ShardMax != 0 {
 210		args = append(args, "-shard_limit", strconv.Itoa(o.ShardMax))
 211	}
 212
 213	if o.Parallelism != 0 {
 214		args = append(args, "-parallelism", strconv.Itoa(o.Parallelism))
 215	}
 216
 217	if o.IndexDir != "" {
 218		args = append(args, "-index", o.IndexDir)
 219	}
 220
 221	if o.ShardPrefixOverride != "" {
 222		args = append(args, "-shard_prefix_override", o.ShardPrefixOverride)
 223	}
 224
 225	if o.CTagsMustSucceed {
 226		args = append(args, "-require_ctags")
 227	}
 228
 229	for _, a := range o.LargeFiles {
 230		args = append(args, "-large_file", a)
 231	}
 232
 233	// Sourcegraph specific
 234	if o.DisableCTags {
 235		args = append(args, "-disable_ctags")
 236	}
 237
 238	if o.ShardMerging {
 239		args = append(args, "-shard_merging")
 240	}
 241
 242	return args
 243}
 244
 245// Builder manages (parallel) creation of uniformly sized shards. The
 246// builder buffers up documents until it collects enough documents and
 247// then builds a shard and writes.
 248type Builder struct {
 249	opts     Options
 250	throttle chan int
 251
 252	nextShardNum int
 253	todo         []*Document
 254	docChecker   DocChecker
 255	size         int
 256
 257	parserBins ctags.ParserBinMap
 258	building   sync.WaitGroup
 259
 260	errMu      sync.Mutex
 261	buildError error
 262
 263	// temp name => final name for finished shards. We only rename
 264	// them once all shards succeed to avoid Frankstein corpuses.
 265	finishedShards map[string]string
 266
 267	// indexTime is set by tests for doing reproducible builds.
 268	indexTime time.Time
 269
 270	// heapProfileMu is used to ensure that only one memory profile is written at a time
 271	heapProfileMu  sync.Mutex
 272	heapProfileNum int
 273
 274	// a sortable 20 chars long id.
 275	id string
 276
 277	finishCalled bool
 278}
 279
 280type finishedShard struct {
 281	temp, final string
 282}
 283
 284func checkCTags() string {
 285	if ctags := os.Getenv("CTAGS_COMMAND"); ctags != "" {
 286		return ctags
 287	}
 288
 289	if ctags, err := exec.LookPath("universal-ctags"); err == nil {
 290		return ctags
 291	}
 292
 293	return ""
 294}
 295
 296func checkScipCTags() string {
 297	if ctags := os.Getenv("SCIP_CTAGS_COMMAND"); ctags != "" {
 298		return ctags
 299	}
 300
 301	if ctags, err := exec.LookPath("scip-ctags"); err == nil {
 302		return ctags
 303	}
 304
 305	return ""
 306}
 307
 308// SetDefaults sets reasonable default options.
 309func (o *Options) SetDefaults() {
 310	if o.CTagsPath == "" && !o.DisableCTags {
 311		o.CTagsPath = checkCTags()
 312	}
 313
 314	if o.ScipCTagsPath == "" && !o.DisableCTags {
 315		o.ScipCTagsPath = checkScipCTags()
 316	}
 317
 318	if o.Parallelism == 0 {
 319		o.Parallelism = 4
 320	}
 321	if o.SizeMax == 0 {
 322		o.SizeMax = 2 << 20
 323	}
 324	if o.ShardMax == 0 {
 325		o.ShardMax = 100 << 20
 326	}
 327	if o.TrigramMax == 0 {
 328		o.TrigramMax = 20000
 329	}
 330
 331	if o.RepositoryDescription.Name == "" && o.RepositoryDescription.URL != "" {
 332		parsed, _ := url.Parse(o.RepositoryDescription.URL)
 333		if parsed != nil {
 334			o.RepositoryDescription.Name = filepath.Join(parsed.Host, parsed.Path)
 335		}
 336	}
 337}
 338
 339// ShardName returns the name the given index shard.
 340func (o *Options) shardName(n int) string {
 341	return o.shardNameVersion(IndexFormatVersion, n)
 342}
 343
 344func (o *Options) shardNameVersion(version, n int) string {
 345	prefix := o.ShardPrefixOverride // ShardPrefixOverride takes precedence to support custom shard naming strategies
 346
 347	if prefix == "" {
 348		// Sourcegraph specific: We use IDs in shard names on multi-tenant
 349		// instances to prevent conflicts.
 350		if tenant.UseIDBasedShardNames() {
 351			prefix = fmt.Sprintf("%09d_%09d", o.RepositoryDescription.TenantID, o.RepositoryDescription.ID)
 352		} else {
 353			prefix = o.RepositoryDescription.Name
 354		}
 355	}
 356
 357	return shardName(o.IndexDir, prefix, version, n)
 358}
 359
 360type IndexState string
 361
 362const (
 363	IndexStateMissing IndexState = "missing"
 364	IndexStateCorrupt IndexState = "corrupt"
 365	IndexStateVersion IndexState = "version-mismatch"
 366	IndexStateOption  IndexState = "option-mismatch"
 367	IndexStateMeta    IndexState = "meta-mismatch"
 368	IndexStateContent IndexState = "content-mismatch"
 369	IndexStateEqual   IndexState = "equal"
 370)
 371
 372var readVersions = []struct {
 373	IndexFormatVersion int
 374	FeatureVersion     int
 375}{{
 376	IndexFormatVersion: IndexFormatVersion,
 377	FeatureVersion:     FeatureVersion,
 378}, {
 379	IndexFormatVersion: NextIndexFormatVersion,
 380	FeatureVersion:     FeatureVersion,
 381}}
 382
 383// IncrementalSkipIndexing returns true if the index present on disk matches
 384// the build options.
 385func (o *Options) IncrementalSkipIndexing() bool {
 386	state, _ := o.IndexState()
 387	return state == IndexStateEqual
 388}
 389
 390// IndexState checks how the index present on disk compares to the build
 391// options and returns the IndexState and the name of the first shard.
 392func (o *Options) IndexState() (IndexState, string) {
 393	// Open the latest version we support that is on disk.
 394	fn := o.findShard()
 395	if fn == "" {
 396		return IndexStateMissing, fn
 397	}
 398
 399	repos, index, err := ReadMetadataPathAlive(fn)
 400	if os.IsNotExist(err) {
 401		return IndexStateMissing, fn
 402	} else if err != nil {
 403		return IndexStateCorrupt, fn
 404	}
 405
 406	for _, v := range readVersions {
 407		if v.IndexFormatVersion == index.IndexFormatVersion && v.FeatureVersion != index.IndexFeatureVersion {
 408			return IndexStateVersion, fn
 409		}
 410	}
 411
 412	var repo *zoekt.Repository
 413	for _, cand := range repos {
 414		if cand.Name == o.RepositoryDescription.Name {
 415			repo = cand
 416			break
 417		}
 418	}
 419
 420	if repo == nil {
 421		return IndexStateCorrupt, fn
 422	}
 423
 424	if repo.IndexOptions != o.GetHash() {
 425		return IndexStateOption, fn
 426	}
 427
 428	if !reflect.DeepEqual(repo.Branches, o.RepositoryDescription.Branches) {
 429		return IndexStateContent, fn
 430	}
 431
 432	// We can mutate repo since it lives in the scope of this function call.
 433	if updated, err := repo.MergeMutable(&o.RepositoryDescription); err != nil {
 434		// non-nil err means we are trying to update an immutable field =>
 435		// reindex content.
 436		log.Printf("warn: immutable field changed, requires re-index: %s", err)
 437		return IndexStateContent, fn
 438	} else if updated {
 439		return IndexStateMeta, fn
 440	}
 441
 442	return IndexStateEqual, fn
 443}
 444
 445// FindRepositoryMetadata returns the index metadata for the repository
 446// specified in the options. 'ok' is false if the repository's metadata
 447// couldn't be found or if an error occurred.
 448func (o *Options) FindRepositoryMetadata() (repository *zoekt.Repository, metadata *zoekt.IndexMetadata, ok bool, err error) {
 449	shard := o.findShard()
 450	if shard == "" {
 451		return nil, nil, false, nil
 452	}
 453
 454	repositories, metadata, err := ReadMetadataPathAlive(shard)
 455	if err != nil {
 456		return nil, nil, false, fmt.Errorf("reading metadata for shard %q: %w", shard, err)
 457	}
 458
 459	ID := o.RepositoryDescription.ID
 460	for _, r := range repositories {
 461		// compound shards contain multiple repositories, so we
 462		// have to pick only the one we're looking for
 463		if r.ID == ID {
 464			return r, metadata, true, nil
 465		}
 466	}
 467
 468	// If we're here, then we're somehow in a state where we found a matching
 469	// shard that's missing the repository metadata we're looking for. This
 470	// should never happen.
 471	name := o.RepositoryDescription.Name
 472	return nil, nil, false, fmt.Errorf("matching shard %q doesn't contain metadata for repo id %d (%q)", shard, ID, name)
 473}
 474
 475func (o *Options) findShard() string {
 476	for _, v := range readVersions {
 477		fn := o.shardNameVersion(v.IndexFormatVersion, 0)
 478		if _, err := os.Stat(fn); err == nil {
 479			return fn
 480		}
 481	}
 482
 483	// Brute force finding the shard in compound shards. We should only hit this
 484	// code path for repositories that don't exist yet or are in compound shards.
 485	return o.findCompoundShard()
 486}
 487
 488func (o *Options) findCompoundShard() string {
 489	compoundShards, err := filepath.Glob(path.Join(o.IndexDir, "compound-*.zoekt"))
 490	if err != nil {
 491		return ""
 492	}
 493	for _, fn := range compoundShards {
 494		if containsRepo(fn, o.RepositoryDescription.ID) {
 495			return fn
 496		}
 497	}
 498
 499	return ""
 500}
 501
 502func (o *Options) FindAllShards() []string {
 503	for _, v := range readVersions {
 504		fn := o.shardNameVersion(v.IndexFormatVersion, 0)
 505		if _, err := os.Stat(fn); err == nil {
 506			shards := []string{fn}
 507			for i := 1; ; i++ {
 508				fn := o.shardNameVersion(v.IndexFormatVersion, i)
 509				if _, err := os.Stat(fn); err != nil {
 510					return shards
 511				}
 512				shards = append(shards, fn)
 513			}
 514		}
 515	}
 516
 517	// lazily fallback to findShard which will look for a compound shard.
 518	if fn := o.findShard(); fn != "" {
 519		return []string{fn}
 520	}
 521
 522	return nil
 523}
 524
 525// IgnoreSizeMax determines whether the max size should be ignored.
 526func (o *Options) IgnoreSizeMax(name string) bool {
 527	// A pattern match will override preceding pattern matches.
 528	for i := len(o.LargeFiles) - 1; i >= 0; i-- {
 529		pattern := strings.TrimSpace(o.LargeFiles[i])
 530		negated, validatedPattern := checkIsNegatePattern(pattern)
 531
 532		if m, _ := doublestar.PathMatch(validatedPattern, name); m {
 533			if negated {
 534				return false
 535			} else {
 536				return true
 537			}
 538		}
 539	}
 540
 541	return false
 542}
 543
 544func checkIsNegatePattern(pattern string) (bool, string) {
 545	negate := "!"
 546
 547	// if negated then strip prefix meta character which identifies negated filter pattern
 548	if strings.HasPrefix(pattern, negate) {
 549		return true, pattern[len(negate):]
 550	}
 551
 552	return false, pattern
 553}
 554
 555// NewBuilder creates a new Builder instance.
 556func NewBuilder(opts Options) (*Builder, error) {
 557	opts.SetDefaults()
 558	if opts.RepositoryDescription.Name == "" {
 559		return nil, fmt.Errorf("builder: must set Name")
 560	}
 561
 562	b := &Builder{
 563		opts:           opts,
 564		throttle:       make(chan int, opts.Parallelism),
 565		finishedShards: map[string]string{},
 566	}
 567
 568	parserBins, err := ctags.NewParserBinMap(
 569		b.opts.CTagsPath,
 570		b.opts.ScipCTagsPath,
 571		opts.LanguageMap,
 572		b.opts.CTagsMustSucceed,
 573	)
 574	if err != nil {
 575		return nil, err
 576	}
 577
 578	b.parserBins = parserBins
 579
 580	if opts.IsDelta {
 581		// Delta shards build on top of previously existing shards.
 582		// As a consequence, the shardNum for delta shards starts from
 583		// the number following the most recently generated shard - not 0.
 584		//
 585		// Using this numbering scheme allows all the shards to be
 586		// discovered as a set.
 587		shards := b.opts.FindAllShards()
 588		b.nextShardNum = len(shards) // shards are zero indexed, so len() provides the next number after the last one
 589	}
 590
 591	if _, err := b.newShardBuilder(); err != nil {
 592		return nil, err
 593	}
 594
 595	now := time.Now()
 596	b.indexTime = now
 597	b.id = xid.NewWithTime(now).String()
 598
 599	return b, nil
 600}
 601
 602// AddFile is a convenience wrapper for the Add method
 603func (b *Builder) AddFile(name string, content []byte) error {
 604	return b.Add(Document{Name: name, Content: content})
 605}
 606
 607func (b *Builder) Add(doc Document) error {
 608	if b.finishCalled {
 609		return nil
 610	}
 611
 612	allowLargeFile := b.opts.IgnoreSizeMax(doc.Name)
 613	if len(doc.Content) > b.opts.SizeMax && !allowLargeFile {
 614		// We could pass the document on to the shardbuilder, but if
 615		// we pass through a part of the source tree with binary/large
 616		// files, the corresponding shard would be mostly empty, so
 617		// insert a reason here too.
 618		doc.SkipReason = SkipReasonTooLarge
 619	} else if skip := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile); skip != SkipReasonNone {
 620		doc.SkipReason = skip
 621	}
 622
 623	b.todo = append(b.todo, &doc)
 624
 625	if doc.SkipReason == SkipReasonNone {
 626		b.size += len(doc.Name) + len(doc.Content)
 627	} else {
 628		b.size += len(doc.Name)
 629		// Drop the content if we are skipping the document. Skipped content is not counted towards the
 630		// shard size limit, so otherwise we might buffer too much data in memory before flushing.
 631		doc.Content = nil
 632	}
 633
 634	if b.size > b.opts.ShardMax {
 635		return b.flush()
 636	}
 637
 638	return nil
 639}
 640
 641// MarkFileAsChangedOrRemoved indicates that the file specified by the given path
 642// has been changed or removed since the last indexing job for this repository.
 643//
 644// If this build is a delta build, these files will be tombstoned in the older shards for this repository.
 645func (b *Builder) MarkFileAsChangedOrRemoved(path string) {
 646	b.opts.changedOrRemovedFiles = append(b.opts.changedOrRemovedFiles, path)
 647}
 648
 649// Finish creates a last shard from the buffered documents, and clears
 650// stale shards from previous runs. This should always be called, also
 651// in failure cases, to ensure cleanup.
 652//
 653// It is safe to call Finish() multiple times.
 654func (b *Builder) Finish() error {
 655	if b.finishCalled {
 656		return b.buildError
 657	}
 658
 659	b.finishCalled = true
 660
 661	b.flush()
 662	b.building.Wait()
 663
 664	if b.buildError != nil {
 665		for tmp := range b.finishedShards {
 666			log.Printf("Builder.Finish %s", tmp)
 667			os.Remove(tmp)
 668		}
 669		b.finishedShards = map[string]string{}
 670		return b.buildError
 671	}
 672
 673	// map of temporary -> final names for all updated shards + shard metadata files
 674	artifactPaths := make(map[string]string)
 675	maps.Copy(artifactPaths, b.finishedShards)
 676
 677	oldShards := b.opts.FindAllShards()
 678
 679	if b.opts.IsDelta {
 680		// Delta shard builds need to update FileTombstone and branch commit information for all
 681		// existing shards
 682		for _, shard := range oldShards {
 683			repositories, _, err := ReadMetadataPathAlive(shard)
 684			if err != nil {
 685				return fmt.Errorf("reading metadata from shard %q: %w", shard, err)
 686			}
 687
 688			if len(repositories) > 1 {
 689				return fmt.Errorf("delta shard builds don't support repositories contained in compound shards (shard %q)", shard)
 690			}
 691
 692			if len(repositories) == 0 {
 693				return fmt.Errorf("failed to update repository metadata for shard %q - shard contains no repositories", shard)
 694			}
 695
 696			repository := repositories[0]
 697			if repository.ID != b.opts.RepositoryDescription.ID {
 698				return fmt.Errorf("shard %q doesn't contain repository ID %d (%q)", shard, b.opts.RepositoryDescription.ID, b.opts.RepositoryDescription.Name)
 699			}
 700
 701			if len(b.opts.changedOrRemovedFiles) > 0 && repository.FileTombstones == nil {
 702				repository.FileTombstones = make(map[string]struct{}, len(b.opts.changedOrRemovedFiles))
 703			}
 704
 705			for _, f := range b.opts.changedOrRemovedFiles {
 706				repository.FileTombstones[f] = struct{}{}
 707			}
 708
 709			if !BranchNamesEqual(repository.Branches, b.opts.RepositoryDescription.Branches) {
 710				return deltaBranchSetError{
 711					shardName: shard,
 712					old:       repository.Branches,
 713					new:       b.opts.RepositoryDescription.Branches,
 714				}
 715			}
 716
 717			if b.opts.GetHash() != repository.IndexOptions {
 718				return &deltaIndexOptionsMismatchError{
 719					shardName:  shard,
 720					newOptions: b.opts.HashOptions(),
 721				}
 722			}
 723
 724			repository.Branches = b.opts.RepositoryDescription.Branches
 725
 726			repository.LatestCommitDate = b.opts.RepositoryDescription.LatestCommitDate
 727
 728			repository.Metadata = b.opts.RepositoryDescription.Metadata
 729
 730			tempPath, finalPath, err := JsonMarshalRepoMetaTemp(shard, repository)
 731			if err != nil {
 732				return fmt.Errorf("writing repository metadta for shard %q: %w", shard, err)
 733			}
 734
 735			artifactPaths[tempPath] = finalPath
 736		}
 737	}
 738
 739	// We mark finished shards as empty when we successfully finish. Return now
 740	// to allow call sites to call Finish idempotently.
 741	if len(artifactPaths) == 0 {
 742		return b.buildError
 743	}
 744
 745	// Collect a map of the old shards on disk. For each new shard we replace we
 746	// delete it from toDelete. Anything remaining in toDelete will be removed
 747	// after we have renamed everything into place.
 748
 749	var toDelete map[string]struct{}
 750	if !b.opts.IsDelta {
 751		// Non-delta shard builds delete all existing shards before they write out
 752		// new ones.
 753		// By contrast, delta shard builds work by stacking changes on top of existing shards.
 754		// So, we skip populating the toDelete map if we're building delta shards.
 755
 756		toDelete = make(map[string]struct{})
 757		for _, name := range oldShards {
 758			paths, err := IndexFilePaths(name)
 759			if err != nil {
 760				b.buildError = fmt.Errorf("failed to find old paths for %s: %w", name, err)
 761			}
 762			for _, p := range paths {
 763				toDelete[p] = struct{}{}
 764			}
 765		}
 766	}
 767
 768	for tmp, final := range artifactPaths {
 769		if err := os.Rename(tmp, final); err != nil {
 770			b.buildError = err
 771			continue
 772		}
 773
 774		delete(toDelete, final)
 775	}
 776
 777	b.finishedShards = map[string]string{}
 778
 779	for p := range toDelete {
 780		// Don't delete compound shards, set tombstones instead.
 781		if b.opts.ShardMerging && strings.HasPrefix(filepath.Base(p), "compound-") {
 782			if !strings.HasSuffix(p, ".zoekt") {
 783				continue
 784			}
 785			err := SetTombstone(p, b.opts.RepositoryDescription.ID)
 786			b.buildError = err
 787			continue
 788		}
 789		log.Printf("removing old shard file: %s", p)
 790		if err := os.Remove(p); err != nil {
 791			b.buildError = err
 792		}
 793	}
 794
 795	return b.buildError
 796}
 797
 798// BranchNamesEqual compares the given zoekt.RepositoryBranch slices, and returns true
 799// iff both slices specify the same set of branch names in the same order.
 800func BranchNamesEqual(a, b []zoekt.RepositoryBranch) bool {
 801	if len(a) != len(b) {
 802		return false
 803	}
 804
 805	for i := range a {
 806		x, y := a[i], b[i]
 807		if x.Name != y.Name {
 808			return false
 809		}
 810	}
 811
 812	return true
 813}
 814
 815func (b *Builder) flush() error {
 816	todo := b.todo
 817	b.todo = nil
 818	b.size = 0
 819	b.errMu.Lock()
 820	defer b.errMu.Unlock()
 821	if b.buildError != nil {
 822		return b.buildError
 823	}
 824
 825	hasShard := b.nextShardNum > 0
 826	if len(todo) == 0 && hasShard {
 827		return nil
 828	}
 829
 830	shard := b.nextShardNum
 831	b.nextShardNum++
 832
 833	if b.opts.Parallelism > 1 {
 834		b.building.Add(1)
 835		b.throttle <- 1
 836		go func() {
 837			done, err := b.buildShard(todo, shard)
 838			<-b.throttle
 839
 840			b.errMu.Lock()
 841			defer b.errMu.Unlock()
 842			if err != nil && b.buildError == nil {
 843				b.buildError = err
 844			}
 845			if err == nil {
 846				b.finishedShards[done.temp] = done.final
 847			}
 848			b.building.Done()
 849		}()
 850	} else {
 851		// No goroutines when we're not parallel. This
 852		// simplifies memory profiling.
 853		done, err := b.buildShard(todo, shard)
 854		b.buildError = err
 855		if err == nil {
 856			b.finishedShards[done.temp] = done.final
 857		}
 858
 859		return b.buildError
 860	}
 861
 862	return nil
 863}
 864
 865// map [0,inf) to [0,1) monotonically
 866func squashRange(j int) float64 {
 867	x := float64(j)
 868	return x / (1 + x)
 869}
 870
 871type rankedDoc struct {
 872	*Document
 873	rank []float64
 874}
 875
 876// rank returns a vector of scores which is used at index-time to sort documents
 877// before writing them to disk. The order of documents in the shard is important
 878// at query time, because earlier documents receive a boost at query time and
 879// have a higher chance of being searched before limits kick in.
 880func rank(d *Document, origIdx int) []float64 {
 881	skipped := 0.0
 882	if d.SkipReason != SkipReasonNone {
 883		skipped = 1.0
 884	}
 885
 886	generated := 0.0
 887	if enry.IsGenerated(d.Name, d.Content) {
 888		generated = 1.0
 889	}
 890
 891	vendor := 0.0
 892	if enry.IsVendor(d.Name) {
 893		vendor = 1.0
 894	}
 895
 896	test := 0.0
 897	if enry.IsTest(d.Name) {
 898		test = 1.0
 899	}
 900
 901	// Smaller is earlier (=better).
 902	return []float64{
 903		// Always place skipped docs last
 904		skipped,
 905
 906		// Prefer docs that are not generated
 907		generated,
 908
 909		// Prefer docs that are not vendored
 910		vendor,
 911
 912		// Prefer docs that are not tests
 913		test,
 914
 915		// With short names
 916		squashRange(len(d.Name)),
 917
 918		// With many symbols
 919		1.0 - squashRange(len(d.Symbols)),
 920
 921		// With short content
 922		squashRange(len(d.Content)),
 923
 924		// That is present is as many branches as possible
 925		1.0 - squashRange(len(d.Branches)),
 926
 927		// Preserve original ordering.
 928		squashRange(origIdx),
 929	}
 930}
 931
 932func sortDocuments(todo []*Document) {
 933	rs := make([]rankedDoc, 0, len(todo))
 934	for i, t := range todo {
 935		rd := rankedDoc{t, rank(t, i)}
 936		rs = append(rs, rd)
 937	}
 938	sort.Slice(rs, func(i, j int) bool {
 939		r1 := rs[i].rank
 940		r2 := rs[j].rank
 941		for i := range r1 {
 942			if r1[i] < r2[i] {
 943				return true
 944			}
 945			if r1[i] > r2[i] {
 946				return false
 947			}
 948		}
 949
 950		return false
 951	})
 952	for i := range todo {
 953		todo[i] = rs[i].Document
 954	}
 955}
 956
 957func (b *Builder) buildShard(todo []*Document, nextShardNum int) (*finishedShard, error) {
 958	if !b.opts.DisableCTags && (b.opts.CTagsPath != "" || b.opts.ScipCTagsPath != "") {
 959		err := parseSymbols(todo, b.opts.LanguageMap, b.parserBins)
 960		if b.opts.CTagsMustSucceed && err != nil {
 961			return nil, err
 962		}
 963		if err != nil {
 964			log.Printf("ignoring universal:%s or scip:%s error: %v", b.opts.CTagsPath, b.opts.ScipCTagsPath, err)
 965		}
 966	}
 967
 968	name := b.opts.shardName(nextShardNum)
 969
 970	shardBuilder, err := b.newShardBuilder()
 971	if err != nil {
 972		return nil, err
 973	}
 974
 975	sortDocuments(todo)
 976
 977	for idx, t := range todo {
 978		if err := shardBuilder.Add(*t); err != nil {
 979			return nil, err
 980		}
 981
 982		if idx%10_000 == 0 {
 983			b.CheckMemoryUsage()
 984		}
 985	}
 986
 987	return b.writeShard(name, shardBuilder)
 988}
 989
 990// CheckMemoryUsage checks the memory usage of the process and writes a memory profile if the heap usage exceeds the
 991// configured threshold. NOTE: this method is expensive and should only be used for debugging.
 992func (b *Builder) CheckMemoryUsage() {
 993	// Don't check memory if heap profiling is disabled, or we've already written 10 profiles
 994	if b.opts.HeapProfileTriggerBytes <= 0 || b.heapProfileNum >= 10 {
 995		return
 996	}
 997
 998	var m runtime.MemStats
 999	runtime.ReadMemStats(&m)
1000
1001	if m.HeapAlloc > b.opts.HeapProfileTriggerBytes && b.heapProfileMu.TryLock() {
1002		defer b.heapProfileMu.Unlock()
1003
1004		log.Printf("writing memory profile, allocated heap: %s", humanize.Bytes(m.HeapAlloc))
1005		name := filepath.Join(b.opts.IndexDir, fmt.Sprintf("indexmemory.prof.%d", b.heapProfileNum))
1006		f, err := os.Create(name)
1007		if err != nil {
1008			log.Printf("failed to create memory profile file: %v", err)
1009			return
1010		}
1011
1012		err = pprof.WriteHeapProfile(f)
1013		if err != nil {
1014			log.Printf("failed to write memory profile: %v", err)
1015		}
1016
1017		b.heapProfileNum++
1018	}
1019}
1020
1021func (b *Builder) newShardBuilder() (*ShardBuilder, error) {
1022	desc := b.opts.RepositoryDescription
1023	desc.HasSymbols = !b.opts.DisableCTags && b.opts.CTagsPath != ""
1024	desc.SubRepoMap = b.opts.SubRepositories
1025	desc.IndexOptions = b.opts.GetHash()
1026
1027	shardBuilder, err := NewShardBuilder(&desc)
1028	if err != nil {
1029		return nil, err
1030	}
1031	shardBuilder.IndexTime = b.indexTime
1032	shardBuilder.ID = b.id
1033	return shardBuilder, nil
1034}
1035
1036func (b *Builder) writeShard(fn string, ib *ShardBuilder) (*finishedShard, error) {
1037	dir := filepath.Dir(fn)
1038	if err := os.MkdirAll(dir, 0o700); err != nil {
1039		return nil, err
1040	}
1041
1042	f, err := os.CreateTemp(dir, filepath.Base(fn)+".*.tmp")
1043	if err != nil {
1044		return nil, err
1045	}
1046	if runtime.GOOS != "windows" {
1047		if err := f.Chmod(0o666 &^ umask); err != nil {
1048			return nil, err
1049		}
1050	}
1051
1052	defer f.Close()
1053	if err := ib.Write(f); err != nil {
1054		return nil, err
1055	}
1056	fi, err := f.Stat()
1057	if err != nil {
1058		return nil, err
1059	}
1060	if err := f.Close(); err != nil {
1061		return nil, err
1062	}
1063
1064	log.Printf("finished shard %s: %d index bytes (overhead %3.1f), %d files processed \n",
1065		fn,
1066		fi.Size(),
1067		float64(fi.Size())/float64(ib.ContentSize()+1),
1068		ib.NumFiles())
1069
1070	return &finishedShard{f.Name(), fn}, nil
1071}
1072
1073type deltaBranchSetError struct {
1074	shardName string
1075	old, new  []zoekt.RepositoryBranch
1076}
1077
1078func (e deltaBranchSetError) Error() string {
1079	return fmt.Sprintf("repository metadata in shard %q contains a different set of branch names than what was requested, which is unsupported in a delta shard build. old: %+v, new: %+v", e.shardName, e.old, e.new)
1080}
1081
1082type deltaIndexOptionsMismatchError struct {
1083	shardName  string
1084	newOptions HashOptions
1085}
1086
1087func (e *deltaIndexOptionsMismatchError) Error() string {
1088	return fmt.Sprintf("one or more index options for shard %q do not match Builder's index options. These index option updates are incompatible with delta build. New index options: %+v", e.shardName, e.newOptions)
1089}
1090
1091// umask holds the Umask of the current process
1092var umask os.FileMode
1093
1094func init() {
1095	umask = os.FileMode(unix.Umask(0))
1096	unix.Umask(int(umask))
1097}
Configure Feed

Configure Feed