index/builder.go at 261aae37dce6a46cdf1eb669d95d314adc0758e7 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / index / builder.go
at 261aae37dce6a46cdf1eb669d95d314adc0758e7 30 kB View raw
Julie Tibshirani Rename IndexBuilder -> ShardBuilder (#908) 1y ago
   1// Copyright 2016 Google Inc. All rights reserved.
   2//
   3// Licensed under the Apache License, Version 2.0 (the "License");
   4// you may not use this file except in compliance with the License.
   5// You may obtain a copy of the License at
   6//
   7//    http://www.apache.org/licenses/LICENSE-2.0
   8//
   9// Unless required by applicable law or agreed to in writing, software
  10// distributed under the License is distributed on an "AS IS" BASIS,
  11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12// See the License for the specific language governing permissions and
  13// limitations under the License.
  14
  15// Package index contains logic for building Zoekt indexes. NOTE: this package is not considered
  16// part of the public API, and it is not recommended to rely on it in external code.
  17package index
  18
  19import (
  20	"cmp"
  21	"crypto/sha1"
  22	"flag"
  23	"fmt"
  24	"log"
  25	"net/url"
  26	"os"
  27	"os/exec"
  28	"path"
  29	"path/filepath"
  30	"reflect"
  31	"runtime"
  32	"runtime/pprof"
  33	"sort"
  34	"strconv"
  35	"strings"
  36	"sync"
  37	"time"
  38
  39	"github.com/bmatcuk/doublestar"
  40	"github.com/dustin/go-humanize"
  41	"github.com/go-enry/go-enry/v2"
  42	"github.com/rs/xid"
  43
  44	"github.com/sourcegraph/zoekt"
  45	"github.com/sourcegraph/zoekt/internal/ctags"
  46)
  47
  48var DefaultDir = filepath.Join(os.Getenv("HOME"), ".zoekt")
  49
  50// Branch describes a single branch version.
  51type Branch struct {
  52	Name    string
  53	Version string
  54}
  55
  56// Options sets options for the index building.
  57type Options struct {
  58	// IndexDir is a directory that holds *.zoekt index files.
  59	IndexDir string
  60
  61	// SizeMax is the maximum file size
  62	SizeMax int
  63
  64	// Parallelism is the maximum number of shards to index in parallel
  65	Parallelism int
  66
  67	// ShardMax sets the maximum corpus size for a single shard
  68	ShardMax int
  69
  70	// TrigramMax sets the maximum number of distinct trigrams per document.
  71	TrigramMax int
  72
  73	// RepositoryDescription holds names and URLs for the repository.
  74	RepositoryDescription zoekt.Repository
  75
  76	// SubRepositories is a path => sub repository map.
  77	SubRepositories map[string]*zoekt.Repository
  78
  79	// DisableCTags disables the generation of ctags metadata.
  80	DisableCTags bool
  81
  82	// CtagsPath is the path to the ctags binary to run, or empty
  83	// if a valid binary couldn't be found.
  84	CTagsPath string
  85
  86	// Same as CTagsPath but for scip-ctags
  87	ScipCTagsPath string
  88
  89	// If set, ctags must succeed.
  90	CTagsMustSucceed bool
  91
  92	// LargeFiles is a slice of glob patterns, including ** for any number
  93	// of directories, where matching file paths should be indexed
  94	// regardless of their size. The full pattern syntax is here:
  95	// https://github.com/bmatcuk/doublestar/tree/v1#patterns.
  96	LargeFiles []string
  97
  98	// IsDelta is true if this run contains only the changed documents since the
  99	// last run.
 100	IsDelta bool
 101
 102	// changedOrRemovedFiles is a list of file paths that have been changed or removed
 103	// since the last indexing job for this repository. These files will be tombstoned
 104	// in the older shards for this repository.
 105	changedOrRemovedFiles []string
 106
 107	LanguageMap ctags.LanguageMap
 108
 109	// ShardMerging is true if builder should respect compound shards. This is a
 110	// Sourcegraph specific option.
 111	ShardMerging bool
 112
 113	// HeapProfileTriggerBytes is the heap allocation in bytes that will trigger a memory profile. If 0, no memory profile
 114	// will be triggered. Note this trigger looks at total heap allocation (which includes both inuse and garbage objects).
 115	//
 116	// Profiles will be written to files named `index-memory.prof.n` in the index directory. No more than 10 files are written.
 117	//
 118	// Note: heap checking is "best effort", and it's possible for the process to OOM without triggering the heap profile.
 119	HeapProfileTriggerBytes uint64
 120
 121	// ShardPrefix is the prefix of the shard. It defaults to the repository name.
 122	ShardPrefix string
 123}
 124
 125// HashOptions contains only the options in Options that upon modification leads to IndexState of IndexStateMismatch during the next index building.
 126type HashOptions struct {
 127	sizeMax          int
 128	disableCTags     bool
 129	ctagsPath        string
 130	cTagsMustSucceed bool
 131	largeFiles       []string
 132}
 133
 134func (o *Options) HashOptions() HashOptions {
 135	return HashOptions{
 136		sizeMax:          o.SizeMax,
 137		disableCTags:     o.DisableCTags,
 138		ctagsPath:        o.CTagsPath,
 139		cTagsMustSucceed: o.CTagsMustSucceed,
 140		largeFiles:       o.LargeFiles,
 141	}
 142}
 143
 144func (o *Options) GetHash() string {
 145	h := o.HashOptions()
 146	hasher := sha1.New()
 147
 148	hasher.Write([]byte(h.ctagsPath))
 149	hasher.Write([]byte(fmt.Sprintf("%t", h.cTagsMustSucceed)))
 150	hasher.Write([]byte(fmt.Sprintf("%d", h.sizeMax)))
 151	hasher.Write([]byte(fmt.Sprintf("%q", h.largeFiles)))
 152	hasher.Write([]byte(fmt.Sprintf("%t", h.disableCTags)))
 153
 154	return fmt.Sprintf("%x", hasher.Sum(nil))
 155}
 156
 157type largeFilesFlag struct{ *Options }
 158
 159func (f largeFilesFlag) String() string {
 160	// From flag.Value documentation:
 161	//
 162	// The flag package may call the String method with a zero-valued receiver,
 163	// such as a nil pointer.
 164	if f.Options == nil {
 165		return ""
 166	}
 167	s := append([]string{""}, f.LargeFiles...)
 168	return strings.Join(s, "-large_file ")
 169}
 170
 171func (f largeFilesFlag) Set(value string) error {
 172	f.LargeFiles = append(f.LargeFiles, value)
 173	return nil
 174}
 175
 176// Flags adds flags for build options to fs. It is the "inverse" of Args.
 177func (o *Options) Flags(fs *flag.FlagSet) {
 178	x := *o
 179	x.SetDefaults()
 180	fs.IntVar(&o.SizeMax, "file_limit", x.SizeMax, "maximum file size")
 181	fs.IntVar(&o.TrigramMax, "max_trigram_count", x.TrigramMax, "maximum number of trigrams per document")
 182	fs.IntVar(&o.ShardMax, "shard_limit", x.ShardMax, "maximum corpus size for a shard")
 183	fs.IntVar(&o.Parallelism, "parallelism", x.Parallelism, "maximum number of parallel indexing processes.")
 184	fs.StringVar(&o.IndexDir, "index", x.IndexDir, "directory for search indices")
 185	fs.BoolVar(&o.CTagsMustSucceed, "require_ctags", x.CTagsMustSucceed, "If set, ctags calls must succeed.")
 186	fs.Var(largeFilesFlag{o}, "large_file", "A glob pattern where matching files are to be index regardless of their size. You can add multiple patterns by setting this more than once.")
 187	fs.StringVar(&o.ShardPrefix, "shard_prefix", x.ShardPrefix, "the prefix of the shard. Defaults to repository name")
 188
 189	// Sourcegraph specific
 190	fs.BoolVar(&o.DisableCTags, "disable_ctags", x.DisableCTags, "If set, ctags will not be called.")
 191	fs.BoolVar(&o.ShardMerging, "shard_merging", x.ShardMerging, "If set, builder will respect compound shards.")
 192}
 193
 194// Args generates command line arguments for o. It is the "inverse" of Flags.
 195func (o *Options) Args() []string {
 196	var args []string
 197
 198	if o.SizeMax != 0 {
 199		args = append(args, "-file_limit", strconv.Itoa(o.SizeMax))
 200	}
 201
 202	if o.TrigramMax != 0 {
 203		args = append(args, "-max_trigram_count", strconv.Itoa(o.TrigramMax))
 204	}
 205
 206	if o.ShardMax != 0 {
 207		args = append(args, "-shard_limit", strconv.Itoa(o.ShardMax))
 208	}
 209
 210	if o.Parallelism != 0 {
 211		args = append(args, "-parallelism", strconv.Itoa(o.Parallelism))
 212	}
 213
 214	if o.IndexDir != "" {
 215		args = append(args, "-index", o.IndexDir)
 216	}
 217
 218	if o.CTagsMustSucceed {
 219		args = append(args, "-require_ctags")
 220	}
 221
 222	for _, a := range o.LargeFiles {
 223		args = append(args, "-large_file", a)
 224	}
 225
 226	// Sourcegraph specific
 227	if o.DisableCTags {
 228		args = append(args, "-disable_ctags")
 229	}
 230
 231	if o.ShardMerging {
 232		args = append(args, "-shard_merging")
 233	}
 234
 235	if o.ShardPrefix != "" {
 236		args = append(args, "-shard_prefix", o.ShardPrefix)
 237	}
 238
 239	return args
 240}
 241
 242// Builder manages (parallel) creation of uniformly sized shards. The
 243// builder buffers up documents until it collects enough documents and
 244// then builds a shard and writes.
 245type Builder struct {
 246	opts     Options
 247	throttle chan int
 248
 249	nextShardNum int
 250	todo         []*Document
 251	docChecker   DocChecker
 252	size         int
 253
 254	parserBins ctags.ParserBinMap
 255	building   sync.WaitGroup
 256
 257	errMu      sync.Mutex
 258	buildError error
 259
 260	// temp name => final name for finished shards. We only rename
 261	// them once all shards succeed to avoid Frankstein corpuses.
 262	finishedShards map[string]string
 263
 264	// indexTime is set by tests for doing reproducible builds.
 265	indexTime time.Time
 266
 267	// heapProfileMu is used to ensure that only one memory profile is written at a time
 268	heapProfileMu  sync.Mutex
 269	heapProfileNum int
 270
 271	// a sortable 20 chars long id.
 272	id string
 273
 274	finishCalled bool
 275}
 276
 277type finishedShard struct {
 278	temp, final string
 279}
 280
 281func checkCTags() string {
 282	if ctags := os.Getenv("CTAGS_COMMAND"); ctags != "" {
 283		return ctags
 284	}
 285
 286	if ctags, err := exec.LookPath("universal-ctags"); err == nil {
 287		return ctags
 288	}
 289
 290	return ""
 291}
 292
 293func checkScipCTags() string {
 294	if ctags := os.Getenv("SCIP_CTAGS_COMMAND"); ctags != "" {
 295		return ctags
 296	}
 297
 298	if ctags, err := exec.LookPath("scip-ctags"); err == nil {
 299		return ctags
 300	}
 301
 302	return ""
 303}
 304
 305// SetDefaults sets reasonable default options.
 306func (o *Options) SetDefaults() {
 307	if o.CTagsPath == "" && !o.DisableCTags {
 308		o.CTagsPath = checkCTags()
 309	}
 310
 311	if o.ScipCTagsPath == "" && !o.DisableCTags {
 312		o.ScipCTagsPath = checkScipCTags()
 313	}
 314
 315	if o.Parallelism == 0 {
 316		o.Parallelism = 4
 317	}
 318	if o.SizeMax == 0 {
 319		o.SizeMax = 2 << 20
 320	}
 321	if o.ShardMax == 0 {
 322		o.ShardMax = 100 << 20
 323	}
 324	if o.TrigramMax == 0 {
 325		o.TrigramMax = 20000
 326	}
 327
 328	if o.RepositoryDescription.Name == "" && o.RepositoryDescription.URL != "" {
 329		parsed, _ := url.Parse(o.RepositoryDescription.URL)
 330		if parsed != nil {
 331			o.RepositoryDescription.Name = filepath.Join(parsed.Host, parsed.Path)
 332		}
 333	}
 334}
 335
 336// ShardName returns the name the given index shard.
 337func (o *Options) shardName(n int) string {
 338	return o.shardNameVersion(IndexFormatVersion, n)
 339}
 340
 341func (o *Options) shardNameVersion(version, n int) string {
 342	return ShardName(o.IndexDir, cmp.Or(o.ShardPrefix, o.RepositoryDescription.Name), version, n)
 343}
 344
 345type IndexState string
 346
 347const (
 348	IndexStateMissing IndexState = "missing"
 349	IndexStateCorrupt IndexState = "corrupt"
 350	IndexStateVersion IndexState = "version-mismatch"
 351	IndexStateOption  IndexState = "option-mismatch"
 352	IndexStateMeta    IndexState = "meta-mismatch"
 353	IndexStateContent IndexState = "content-mismatch"
 354	IndexStateEqual   IndexState = "equal"
 355)
 356
 357var readVersions = []struct {
 358	IndexFormatVersion int
 359	FeatureVersion     int
 360}{{
 361	IndexFormatVersion: IndexFormatVersion,
 362	FeatureVersion:     FeatureVersion,
 363}, {
 364	IndexFormatVersion: NextIndexFormatVersion,
 365	FeatureVersion:     FeatureVersion,
 366}}
 367
 368// IncrementalSkipIndexing returns true if the index present on disk matches
 369// the build options.
 370func (o *Options) IncrementalSkipIndexing() bool {
 371	state, _ := o.IndexState()
 372	return state == IndexStateEqual
 373}
 374
 375// IndexState checks how the index present on disk compares to the build
 376// options and returns the IndexState and the name of the first shard.
 377func (o *Options) IndexState() (IndexState, string) {
 378	// Open the latest version we support that is on disk.
 379	fn := o.findShard()
 380	if fn == "" {
 381		return IndexStateMissing, fn
 382	}
 383
 384	repos, index, err := ReadMetadataPathAlive(fn)
 385	if os.IsNotExist(err) {
 386		return IndexStateMissing, fn
 387	} else if err != nil {
 388		return IndexStateCorrupt, fn
 389	}
 390
 391	for _, v := range readVersions {
 392		if v.IndexFormatVersion == index.IndexFormatVersion && v.FeatureVersion != index.IndexFeatureVersion {
 393			return IndexStateVersion, fn
 394		}
 395	}
 396
 397	var repo *zoekt.Repository
 398	for _, cand := range repos {
 399		if cand.Name == o.RepositoryDescription.Name {
 400			repo = cand
 401			break
 402		}
 403	}
 404
 405	if repo == nil {
 406		return IndexStateCorrupt, fn
 407	}
 408
 409	if repo.IndexOptions != o.GetHash() {
 410		return IndexStateOption, fn
 411	}
 412
 413	if !reflect.DeepEqual(repo.Branches, o.RepositoryDescription.Branches) {
 414		return IndexStateContent, fn
 415	}
 416
 417	// We can mutate repo since it lives in the scope of this function call.
 418	if updated, err := repo.MergeMutable(&o.RepositoryDescription); err != nil {
 419		// non-nil err means we are trying to update an immutable field =>
 420		// reindex content.
 421		log.Printf("warn: immutable field changed, requires re-index: %s", err)
 422		return IndexStateContent, fn
 423	} else if updated {
 424		return IndexStateMeta, fn
 425	}
 426
 427	return IndexStateEqual, fn
 428}
 429
 430// FindRepositoryMetadata returns the index metadata for the repository
 431// specified in the options. 'ok' is false if the repository's metadata
 432// couldn't be found or if an error occurred.
 433func (o *Options) FindRepositoryMetadata() (repository *zoekt.Repository, metadata *zoekt.IndexMetadata, ok bool, err error) {
 434	shard := o.findShard()
 435	if shard == "" {
 436		return nil, nil, false, nil
 437	}
 438
 439	repositories, metadata, err := ReadMetadataPathAlive(shard)
 440	if err != nil {
 441		return nil, nil, false, fmt.Errorf("reading metadata for shard %q: %w", shard, err)
 442	}
 443
 444	ID := o.RepositoryDescription.ID
 445	for _, r := range repositories {
 446		// compound shards contain multiple repositories, so we
 447		// have to pick only the one we're looking for
 448		if r.ID == ID {
 449			return r, metadata, true, nil
 450		}
 451	}
 452
 453	// If we're here, then we're somehow in a state where we found a matching
 454	// shard that's missing the repository metadata we're looking for. This
 455	// should never happen.
 456	name := o.RepositoryDescription.Name
 457	return nil, nil, false, fmt.Errorf("matching shard %q doesn't contain metadata for repo id %d (%q)", shard, ID, name)
 458}
 459
 460func (o *Options) findShard() string {
 461	for _, v := range readVersions {
 462		fn := o.shardNameVersion(v.IndexFormatVersion, 0)
 463		if _, err := os.Stat(fn); err == nil {
 464			return fn
 465		}
 466	}
 467
 468	// Brute force finding the shard in compound shards. We should only hit this
 469	// code path for repositories that are not already existing or are in
 470	// compound shards.
 471	//
 472	// TODO add an oracle which can speed this up in the case of repositories
 473	// already in compound shards.
 474	compoundShards, err := filepath.Glob(path.Join(o.IndexDir, "compound-*.zoekt"))
 475	if err != nil {
 476		return ""
 477	}
 478	for _, fn := range compoundShards {
 479		repos, _, err := ReadMetadataPathAlive(fn)
 480		if err != nil {
 481			continue
 482		}
 483		for _, repo := range repos {
 484			if repo.ID == o.RepositoryDescription.ID {
 485				return fn
 486			}
 487		}
 488	}
 489
 490	return ""
 491}
 492
 493func (o *Options) FindAllShards() []string {
 494	for _, v := range readVersions {
 495		fn := o.shardNameVersion(v.IndexFormatVersion, 0)
 496		if _, err := os.Stat(fn); err == nil {
 497			shards := []string{fn}
 498			for i := 1; ; i++ {
 499				fn := o.shardNameVersion(v.IndexFormatVersion, i)
 500				if _, err := os.Stat(fn); err != nil {
 501					return shards
 502				}
 503				shards = append(shards, fn)
 504			}
 505		}
 506	}
 507
 508	// lazily fallback to findShard which will look for a compound shard.
 509	if fn := o.findShard(); fn != "" {
 510		return []string{fn}
 511	}
 512
 513	return nil
 514}
 515
 516// IgnoreSizeMax determines whether the max size should be ignored.
 517func (o *Options) IgnoreSizeMax(name string) bool {
 518	// A pattern match will override preceding pattern matches.
 519	for i := len(o.LargeFiles) - 1; i >= 0; i-- {
 520		pattern := strings.TrimSpace(o.LargeFiles[i])
 521		negated, validatedPattern := checkIsNegatePattern(pattern)
 522
 523		if m, _ := doublestar.PathMatch(validatedPattern, name); m {
 524			if negated {
 525				return false
 526			} else {
 527				return true
 528			}
 529		}
 530	}
 531
 532	return false
 533}
 534
 535func checkIsNegatePattern(pattern string) (bool, string) {
 536	negate := "!"
 537
 538	// if negated then strip prefix meta character which identifies negated filter pattern
 539	if strings.HasPrefix(pattern, negate) {
 540		return true, pattern[len(negate):]
 541	}
 542
 543	return false, pattern
 544}
 545
 546// NewBuilder creates a new Builder instance.
 547func NewBuilder(opts Options) (*Builder, error) {
 548	opts.SetDefaults()
 549	if opts.RepositoryDescription.Name == "" {
 550		return nil, fmt.Errorf("builder: must set Name")
 551	}
 552
 553	b := &Builder{
 554		opts:           opts,
 555		throttle:       make(chan int, opts.Parallelism),
 556		finishedShards: map[string]string{},
 557	}
 558
 559	parserBins, err := ctags.NewParserBinMap(
 560		b.opts.CTagsPath,
 561		b.opts.ScipCTagsPath,
 562		opts.LanguageMap,
 563		b.opts.CTagsMustSucceed,
 564	)
 565	if err != nil {
 566		return nil, err
 567	}
 568
 569	b.parserBins = parserBins
 570
 571	if opts.IsDelta {
 572		// Delta shards build on top of previously existing shards.
 573		// As a consequence, the shardNum for delta shards starts from
 574		// the number following the most recently generated shard - not 0.
 575		//
 576		// Using this numbering scheme allows all the shards to be
 577		// discovered as a set.
 578		shards := b.opts.FindAllShards()
 579		b.nextShardNum = len(shards) // shards are zero indexed, so len() provides the next number after the last one
 580	}
 581
 582	if _, err := b.newShardBuilder(); err != nil {
 583		return nil, err
 584	}
 585
 586	now := time.Now()
 587	b.indexTime = now
 588	b.id = xid.NewWithTime(now).String()
 589
 590	return b, nil
 591}
 592
 593// AddFile is a convenience wrapper for the Add method
 594func (b *Builder) AddFile(name string, content []byte) error {
 595	return b.Add(Document{Name: name, Content: content})
 596}
 597
 598func (b *Builder) Add(doc Document) error {
 599	if b.finishCalled {
 600		return nil
 601	}
 602
 603	allowLargeFile := b.opts.IgnoreSizeMax(doc.Name)
 604	if len(doc.Content) > b.opts.SizeMax && !allowLargeFile {
 605		// We could pass the document on to the shardbuilder, but if
 606		// we pass through a part of the source tree with binary/large
 607		// files, the corresponding shard would be mostly empty, so
 608		// insert a reason here too.
 609		doc.SkipReason = fmt.Sprintf("document size %d larger than limit %d", len(doc.Content), b.opts.SizeMax)
 610	} else if err := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile); err != nil {
 611		doc.SkipReason = err.Error()
 612		doc.Language = "binary"
 613	}
 614
 615	b.todo = append(b.todo, &doc)
 616
 617	if doc.SkipReason == "" {
 618		b.size += len(doc.Name) + len(doc.Content)
 619	} else {
 620		b.size += len(doc.Name) + len(doc.SkipReason)
 621		// Drop the content if we are skipping the document. Skipped content is not counted towards the
 622		// shard size limit, so otherwise we might buffer too much data in memory before flushing.
 623		doc.Content = nil
 624	}
 625
 626	if b.size > b.opts.ShardMax {
 627		return b.flush()
 628	}
 629
 630	return nil
 631}
 632
 633// MarkFileAsChangedOrRemoved indicates that the file specified by the given path
 634// has been changed or removed since the last indexing job for this repository.
 635//
 636// If this build is a delta build, these files will be tombstoned in the older shards for this repository.
 637func (b *Builder) MarkFileAsChangedOrRemoved(path string) {
 638	b.opts.changedOrRemovedFiles = append(b.opts.changedOrRemovedFiles, path)
 639}
 640
 641// Finish creates a last shard from the buffered documents, and clears
 642// stale shards from previous runs. This should always be called, also
 643// in failure cases, to ensure cleanup.
 644//
 645// It is safe to call Finish() multiple times.
 646func (b *Builder) Finish() error {
 647	if b.finishCalled {
 648		return b.buildError
 649	}
 650
 651	b.finishCalled = true
 652
 653	b.flush()
 654	b.building.Wait()
 655
 656	if b.buildError != nil {
 657		for tmp := range b.finishedShards {
 658			log.Printf("Builder.Finish %s", tmp)
 659			os.Remove(tmp)
 660		}
 661		b.finishedShards = map[string]string{}
 662		return b.buildError
 663	}
 664
 665	// map of temporary -> final names for all updated shards + shard metadata files
 666	artifactPaths := make(map[string]string)
 667	for tmp, final := range b.finishedShards {
 668		artifactPaths[tmp] = final
 669	}
 670
 671	oldShards := b.opts.FindAllShards()
 672
 673	if b.opts.IsDelta {
 674		// Delta shard builds need to update FileTombstone and branch commit information for all
 675		// existing shards
 676		for _, shard := range oldShards {
 677			repositories, _, err := ReadMetadataPathAlive(shard)
 678			if err != nil {
 679				return fmt.Errorf("reading metadata from shard %q: %w", shard, err)
 680			}
 681
 682			if len(repositories) > 1 {
 683				return fmt.Errorf("delta shard builds don't support repositories contained in compound shards (shard %q)", shard)
 684			}
 685
 686			if len(repositories) == 0 {
 687				return fmt.Errorf("failed to update repository metadata for shard %q - shard contains no repositories", shard)
 688			}
 689
 690			repository := repositories[0]
 691			if repository.ID != b.opts.RepositoryDescription.ID {
 692				return fmt.Errorf("shard %q doesn't contain repository ID %d (%q)", shard, b.opts.RepositoryDescription.ID, b.opts.RepositoryDescription.Name)
 693			}
 694
 695			if len(b.opts.changedOrRemovedFiles) > 0 && repository.FileTombstones == nil {
 696				repository.FileTombstones = make(map[string]struct{}, len(b.opts.changedOrRemovedFiles))
 697			}
 698
 699			for _, f := range b.opts.changedOrRemovedFiles {
 700				repository.FileTombstones[f] = struct{}{}
 701			}
 702
 703			if !BranchNamesEqual(repository.Branches, b.opts.RepositoryDescription.Branches) {
 704				return deltaBranchSetError{
 705					shardName: shard,
 706					old:       repository.Branches,
 707					new:       b.opts.RepositoryDescription.Branches,
 708				}
 709			}
 710
 711			if b.opts.GetHash() != repository.IndexOptions {
 712				return &deltaIndexOptionsMismatchError{
 713					shardName:  shard,
 714					newOptions: b.opts.HashOptions(),
 715				}
 716			}
 717
 718			repository.Branches = b.opts.RepositoryDescription.Branches
 719
 720			repository.LatestCommitDate = b.opts.RepositoryDescription.LatestCommitDate
 721
 722			tempPath, finalPath, err := JsonMarshalRepoMetaTemp(shard, repository)
 723			if err != nil {
 724				return fmt.Errorf("writing repository metadta for shard %q: %w", shard, err)
 725			}
 726
 727			artifactPaths[tempPath] = finalPath
 728		}
 729	}
 730
 731	// We mark finished shards as empty when we successfully finish. Return now
 732	// to allow call sites to call Finish idempotently.
 733	if len(artifactPaths) == 0 {
 734		return b.buildError
 735	}
 736
 737	// Collect a map of the old shards on disk. For each new shard we replace we
 738	// delete it from toDelete. Anything remaining in toDelete will be removed
 739	// after we have renamed everything into place.
 740
 741	var toDelete map[string]struct{}
 742	if !b.opts.IsDelta {
 743		// Non-delta shard builds delete all existing shards before they write out
 744		// new ones.
 745		// By contrast, delta shard builds work by stacking changes on top of existing shards.
 746		// So, we skip populating the toDelete map if we're building delta shards.
 747
 748		toDelete = make(map[string]struct{})
 749		for _, name := range oldShards {
 750			paths, err := IndexFilePaths(name)
 751			if err != nil {
 752				b.buildError = fmt.Errorf("failed to find old paths for %s: %w", name, err)
 753			}
 754			for _, p := range paths {
 755				toDelete[p] = struct{}{}
 756			}
 757		}
 758	}
 759
 760	for tmp, final := range artifactPaths {
 761		if err := os.Rename(tmp, final); err != nil {
 762			b.buildError = err
 763			continue
 764		}
 765
 766		delete(toDelete, final)
 767	}
 768
 769	b.finishedShards = map[string]string{}
 770
 771	for p := range toDelete {
 772		// Don't delete compound shards, set tombstones instead.
 773		if b.opts.ShardMerging && strings.HasPrefix(filepath.Base(p), "compound-") {
 774			if !strings.HasSuffix(p, ".zoekt") {
 775				continue
 776			}
 777			err := SetTombstone(p, b.opts.RepositoryDescription.ID)
 778			b.buildError = err
 779			continue
 780		}
 781		log.Printf("removing old shard file: %s", p)
 782		if err := os.Remove(p); err != nil {
 783			b.buildError = err
 784		}
 785	}
 786
 787	return b.buildError
 788}
 789
 790// BranchNamesEqual compares the given zoekt.RepositoryBranch slices, and returns true
 791// iff both slices specify the same set of branch names in the same order.
 792func BranchNamesEqual(a, b []zoekt.RepositoryBranch) bool {
 793	if len(a) != len(b) {
 794		return false
 795	}
 796
 797	for i := range a {
 798		x, y := a[i], b[i]
 799		if x.Name != y.Name {
 800			return false
 801		}
 802	}
 803
 804	return true
 805}
 806
 807func (b *Builder) flush() error {
 808	todo := b.todo
 809	b.todo = nil
 810	b.size = 0
 811	b.errMu.Lock()
 812	defer b.errMu.Unlock()
 813	if b.buildError != nil {
 814		return b.buildError
 815	}
 816
 817	hasShard := b.nextShardNum > 0
 818	if len(todo) == 0 && hasShard {
 819		return nil
 820	}
 821
 822	shard := b.nextShardNum
 823	b.nextShardNum++
 824
 825	if b.opts.Parallelism > 1 {
 826		b.building.Add(1)
 827		b.throttle <- 1
 828		go func() {
 829			done, err := b.buildShard(todo, shard)
 830			<-b.throttle
 831
 832			b.errMu.Lock()
 833			defer b.errMu.Unlock()
 834			if err != nil && b.buildError == nil {
 835				b.buildError = err
 836			}
 837			if err == nil {
 838				b.finishedShards[done.temp] = done.final
 839			}
 840			b.building.Done()
 841		}()
 842	} else {
 843		// No goroutines when we're not parallel. This
 844		// simplifies memory profiling.
 845		done, err := b.buildShard(todo, shard)
 846		b.buildError = err
 847		if err == nil {
 848			b.finishedShards[done.temp] = done.final
 849		}
 850
 851		return b.buildError
 852	}
 853
 854	return nil
 855}
 856
 857// map [0,inf) to [0,1) monotonically
 858func squashRange(j int) float64 {
 859	x := float64(j)
 860	return x / (1 + x)
 861}
 862
 863// IsLowPriority takes a file name and makes an educated guess about its priority
 864// in search results. A file is considered low priority if it looks like a test,
 865// vendored, or generated file.
 866//
 867// These 'priority' criteria affects how documents are ordered within a shard. It's
 868// also used to help guess a file's rank when we're missing ranking information.
 869func IsLowPriority(path string, content []byte) bool {
 870	return enry.IsTest(path) || enry.IsVendor(path) || enry.IsGenerated(path, content)
 871}
 872
 873type rankedDoc struct {
 874	*Document
 875	rank []float64
 876}
 877
 878// rank returns a vector of scores which is used at index-time to sort documents
 879// before writing them to disk. The order of documents in the shard is important
 880// at query time, because earlier documents receive a boost at query time and
 881// have a higher chance of being searched before limits kick in.
 882func rank(d *Document, origIdx int) []float64 {
 883	skipped := 0.0
 884	if d.SkipReason != "" {
 885		skipped = 1.0
 886	}
 887
 888	generated := 0.0
 889	if enry.IsGenerated(d.Name, d.Content) {
 890		generated = 1.0
 891	}
 892
 893	vendor := 0.0
 894	if enry.IsVendor(d.Name) {
 895		vendor = 1.0
 896	}
 897
 898	test := 0.0
 899	if enry.IsTest(d.Name) {
 900		test = 1.0
 901	}
 902
 903	// Smaller is earlier (=better).
 904	return []float64{
 905		// Always place skipped docs last
 906		skipped,
 907
 908		// Prefer docs that are not generated
 909		generated,
 910
 911		// Prefer docs that are not vendored
 912		vendor,
 913
 914		// Prefer docs that are not tests
 915		test,
 916
 917		// With short names
 918		squashRange(len(d.Name)),
 919
 920		// With many symbols
 921		1.0 - squashRange(len(d.Symbols)),
 922
 923		// With short content
 924		squashRange(len(d.Content)),
 925
 926		// That is present is as many branches as possible
 927		1.0 - squashRange(len(d.Branches)),
 928
 929		// Preserve original ordering.
 930		squashRange(origIdx),
 931	}
 932}
 933
 934func sortDocuments(todo []*Document) {
 935	rs := make([]rankedDoc, 0, len(todo))
 936	for i, t := range todo {
 937		rd := rankedDoc{t, rank(t, i)}
 938		rs = append(rs, rd)
 939	}
 940	sort.Slice(rs, func(i, j int) bool {
 941		r1 := rs[i].rank
 942		r2 := rs[j].rank
 943		for i := range r1 {
 944			if r1[i] < r2[i] {
 945				return true
 946			}
 947			if r1[i] > r2[i] {
 948				return false
 949			}
 950		}
 951
 952		return false
 953	})
 954	for i := range todo {
 955		todo[i] = rs[i].Document
 956	}
 957}
 958
 959func (b *Builder) buildShard(todo []*Document, nextShardNum int) (*finishedShard, error) {
 960	if !b.opts.DisableCTags && (b.opts.CTagsPath != "" || b.opts.ScipCTagsPath != "") {
 961		err := parseSymbols(todo, b.opts.LanguageMap, b.parserBins)
 962		if b.opts.CTagsMustSucceed && err != nil {
 963			return nil, err
 964		}
 965		if err != nil {
 966			log.Printf("ignoring universal:%s or scip:%s error: %v", b.opts.CTagsPath, b.opts.ScipCTagsPath, err)
 967		}
 968	}
 969
 970	name := b.opts.shardName(nextShardNum)
 971
 972	shardBuilder, err := b.newShardBuilder()
 973	if err != nil {
 974		return nil, err
 975	}
 976
 977	sortDocuments(todo)
 978
 979	for idx, t := range todo {
 980		if err := shardBuilder.Add(*t); err != nil {
 981			return nil, err
 982		}
 983
 984		if idx%10_000 == 0 {
 985			b.CheckMemoryUsage()
 986		}
 987	}
 988
 989	return b.writeShard(name, shardBuilder)
 990}
 991
 992// CheckMemoryUsage checks the memory usage of the process and writes a memory profile if the heap usage exceeds the
 993// configured threshold. NOTE: this method is expensive and should only be used for debugging.
 994func (b *Builder) CheckMemoryUsage() {
 995	// Don't check memory if heap profiling is disabled, or we've already written 10 profiles
 996	if b.opts.HeapProfileTriggerBytes <= 0 || b.heapProfileNum >= 10 {
 997		return
 998	}
 999
1000	var m runtime.MemStats
1001	runtime.ReadMemStats(&m)
1002
1003	if m.HeapAlloc > b.opts.HeapProfileTriggerBytes && b.heapProfileMu.TryLock() {
1004		defer b.heapProfileMu.Unlock()
1005
1006		log.Printf("writing memory profile, allocated heap: %s", humanize.Bytes(m.HeapAlloc))
1007		name := filepath.Join(b.opts.IndexDir, fmt.Sprintf("indexmemory.prof.%d", b.heapProfileNum))
1008		f, err := os.Create(name)
1009		if err != nil {
1010			log.Printf("failed to create memory profile file: %v", err)
1011			return
1012		}
1013
1014		err = pprof.WriteHeapProfile(f)
1015		if err != nil {
1016			log.Printf("failed to write memory profile: %v", err)
1017		}
1018
1019		b.heapProfileNum++
1020	}
1021}
1022
1023func (b *Builder) newShardBuilder() (*ShardBuilder, error) {
1024	desc := b.opts.RepositoryDescription
1025	desc.HasSymbols = !b.opts.DisableCTags && b.opts.CTagsPath != ""
1026	desc.SubRepoMap = b.opts.SubRepositories
1027	desc.IndexOptions = b.opts.GetHash()
1028
1029	shardBuilder, err := NewShardBuilder(&desc)
1030	if err != nil {
1031		return nil, err
1032	}
1033	shardBuilder.IndexTime = b.indexTime
1034	shardBuilder.ID = b.id
1035	return shardBuilder, nil
1036}
1037
1038func (b *Builder) writeShard(fn string, ib *ShardBuilder) (*finishedShard, error) {
1039	dir := filepath.Dir(fn)
1040	if err := os.MkdirAll(dir, 0o700); err != nil {
1041		return nil, err
1042	}
1043
1044	f, err := os.CreateTemp(dir, filepath.Base(fn)+".*.tmp")
1045	if err != nil {
1046		return nil, err
1047	}
1048	if runtime.GOOS != "windows" {
1049		if err := f.Chmod(0o666 &^ umask); err != nil {
1050			return nil, err
1051		}
1052	}
1053
1054	defer f.Close()
1055	if err := ib.Write(f); err != nil {
1056		return nil, err
1057	}
1058	fi, err := f.Stat()
1059	if err != nil {
1060		return nil, err
1061	}
1062	if err := f.Close(); err != nil {
1063		return nil, err
1064	}
1065
1066	log.Printf("finished shard %s: %d index bytes (overhead %3.1f), %d files processed \n",
1067		fn,
1068		fi.Size(),
1069		float64(fi.Size())/float64(ib.ContentSize()+1),
1070		ib.NumFiles())
1071
1072	return &finishedShard{f.Name(), fn}, nil
1073}
1074
1075type deltaBranchSetError struct {
1076	shardName string
1077	old, new  []zoekt.RepositoryBranch
1078}
1079
1080func (e deltaBranchSetError) Error() string {
1081	return fmt.Sprintf("repository metadata in shard %q contains a different set of branch names than what was requested, which is unsupported in a delta shard build. old: %+v, new: %+v", e.shardName, e.old, e.new)
1082}
1083
1084type deltaIndexOptionsMismatchError struct {
1085	shardName  string
1086	newOptions HashOptions
1087}
1088
1089func (e *deltaIndexOptionsMismatchError) Error() string {
1090	return fmt.Sprintf("one or more index options for shard %q do not match Builder's index options. These index option updates are incompatible with delta build. New index options: %+v", e.shardName, e.newOptions)
1091}
1092
1093// umask holds the Umask of the current process
1094var umask os.FileMode
1095
1096// Document holds a document (file) to index.
1097type Document struct {
1098	Name              string
1099	Content           []byte
1100	Branches          []string
1101	SubRepositoryPath string
1102	Language          string
1103
1104	// If set, something is wrong with the file contents, and this
1105	// is the reason it wasn't indexed.
1106	SkipReason string
1107
1108	// Document sections for symbols. Offsets should use bytes.
1109	Symbols         []DocumentSection
1110	SymbolsMetaData []*zoekt.Symbol
1111}
1112
1113type DocumentSection struct {
1114	Start, End uint32
1115}
Configure Feed

Configure Feed