build/builder.go at ebb3ca24249dfa7d3be2bb48899b86e68bed2f6c · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / build / builder.go
at ebb3ca24249dfa7d3be2bb48899b86e68bed2f6c 30 kB View raw
Keegan Carruthers-Smith all: gofmt -s -w . (#730) 2y ago
   1// Copyright 2016 Google Inc. All rights reserved.
   2//
   3// Licensed under the Apache License, Version 2.0 (the "License");
   4// you may not use this file except in compliance with the License.
   5// You may obtain a copy of the License at
   6//
   7//    http://www.apache.org/licenses/LICENSE-2.0
   8//
   9// Unless required by applicable law or agreed to in writing, software
  10// distributed under the License is distributed on an "AS IS" BASIS,
  11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12// See the License for the specific language governing permissions and
  13// limitations under the License.
  14
  15// package build implements a more convenient interface for building
  16// zoekt indices.
  17package build
  18
  19import (
  20	"crypto/sha1"
  21	"flag"
  22	"fmt"
  23	"io"
  24	"log"
  25	"net/url"
  26	"os"
  27	"os/exec"
  28	"path"
  29	"path/filepath"
  30	"reflect"
  31	"runtime"
  32	"runtime/pprof"
  33	"sort"
  34	"strconv"
  35	"strings"
  36	"sync"
  37	"time"
  38
  39	"github.com/bmatcuk/doublestar"
  40	"github.com/grafana/regexp"
  41	"github.com/rs/xid"
  42
  43	"github.com/sourcegraph/zoekt"
  44	"github.com/sourcegraph/zoekt/ctags"
  45)
  46
  47var DefaultDir = filepath.Join(os.Getenv("HOME"), ".zoekt")
  48
  49// Branch describes a single branch version.
  50type Branch struct {
  51	Name    string
  52	Version string
  53}
  54
  55// Options sets options for the index building.
  56type Options struct {
  57	// IndexDir is a directory that holds *.zoekt index files.
  58	IndexDir string
  59
  60	// SizeMax is the maximum file size
  61	SizeMax int
  62
  63	// Parallelism is the maximum number of shards to index in parallel
  64	Parallelism int
  65
  66	// ShardMax sets the maximum corpus size for a single shard
  67	ShardMax int
  68
  69	// TrigramMax sets the maximum number of distinct trigrams per document.
  70	TrigramMax int
  71
  72	// RepositoryDescription holds names and URLs for the repository.
  73	RepositoryDescription zoekt.Repository
  74
  75	// SubRepositories is a path => sub repository map.
  76	SubRepositories map[string]*zoekt.Repository
  77
  78	// DisableCTags disables the generation of ctags metadata.
  79	DisableCTags bool
  80
  81	// CtagsPath is the path to the ctags binary to run, or empty
  82	// if a valid binary couldn't be found.
  83	CTagsPath string
  84
  85	// Same as CTagsPath but for scip-ctags
  86	ScipCTagsPath string
  87
  88	// If set, ctags must succeed.
  89	CTagsMustSucceed bool
  90
  91	// Write memory profiles to this file.
  92	MemProfile string
  93
  94	// LargeFiles is a slice of glob patterns, including ** for any number
  95	// of directories, where matching file paths should be indexed
  96	// regardless of their size. The full pattern syntax is here:
  97	// https://github.com/bmatcuk/doublestar/tree/v1#patterns.
  98	LargeFiles []string
  99
 100	// IsDelta is true if this run contains only the changed documents since the
 101	// last run.
 102	IsDelta bool
 103
 104	// DocumentRanksPath is the path to the file with document ranks. If empty,
 105	// ranks will be computed on-the-fly.
 106	DocumentRanksPath string
 107
 108	// DocumentRanksVersion is a string which when changed will cause us to
 109	// reindex a shard. This field is used so that when the contents of
 110	// DocumentRanksPath changes, we can reindex.
 111	DocumentRanksVersion string
 112
 113	// changedOrRemovedFiles is a list of file paths that have been changed or removed
 114	// since the last indexing job for this repository. These files will be tombstoned
 115	// in the older shards for this repository.
 116	changedOrRemovedFiles []string
 117
 118	LanguageMap ctags.LanguageMap
 119}
 120
 121// HashOptions contains only the options in Options that upon modification leads to IndexState of IndexStateMismatch during the next index building.
 122type HashOptions struct {
 123	sizeMax          int
 124	disableCTags     bool
 125	ctagsPath        string
 126	cTagsMustSucceed bool
 127	largeFiles       []string
 128
 129	// documentRankVersion is an experimental field which will change when the
 130	// DocumentRanksPath content changes. If empty we ignore it.
 131	documentRankVersion string
 132}
 133
 134func (o *Options) HashOptions() HashOptions {
 135	return HashOptions{
 136		sizeMax:             o.SizeMax,
 137		disableCTags:        o.DisableCTags,
 138		ctagsPath:           o.CTagsPath,
 139		cTagsMustSucceed:    o.CTagsMustSucceed,
 140		largeFiles:          o.LargeFiles,
 141		documentRankVersion: o.DocumentRanksVersion,
 142	}
 143}
 144
 145func (o *Options) GetHash() string {
 146	h := o.HashOptions()
 147	hasher := sha1.New()
 148
 149	hasher.Write([]byte(h.ctagsPath))
 150	hasher.Write([]byte(fmt.Sprintf("%t", h.cTagsMustSucceed)))
 151	hasher.Write([]byte(fmt.Sprintf("%d", h.sizeMax)))
 152	hasher.Write([]byte(fmt.Sprintf("%q", h.largeFiles)))
 153	hasher.Write([]byte(fmt.Sprintf("%t", h.disableCTags)))
 154
 155	if h.documentRankVersion != "" {
 156		hasher.Write([]byte{0})
 157		io.WriteString(hasher, h.documentRankVersion)
 158	}
 159
 160	return fmt.Sprintf("%x", hasher.Sum(nil))
 161}
 162
 163type largeFilesFlag struct{ *Options }
 164
 165func (f largeFilesFlag) String() string {
 166	// From flag.Value documentation:
 167	//
 168	// The flag package may call the String method with a zero-valued receiver,
 169	// such as a nil pointer.
 170	if f.Options == nil {
 171		return ""
 172	}
 173	s := append([]string{""}, f.LargeFiles...)
 174	return strings.Join(s, "-large_file ")
 175}
 176
 177func (f largeFilesFlag) Set(value string) error {
 178	f.LargeFiles = append(f.LargeFiles, value)
 179	return nil
 180}
 181
 182// Flags adds flags for build options to fs. It is the "inverse" of Args.
 183func (o *Options) Flags(fs *flag.FlagSet) {
 184	x := *o
 185	x.SetDefaults()
 186	fs.IntVar(&o.SizeMax, "file_limit", x.SizeMax, "maximum file size")
 187	fs.IntVar(&o.TrigramMax, "max_trigram_count", x.TrigramMax, "maximum number of trigrams per document")
 188	fs.IntVar(&o.ShardMax, "shard_limit", x.ShardMax, "maximum corpus size for a shard")
 189	fs.IntVar(&o.Parallelism, "parallelism", x.Parallelism, "maximum number of parallel indexing processes.")
 190	fs.StringVar(&o.IndexDir, "index", x.IndexDir, "directory for search indices")
 191	fs.BoolVar(&o.CTagsMustSucceed, "require_ctags", x.CTagsMustSucceed, "If set, ctags calls must succeed.")
 192	fs.Var(largeFilesFlag{o}, "large_file", "A glob pattern where matching files are to be index regardless of their size. You can add multiple patterns by setting this more than once.")
 193	fs.StringVar(&o.MemProfile, "memprofile", "", "write memory profile(s) to `file.shardnum`. Note: sets parallelism to 1.")
 194
 195	// Sourcegraph specific
 196	fs.BoolVar(&o.DisableCTags, "disable_ctags", x.DisableCTags, "If set, ctags will not be called.")
 197}
 198
 199// Args generates command line arguments for o. It is the "inverse" of Flags.
 200func (o *Options) Args() []string {
 201	var args []string
 202
 203	if o.SizeMax != 0 {
 204		args = append(args, "-file_limit", strconv.Itoa(o.SizeMax))
 205	}
 206
 207	if o.TrigramMax != 0 {
 208		args = append(args, "-max_trigram_count", strconv.Itoa(o.TrigramMax))
 209	}
 210
 211	if o.ShardMax != 0 {
 212		args = append(args, "-shard_limit", strconv.Itoa(o.ShardMax))
 213	}
 214
 215	if o.Parallelism != 0 {
 216		args = append(args, "-parallelism", strconv.Itoa(o.Parallelism))
 217	}
 218
 219	if o.IndexDir != "" {
 220		args = append(args, "-index", o.IndexDir)
 221	}
 222
 223	if o.CTagsMustSucceed {
 224		args = append(args, "-require_ctags")
 225	}
 226
 227	for _, a := range o.LargeFiles {
 228		args = append(args, "-large_file", a)
 229	}
 230
 231	// Sourcegraph specific
 232	if o.DisableCTags {
 233		args = append(args, "-disable_ctags")
 234	}
 235
 236	return args
 237}
 238
 239// Builder manages (parallel) creation of uniformly sized shards. The
 240// builder buffers up documents until it collects enough documents and
 241// then builds a shard and writes.
 242type Builder struct {
 243	opts     Options
 244	throttle chan int
 245
 246	nextShardNum int
 247	todo         []*zoekt.Document
 248	docChecker   zoekt.DocChecker
 249	size         int
 250
 251	parserBins ctags.ParserBinMap
 252	building   sync.WaitGroup
 253
 254	errMu      sync.Mutex
 255	buildError error
 256
 257	// temp name => final name for finished shards. We only rename
 258	// them once all shards succeed to avoid Frankstein corpuses.
 259	finishedShards map[string]string
 260
 261	// indexTime is set by tests for doing reproducible builds.
 262	indexTime time.Time
 263
 264	// a sortable 20 chars long id.
 265	id string
 266
 267	finishCalled bool
 268}
 269
 270type finishedShard struct {
 271	temp, final string
 272}
 273
 274func checkCTags() string {
 275	if ctags := os.Getenv("CTAGS_COMMAND"); ctags != "" {
 276		return ctags
 277	}
 278
 279	if ctags, err := exec.LookPath("universal-ctags"); err == nil {
 280		return ctags
 281	}
 282
 283	return ""
 284}
 285
 286func checkScipCTags() string {
 287	if ctags := os.Getenv("SCIP_CTAGS_COMMAND"); ctags != "" {
 288		return ctags
 289	}
 290
 291	if ctags, err := exec.LookPath("scip-ctags"); err == nil {
 292		return ctags
 293	}
 294
 295	return ""
 296}
 297
 298// SetDefaults sets reasonable default options.
 299func (o *Options) SetDefaults() {
 300	if o.CTagsPath == "" && !o.DisableCTags {
 301		o.CTagsPath = checkCTags()
 302	}
 303
 304	if o.ScipCTagsPath == "" && !o.DisableCTags {
 305		o.ScipCTagsPath = checkScipCTags()
 306	}
 307
 308	if o.Parallelism == 0 {
 309		o.Parallelism = 4
 310	}
 311	if o.SizeMax == 0 {
 312		o.SizeMax = 2 << 20
 313	}
 314	if o.ShardMax == 0 {
 315		o.ShardMax = 100 << 20
 316	}
 317	if o.TrigramMax == 0 {
 318		o.TrigramMax = 20000
 319	}
 320
 321	if o.RepositoryDescription.Name == "" && o.RepositoryDescription.URL != "" {
 322		parsed, _ := url.Parse(o.RepositoryDescription.URL)
 323		if parsed != nil {
 324			o.RepositoryDescription.Name = filepath.Join(parsed.Host, parsed.Path)
 325		}
 326	}
 327}
 328
 329func hashString(s string) string {
 330	h := sha1.New()
 331	_, _ = io.WriteString(h, s)
 332	return fmt.Sprintf("%x", h.Sum(nil))
 333}
 334
 335// ShardName returns the name the given index shard.
 336func (o *Options) shardName(n int) string {
 337	return o.shardNameVersion(zoekt.IndexFormatVersion, n)
 338}
 339
 340func (o *Options) shardNameVersion(version, n int) string {
 341	abs := url.QueryEscape(o.RepositoryDescription.Name)
 342	if len(abs) > 200 {
 343		abs = abs[:200] + hashString(abs)[:8]
 344	}
 345	return filepath.Join(o.IndexDir,
 346		fmt.Sprintf("%s_v%d.%05d.zoekt", abs, version, n))
 347}
 348
 349type IndexState string
 350
 351const (
 352	IndexStateMissing IndexState = "missing"
 353	IndexStateCorrupt IndexState = "corrupt"
 354	IndexStateVersion IndexState = "version-mismatch"
 355	IndexStateOption  IndexState = "option-mismatch"
 356	IndexStateMeta    IndexState = "meta-mismatch"
 357	IndexStateContent IndexState = "content-mismatch"
 358	IndexStateEqual   IndexState = "equal"
 359)
 360
 361var readVersions = []struct {
 362	IndexFormatVersion int
 363	FeatureVersion     int
 364}{{
 365	IndexFormatVersion: zoekt.IndexFormatVersion,
 366	FeatureVersion:     zoekt.FeatureVersion,
 367}, {
 368	IndexFormatVersion: zoekt.NextIndexFormatVersion,
 369	FeatureVersion:     zoekt.FeatureVersion,
 370}}
 371
 372// IncrementalSkipIndexing returns true if the index present on disk matches
 373// the build options.
 374func (o *Options) IncrementalSkipIndexing() bool {
 375	state, _ := o.IndexState()
 376	return state == IndexStateEqual
 377}
 378
 379// IndexState checks how the index present on disk compares to the build
 380// options and returns the IndexState and the name of the first shard.
 381func (o *Options) IndexState() (IndexState, string) {
 382	// Open the latest version we support that is on disk.
 383	fn := o.findShard()
 384	if fn == "" {
 385		return IndexStateMissing, fn
 386	}
 387
 388	repos, index, err := zoekt.ReadMetadataPathAlive(fn)
 389	if os.IsNotExist(err) {
 390		return IndexStateMissing, fn
 391	} else if err != nil {
 392		return IndexStateCorrupt, fn
 393	}
 394
 395	for _, v := range readVersions {
 396		if v.IndexFormatVersion == index.IndexFormatVersion && v.FeatureVersion != index.IndexFeatureVersion {
 397			return IndexStateVersion, fn
 398		}
 399	}
 400
 401	var repo *zoekt.Repository
 402	for _, cand := range repos {
 403		if cand.Name == o.RepositoryDescription.Name {
 404			repo = cand
 405			break
 406		}
 407	}
 408
 409	if repo == nil {
 410		return IndexStateCorrupt, fn
 411	}
 412
 413	if repo.IndexOptions != o.GetHash() {
 414		return IndexStateOption, fn
 415	}
 416
 417	if !reflect.DeepEqual(repo.Branches, o.RepositoryDescription.Branches) {
 418		return IndexStateContent, fn
 419	}
 420
 421	// We can mutate repo since it lives in the scope of this function call.
 422	if updated, err := repo.MergeMutable(&o.RepositoryDescription); err != nil {
 423		// non-nil err means we are trying to update an immutable field =>
 424		// reindex content.
 425		log.Printf("warn: immutable field changed, requires re-index: %s", err)
 426		return IndexStateContent, fn
 427	} else if updated {
 428		return IndexStateMeta, fn
 429	}
 430
 431	return IndexStateEqual, fn
 432}
 433
 434// FindRepositoryMetadata returns the index metadata for the repository
 435// specified in the options. 'ok' is false if the repository's metadata
 436// couldn't be found or if an error occurred.
 437func (o *Options) FindRepositoryMetadata() (repository *zoekt.Repository, metadata *zoekt.IndexMetadata, ok bool, err error) {
 438	shard := o.findShard()
 439	if shard == "" {
 440		return nil, nil, false, nil
 441	}
 442
 443	repositories, metadata, err := zoekt.ReadMetadataPathAlive(shard)
 444	if err != nil {
 445		return nil, nil, false, fmt.Errorf("reading metadata for shard %q: %w", shard, err)
 446	}
 447
 448	ID := o.RepositoryDescription.ID
 449	for _, r := range repositories {
 450		// compound shards contain multiple repositories, so we
 451		// have to pick only the one we're looking for
 452		if r.ID == ID {
 453			return r, metadata, true, nil
 454		}
 455	}
 456
 457	// If we're here, then we're somehow in a state where we found a matching
 458	// shard that's missing the repository metadata we're looking for. This
 459	// should never happen.
 460	name := o.RepositoryDescription.Name
 461	return nil, nil, false, fmt.Errorf("matching shard %q doesn't contain metadata for repo id %d (%q)", shard, ID, name)
 462}
 463
 464func (o *Options) findShard() string {
 465	for _, v := range readVersions {
 466		fn := o.shardNameVersion(v.IndexFormatVersion, 0)
 467		if _, err := os.Stat(fn); err == nil {
 468			return fn
 469		}
 470	}
 471
 472	// Brute force finding the shard in compound shards. We should only hit this
 473	// code path for repositories that are not already existing or are in
 474	// compound shards.
 475	//
 476	// TODO add an oracle which can speed this up in the case of repositories
 477	// already in compound shards.
 478	compoundShards, err := filepath.Glob(path.Join(o.IndexDir, "compound-*.zoekt"))
 479	if err != nil {
 480		return ""
 481	}
 482	for _, fn := range compoundShards {
 483		repos, _, err := zoekt.ReadMetadataPathAlive(fn)
 484		if err != nil {
 485			continue
 486		}
 487		for _, repo := range repos {
 488			if repo.ID == o.RepositoryDescription.ID {
 489				return fn
 490			}
 491		}
 492	}
 493
 494	return ""
 495}
 496
 497func (o *Options) FindAllShards() []string {
 498	for _, v := range readVersions {
 499		fn := o.shardNameVersion(v.IndexFormatVersion, 0)
 500		if _, err := os.Stat(fn); err == nil {
 501			shards := []string{fn}
 502			for i := 1; ; i++ {
 503				fn := o.shardNameVersion(v.IndexFormatVersion, i)
 504				if _, err := os.Stat(fn); err != nil {
 505					return shards
 506				}
 507				shards = append(shards, fn)
 508			}
 509		}
 510	}
 511
 512	// lazily fallback to findShard which will look for a compound shard.
 513	if fn := o.findShard(); fn != "" {
 514		return []string{fn}
 515	}
 516
 517	return nil
 518}
 519
 520// IgnoreSizeMax determines whether the max size should be ignored.
 521func (o *Options) IgnoreSizeMax(name string) bool {
 522	// A pattern match will override preceding pattern matches.
 523	for i := len(o.LargeFiles) - 1; i >= 0; i-- {
 524		pattern := strings.TrimSpace(o.LargeFiles[i])
 525		negated, validatedPattern := checkIsNegatePattern(pattern)
 526
 527		if m, _ := doublestar.PathMatch(validatedPattern, name); m {
 528			if negated {
 529				return false
 530			} else {
 531				return true
 532			}
 533		}
 534	}
 535
 536	return false
 537}
 538
 539func checkIsNegatePattern(pattern string) (bool, string) {
 540	negate := "!"
 541
 542	// if negated then strip prefix meta character which identifies negated filter pattern
 543	if strings.HasPrefix(pattern, negate) {
 544		return true, pattern[len(negate):]
 545	}
 546
 547	return false, pattern
 548}
 549
 550// NewBuilder creates a new Builder instance.
 551func NewBuilder(opts Options) (*Builder, error) {
 552	opts.SetDefaults()
 553	if opts.RepositoryDescription.Name == "" {
 554		return nil, fmt.Errorf("builder: must set Name")
 555	}
 556
 557	b := &Builder{
 558		opts:           opts,
 559		throttle:       make(chan int, opts.Parallelism),
 560		finishedShards: map[string]string{},
 561	}
 562
 563	parserBins, err := ctags.NewParserBinMap(
 564		b.opts.CTagsPath,
 565		b.opts.ScipCTagsPath,
 566		opts.LanguageMap,
 567		b.opts.CTagsMustSucceed,
 568	)
 569	if err != nil {
 570		return nil, err
 571	}
 572
 573	b.parserBins = parserBins
 574
 575	if opts.IsDelta {
 576		// Delta shards build on top of previously existing shards.
 577		// As a consequence, the shardNum for delta shards starts from
 578		// the number following the most recently generated shard - not 0.
 579		//
 580		// Using this numbering scheme allows all the shards to be
 581		// discovered as a set.
 582		shards := b.opts.FindAllShards()
 583		b.nextShardNum = len(shards) // shards are zero indexed, so len() provides the next number after the last one
 584	}
 585
 586	if _, err := b.newShardBuilder(); err != nil {
 587		return nil, err
 588	}
 589
 590	now := time.Now()
 591	b.indexTime = now
 592	b.id = xid.NewWithTime(now).String()
 593
 594	return b, nil
 595}
 596
 597// AddFile is a convenience wrapper for the Add method
 598func (b *Builder) AddFile(name string, content []byte) error {
 599	return b.Add(zoekt.Document{Name: name, Content: content})
 600}
 601
 602func (b *Builder) Add(doc zoekt.Document) error {
 603	if b.finishCalled {
 604		return nil
 605	}
 606
 607	allowLargeFile := b.opts.IgnoreSizeMax(doc.Name)
 608	if len(doc.Content) > b.opts.SizeMax && !allowLargeFile {
 609		// We could pass the document on to the shardbuilder, but if
 610		// we pass through a part of the source tree with binary/large
 611		// files, the corresponding shard would be mostly empty, so
 612		// insert a reason here too.
 613		doc.SkipReason = fmt.Sprintf("document size %d larger than limit %d", len(doc.Content), b.opts.SizeMax)
 614	} else if err := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile); err != nil {
 615		doc.SkipReason = err.Error()
 616		doc.Language = "binary"
 617	}
 618
 619	b.todo = append(b.todo, &doc)
 620
 621	if doc.SkipReason == "" {
 622		b.size += len(doc.Name) + len(doc.Content)
 623	} else {
 624		b.size += len(doc.Name) + len(doc.SkipReason)
 625		// Drop the content if we are skipping the document. Skipped content is not counted towards the
 626		// shard size limit, so otherwise we might buffer too much data in memory before flushing.
 627		doc.Content = nil
 628	}
 629
 630	if b.size > b.opts.ShardMax {
 631		return b.flush()
 632	}
 633
 634	return nil
 635}
 636
 637// MarkFileAsChangedOrRemoved indicates that the file specified by the given path
 638// has been changed or removed since the last indexing job for this repository.
 639//
 640// If this build is a delta build, these files will be tombstoned in the older shards for this repository.
 641func (b *Builder) MarkFileAsChangedOrRemoved(path string) {
 642	b.opts.changedOrRemovedFiles = append(b.opts.changedOrRemovedFiles, path)
 643}
 644
 645// Finish creates a last shard from the buffered documents, and clears
 646// stale shards from previous runs. This should always be called, also
 647// in failure cases, to ensure cleanup.
 648//
 649// It is safe to call Finish() multiple times.
 650func (b *Builder) Finish() error {
 651	if b.finishCalled {
 652		return b.buildError
 653	}
 654
 655	b.finishCalled = true
 656
 657	b.flush()
 658	b.building.Wait()
 659
 660	if b.buildError != nil {
 661		for tmp := range b.finishedShards {
 662			log.Printf("Builder.Finish %s", tmp)
 663			os.Remove(tmp)
 664		}
 665		b.finishedShards = map[string]string{}
 666		return b.buildError
 667	}
 668
 669	// map of temporary -> final names for all updated shards + shard metadata files
 670	artifactPaths := make(map[string]string)
 671	for tmp, final := range b.finishedShards {
 672		artifactPaths[tmp] = final
 673	}
 674
 675	oldShards := b.opts.FindAllShards()
 676
 677	if b.opts.IsDelta {
 678		// Delta shard builds need to update FileTombstone and branch commit information for all
 679		// existing shards
 680		for _, shard := range oldShards {
 681			repositories, _, err := zoekt.ReadMetadataPathAlive(shard)
 682			if err != nil {
 683				return fmt.Errorf("reading metadata from shard %q: %w", shard, err)
 684			}
 685
 686			if len(repositories) > 1 {
 687				return fmt.Errorf("delta shard builds don't support repositories contained in compound shards (shard %q)", shard)
 688			}
 689
 690			if len(repositories) == 0 {
 691				return fmt.Errorf("failed to update repository metadata for shard %q - shard contains no repositories", shard)
 692			}
 693
 694			repository := repositories[0]
 695			if repository.ID != b.opts.RepositoryDescription.ID {
 696				return fmt.Errorf("shard %q doesn't contain repository ID %d (%q)", shard, b.opts.RepositoryDescription.ID, b.opts.RepositoryDescription.Name)
 697			}
 698
 699			if len(b.opts.changedOrRemovedFiles) > 0 && repository.FileTombstones == nil {
 700				repository.FileTombstones = make(map[string]struct{}, len(b.opts.changedOrRemovedFiles))
 701			}
 702
 703			for _, f := range b.opts.changedOrRemovedFiles {
 704				repository.FileTombstones[f] = struct{}{}
 705			}
 706
 707			if !BranchNamesEqual(repository.Branches, b.opts.RepositoryDescription.Branches) {
 708				return deltaBranchSetError{
 709					shardName: shard,
 710					old:       repository.Branches,
 711					new:       b.opts.RepositoryDescription.Branches,
 712				}
 713			}
 714
 715			if b.opts.GetHash() != repository.IndexOptions {
 716				return &deltaIndexOptionsMismatchError{
 717					shardName:  shard,
 718					newOptions: b.opts.HashOptions(),
 719				}
 720			}
 721
 722			repository.Branches = b.opts.RepositoryDescription.Branches
 723
 724			repository.LatestCommitDate = b.opts.RepositoryDescription.LatestCommitDate
 725
 726			tempPath, finalPath, err := zoekt.JsonMarshalRepoMetaTemp(shard, repository)
 727			if err != nil {
 728				return fmt.Errorf("writing repository metadta for shard %q: %w", shard, err)
 729			}
 730
 731			artifactPaths[tempPath] = finalPath
 732		}
 733	}
 734
 735	// We mark finished shards as empty when we successfully finish. Return now
 736	// to allow call sites to call Finish idempotently.
 737	if len(artifactPaths) == 0 {
 738		return b.buildError
 739	}
 740
 741	// Collect a map of the old shards on disk. For each new shard we replace we
 742	// delete it from toDelete. Anything remaining in toDelete will be removed
 743	// after we have renamed everything into place.
 744
 745	var toDelete map[string]struct{}
 746	if !b.opts.IsDelta {
 747		// Non-delta shard builds delete all existing shards before they write out
 748		// new ones.
 749		// By contrast, delta shard builds work by stacking changes on top of existing shards.
 750		// So, we skip populating the toDelete map if we're building delta shards.
 751
 752		toDelete = make(map[string]struct{})
 753		for _, name := range oldShards {
 754			paths, err := zoekt.IndexFilePaths(name)
 755			if err != nil {
 756				b.buildError = fmt.Errorf("failed to find old paths for %s: %w", name, err)
 757			}
 758			for _, p := range paths {
 759				toDelete[p] = struct{}{}
 760			}
 761		}
 762	}
 763
 764	for tmp, final := range artifactPaths {
 765		if err := os.Rename(tmp, final); err != nil {
 766			b.buildError = err
 767			continue
 768		}
 769
 770		delete(toDelete, final)
 771	}
 772
 773	b.finishedShards = map[string]string{}
 774
 775	for p := range toDelete {
 776		// Don't delete compound shards, set tombstones instead.
 777		if zoekt.ShardMergingEnabled() && strings.HasPrefix(filepath.Base(p), "compound-") {
 778			if !strings.HasSuffix(p, ".zoekt") {
 779				continue
 780			}
 781			err := zoekt.SetTombstone(p, b.opts.RepositoryDescription.ID)
 782			b.buildError = err
 783			continue
 784		}
 785		log.Printf("removing old shard file: %s", p)
 786		if err := os.Remove(p); err != nil {
 787			b.buildError = err
 788		}
 789	}
 790
 791	return b.buildError
 792}
 793
 794// BranchNamesEqual compares the given zoekt.RepositoryBranch slices, and returns true
 795// iff both slices specify the same set of branch names in the same order.
 796func BranchNamesEqual(a, b []zoekt.RepositoryBranch) bool {
 797	if len(a) != len(b) {
 798		return false
 799	}
 800
 801	for i := range a {
 802		x, y := a[i], b[i]
 803		if x.Name != y.Name {
 804			return false
 805		}
 806	}
 807
 808	return true
 809}
 810
 811func (b *Builder) flush() error {
 812	todo := b.todo
 813	b.todo = nil
 814	b.size = 0
 815	b.errMu.Lock()
 816	defer b.errMu.Unlock()
 817	if b.buildError != nil {
 818		return b.buildError
 819	}
 820
 821	hasShard := b.nextShardNum > 0
 822	if len(todo) == 0 && hasShard {
 823		return nil
 824	}
 825
 826	shard := b.nextShardNum
 827	b.nextShardNum++
 828
 829	if b.opts.Parallelism > 1 && b.opts.MemProfile == "" {
 830		b.building.Add(1)
 831		b.throttle <- 1
 832		go func() {
 833			done, err := b.buildShard(todo, shard)
 834			<-b.throttle
 835
 836			b.errMu.Lock()
 837			defer b.errMu.Unlock()
 838			if err != nil && b.buildError == nil {
 839				b.buildError = err
 840			}
 841			if err == nil {
 842				b.finishedShards[done.temp] = done.final
 843			}
 844			b.building.Done()
 845		}()
 846	} else {
 847		// No goroutines when we're not parallel. This
 848		// simplifies memory profiling.
 849		done, err := b.buildShard(todo, shard)
 850		b.buildError = err
 851		if err == nil {
 852			b.finishedShards[done.temp] = done.final
 853		}
 854		if b.opts.MemProfile != "" {
 855			// drop memory, and profile.
 856			todo = nil
 857			b.writeMemProfile(b.opts.MemProfile)
 858		}
 859
 860		return b.buildError
 861	}
 862
 863	return nil
 864}
 865
 866var profileNumber int
 867
 868func (b *Builder) writeMemProfile(name string) {
 869	nm := fmt.Sprintf("%s.%d", name, profileNumber)
 870	profileNumber++
 871	f, err := os.Create(nm)
 872	if err != nil {
 873		log.Fatal("could not create memory profile: ", err)
 874	}
 875	runtime.GC() // get up-to-date statistics
 876	if err := pprof.WriteHeapProfile(f); err != nil {
 877		log.Fatal("could not write memory profile: ", err)
 878	}
 879	f.Close()
 880	log.Printf("wrote mem profile %q", nm)
 881}
 882
 883// map [0,inf) to [0,1) monotonically
 884func squashRange(j int) float64 {
 885	x := float64(j)
 886	return x / (1 + x)
 887}
 888
 889// IsLowPriority takes a file name and makes an educated guess about its priority
 890// in search results. A file is considered low priority if it looks like a test,
 891// vendored, or generated file.
 892//
 893// These 'priority' criteria affects how documents are ordered within a shard. It's
 894// also used to help guess a file's rank when we're missing ranking information.
 895func IsLowPriority(file string) bool {
 896	return testRe.MatchString(file) || isGenerated(file) || isVendored(file)
 897}
 898
 899var testRe = regexp.MustCompile("[Tt]est")
 900
 901func isGenerated(file string) bool {
 902	return strings.HasSuffix(file, "min.js") || strings.HasSuffix(file, "js.map")
 903}
 904
 905func isVendored(file string) bool {
 906	return strings.Contains(file, "vendor/") || strings.Contains(file, "node_modules/")
 907}
 908
 909type rankedDoc struct {
 910	*zoekt.Document
 911	rank []float64
 912}
 913
 914// rank returns a vector of scores which is used at index-time to sort documents
 915// before writing them to disk. The order of documents in the shard is important
 916// at query time, because earlier documents receive a boost at query time and
 917// have a higher chance of being searched before limits kick in.
 918func rank(d *zoekt.Document, origIdx int) []float64 {
 919	skipped := 0.0
 920	if d.SkipReason != "" {
 921		skipped = 1.0
 922	}
 923
 924	generated := 0.0
 925	if isGenerated(d.Name) {
 926		generated = 1.0
 927	}
 928
 929	vendor := 0.0
 930	if isVendored(d.Name) {
 931		vendor = 1.0
 932	}
 933
 934	test := 0.0
 935	if testRe.MatchString(d.Name) {
 936		test = 1.0
 937	}
 938
 939	// Smaller is earlier (=better).
 940	return []float64{
 941		// Always place skipped docs last
 942		skipped,
 943
 944		// Prefer docs that are not generated
 945		generated,
 946
 947		// Prefer docs that are not vendored
 948		vendor,
 949
 950		// Prefer docs that are not tests
 951		test,
 952
 953		// With short names
 954		squashRange(len(d.Name)),
 955
 956		// With many symbols
 957		1.0 - squashRange(len(d.Symbols)),
 958
 959		// With short content
 960		squashRange(len(d.Content)),
 961
 962		// That is present is as many branches as possible
 963		1.0 - squashRange(len(d.Branches)),
 964
 965		// Preserve original ordering.
 966		squashRange(origIdx),
 967	}
 968}
 969
 970func sortDocuments(todo []*zoekt.Document) {
 971	rs := make([]rankedDoc, 0, len(todo))
 972	for i, t := range todo {
 973		rd := rankedDoc{t, rank(t, i)}
 974		rs = append(rs, rd)
 975	}
 976	sort.Slice(rs, func(i, j int) bool {
 977		r1 := rs[i].rank
 978		r2 := rs[j].rank
 979		for i := range r1 {
 980			if r1[i] < r2[i] {
 981				return true
 982			}
 983			if r1[i] > r2[i] {
 984				return false
 985			}
 986		}
 987
 988		return false
 989	})
 990	for i := range todo {
 991		todo[i] = rs[i].Document
 992	}
 993}
 994
 995func (b *Builder) buildShard(todo []*zoekt.Document, nextShardNum int) (*finishedShard, error) {
 996	if !b.opts.DisableCTags && (b.opts.CTagsPath != "" || b.opts.ScipCTagsPath != "") {
 997		err := parseSymbols(todo, b.opts.LanguageMap, b.parserBins)
 998		if b.opts.CTagsMustSucceed && err != nil {
 999			return nil, err
1000		}
1001		if err != nil {
1002			log.Printf("ignoring universal:%s or scip:%s error: %v", b.opts.CTagsPath, b.opts.ScipCTagsPath, err)
1003		}
1004	}
1005
1006	name := b.opts.shardName(nextShardNum)
1007
1008	shardBuilder, err := b.newShardBuilder()
1009	if err != nil {
1010		return nil, err
1011	}
1012
1013	sortDocuments(todo)
1014
1015	for _, t := range todo {
1016		if err := shardBuilder.Add(*t); err != nil {
1017			return nil, err
1018		}
1019	}
1020
1021	return b.writeShard(name, shardBuilder)
1022}
1023
1024func (b *Builder) newShardBuilder() (*zoekt.IndexBuilder, error) {
1025	desc := b.opts.RepositoryDescription
1026	desc.HasSymbols = !b.opts.DisableCTags && b.opts.CTagsPath != ""
1027	desc.SubRepoMap = b.opts.SubRepositories
1028	desc.IndexOptions = b.opts.GetHash()
1029
1030	shardBuilder, err := zoekt.NewIndexBuilder(&desc)
1031	if err != nil {
1032		return nil, err
1033	}
1034	shardBuilder.IndexTime = b.indexTime
1035	shardBuilder.ID = b.id
1036	return shardBuilder, nil
1037}
1038
1039func (b *Builder) writeShard(fn string, ib *zoekt.IndexBuilder) (*finishedShard, error) {
1040	dir := filepath.Dir(fn)
1041	if err := os.MkdirAll(dir, 0o700); err != nil {
1042		return nil, err
1043	}
1044
1045	f, err := os.CreateTemp(dir, filepath.Base(fn)+".*.tmp")
1046	if err != nil {
1047		return nil, err
1048	}
1049	if runtime.GOOS != "windows" {
1050		if err := f.Chmod(0o666 &^ umask); err != nil {
1051			return nil, err
1052		}
1053	}
1054
1055	defer f.Close()
1056	if err := ib.Write(f); err != nil {
1057		return nil, err
1058	}
1059	fi, err := f.Stat()
1060	if err != nil {
1061		return nil, err
1062	}
1063	if err := f.Close(); err != nil {
1064		return nil, err
1065	}
1066
1067	log.Printf("finished shard %s: %d index bytes (overhead %3.1f), %d files processed \n",
1068		fn,
1069		fi.Size(),
1070		float64(fi.Size())/float64(ib.ContentSize()+1),
1071		ib.NumFiles())
1072
1073	return &finishedShard{f.Name(), fn}, nil
1074}
1075
1076type deltaBranchSetError struct {
1077	shardName string
1078	old, new  []zoekt.RepositoryBranch
1079}
1080
1081func (e deltaBranchSetError) Error() string {
1082	return fmt.Sprintf("repository metadata in shard %q contains a different set of branch names than what was requested, which is unsupported in a delta shard build. old: %+v, new: %+v", e.shardName, e.old, e.new)
1083}
1084
1085type deltaIndexOptionsMismatchError struct {
1086	shardName  string
1087	newOptions HashOptions
1088}
1089
1090func (e *deltaIndexOptionsMismatchError) Error() string {
1091	return fmt.Sprintf("one or more index options for shard %q do not match Builder's index options. These index option updates are incompatible with delta build. New index options: %+v", e.shardName, e.newOptions)
1092}
1093
1094// umask holds the Umask of the current process
1095var umask os.FileMode
Configure Feed

Configure Feed