build/builder.go at aa6050a078a549b4786b36a0a1a196145d92dec4 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / build / builder.go
at aa6050a078a549b4786b36a0a1a196145d92dec4 30 kB View raw
Stefan Hengl ranking: removing document ranks (#853) 2y ago
   1// Copyright 2016 Google Inc. All rights reserved.
   2//
   3// Licensed under the Apache License, Version 2.0 (the "License");
   4// you may not use this file except in compliance with the License.
   5// You may obtain a copy of the License at
   6//
   7//    http://www.apache.org/licenses/LICENSE-2.0
   8//
   9// Unless required by applicable law or agreed to in writing, software
  10// distributed under the License is distributed on an "AS IS" BASIS,
  11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12// See the License for the specific language governing permissions and
  13// limitations under the License.
  14
  15// package build implements a more convenient interface for building
  16// zoekt indices.
  17package build
  18
  19import (
  20	"crypto/sha1"
  21	"flag"
  22	"fmt"
  23	"io"
  24	"log"
  25	"net/url"
  26	"os"
  27	"os/exec"
  28	"path"
  29	"path/filepath"
  30	"reflect"
  31	"runtime"
  32	"runtime/pprof"
  33	"sort"
  34	"strconv"
  35	"strings"
  36	"sync"
  37	"time"
  38
  39	"github.com/bmatcuk/doublestar"
  40	"github.com/dustin/go-humanize"
  41	"github.com/go-enry/go-enry/v2"
  42	"github.com/rs/xid"
  43
  44	"github.com/sourcegraph/zoekt"
  45	"github.com/sourcegraph/zoekt/ctags"
  46)
  47
  48var DefaultDir = filepath.Join(os.Getenv("HOME"), ".zoekt")
  49
  50// Branch describes a single branch version.
  51type Branch struct {
  52	Name    string
  53	Version string
  54}
  55
  56// Options sets options for the index building.
  57type Options struct {
  58	// IndexDir is a directory that holds *.zoekt index files.
  59	IndexDir string
  60
  61	// SizeMax is the maximum file size
  62	SizeMax int
  63
  64	// Parallelism is the maximum number of shards to index in parallel
  65	Parallelism int
  66
  67	// ShardMax sets the maximum corpus size for a single shard
  68	ShardMax int
  69
  70	// TrigramMax sets the maximum number of distinct trigrams per document.
  71	TrigramMax int
  72
  73	// RepositoryDescription holds names and URLs for the repository.
  74	RepositoryDescription zoekt.Repository
  75
  76	// SubRepositories is a path => sub repository map.
  77	SubRepositories map[string]*zoekt.Repository
  78
  79	// DisableCTags disables the generation of ctags metadata.
  80	DisableCTags bool
  81
  82	// CtagsPath is the path to the ctags binary to run, or empty
  83	// if a valid binary couldn't be found.
  84	CTagsPath string
  85
  86	// Same as CTagsPath but for scip-ctags
  87	ScipCTagsPath string
  88
  89	// If set, ctags must succeed.
  90	CTagsMustSucceed bool
  91
  92	// LargeFiles is a slice of glob patterns, including ** for any number
  93	// of directories, where matching file paths should be indexed
  94	// regardless of their size. The full pattern syntax is here:
  95	// https://github.com/bmatcuk/doublestar/tree/v1#patterns.
  96	LargeFiles []string
  97
  98	// IsDelta is true if this run contains only the changed documents since the
  99	// last run.
 100	IsDelta bool
 101
 102	// changedOrRemovedFiles is a list of file paths that have been changed or removed
 103	// since the last indexing job for this repository. These files will be tombstoned
 104	// in the older shards for this repository.
 105	changedOrRemovedFiles []string
 106
 107	LanguageMap ctags.LanguageMap
 108
 109	// ShardMerging is true if builder should respect compound shards. This is a
 110	// Sourcegraph specific option.
 111	ShardMerging bool
 112
 113	// HeapProfileTriggerBytes is the heap allocation in bytes that will trigger a memory profile. If 0, no memory profile
 114	// will be triggered. Note this trigger looks at total heap allocation (which includes both inuse and garbage objects).
 115	//
 116	// Profiles will be written to files named `index-memory.prof.n` in the index directory. No more than 10 files are written.
 117	//
 118	// Note: heap checking is "best effort", and it's possible for the process to OOM without triggering the heap profile.
 119	HeapProfileTriggerBytes uint64
 120}
 121
 122// HashOptions contains only the options in Options that upon modification leads to IndexState of IndexStateMismatch during the next index building.
 123type HashOptions struct {
 124	sizeMax          int
 125	disableCTags     bool
 126	ctagsPath        string
 127	cTagsMustSucceed bool
 128	largeFiles       []string
 129}
 130
 131func (o *Options) HashOptions() HashOptions {
 132	return HashOptions{
 133		sizeMax:          o.SizeMax,
 134		disableCTags:     o.DisableCTags,
 135		ctagsPath:        o.CTagsPath,
 136		cTagsMustSucceed: o.CTagsMustSucceed,
 137		largeFiles:       o.LargeFiles,
 138	}
 139}
 140
 141func (o *Options) GetHash() string {
 142	h := o.HashOptions()
 143	hasher := sha1.New()
 144
 145	hasher.Write([]byte(h.ctagsPath))
 146	hasher.Write([]byte(fmt.Sprintf("%t", h.cTagsMustSucceed)))
 147	hasher.Write([]byte(fmt.Sprintf("%d", h.sizeMax)))
 148	hasher.Write([]byte(fmt.Sprintf("%q", h.largeFiles)))
 149	hasher.Write([]byte(fmt.Sprintf("%t", h.disableCTags)))
 150
 151	return fmt.Sprintf("%x", hasher.Sum(nil))
 152}
 153
 154type largeFilesFlag struct{ *Options }
 155
 156func (f largeFilesFlag) String() string {
 157	// From flag.Value documentation:
 158	//
 159	// The flag package may call the String method with a zero-valued receiver,
 160	// such as a nil pointer.
 161	if f.Options == nil {
 162		return ""
 163	}
 164	s := append([]string{""}, f.LargeFiles...)
 165	return strings.Join(s, "-large_file ")
 166}
 167
 168func (f largeFilesFlag) Set(value string) error {
 169	f.LargeFiles = append(f.LargeFiles, value)
 170	return nil
 171}
 172
 173// Flags adds flags for build options to fs. It is the "inverse" of Args.
 174func (o *Options) Flags(fs *flag.FlagSet) {
 175	x := *o
 176	x.SetDefaults()
 177	fs.IntVar(&o.SizeMax, "file_limit", x.SizeMax, "maximum file size")
 178	fs.IntVar(&o.TrigramMax, "max_trigram_count", x.TrigramMax, "maximum number of trigrams per document")
 179	fs.IntVar(&o.ShardMax, "shard_limit", x.ShardMax, "maximum corpus size for a shard")
 180	fs.IntVar(&o.Parallelism, "parallelism", x.Parallelism, "maximum number of parallel indexing processes.")
 181	fs.StringVar(&o.IndexDir, "index", x.IndexDir, "directory for search indices")
 182	fs.BoolVar(&o.CTagsMustSucceed, "require_ctags", x.CTagsMustSucceed, "If set, ctags calls must succeed.")
 183	fs.Var(largeFilesFlag{o}, "large_file", "A glob pattern where matching files are to be index regardless of their size. You can add multiple patterns by setting this more than once.")
 184
 185	// Sourcegraph specific
 186	fs.BoolVar(&o.DisableCTags, "disable_ctags", x.DisableCTags, "If set, ctags will not be called.")
 187	fs.BoolVar(&o.ShardMerging, "shard_merging", x.ShardMerging, "If set, builder will respect compound shards.")
 188}
 189
 190// Args generates command line arguments for o. It is the "inverse" of Flags.
 191func (o *Options) Args() []string {
 192	var args []string
 193
 194	if o.SizeMax != 0 {
 195		args = append(args, "-file_limit", strconv.Itoa(o.SizeMax))
 196	}
 197
 198	if o.TrigramMax != 0 {
 199		args = append(args, "-max_trigram_count", strconv.Itoa(o.TrigramMax))
 200	}
 201
 202	if o.ShardMax != 0 {
 203		args = append(args, "-shard_limit", strconv.Itoa(o.ShardMax))
 204	}
 205
 206	if o.Parallelism != 0 {
 207		args = append(args, "-parallelism", strconv.Itoa(o.Parallelism))
 208	}
 209
 210	if o.IndexDir != "" {
 211		args = append(args, "-index", o.IndexDir)
 212	}
 213
 214	if o.CTagsMustSucceed {
 215		args = append(args, "-require_ctags")
 216	}
 217
 218	for _, a := range o.LargeFiles {
 219		args = append(args, "-large_file", a)
 220	}
 221
 222	// Sourcegraph specific
 223	if o.DisableCTags {
 224		args = append(args, "-disable_ctags")
 225	}
 226
 227	if o.ShardMerging {
 228		args = append(args, "-shard_merging")
 229	}
 230
 231	return args
 232}
 233
 234// Builder manages (parallel) creation of uniformly sized shards. The
 235// builder buffers up documents until it collects enough documents and
 236// then builds a shard and writes.
 237type Builder struct {
 238	opts     Options
 239	throttle chan int
 240
 241	nextShardNum int
 242	todo         []*zoekt.Document
 243	docChecker   zoekt.DocChecker
 244	size         int
 245
 246	parserBins ctags.ParserBinMap
 247	building   sync.WaitGroup
 248
 249	errMu      sync.Mutex
 250	buildError error
 251
 252	// temp name => final name for finished shards. We only rename
 253	// them once all shards succeed to avoid Frankstein corpuses.
 254	finishedShards map[string]string
 255
 256	// indexTime is set by tests for doing reproducible builds.
 257	indexTime time.Time
 258
 259	// heapProfileMu is used to ensure that only one memory profile is written at a time
 260	heapProfileMu  sync.Mutex
 261	heapProfileNum int
 262
 263	// a sortable 20 chars long id.
 264	id string
 265
 266	finishCalled bool
 267}
 268
 269type finishedShard struct {
 270	temp, final string
 271}
 272
 273func checkCTags() string {
 274	if ctags := os.Getenv("CTAGS_COMMAND"); ctags != "" {
 275		return ctags
 276	}
 277
 278	if ctags, err := exec.LookPath("universal-ctags"); err == nil {
 279		return ctags
 280	}
 281
 282	return ""
 283}
 284
 285func checkScipCTags() string {
 286	if ctags := os.Getenv("SCIP_CTAGS_COMMAND"); ctags != "" {
 287		return ctags
 288	}
 289
 290	if ctags, err := exec.LookPath("scip-ctags"); err == nil {
 291		return ctags
 292	}
 293
 294	return ""
 295}
 296
 297// SetDefaults sets reasonable default options.
 298func (o *Options) SetDefaults() {
 299	if o.CTagsPath == "" && !o.DisableCTags {
 300		o.CTagsPath = checkCTags()
 301	}
 302
 303	if o.ScipCTagsPath == "" && !o.DisableCTags {
 304		o.ScipCTagsPath = checkScipCTags()
 305	}
 306
 307	if o.Parallelism == 0 {
 308		o.Parallelism = 4
 309	}
 310	if o.SizeMax == 0 {
 311		o.SizeMax = 2 << 20
 312	}
 313	if o.ShardMax == 0 {
 314		o.ShardMax = 100 << 20
 315	}
 316	if o.TrigramMax == 0 {
 317		o.TrigramMax = 20000
 318	}
 319
 320	if o.RepositoryDescription.Name == "" && o.RepositoryDescription.URL != "" {
 321		parsed, _ := url.Parse(o.RepositoryDescription.URL)
 322		if parsed != nil {
 323			o.RepositoryDescription.Name = filepath.Join(parsed.Host, parsed.Path)
 324		}
 325	}
 326}
 327
 328func hashString(s string) string {
 329	h := sha1.New()
 330	_, _ = io.WriteString(h, s)
 331	return fmt.Sprintf("%x", h.Sum(nil))
 332}
 333
 334// ShardName returns the name the given index shard.
 335func (o *Options) shardName(n int) string {
 336	return o.shardNameVersion(zoekt.IndexFormatVersion, n)
 337}
 338
 339func (o *Options) shardNameVersion(version, n int) string {
 340	abs := url.QueryEscape(o.RepositoryDescription.Name)
 341	if len(abs) > 200 {
 342		abs = abs[:200] + hashString(abs)[:8]
 343	}
 344	return filepath.Join(o.IndexDir,
 345		fmt.Sprintf("%s_v%d.%05d.zoekt", abs, version, n))
 346}
 347
 348type IndexState string
 349
 350const (
 351	IndexStateMissing IndexState = "missing"
 352	IndexStateCorrupt IndexState = "corrupt"
 353	IndexStateVersion IndexState = "version-mismatch"
 354	IndexStateOption  IndexState = "option-mismatch"
 355	IndexStateMeta    IndexState = "meta-mismatch"
 356	IndexStateContent IndexState = "content-mismatch"
 357	IndexStateEqual   IndexState = "equal"
 358)
 359
 360var readVersions = []struct {
 361	IndexFormatVersion int
 362	FeatureVersion     int
 363}{{
 364	IndexFormatVersion: zoekt.IndexFormatVersion,
 365	FeatureVersion:     zoekt.FeatureVersion,
 366}, {
 367	IndexFormatVersion: zoekt.NextIndexFormatVersion,
 368	FeatureVersion:     zoekt.FeatureVersion,
 369}}
 370
 371// IncrementalSkipIndexing returns true if the index present on disk matches
 372// the build options.
 373func (o *Options) IncrementalSkipIndexing() bool {
 374	state, _ := o.IndexState()
 375	return state == IndexStateEqual
 376}
 377
 378// IndexState checks how the index present on disk compares to the build
 379// options and returns the IndexState and the name of the first shard.
 380func (o *Options) IndexState() (IndexState, string) {
 381	// Open the latest version we support that is on disk.
 382	fn := o.findShard()
 383	if fn == "" {
 384		return IndexStateMissing, fn
 385	}
 386
 387	repos, index, err := zoekt.ReadMetadataPathAlive(fn)
 388	if os.IsNotExist(err) {
 389		return IndexStateMissing, fn
 390	} else if err != nil {
 391		return IndexStateCorrupt, fn
 392	}
 393
 394	for _, v := range readVersions {
 395		if v.IndexFormatVersion == index.IndexFormatVersion && v.FeatureVersion != index.IndexFeatureVersion {
 396			return IndexStateVersion, fn
 397		}
 398	}
 399
 400	var repo *zoekt.Repository
 401	for _, cand := range repos {
 402		if cand.Name == o.RepositoryDescription.Name {
 403			repo = cand
 404			break
 405		}
 406	}
 407
 408	if repo == nil {
 409		return IndexStateCorrupt, fn
 410	}
 411
 412	if repo.IndexOptions != o.GetHash() {
 413		return IndexStateOption, fn
 414	}
 415
 416	if !reflect.DeepEqual(repo.Branches, o.RepositoryDescription.Branches) {
 417		return IndexStateContent, fn
 418	}
 419
 420	// We can mutate repo since it lives in the scope of this function call.
 421	if updated, err := repo.MergeMutable(&o.RepositoryDescription); err != nil {
 422		// non-nil err means we are trying to update an immutable field =>
 423		// reindex content.
 424		log.Printf("warn: immutable field changed, requires re-index: %s", err)
 425		return IndexStateContent, fn
 426	} else if updated {
 427		return IndexStateMeta, fn
 428	}
 429
 430	return IndexStateEqual, fn
 431}
 432
 433// FindRepositoryMetadata returns the index metadata for the repository
 434// specified in the options. 'ok' is false if the repository's metadata
 435// couldn't be found or if an error occurred.
 436func (o *Options) FindRepositoryMetadata() (repository *zoekt.Repository, metadata *zoekt.IndexMetadata, ok bool, err error) {
 437	shard := o.findShard()
 438	if shard == "" {
 439		return nil, nil, false, nil
 440	}
 441
 442	repositories, metadata, err := zoekt.ReadMetadataPathAlive(shard)
 443	if err != nil {
 444		return nil, nil, false, fmt.Errorf("reading metadata for shard %q: %w", shard, err)
 445	}
 446
 447	ID := o.RepositoryDescription.ID
 448	for _, r := range repositories {
 449		// compound shards contain multiple repositories, so we
 450		// have to pick only the one we're looking for
 451		if r.ID == ID {
 452			return r, metadata, true, nil
 453		}
 454	}
 455
 456	// If we're here, then we're somehow in a state where we found a matching
 457	// shard that's missing the repository metadata we're looking for. This
 458	// should never happen.
 459	name := o.RepositoryDescription.Name
 460	return nil, nil, false, fmt.Errorf("matching shard %q doesn't contain metadata for repo id %d (%q)", shard, ID, name)
 461}
 462
 463func (o *Options) findShard() string {
 464	for _, v := range readVersions {
 465		fn := o.shardNameVersion(v.IndexFormatVersion, 0)
 466		if _, err := os.Stat(fn); err == nil {
 467			return fn
 468		}
 469	}
 470
 471	// Brute force finding the shard in compound shards. We should only hit this
 472	// code path for repositories that are not already existing or are in
 473	// compound shards.
 474	//
 475	// TODO add an oracle which can speed this up in the case of repositories
 476	// already in compound shards.
 477	compoundShards, err := filepath.Glob(path.Join(o.IndexDir, "compound-*.zoekt"))
 478	if err != nil {
 479		return ""
 480	}
 481	for _, fn := range compoundShards {
 482		repos, _, err := zoekt.ReadMetadataPathAlive(fn)
 483		if err != nil {
 484			continue
 485		}
 486		for _, repo := range repos {
 487			if repo.ID == o.RepositoryDescription.ID {
 488				return fn
 489			}
 490		}
 491	}
 492
 493	return ""
 494}
 495
 496func (o *Options) FindAllShards() []string {
 497	for _, v := range readVersions {
 498		fn := o.shardNameVersion(v.IndexFormatVersion, 0)
 499		if _, err := os.Stat(fn); err == nil {
 500			shards := []string{fn}
 501			for i := 1; ; i++ {
 502				fn := o.shardNameVersion(v.IndexFormatVersion, i)
 503				if _, err := os.Stat(fn); err != nil {
 504					return shards
 505				}
 506				shards = append(shards, fn)
 507			}
 508		}
 509	}
 510
 511	// lazily fallback to findShard which will look for a compound shard.
 512	if fn := o.findShard(); fn != "" {
 513		return []string{fn}
 514	}
 515
 516	return nil
 517}
 518
 519// IgnoreSizeMax determines whether the max size should be ignored.
 520func (o *Options) IgnoreSizeMax(name string) bool {
 521	// A pattern match will override preceding pattern matches.
 522	for i := len(o.LargeFiles) - 1; i >= 0; i-- {
 523		pattern := strings.TrimSpace(o.LargeFiles[i])
 524		negated, validatedPattern := checkIsNegatePattern(pattern)
 525
 526		if m, _ := doublestar.PathMatch(validatedPattern, name); m {
 527			if negated {
 528				return false
 529			} else {
 530				return true
 531			}
 532		}
 533	}
 534
 535	return false
 536}
 537
 538func checkIsNegatePattern(pattern string) (bool, string) {
 539	negate := "!"
 540
 541	// if negated then strip prefix meta character which identifies negated filter pattern
 542	if strings.HasPrefix(pattern, negate) {
 543		return true, pattern[len(negate):]
 544	}
 545
 546	return false, pattern
 547}
 548
 549// NewBuilder creates a new Builder instance.
 550func NewBuilder(opts Options) (*Builder, error) {
 551	opts.SetDefaults()
 552	if opts.RepositoryDescription.Name == "" {
 553		return nil, fmt.Errorf("builder: must set Name")
 554	}
 555
 556	b := &Builder{
 557		opts:           opts,
 558		throttle:       make(chan int, opts.Parallelism),
 559		finishedShards: map[string]string{},
 560	}
 561
 562	parserBins, err := ctags.NewParserBinMap(
 563		b.opts.CTagsPath,
 564		b.opts.ScipCTagsPath,
 565		opts.LanguageMap,
 566		b.opts.CTagsMustSucceed,
 567	)
 568	if err != nil {
 569		return nil, err
 570	}
 571
 572	b.parserBins = parserBins
 573
 574	if opts.IsDelta {
 575		// Delta shards build on top of previously existing shards.
 576		// As a consequence, the shardNum for delta shards starts from
 577		// the number following the most recently generated shard - not 0.
 578		//
 579		// Using this numbering scheme allows all the shards to be
 580		// discovered as a set.
 581		shards := b.opts.FindAllShards()
 582		b.nextShardNum = len(shards) // shards are zero indexed, so len() provides the next number after the last one
 583	}
 584
 585	if _, err := b.newShardBuilder(); err != nil {
 586		return nil, err
 587	}
 588
 589	now := time.Now()
 590	b.indexTime = now
 591	b.id = xid.NewWithTime(now).String()
 592
 593	return b, nil
 594}
 595
 596// AddFile is a convenience wrapper for the Add method
 597func (b *Builder) AddFile(name string, content []byte) error {
 598	return b.Add(zoekt.Document{Name: name, Content: content})
 599}
 600
 601func (b *Builder) Add(doc zoekt.Document) error {
 602	if b.finishCalled {
 603		return nil
 604	}
 605
 606	allowLargeFile := b.opts.IgnoreSizeMax(doc.Name)
 607	if len(doc.Content) > b.opts.SizeMax && !allowLargeFile {
 608		// We could pass the document on to the shardbuilder, but if
 609		// we pass through a part of the source tree with binary/large
 610		// files, the corresponding shard would be mostly empty, so
 611		// insert a reason here too.
 612		doc.SkipReason = fmt.Sprintf("document size %d larger than limit %d", len(doc.Content), b.opts.SizeMax)
 613	} else if err := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile); err != nil {
 614		doc.SkipReason = err.Error()
 615		doc.Language = "binary"
 616	}
 617
 618	b.todo = append(b.todo, &doc)
 619
 620	if doc.SkipReason == "" {
 621		b.size += len(doc.Name) + len(doc.Content)
 622	} else {
 623		b.size += len(doc.Name) + len(doc.SkipReason)
 624		// Drop the content if we are skipping the document. Skipped content is not counted towards the
 625		// shard size limit, so otherwise we might buffer too much data in memory before flushing.
 626		doc.Content = nil
 627	}
 628
 629	if b.size > b.opts.ShardMax {
 630		return b.flush()
 631	}
 632
 633	return nil
 634}
 635
 636// MarkFileAsChangedOrRemoved indicates that the file specified by the given path
 637// has been changed or removed since the last indexing job for this repository.
 638//
 639// If this build is a delta build, these files will be tombstoned in the older shards for this repository.
 640func (b *Builder) MarkFileAsChangedOrRemoved(path string) {
 641	b.opts.changedOrRemovedFiles = append(b.opts.changedOrRemovedFiles, path)
 642}
 643
 644// Finish creates a last shard from the buffered documents, and clears
 645// stale shards from previous runs. This should always be called, also
 646// in failure cases, to ensure cleanup.
 647//
 648// It is safe to call Finish() multiple times.
 649func (b *Builder) Finish() error {
 650	if b.finishCalled {
 651		return b.buildError
 652	}
 653
 654	b.finishCalled = true
 655
 656	b.flush()
 657	b.building.Wait()
 658
 659	if b.buildError != nil {
 660		for tmp := range b.finishedShards {
 661			log.Printf("Builder.Finish %s", tmp)
 662			os.Remove(tmp)
 663		}
 664		b.finishedShards = map[string]string{}
 665		return b.buildError
 666	}
 667
 668	// map of temporary -> final names for all updated shards + shard metadata files
 669	artifactPaths := make(map[string]string)
 670	for tmp, final := range b.finishedShards {
 671		artifactPaths[tmp] = final
 672	}
 673
 674	oldShards := b.opts.FindAllShards()
 675
 676	if b.opts.IsDelta {
 677		// Delta shard builds need to update FileTombstone and branch commit information for all
 678		// existing shards
 679		for _, shard := range oldShards {
 680			repositories, _, err := zoekt.ReadMetadataPathAlive(shard)
 681			if err != nil {
 682				return fmt.Errorf("reading metadata from shard %q: %w", shard, err)
 683			}
 684
 685			if len(repositories) > 1 {
 686				return fmt.Errorf("delta shard builds don't support repositories contained in compound shards (shard %q)", shard)
 687			}
 688
 689			if len(repositories) == 0 {
 690				return fmt.Errorf("failed to update repository metadata for shard %q - shard contains no repositories", shard)
 691			}
 692
 693			repository := repositories[0]
 694			if repository.ID != b.opts.RepositoryDescription.ID {
 695				return fmt.Errorf("shard %q doesn't contain repository ID %d (%q)", shard, b.opts.RepositoryDescription.ID, b.opts.RepositoryDescription.Name)
 696			}
 697
 698			if len(b.opts.changedOrRemovedFiles) > 0 && repository.FileTombstones == nil {
 699				repository.FileTombstones = make(map[string]struct{}, len(b.opts.changedOrRemovedFiles))
 700			}
 701
 702			for _, f := range b.opts.changedOrRemovedFiles {
 703				repository.FileTombstones[f] = struct{}{}
 704			}
 705
 706			if !BranchNamesEqual(repository.Branches, b.opts.RepositoryDescription.Branches) {
 707				return deltaBranchSetError{
 708					shardName: shard,
 709					old:       repository.Branches,
 710					new:       b.opts.RepositoryDescription.Branches,
 711				}
 712			}
 713
 714			if b.opts.GetHash() != repository.IndexOptions {
 715				return &deltaIndexOptionsMismatchError{
 716					shardName:  shard,
 717					newOptions: b.opts.HashOptions(),
 718				}
 719			}
 720
 721			repository.Branches = b.opts.RepositoryDescription.Branches
 722
 723			repository.LatestCommitDate = b.opts.RepositoryDescription.LatestCommitDate
 724
 725			tempPath, finalPath, err := zoekt.JsonMarshalRepoMetaTemp(shard, repository)
 726			if err != nil {
 727				return fmt.Errorf("writing repository metadta for shard %q: %w", shard, err)
 728			}
 729
 730			artifactPaths[tempPath] = finalPath
 731		}
 732	}
 733
 734	// We mark finished shards as empty when we successfully finish. Return now
 735	// to allow call sites to call Finish idempotently.
 736	if len(artifactPaths) == 0 {
 737		return b.buildError
 738	}
 739
 740	// Collect a map of the old shards on disk. For each new shard we replace we
 741	// delete it from toDelete. Anything remaining in toDelete will be removed
 742	// after we have renamed everything into place.
 743
 744	var toDelete map[string]struct{}
 745	if !b.opts.IsDelta {
 746		// Non-delta shard builds delete all existing shards before they write out
 747		// new ones.
 748		// By contrast, delta shard builds work by stacking changes on top of existing shards.
 749		// So, we skip populating the toDelete map if we're building delta shards.
 750
 751		toDelete = make(map[string]struct{})
 752		for _, name := range oldShards {
 753			paths, err := zoekt.IndexFilePaths(name)
 754			if err != nil {
 755				b.buildError = fmt.Errorf("failed to find old paths for %s: %w", name, err)
 756			}
 757			for _, p := range paths {
 758				toDelete[p] = struct{}{}
 759			}
 760		}
 761	}
 762
 763	for tmp, final := range artifactPaths {
 764		if err := os.Rename(tmp, final); err != nil {
 765			b.buildError = err
 766			continue
 767		}
 768
 769		delete(toDelete, final)
 770	}
 771
 772	b.finishedShards = map[string]string{}
 773
 774	for p := range toDelete {
 775		// Don't delete compound shards, set tombstones instead.
 776		if b.opts.ShardMerging && strings.HasPrefix(filepath.Base(p), "compound-") {
 777			if !strings.HasSuffix(p, ".zoekt") {
 778				continue
 779			}
 780			err := zoekt.SetTombstone(p, b.opts.RepositoryDescription.ID)
 781			b.buildError = err
 782			continue
 783		}
 784		log.Printf("removing old shard file: %s", p)
 785		if err := os.Remove(p); err != nil {
 786			b.buildError = err
 787		}
 788	}
 789
 790	return b.buildError
 791}
 792
 793// BranchNamesEqual compares the given zoekt.RepositoryBranch slices, and returns true
 794// iff both slices specify the same set of branch names in the same order.
 795func BranchNamesEqual(a, b []zoekt.RepositoryBranch) bool {
 796	if len(a) != len(b) {
 797		return false
 798	}
 799
 800	for i := range a {
 801		x, y := a[i], b[i]
 802		if x.Name != y.Name {
 803			return false
 804		}
 805	}
 806
 807	return true
 808}
 809
 810func (b *Builder) flush() error {
 811	todo := b.todo
 812	b.todo = nil
 813	b.size = 0
 814	b.errMu.Lock()
 815	defer b.errMu.Unlock()
 816	if b.buildError != nil {
 817		return b.buildError
 818	}
 819
 820	hasShard := b.nextShardNum > 0
 821	if len(todo) == 0 && hasShard {
 822		return nil
 823	}
 824
 825	shard := b.nextShardNum
 826	b.nextShardNum++
 827
 828	if b.opts.Parallelism > 1 {
 829		b.building.Add(1)
 830		b.throttle <- 1
 831		go func() {
 832			done, err := b.buildShard(todo, shard)
 833			<-b.throttle
 834
 835			b.errMu.Lock()
 836			defer b.errMu.Unlock()
 837			if err != nil && b.buildError == nil {
 838				b.buildError = err
 839			}
 840			if err == nil {
 841				b.finishedShards[done.temp] = done.final
 842			}
 843			b.building.Done()
 844		}()
 845	} else {
 846		// No goroutines when we're not parallel. This
 847		// simplifies memory profiling.
 848		done, err := b.buildShard(todo, shard)
 849		b.buildError = err
 850		if err == nil {
 851			b.finishedShards[done.temp] = done.final
 852		}
 853
 854		return b.buildError
 855	}
 856
 857	return nil
 858}
 859
 860// map [0,inf) to [0,1) monotonically
 861func squashRange(j int) float64 {
 862	x := float64(j)
 863	return x / (1 + x)
 864}
 865
 866// IsLowPriority takes a file name and makes an educated guess about its priority
 867// in search results. A file is considered low priority if it looks like a test,
 868// vendored, or generated file.
 869//
 870// These 'priority' criteria affects how documents are ordered within a shard. It's
 871// also used to help guess a file's rank when we're missing ranking information.
 872func IsLowPriority(path string, content []byte) bool {
 873	return enry.IsTest(path) || enry.IsVendor(path) || enry.IsGenerated(path, content)
 874}
 875
 876type rankedDoc struct {
 877	*zoekt.Document
 878	rank []float64
 879}
 880
 881// rank returns a vector of scores which is used at index-time to sort documents
 882// before writing them to disk. The order of documents in the shard is important
 883// at query time, because earlier documents receive a boost at query time and
 884// have a higher chance of being searched before limits kick in.
 885func rank(d *zoekt.Document, origIdx int) []float64 {
 886	skipped := 0.0
 887	if d.SkipReason != "" {
 888		skipped = 1.0
 889	}
 890
 891	generated := 0.0
 892	if enry.IsGenerated(d.Name, d.Content) {
 893		generated = 1.0
 894	}
 895
 896	vendor := 0.0
 897	if enry.IsVendor(d.Name) {
 898		vendor = 1.0
 899	}
 900
 901	test := 0.0
 902	if enry.IsTest(d.Name) {
 903		test = 1.0
 904	}
 905
 906	// Smaller is earlier (=better).
 907	return []float64{
 908		// Always place skipped docs last
 909		skipped,
 910
 911		// Prefer docs that are not generated
 912		generated,
 913
 914		// Prefer docs that are not vendored
 915		vendor,
 916
 917		// Prefer docs that are not tests
 918		test,
 919
 920		// With short names
 921		squashRange(len(d.Name)),
 922
 923		// With many symbols
 924		1.0 - squashRange(len(d.Symbols)),
 925
 926		// With short content
 927		squashRange(len(d.Content)),
 928
 929		// That is present is as many branches as possible
 930		1.0 - squashRange(len(d.Branches)),
 931
 932		// Preserve original ordering.
 933		squashRange(origIdx),
 934	}
 935}
 936
 937func sortDocuments(todo []*zoekt.Document) {
 938	rs := make([]rankedDoc, 0, len(todo))
 939	for i, t := range todo {
 940		rd := rankedDoc{t, rank(t, i)}
 941		rs = append(rs, rd)
 942	}
 943	sort.Slice(rs, func(i, j int) bool {
 944		r1 := rs[i].rank
 945		r2 := rs[j].rank
 946		for i := range r1 {
 947			if r1[i] < r2[i] {
 948				return true
 949			}
 950			if r1[i] > r2[i] {
 951				return false
 952			}
 953		}
 954
 955		return false
 956	})
 957	for i := range todo {
 958		todo[i] = rs[i].Document
 959	}
 960}
 961
 962func (b *Builder) buildShard(todo []*zoekt.Document, nextShardNum int) (*finishedShard, error) {
 963	if !b.opts.DisableCTags && (b.opts.CTagsPath != "" || b.opts.ScipCTagsPath != "") {
 964		err := parseSymbols(todo, b.opts.LanguageMap, b.parserBins)
 965		if b.opts.CTagsMustSucceed && err != nil {
 966			return nil, err
 967		}
 968		if err != nil {
 969			log.Printf("ignoring universal:%s or scip:%s error: %v", b.opts.CTagsPath, b.opts.ScipCTagsPath, err)
 970		}
 971	}
 972
 973	name := b.opts.shardName(nextShardNum)
 974
 975	shardBuilder, err := b.newShardBuilder()
 976	if err != nil {
 977		return nil, err
 978	}
 979
 980	sortDocuments(todo)
 981
 982	for idx, t := range todo {
 983		if err := shardBuilder.Add(*t); err != nil {
 984			return nil, err
 985		}
 986
 987		if idx%10_000 == 0 {
 988			b.CheckMemoryUsage()
 989		}
 990	}
 991
 992	return b.writeShard(name, shardBuilder)
 993}
 994
 995// CheckMemoryUsage checks the memory usage of the process and writes a memory profile if the heap usage exceeds the
 996// configured threshold. NOTE: this method is expensive and should only be used for debugging.
 997func (b *Builder) CheckMemoryUsage() {
 998	// Don't check memory if heap profiling is disabled, or we've already written 10 profiles
 999	if b.opts.HeapProfileTriggerBytes <= 0 || b.heapProfileNum >= 10 {
1000		return
1001	}
1002
1003	var m runtime.MemStats
1004	runtime.ReadMemStats(&m)
1005
1006	if m.HeapAlloc > b.opts.HeapProfileTriggerBytes && b.heapProfileMu.TryLock() {
1007		defer b.heapProfileMu.Unlock()
1008
1009		log.Printf("writing memory profile, allocated heap: %s", humanize.Bytes(m.HeapAlloc))
1010		name := filepath.Join(b.opts.IndexDir, fmt.Sprintf("indexmemory.prof.%d", b.heapProfileNum))
1011		f, err := os.Create(name)
1012		if err != nil {
1013			log.Printf("failed to create memory profile file: %v", err)
1014			return
1015		}
1016
1017		err = pprof.WriteHeapProfile(f)
1018		if err != nil {
1019			log.Printf("failed to write memory profile: %v", err)
1020		}
1021
1022		b.heapProfileNum++
1023	}
1024}
1025
1026func (b *Builder) newShardBuilder() (*zoekt.IndexBuilder, error) {
1027	desc := b.opts.RepositoryDescription
1028	desc.HasSymbols = !b.opts.DisableCTags && b.opts.CTagsPath != ""
1029	desc.SubRepoMap = b.opts.SubRepositories
1030	desc.IndexOptions = b.opts.GetHash()
1031
1032	shardBuilder, err := zoekt.NewIndexBuilder(&desc)
1033	if err != nil {
1034		return nil, err
1035	}
1036	shardBuilder.IndexTime = b.indexTime
1037	shardBuilder.ID = b.id
1038	return shardBuilder, nil
1039}
1040
1041func (b *Builder) writeShard(fn string, ib *zoekt.IndexBuilder) (*finishedShard, error) {
1042	dir := filepath.Dir(fn)
1043	if err := os.MkdirAll(dir, 0o700); err != nil {
1044		return nil, err
1045	}
1046
1047	f, err := os.CreateTemp(dir, filepath.Base(fn)+".*.tmp")
1048	if err != nil {
1049		return nil, err
1050	}
1051	if runtime.GOOS != "windows" {
1052		if err := f.Chmod(0o666 &^ umask); err != nil {
1053			return nil, err
1054		}
1055	}
1056
1057	defer f.Close()
1058	if err := ib.Write(f); err != nil {
1059		return nil, err
1060	}
1061	fi, err := f.Stat()
1062	if err != nil {
1063		return nil, err
1064	}
1065	if err := f.Close(); err != nil {
1066		return nil, err
1067	}
1068
1069	log.Printf("finished shard %s: %d index bytes (overhead %3.1f), %d files processed \n",
1070		fn,
1071		fi.Size(),
1072		float64(fi.Size())/float64(ib.ContentSize()+1),
1073		ib.NumFiles())
1074
1075	return &finishedShard{f.Name(), fn}, nil
1076}
1077
1078type deltaBranchSetError struct {
1079	shardName string
1080	old, new  []zoekt.RepositoryBranch
1081}
1082
1083func (e deltaBranchSetError) Error() string {
1084	return fmt.Sprintf("repository metadata in shard %q contains a different set of branch names than what was requested, which is unsupported in a delta shard build. old: %+v, new: %+v", e.shardName, e.old, e.new)
1085}
1086
1087type deltaIndexOptionsMismatchError struct {
1088	shardName  string
1089	newOptions HashOptions
1090}
1091
1092func (e *deltaIndexOptionsMismatchError) Error() string {
1093	return fmt.Sprintf("one or more index options for shard %q do not match Builder's index options. These index option updates are incompatible with delta build. New index options: %+v", e.shardName, e.newOptions)
1094}
1095
1096// umask holds the Umask of the current process
1097var umask os.FileMode
Configure Feed

Configure Feed