index/builder.go at b97d734760cec3ddde795683731e48a9e674ebb7 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / index / builder.go
at b97d734760cec3ddde795683731e48a9e674ebb7 30 kB View raw
Julie Tibshirani ranking: incorporate file signals into BM25F (#922) 1y ago
   1// Copyright 2016 Google Inc. All rights reserved.
   2//
   3// Licensed under the Apache License, Version 2.0 (the "License");
   4// you may not use this file except in compliance with the License.
   5// You may obtain a copy of the License at
   6//
   7//    http://www.apache.org/licenses/LICENSE-2.0
   8//
   9// Unless required by applicable law or agreed to in writing, software
  10// distributed under the License is distributed on an "AS IS" BASIS,
  11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12// See the License for the specific language governing permissions and
  13// limitations under the License.
  14
  15// Package index contains logic for building Zoekt indexes. NOTE: this package is not considered
  16// part of the public API, and it is not recommended to rely on it in external code.
  17package index
  18
  19import (
  20	"cmp"
  21	"crypto/sha1"
  22	"flag"
  23	"fmt"
  24	"log"
  25	"net/url"
  26	"os"
  27	"os/exec"
  28	"path"
  29	"path/filepath"
  30	"reflect"
  31	"runtime"
  32	"runtime/pprof"
  33	"sort"
  34	"strconv"
  35	"strings"
  36	"sync"
  37	"time"
  38
  39	"github.com/bmatcuk/doublestar"
  40	"github.com/dustin/go-humanize"
  41	"github.com/go-enry/go-enry/v2"
  42	"github.com/rs/xid"
  43	"golang.org/x/sys/unix"
  44
  45	"github.com/sourcegraph/zoekt"
  46	"github.com/sourcegraph/zoekt/internal/ctags"
  47)
  48
  49var DefaultDir = filepath.Join(os.Getenv("HOME"), ".zoekt")
  50
  51// Branch describes a single branch version.
  52type Branch struct {
  53	Name    string
  54	Version string
  55}
  56
  57// Options sets options for the index building.
  58type Options struct {
  59	// IndexDir is a directory that holds *.zoekt index files.
  60	IndexDir string
  61
  62	// SizeMax is the maximum file size
  63	SizeMax int
  64
  65	// Parallelism is the maximum number of shards to index in parallel
  66	Parallelism int
  67
  68	// ShardMax sets the maximum corpus size for a single shard
  69	ShardMax int
  70
  71	// TrigramMax sets the maximum number of distinct trigrams per document.
  72	TrigramMax int
  73
  74	// RepositoryDescription holds names and URLs for the repository.
  75	RepositoryDescription zoekt.Repository
  76
  77	// SubRepositories is a path => sub repository map.
  78	SubRepositories map[string]*zoekt.Repository
  79
  80	// DisableCTags disables the generation of ctags metadata.
  81	DisableCTags bool
  82
  83	// CtagsPath is the path to the ctags binary to run, or empty
  84	// if a valid binary couldn't be found.
  85	CTagsPath string
  86
  87	// Same as CTagsPath but for scip-ctags
  88	ScipCTagsPath string
  89
  90	// If set, ctags must succeed.
  91	CTagsMustSucceed bool
  92
  93	// LargeFiles is a slice of glob patterns, including ** for any number
  94	// of directories, where matching file paths should be indexed
  95	// regardless of their size. The full pattern syntax is here:
  96	// https://github.com/bmatcuk/doublestar/tree/v1#patterns.
  97	LargeFiles []string
  98
  99	// IsDelta is true if this run contains only the changed documents since the
 100	// last run.
 101	IsDelta bool
 102
 103	// changedOrRemovedFiles is a list of file paths that have been changed or removed
 104	// since the last indexing job for this repository. These files will be tombstoned
 105	// in the older shards for this repository.
 106	changedOrRemovedFiles []string
 107
 108	LanguageMap ctags.LanguageMap
 109
 110	// ShardMerging is true if builder should respect compound shards. This is a
 111	// Sourcegraph specific option.
 112	ShardMerging bool
 113
 114	// HeapProfileTriggerBytes is the heap allocation in bytes that will trigger a memory profile. If 0, no memory profile
 115	// will be triggered. Note this trigger looks at total heap allocation (which includes both inuse and garbage objects).
 116	//
 117	// Profiles will be written to files named `index-memory.prof.n` in the index directory. No more than 10 files are written.
 118	//
 119	// Note: heap checking is "best effort", and it's possible for the process to OOM without triggering the heap profile.
 120	HeapProfileTriggerBytes uint64
 121
 122	// ShardPrefix is the prefix of the shard. It defaults to the repository name.
 123	ShardPrefix string
 124}
 125
 126// HashOptions contains only the options in Options that upon modification leads to IndexState of IndexStateMismatch during the next index building.
 127type HashOptions struct {
 128	sizeMax          int
 129	disableCTags     bool
 130	ctagsPath        string
 131	cTagsMustSucceed bool
 132	largeFiles       []string
 133}
 134
 135func (o *Options) HashOptions() HashOptions {
 136	return HashOptions{
 137		sizeMax:          o.SizeMax,
 138		disableCTags:     o.DisableCTags,
 139		ctagsPath:        o.CTagsPath,
 140		cTagsMustSucceed: o.CTagsMustSucceed,
 141		largeFiles:       o.LargeFiles,
 142	}
 143}
 144
 145func (o *Options) GetHash() string {
 146	h := o.HashOptions()
 147	hasher := sha1.New()
 148
 149	hasher.Write([]byte(h.ctagsPath))
 150	hasher.Write([]byte(fmt.Sprintf("%t", h.cTagsMustSucceed)))
 151	hasher.Write([]byte(fmt.Sprintf("%d", h.sizeMax)))
 152	hasher.Write([]byte(fmt.Sprintf("%q", h.largeFiles)))
 153	hasher.Write([]byte(fmt.Sprintf("%t", h.disableCTags)))
 154
 155	return fmt.Sprintf("%x", hasher.Sum(nil))
 156}
 157
 158type largeFilesFlag struct{ *Options }
 159
 160func (f largeFilesFlag) String() string {
 161	// From flag.Value documentation:
 162	//
 163	// The flag package may call the String method with a zero-valued receiver,
 164	// such as a nil pointer.
 165	if f.Options == nil {
 166		return ""
 167	}
 168	s := append([]string{""}, f.LargeFiles...)
 169	return strings.Join(s, "-large_file ")
 170}
 171
 172func (f largeFilesFlag) Set(value string) error {
 173	f.LargeFiles = append(f.LargeFiles, value)
 174	return nil
 175}
 176
 177// Flags adds flags for build options to fs. It is the "inverse" of Args.
 178func (o *Options) Flags(fs *flag.FlagSet) {
 179	x := *o
 180	x.SetDefaults()
 181	fs.IntVar(&o.SizeMax, "file_limit", x.SizeMax, "maximum file size")
 182	fs.IntVar(&o.TrigramMax, "max_trigram_count", x.TrigramMax, "maximum number of trigrams per document")
 183	fs.IntVar(&o.ShardMax, "shard_limit", x.ShardMax, "maximum corpus size for a shard")
 184	fs.IntVar(&o.Parallelism, "parallelism", x.Parallelism, "maximum number of parallel indexing processes.")
 185	fs.StringVar(&o.IndexDir, "index", x.IndexDir, "directory for search indices")
 186	fs.BoolVar(&o.CTagsMustSucceed, "require_ctags", x.CTagsMustSucceed, "If set, ctags calls must succeed.")
 187	fs.Var(largeFilesFlag{o}, "large_file", "A glob pattern where matching files are to be index regardless of their size. You can add multiple patterns by setting this more than once.")
 188	fs.StringVar(&o.ShardPrefix, "shard_prefix", x.ShardPrefix, "the prefix of the shard. Defaults to repository name")
 189
 190	// Sourcegraph specific
 191	fs.BoolVar(&o.DisableCTags, "disable_ctags", x.DisableCTags, "If set, ctags will not be called.")
 192	fs.BoolVar(&o.ShardMerging, "shard_merging", x.ShardMerging, "If set, builder will respect compound shards.")
 193}
 194
 195// Args generates command line arguments for o. It is the "inverse" of Flags.
 196func (o *Options) Args() []string {
 197	var args []string
 198
 199	if o.SizeMax != 0 {
 200		args = append(args, "-file_limit", strconv.Itoa(o.SizeMax))
 201	}
 202
 203	if o.TrigramMax != 0 {
 204		args = append(args, "-max_trigram_count", strconv.Itoa(o.TrigramMax))
 205	}
 206
 207	if o.ShardMax != 0 {
 208		args = append(args, "-shard_limit", strconv.Itoa(o.ShardMax))
 209	}
 210
 211	if o.Parallelism != 0 {
 212		args = append(args, "-parallelism", strconv.Itoa(o.Parallelism))
 213	}
 214
 215	if o.IndexDir != "" {
 216		args = append(args, "-index", o.IndexDir)
 217	}
 218
 219	if o.CTagsMustSucceed {
 220		args = append(args, "-require_ctags")
 221	}
 222
 223	for _, a := range o.LargeFiles {
 224		args = append(args, "-large_file", a)
 225	}
 226
 227	// Sourcegraph specific
 228	if o.DisableCTags {
 229		args = append(args, "-disable_ctags")
 230	}
 231
 232	if o.ShardMerging {
 233		args = append(args, "-shard_merging")
 234	}
 235
 236	if o.ShardPrefix != "" {
 237		args = append(args, "-shard_prefix", o.ShardPrefix)
 238	}
 239
 240	return args
 241}
 242
 243// Builder manages (parallel) creation of uniformly sized shards. The
 244// builder buffers up documents until it collects enough documents and
 245// then builds a shard and writes.
 246type Builder struct {
 247	opts     Options
 248	throttle chan int
 249
 250	nextShardNum int
 251	todo         []*Document
 252	docChecker   DocChecker
 253	size         int
 254
 255	parserBins ctags.ParserBinMap
 256	building   sync.WaitGroup
 257
 258	errMu      sync.Mutex
 259	buildError error
 260
 261	// temp name => final name for finished shards. We only rename
 262	// them once all shards succeed to avoid Frankstein corpuses.
 263	finishedShards map[string]string
 264
 265	// indexTime is set by tests for doing reproducible builds.
 266	indexTime time.Time
 267
 268	// heapProfileMu is used to ensure that only one memory profile is written at a time
 269	heapProfileMu  sync.Mutex
 270	heapProfileNum int
 271
 272	// a sortable 20 chars long id.
 273	id string
 274
 275	finishCalled bool
 276}
 277
 278type finishedShard struct {
 279	temp, final string
 280}
 281
 282func checkCTags() string {
 283	if ctags := os.Getenv("CTAGS_COMMAND"); ctags != "" {
 284		return ctags
 285	}
 286
 287	if ctags, err := exec.LookPath("universal-ctags"); err == nil {
 288		return ctags
 289	}
 290
 291	return ""
 292}
 293
 294func checkScipCTags() string {
 295	if ctags := os.Getenv("SCIP_CTAGS_COMMAND"); ctags != "" {
 296		return ctags
 297	}
 298
 299	if ctags, err := exec.LookPath("scip-ctags"); err == nil {
 300		return ctags
 301	}
 302
 303	return ""
 304}
 305
 306// SetDefaults sets reasonable default options.
 307func (o *Options) SetDefaults() {
 308	if o.CTagsPath == "" && !o.DisableCTags {
 309		o.CTagsPath = checkCTags()
 310	}
 311
 312	if o.ScipCTagsPath == "" && !o.DisableCTags {
 313		o.ScipCTagsPath = checkScipCTags()
 314	}
 315
 316	if o.Parallelism == 0 {
 317		o.Parallelism = 4
 318	}
 319	if o.SizeMax == 0 {
 320		o.SizeMax = 2 << 20
 321	}
 322	if o.ShardMax == 0 {
 323		o.ShardMax = 100 << 20
 324	}
 325	if o.TrigramMax == 0 {
 326		o.TrigramMax = 20000
 327	}
 328
 329	if o.RepositoryDescription.Name == "" && o.RepositoryDescription.URL != "" {
 330		parsed, _ := url.Parse(o.RepositoryDescription.URL)
 331		if parsed != nil {
 332			o.RepositoryDescription.Name = filepath.Join(parsed.Host, parsed.Path)
 333		}
 334	}
 335}
 336
 337// ShardName returns the name the given index shard.
 338func (o *Options) shardName(n int) string {
 339	return o.shardNameVersion(IndexFormatVersion, n)
 340}
 341
 342func (o *Options) shardNameVersion(version, n int) string {
 343	return ShardName(o.IndexDir, cmp.Or(o.ShardPrefix, o.RepositoryDescription.Name), version, n)
 344}
 345
 346type IndexState string
 347
 348const (
 349	IndexStateMissing IndexState = "missing"
 350	IndexStateCorrupt IndexState = "corrupt"
 351	IndexStateVersion IndexState = "version-mismatch"
 352	IndexStateOption  IndexState = "option-mismatch"
 353	IndexStateMeta    IndexState = "meta-mismatch"
 354	IndexStateContent IndexState = "content-mismatch"
 355	IndexStateEqual   IndexState = "equal"
 356)
 357
 358var readVersions = []struct {
 359	IndexFormatVersion int
 360	FeatureVersion     int
 361}{{
 362	IndexFormatVersion: IndexFormatVersion,
 363	FeatureVersion:     FeatureVersion,
 364}, {
 365	IndexFormatVersion: NextIndexFormatVersion,
 366	FeatureVersion:     FeatureVersion,
 367}}
 368
 369// IncrementalSkipIndexing returns true if the index present on disk matches
 370// the build options.
 371func (o *Options) IncrementalSkipIndexing() bool {
 372	state, _ := o.IndexState()
 373	return state == IndexStateEqual
 374}
 375
 376// IndexState checks how the index present on disk compares to the build
 377// options and returns the IndexState and the name of the first shard.
 378func (o *Options) IndexState() (IndexState, string) {
 379	// Open the latest version we support that is on disk.
 380	fn := o.findShard()
 381	if fn == "" {
 382		return IndexStateMissing, fn
 383	}
 384
 385	repos, index, err := ReadMetadataPathAlive(fn)
 386	if os.IsNotExist(err) {
 387		return IndexStateMissing, fn
 388	} else if err != nil {
 389		return IndexStateCorrupt, fn
 390	}
 391
 392	for _, v := range readVersions {
 393		if v.IndexFormatVersion == index.IndexFormatVersion && v.FeatureVersion != index.IndexFeatureVersion {
 394			return IndexStateVersion, fn
 395		}
 396	}
 397
 398	var repo *zoekt.Repository
 399	for _, cand := range repos {
 400		if cand.Name == o.RepositoryDescription.Name {
 401			repo = cand
 402			break
 403		}
 404	}
 405
 406	if repo == nil {
 407		return IndexStateCorrupt, fn
 408	}
 409
 410	if repo.IndexOptions != o.GetHash() {
 411		return IndexStateOption, fn
 412	}
 413
 414	if !reflect.DeepEqual(repo.Branches, o.RepositoryDescription.Branches) {
 415		return IndexStateContent, fn
 416	}
 417
 418	// We can mutate repo since it lives in the scope of this function call.
 419	if updated, err := repo.MergeMutable(&o.RepositoryDescription); err != nil {
 420		// non-nil err means we are trying to update an immutable field =>
 421		// reindex content.
 422		log.Printf("warn: immutable field changed, requires re-index: %s", err)
 423		return IndexStateContent, fn
 424	} else if updated {
 425		return IndexStateMeta, fn
 426	}
 427
 428	return IndexStateEqual, fn
 429}
 430
 431// FindRepositoryMetadata returns the index metadata for the repository
 432// specified in the options. 'ok' is false if the repository's metadata
 433// couldn't be found or if an error occurred.
 434func (o *Options) FindRepositoryMetadata() (repository *zoekt.Repository, metadata *zoekt.IndexMetadata, ok bool, err error) {
 435	shard := o.findShard()
 436	if shard == "" {
 437		return nil, nil, false, nil
 438	}
 439
 440	repositories, metadata, err := ReadMetadataPathAlive(shard)
 441	if err != nil {
 442		return nil, nil, false, fmt.Errorf("reading metadata for shard %q: %w", shard, err)
 443	}
 444
 445	ID := o.RepositoryDescription.ID
 446	for _, r := range repositories {
 447		// compound shards contain multiple repositories, so we
 448		// have to pick only the one we're looking for
 449		if r.ID == ID {
 450			return r, metadata, true, nil
 451		}
 452	}
 453
 454	// If we're here, then we're somehow in a state where we found a matching
 455	// shard that's missing the repository metadata we're looking for. This
 456	// should never happen.
 457	name := o.RepositoryDescription.Name
 458	return nil, nil, false, fmt.Errorf("matching shard %q doesn't contain metadata for repo id %d (%q)", shard, ID, name)
 459}
 460
 461func (o *Options) findShard() string {
 462	for _, v := range readVersions {
 463		fn := o.shardNameVersion(v.IndexFormatVersion, 0)
 464		if _, err := os.Stat(fn); err == nil {
 465			return fn
 466		}
 467	}
 468
 469	// Brute force finding the shard in compound shards. We should only hit this
 470	// code path for repositories that don't exist yet or are in compound shards.
 471	return o.findCompoundShard()
 472}
 473
 474func (o *Options) findCompoundShard() string {
 475	compoundShards, err := filepath.Glob(path.Join(o.IndexDir, "compound-*.zoekt"))
 476	if err != nil {
 477		return ""
 478	}
 479	for _, fn := range compoundShards {
 480		if containsRepo(fn, o.RepositoryDescription.ID) {
 481			return fn
 482		}
 483	}
 484
 485	return ""
 486}
 487
 488func (o *Options) FindAllShards() []string {
 489	for _, v := range readVersions {
 490		fn := o.shardNameVersion(v.IndexFormatVersion, 0)
 491		if _, err := os.Stat(fn); err == nil {
 492			shards := []string{fn}
 493			for i := 1; ; i++ {
 494				fn := o.shardNameVersion(v.IndexFormatVersion, i)
 495				if _, err := os.Stat(fn); err != nil {
 496					return shards
 497				}
 498				shards = append(shards, fn)
 499			}
 500		}
 501	}
 502
 503	// lazily fallback to findShard which will look for a compound shard.
 504	if fn := o.findShard(); fn != "" {
 505		return []string{fn}
 506	}
 507
 508	return nil
 509}
 510
 511// IgnoreSizeMax determines whether the max size should be ignored.
 512func (o *Options) IgnoreSizeMax(name string) bool {
 513	// A pattern match will override preceding pattern matches.
 514	for i := len(o.LargeFiles) - 1; i >= 0; i-- {
 515		pattern := strings.TrimSpace(o.LargeFiles[i])
 516		negated, validatedPattern := checkIsNegatePattern(pattern)
 517
 518		if m, _ := doublestar.PathMatch(validatedPattern, name); m {
 519			if negated {
 520				return false
 521			} else {
 522				return true
 523			}
 524		}
 525	}
 526
 527	return false
 528}
 529
 530func checkIsNegatePattern(pattern string) (bool, string) {
 531	negate := "!"
 532
 533	// if negated then strip prefix meta character which identifies negated filter pattern
 534	if strings.HasPrefix(pattern, negate) {
 535		return true, pattern[len(negate):]
 536	}
 537
 538	return false, pattern
 539}
 540
 541// NewBuilder creates a new Builder instance.
 542func NewBuilder(opts Options) (*Builder, error) {
 543	opts.SetDefaults()
 544	if opts.RepositoryDescription.Name == "" {
 545		return nil, fmt.Errorf("builder: must set Name")
 546	}
 547
 548	b := &Builder{
 549		opts:           opts,
 550		throttle:       make(chan int, opts.Parallelism),
 551		finishedShards: map[string]string{},
 552	}
 553
 554	parserBins, err := ctags.NewParserBinMap(
 555		b.opts.CTagsPath,
 556		b.opts.ScipCTagsPath,
 557		opts.LanguageMap,
 558		b.opts.CTagsMustSucceed,
 559	)
 560	if err != nil {
 561		return nil, err
 562	}
 563
 564	b.parserBins = parserBins
 565
 566	if opts.IsDelta {
 567		// Delta shards build on top of previously existing shards.
 568		// As a consequence, the shardNum for delta shards starts from
 569		// the number following the most recently generated shard - not 0.
 570		//
 571		// Using this numbering scheme allows all the shards to be
 572		// discovered as a set.
 573		shards := b.opts.FindAllShards()
 574		b.nextShardNum = len(shards) // shards are zero indexed, so len() provides the next number after the last one
 575	}
 576
 577	if _, err := b.newShardBuilder(); err != nil {
 578		return nil, err
 579	}
 580
 581	now := time.Now()
 582	b.indexTime = now
 583	b.id = xid.NewWithTime(now).String()
 584
 585	return b, nil
 586}
 587
 588// AddFile is a convenience wrapper for the Add method
 589func (b *Builder) AddFile(name string, content []byte) error {
 590	return b.Add(Document{Name: name, Content: content})
 591}
 592
 593func (b *Builder) Add(doc Document) error {
 594	if b.finishCalled {
 595		return nil
 596	}
 597
 598	allowLargeFile := b.opts.IgnoreSizeMax(doc.Name)
 599	if len(doc.Content) > b.opts.SizeMax && !allowLargeFile {
 600		// We could pass the document on to the shardbuilder, but if
 601		// we pass through a part of the source tree with binary/large
 602		// files, the corresponding shard would be mostly empty, so
 603		// insert a reason here too.
 604		doc.SkipReason = fmt.Sprintf("document size %d larger than limit %d", len(doc.Content), b.opts.SizeMax)
 605	} else if err := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile); err != nil {
 606		doc.SkipReason = err.Error()
 607	}
 608
 609	b.todo = append(b.todo, &doc)
 610
 611	if doc.SkipReason == "" {
 612		b.size += len(doc.Name) + len(doc.Content)
 613	} else {
 614		b.size += len(doc.Name) + len(doc.SkipReason)
 615		// Drop the content if we are skipping the document. Skipped content is not counted towards the
 616		// shard size limit, so otherwise we might buffer too much data in memory before flushing.
 617		doc.Content = nil
 618	}
 619
 620	if b.size > b.opts.ShardMax {
 621		return b.flush()
 622	}
 623
 624	return nil
 625}
 626
 627// MarkFileAsChangedOrRemoved indicates that the file specified by the given path
 628// has been changed or removed since the last indexing job for this repository.
 629//
 630// If this build is a delta build, these files will be tombstoned in the older shards for this repository.
 631func (b *Builder) MarkFileAsChangedOrRemoved(path string) {
 632	b.opts.changedOrRemovedFiles = append(b.opts.changedOrRemovedFiles, path)
 633}
 634
 635// Finish creates a last shard from the buffered documents, and clears
 636// stale shards from previous runs. This should always be called, also
 637// in failure cases, to ensure cleanup.
 638//
 639// It is safe to call Finish() multiple times.
 640func (b *Builder) Finish() error {
 641	if b.finishCalled {
 642		return b.buildError
 643	}
 644
 645	b.finishCalled = true
 646
 647	b.flush()
 648	b.building.Wait()
 649
 650	if b.buildError != nil {
 651		for tmp := range b.finishedShards {
 652			log.Printf("Builder.Finish %s", tmp)
 653			os.Remove(tmp)
 654		}
 655		b.finishedShards = map[string]string{}
 656		return b.buildError
 657	}
 658
 659	// map of temporary -> final names for all updated shards + shard metadata files
 660	artifactPaths := make(map[string]string)
 661	for tmp, final := range b.finishedShards {
 662		artifactPaths[tmp] = final
 663	}
 664
 665	oldShards := b.opts.FindAllShards()
 666
 667	if b.opts.IsDelta {
 668		// Delta shard builds need to update FileTombstone and branch commit information for all
 669		// existing shards
 670		for _, shard := range oldShards {
 671			repositories, _, err := ReadMetadataPathAlive(shard)
 672			if err != nil {
 673				return fmt.Errorf("reading metadata from shard %q: %w", shard, err)
 674			}
 675
 676			if len(repositories) > 1 {
 677				return fmt.Errorf("delta shard builds don't support repositories contained in compound shards (shard %q)", shard)
 678			}
 679
 680			if len(repositories) == 0 {
 681				return fmt.Errorf("failed to update repository metadata for shard %q - shard contains no repositories", shard)
 682			}
 683
 684			repository := repositories[0]
 685			if repository.ID != b.opts.RepositoryDescription.ID {
 686				return fmt.Errorf("shard %q doesn't contain repository ID %d (%q)", shard, b.opts.RepositoryDescription.ID, b.opts.RepositoryDescription.Name)
 687			}
 688
 689			if len(b.opts.changedOrRemovedFiles) > 0 && repository.FileTombstones == nil {
 690				repository.FileTombstones = make(map[string]struct{}, len(b.opts.changedOrRemovedFiles))
 691			}
 692
 693			for _, f := range b.opts.changedOrRemovedFiles {
 694				repository.FileTombstones[f] = struct{}{}
 695			}
 696
 697			if !BranchNamesEqual(repository.Branches, b.opts.RepositoryDescription.Branches) {
 698				return deltaBranchSetError{
 699					shardName: shard,
 700					old:       repository.Branches,
 701					new:       b.opts.RepositoryDescription.Branches,
 702				}
 703			}
 704
 705			if b.opts.GetHash() != repository.IndexOptions {
 706				return &deltaIndexOptionsMismatchError{
 707					shardName:  shard,
 708					newOptions: b.opts.HashOptions(),
 709				}
 710			}
 711
 712			repository.Branches = b.opts.RepositoryDescription.Branches
 713
 714			repository.LatestCommitDate = b.opts.RepositoryDescription.LatestCommitDate
 715
 716			tempPath, finalPath, err := JsonMarshalRepoMetaTemp(shard, repository)
 717			if err != nil {
 718				return fmt.Errorf("writing repository metadta for shard %q: %w", shard, err)
 719			}
 720
 721			artifactPaths[tempPath] = finalPath
 722		}
 723	}
 724
 725	// We mark finished shards as empty when we successfully finish. Return now
 726	// to allow call sites to call Finish idempotently.
 727	if len(artifactPaths) == 0 {
 728		return b.buildError
 729	}
 730
 731	// Collect a map of the old shards on disk. For each new shard we replace we
 732	// delete it from toDelete. Anything remaining in toDelete will be removed
 733	// after we have renamed everything into place.
 734
 735	var toDelete map[string]struct{}
 736	if !b.opts.IsDelta {
 737		// Non-delta shard builds delete all existing shards before they write out
 738		// new ones.
 739		// By contrast, delta shard builds work by stacking changes on top of existing shards.
 740		// So, we skip populating the toDelete map if we're building delta shards.
 741
 742		toDelete = make(map[string]struct{})
 743		for _, name := range oldShards {
 744			paths, err := IndexFilePaths(name)
 745			if err != nil {
 746				b.buildError = fmt.Errorf("failed to find old paths for %s: %w", name, err)
 747			}
 748			for _, p := range paths {
 749				toDelete[p] = struct{}{}
 750			}
 751		}
 752	}
 753
 754	for tmp, final := range artifactPaths {
 755		if err := os.Rename(tmp, final); err != nil {
 756			b.buildError = err
 757			continue
 758		}
 759
 760		delete(toDelete, final)
 761	}
 762
 763	b.finishedShards = map[string]string{}
 764
 765	for p := range toDelete {
 766		// Don't delete compound shards, set tombstones instead.
 767		if b.opts.ShardMerging && strings.HasPrefix(filepath.Base(p), "compound-") {
 768			if !strings.HasSuffix(p, ".zoekt") {
 769				continue
 770			}
 771			err := SetTombstone(p, b.opts.RepositoryDescription.ID)
 772			b.buildError = err
 773			continue
 774		}
 775		log.Printf("removing old shard file: %s", p)
 776		if err := os.Remove(p); err != nil {
 777			b.buildError = err
 778		}
 779	}
 780
 781	return b.buildError
 782}
 783
 784// BranchNamesEqual compares the given zoekt.RepositoryBranch slices, and returns true
 785// iff both slices specify the same set of branch names in the same order.
 786func BranchNamesEqual(a, b []zoekt.RepositoryBranch) bool {
 787	if len(a) != len(b) {
 788		return false
 789	}
 790
 791	for i := range a {
 792		x, y := a[i], b[i]
 793		if x.Name != y.Name {
 794			return false
 795		}
 796	}
 797
 798	return true
 799}
 800
 801func (b *Builder) flush() error {
 802	todo := b.todo
 803	b.todo = nil
 804	b.size = 0
 805	b.errMu.Lock()
 806	defer b.errMu.Unlock()
 807	if b.buildError != nil {
 808		return b.buildError
 809	}
 810
 811	hasShard := b.nextShardNum > 0
 812	if len(todo) == 0 && hasShard {
 813		return nil
 814	}
 815
 816	shard := b.nextShardNum
 817	b.nextShardNum++
 818
 819	if b.opts.Parallelism > 1 {
 820		b.building.Add(1)
 821		b.throttle <- 1
 822		go func() {
 823			done, err := b.buildShard(todo, shard)
 824			<-b.throttle
 825
 826			b.errMu.Lock()
 827			defer b.errMu.Unlock()
 828			if err != nil && b.buildError == nil {
 829				b.buildError = err
 830			}
 831			if err == nil {
 832				b.finishedShards[done.temp] = done.final
 833			}
 834			b.building.Done()
 835		}()
 836	} else {
 837		// No goroutines when we're not parallel. This
 838		// simplifies memory profiling.
 839		done, err := b.buildShard(todo, shard)
 840		b.buildError = err
 841		if err == nil {
 842			b.finishedShards[done.temp] = done.final
 843		}
 844
 845		return b.buildError
 846	}
 847
 848	return nil
 849}
 850
 851// map [0,inf) to [0,1) monotonically
 852func squashRange(j int) float64 {
 853	x := float64(j)
 854	return x / (1 + x)
 855}
 856
 857type rankedDoc struct {
 858	*Document
 859	rank []float64
 860}
 861
 862// rank returns a vector of scores which is used at index-time to sort documents
 863// before writing them to disk. The order of documents in the shard is important
 864// at query time, because earlier documents receive a boost at query time and
 865// have a higher chance of being searched before limits kick in.
 866func rank(d *Document, origIdx int) []float64 {
 867	skipped := 0.0
 868	if d.SkipReason != "" {
 869		skipped = 1.0
 870	}
 871
 872	generated := 0.0
 873	if enry.IsGenerated(d.Name, d.Content) {
 874		generated = 1.0
 875	}
 876
 877	vendor := 0.0
 878	if enry.IsVendor(d.Name) {
 879		vendor = 1.0
 880	}
 881
 882	test := 0.0
 883	if enry.IsTest(d.Name) {
 884		test = 1.0
 885	}
 886
 887	// Smaller is earlier (=better).
 888	return []float64{
 889		// Always place skipped docs last
 890		skipped,
 891
 892		// Prefer docs that are not generated
 893		generated,
 894
 895		// Prefer docs that are not vendored
 896		vendor,
 897
 898		// Prefer docs that are not tests
 899		test,
 900
 901		// With short names
 902		squashRange(len(d.Name)),
 903
 904		// With many symbols
 905		1.0 - squashRange(len(d.Symbols)),
 906
 907		// With short content
 908		squashRange(len(d.Content)),
 909
 910		// That is present is as many branches as possible
 911		1.0 - squashRange(len(d.Branches)),
 912
 913		// Preserve original ordering.
 914		squashRange(origIdx),
 915	}
 916}
 917
 918func sortDocuments(todo []*Document) {
 919	rs := make([]rankedDoc, 0, len(todo))
 920	for i, t := range todo {
 921		rd := rankedDoc{t, rank(t, i)}
 922		rs = append(rs, rd)
 923	}
 924	sort.Slice(rs, func(i, j int) bool {
 925		r1 := rs[i].rank
 926		r2 := rs[j].rank
 927		for i := range r1 {
 928			if r1[i] < r2[i] {
 929				return true
 930			}
 931			if r1[i] > r2[i] {
 932				return false
 933			}
 934		}
 935
 936		return false
 937	})
 938	for i := range todo {
 939		todo[i] = rs[i].Document
 940	}
 941}
 942
 943func (b *Builder) buildShard(todo []*Document, nextShardNum int) (*finishedShard, error) {
 944	if !b.opts.DisableCTags && (b.opts.CTagsPath != "" || b.opts.ScipCTagsPath != "") {
 945		err := parseSymbols(todo, b.opts.LanguageMap, b.parserBins)
 946		if b.opts.CTagsMustSucceed && err != nil {
 947			return nil, err
 948		}
 949		if err != nil {
 950			log.Printf("ignoring universal:%s or scip:%s error: %v", b.opts.CTagsPath, b.opts.ScipCTagsPath, err)
 951		}
 952	}
 953
 954	name := b.opts.shardName(nextShardNum)
 955
 956	shardBuilder, err := b.newShardBuilder()
 957	if err != nil {
 958		return nil, err
 959	}
 960
 961	sortDocuments(todo)
 962
 963	for idx, t := range todo {
 964		if err := shardBuilder.Add(*t); err != nil {
 965			return nil, err
 966		}
 967
 968		if idx%10_000 == 0 {
 969			b.CheckMemoryUsage()
 970		}
 971	}
 972
 973	return b.writeShard(name, shardBuilder)
 974}
 975
 976// CheckMemoryUsage checks the memory usage of the process and writes a memory profile if the heap usage exceeds the
 977// configured threshold. NOTE: this method is expensive and should only be used for debugging.
 978func (b *Builder) CheckMemoryUsage() {
 979	// Don't check memory if heap profiling is disabled, or we've already written 10 profiles
 980	if b.opts.HeapProfileTriggerBytes <= 0 || b.heapProfileNum >= 10 {
 981		return
 982	}
 983
 984	var m runtime.MemStats
 985	runtime.ReadMemStats(&m)
 986
 987	if m.HeapAlloc > b.opts.HeapProfileTriggerBytes && b.heapProfileMu.TryLock() {
 988		defer b.heapProfileMu.Unlock()
 989
 990		log.Printf("writing memory profile, allocated heap: %s", humanize.Bytes(m.HeapAlloc))
 991		name := filepath.Join(b.opts.IndexDir, fmt.Sprintf("indexmemory.prof.%d", b.heapProfileNum))
 992		f, err := os.Create(name)
 993		if err != nil {
 994			log.Printf("failed to create memory profile file: %v", err)
 995			return
 996		}
 997
 998		err = pprof.WriteHeapProfile(f)
 999		if err != nil {
1000			log.Printf("failed to write memory profile: %v", err)
1001		}
1002
1003		b.heapProfileNum++
1004	}
1005}
1006
1007func (b *Builder) newShardBuilder() (*ShardBuilder, error) {
1008	desc := b.opts.RepositoryDescription
1009	desc.HasSymbols = !b.opts.DisableCTags && b.opts.CTagsPath != ""
1010	desc.SubRepoMap = b.opts.SubRepositories
1011	desc.IndexOptions = b.opts.GetHash()
1012
1013	shardBuilder, err := NewShardBuilder(&desc)
1014	if err != nil {
1015		return nil, err
1016	}
1017	shardBuilder.IndexTime = b.indexTime
1018	shardBuilder.ID = b.id
1019	return shardBuilder, nil
1020}
1021
1022func (b *Builder) writeShard(fn string, ib *ShardBuilder) (*finishedShard, error) {
1023	dir := filepath.Dir(fn)
1024	if err := os.MkdirAll(dir, 0o700); err != nil {
1025		return nil, err
1026	}
1027
1028	f, err := os.CreateTemp(dir, filepath.Base(fn)+".*.tmp")
1029	if err != nil {
1030		return nil, err
1031	}
1032	if runtime.GOOS != "windows" {
1033		if err := f.Chmod(0o666 &^ umask); err != nil {
1034			return nil, err
1035		}
1036	}
1037
1038	defer f.Close()
1039	if err := ib.Write(f); err != nil {
1040		return nil, err
1041	}
1042	fi, err := f.Stat()
1043	if err != nil {
1044		return nil, err
1045	}
1046	if err := f.Close(); err != nil {
1047		return nil, err
1048	}
1049
1050	log.Printf("finished shard %s: %d index bytes (overhead %3.1f), %d files processed \n",
1051		fn,
1052		fi.Size(),
1053		float64(fi.Size())/float64(ib.ContentSize()+1),
1054		ib.NumFiles())
1055
1056	return &finishedShard{f.Name(), fn}, nil
1057}
1058
1059type deltaBranchSetError struct {
1060	shardName string
1061	old, new  []zoekt.RepositoryBranch
1062}
1063
1064func (e deltaBranchSetError) Error() string {
1065	return fmt.Sprintf("repository metadata in shard %q contains a different set of branch names than what was requested, which is unsupported in a delta shard build. old: %+v, new: %+v", e.shardName, e.old, e.new)
1066}
1067
1068type deltaIndexOptionsMismatchError struct {
1069	shardName  string
1070	newOptions HashOptions
1071}
1072
1073func (e *deltaIndexOptionsMismatchError) Error() string {
1074	return fmt.Sprintf("one or more index options for shard %q do not match Builder's index options. These index option updates are incompatible with delta build. New index options: %+v", e.shardName, e.newOptions)
1075}
1076
1077// Document holds a document (file) to index.
1078type Document struct {
1079	Name              string
1080	Content           []byte
1081	Branches          []string
1082	SubRepositoryPath string
1083	Language          string
1084	Category          FileCategory
1085
1086	// If set, something is wrong with the file contents, and this
1087	// is the reason it wasn't indexed.
1088	SkipReason string
1089
1090	// Document sections for symbols. Offsets should use bytes.
1091	Symbols         []DocumentSection
1092	SymbolsMetaData []*zoekt.Symbol
1093}
1094
1095type DocumentSection struct {
1096	Start, End uint32
1097}
1098
1099// umask holds the Umask of the current process
1100var umask os.FileMode
1101
1102func init() {
1103	umask = os.FileMode(unix.Umask(0))
1104	unix.Umask(int(umask))
1105}
Configure Feed

Configure Feed