fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// Package index contains logic for building Zoekt indexes. NOTE: this package is not considered
16// part of the public API, and it is not recommended to rely on it in external code.
17package index
18
19import (
20 "crypto/sha1"
21 "flag"
22 "fmt"
23 "log"
24 "net/url"
25 "os"
26 "os/exec"
27 "path"
28 "path/filepath"
29 "reflect"
30 "runtime"
31 "runtime/pprof"
32 "sort"
33 "strconv"
34 "strings"
35 "sync"
36 "time"
37
38 "github.com/bmatcuk/doublestar"
39 "github.com/dustin/go-humanize"
40 "github.com/go-enry/go-enry/v2"
41 "github.com/rs/xid"
42 "golang.org/x/sys/unix"
43
44 "maps"
45
46 "github.com/sourcegraph/zoekt"
47 "github.com/sourcegraph/zoekt/internal/ctags"
48 "github.com/sourcegraph/zoekt/internal/tenant"
49)
50
51var DefaultDir = filepath.Join(os.Getenv("HOME"), ".zoekt")
52
53// Branch describes a single branch version.
54type Branch struct {
55 Name string
56 Version string
57}
58
59// Options sets options for the index building.
60type Options struct {
61 // IndexDir is a directory that holds *.zoekt index files.
62 IndexDir string
63
64 // SizeMax is the maximum file size
65 SizeMax int
66
67 // Parallelism is the maximum number of shards to index in parallel
68 Parallelism int
69
70 // ShardMax sets the maximum corpus size for a single shard
71 ShardMax int
72
73 // TrigramMax sets the maximum number of distinct trigrams per document.
74 TrigramMax int
75
76 // RepositoryDescription holds names and URLs for the repository.
77 RepositoryDescription zoekt.Repository
78
79 // SubRepositories is a path => sub repository map.
80 SubRepositories map[string]*zoekt.Repository
81
82 // DisableCTags disables the generation of ctags metadata.
83 DisableCTags bool
84
85 // CtagsPath is the path to the ctags binary to run, or empty
86 // if a valid binary couldn't be found.
87 CTagsPath string
88
89 // Same as CTagsPath but for scip-ctags
90 ScipCTagsPath string
91
92 // If set, ctags must succeed.
93 CTagsMustSucceed bool
94
95 // LargeFiles is a slice of glob patterns, including ** for any number
96 // of directories, where matching file paths should be indexed
97 // regardless of their size. The full pattern syntax is here:
98 // https://github.com/bmatcuk/doublestar/tree/v1#patterns.
99 LargeFiles []string
100
101 // IsDelta is true if this run contains only the changed documents since the
102 // last run.
103 IsDelta bool
104
105 // changedOrRemovedFiles is a list of file paths that have been changed or removed
106 // since the last indexing job for this repository. These files will be tombstoned
107 // in the older shards for this repository.
108 changedOrRemovedFiles []string
109
110 LanguageMap ctags.LanguageMap
111
112 // ShardMerging is true if builder should respect compound shards. This is a
113 // Sourcegraph specific option.
114 ShardMerging bool
115
116 // HeapProfileTriggerBytes is the heap allocation in bytes that will trigger a memory profile. If 0, no memory profile
117 // will be triggered. Note this trigger looks at total heap allocation (which includes both inuse and garbage objects).
118 //
119 // Profiles will be written to files named `index-memory.prof.n` in the index directory. No more than 10 files are written.
120 //
121 // Note: heap checking is "best effort", and it's possible for the process to OOM without triggering the heap profile.
122 HeapProfileTriggerBytes uint64
123
124 // TenantID is the ID of the tenant this shard belongs to.
125 TenantID int
126
127 // RepoID is the ID of the repository this shard belongs to.
128 RepoID uint32
129}
130
131// HashOptions contains only the options in Options that upon modification leads to IndexState of IndexStateMismatch during the next index building.
132type HashOptions struct {
133 sizeMax int
134 disableCTags bool
135 ctagsPath string
136 cTagsMustSucceed bool
137 largeFiles []string
138}
139
140func (o *Options) HashOptions() HashOptions {
141 return HashOptions{
142 sizeMax: o.SizeMax,
143 disableCTags: o.DisableCTags,
144 ctagsPath: o.CTagsPath,
145 cTagsMustSucceed: o.CTagsMustSucceed,
146 largeFiles: o.LargeFiles,
147 }
148}
149
150func (o *Options) GetHash() string {
151 h := o.HashOptions()
152 hasher := sha1.New()
153
154 hasher.Write([]byte(h.ctagsPath))
155 hasher.Write(fmt.Appendf(nil, "%t", h.cTagsMustSucceed))
156 hasher.Write(fmt.Appendf(nil, "%d", h.sizeMax))
157 hasher.Write(fmt.Appendf(nil, "%q", h.largeFiles))
158 hasher.Write(fmt.Appendf(nil, "%t", h.disableCTags))
159
160 return fmt.Sprintf("%x", hasher.Sum(nil))
161}
162
163type largeFilesFlag struct{ *Options }
164
165func (f largeFilesFlag) String() string {
166 // From flag.Value documentation:
167 //
168 // The flag package may call the String method with a zero-valued receiver,
169 // such as a nil pointer.
170 if f.Options == nil {
171 return ""
172 }
173 s := append([]string{""}, f.LargeFiles...)
174 return strings.Join(s, "-large_file ")
175}
176
177func (f largeFilesFlag) Set(value string) error {
178 f.LargeFiles = append(f.LargeFiles, value)
179 return nil
180}
181
182// Flags adds flags for build options to fs. It is the "inverse" of Args.
183func (o *Options) Flags(fs *flag.FlagSet) {
184 x := *o
185 x.SetDefaults()
186 fs.IntVar(&o.SizeMax, "file_limit", x.SizeMax, "maximum file size")
187 fs.IntVar(&o.TrigramMax, "max_trigram_count", x.TrigramMax, "maximum number of trigrams per document")
188 fs.IntVar(&o.ShardMax, "shard_limit", x.ShardMax, "maximum corpus size for a shard")
189 fs.IntVar(&o.Parallelism, "parallelism", x.Parallelism, "maximum number of parallel indexing processes.")
190 fs.StringVar(&o.IndexDir, "index", x.IndexDir, "directory for search indices")
191 fs.BoolVar(&o.CTagsMustSucceed, "require_ctags", x.CTagsMustSucceed, "If set, ctags calls must succeed.")
192 fs.Var(largeFilesFlag{o}, "large_file", "A glob pattern where matching files are to be index regardless of their size. You can add multiple patterns by setting this more than once.")
193
194 // Sourcegraph specific
195 fs.BoolVar(&o.DisableCTags, "disable_ctags", x.DisableCTags, "If set, ctags will not be called.")
196 fs.BoolVar(&o.ShardMerging, "shard_merging", x.ShardMerging, "If set, builder will respect compound shards.")
197}
198
199// Args generates command line arguments for o. It is the "inverse" of Flags.
200func (o *Options) Args() []string {
201 var args []string
202
203 if o.SizeMax != 0 {
204 args = append(args, "-file_limit", strconv.Itoa(o.SizeMax))
205 }
206
207 if o.TrigramMax != 0 {
208 args = append(args, "-max_trigram_count", strconv.Itoa(o.TrigramMax))
209 }
210
211 if o.ShardMax != 0 {
212 args = append(args, "-shard_limit", strconv.Itoa(o.ShardMax))
213 }
214
215 if o.Parallelism != 0 {
216 args = append(args, "-parallelism", strconv.Itoa(o.Parallelism))
217 }
218
219 if o.IndexDir != "" {
220 args = append(args, "-index", o.IndexDir)
221 }
222
223 if o.CTagsMustSucceed {
224 args = append(args, "-require_ctags")
225 }
226
227 for _, a := range o.LargeFiles {
228 args = append(args, "-large_file", a)
229 }
230
231 // Sourcegraph specific
232 if o.DisableCTags {
233 args = append(args, "-disable_ctags")
234 }
235
236 if o.ShardMerging {
237 args = append(args, "-shard_merging")
238 }
239
240 return args
241}
242
243// Builder manages (parallel) creation of uniformly sized shards. The
244// builder buffers up documents until it collects enough documents and
245// then builds a shard and writes.
246type Builder struct {
247 opts Options
248 throttle chan int
249
250 nextShardNum int
251 todo []*Document
252 docChecker DocChecker
253 size int
254
255 parserBins ctags.ParserBinMap
256 building sync.WaitGroup
257
258 errMu sync.Mutex
259 buildError error
260
261 // temp name => final name for finished shards. We only rename
262 // them once all shards succeed to avoid Frankstein corpuses.
263 finishedShards map[string]string
264
265 // indexTime is set by tests for doing reproducible builds.
266 indexTime time.Time
267
268 // heapProfileMu is used to ensure that only one memory profile is written at a time
269 heapProfileMu sync.Mutex
270 heapProfileNum int
271
272 // a sortable 20 chars long id.
273 id string
274
275 finishCalled bool
276}
277
278type finishedShard struct {
279 temp, final string
280}
281
282func checkCTags() string {
283 if ctags := os.Getenv("CTAGS_COMMAND"); ctags != "" {
284 return ctags
285 }
286
287 if ctags, err := exec.LookPath("universal-ctags"); err == nil {
288 return ctags
289 }
290
291 return ""
292}
293
294func checkScipCTags() string {
295 if ctags := os.Getenv("SCIP_CTAGS_COMMAND"); ctags != "" {
296 return ctags
297 }
298
299 if ctags, err := exec.LookPath("scip-ctags"); err == nil {
300 return ctags
301 }
302
303 return ""
304}
305
306// SetDefaults sets reasonable default options.
307func (o *Options) SetDefaults() {
308 if o.CTagsPath == "" && !o.DisableCTags {
309 o.CTagsPath = checkCTags()
310 }
311
312 if o.ScipCTagsPath == "" && !o.DisableCTags {
313 o.ScipCTagsPath = checkScipCTags()
314 }
315
316 if o.Parallelism == 0 {
317 o.Parallelism = 4
318 }
319 if o.SizeMax == 0 {
320 o.SizeMax = 2 << 20
321 }
322 if o.ShardMax == 0 {
323 o.ShardMax = 100 << 20
324 }
325 if o.TrigramMax == 0 {
326 o.TrigramMax = 20000
327 }
328
329 if o.RepositoryDescription.Name == "" && o.RepositoryDescription.URL != "" {
330 parsed, _ := url.Parse(o.RepositoryDescription.URL)
331 if parsed != nil {
332 o.RepositoryDescription.Name = filepath.Join(parsed.Host, parsed.Path)
333 }
334 }
335}
336
337// ShardName returns the name the given index shard.
338func (o *Options) shardName(n int) string {
339 return o.shardNameVersion(IndexFormatVersion, n)
340}
341
342func (o *Options) shardNameVersion(version, n int) string {
343 var prefix string
344
345 // If tenant enforcement is enabled and we have tenant/repo IDs, use those to generate the prefix
346 if o.TenantID != 0 && o.RepoID != 0 && tenant.EnforceTenant() {
347 prefix = tenant.SrcPrefix(o.TenantID, o.RepoID)
348 } else {
349 prefix = o.RepositoryDescription.Name
350 }
351
352 return ShardName(o.IndexDir, prefix, version, n)
353}
354
355type IndexState string
356
357const (
358 IndexStateMissing IndexState = "missing"
359 IndexStateCorrupt IndexState = "corrupt"
360 IndexStateVersion IndexState = "version-mismatch"
361 IndexStateOption IndexState = "option-mismatch"
362 IndexStateMeta IndexState = "meta-mismatch"
363 IndexStateContent IndexState = "content-mismatch"
364 IndexStateEqual IndexState = "equal"
365)
366
367var readVersions = []struct {
368 IndexFormatVersion int
369 FeatureVersion int
370}{{
371 IndexFormatVersion: IndexFormatVersion,
372 FeatureVersion: FeatureVersion,
373}, {
374 IndexFormatVersion: NextIndexFormatVersion,
375 FeatureVersion: FeatureVersion,
376}}
377
378// IncrementalSkipIndexing returns true if the index present on disk matches
379// the build options.
380func (o *Options) IncrementalSkipIndexing() bool {
381 state, _ := o.IndexState()
382 return state == IndexStateEqual
383}
384
385// IndexState checks how the index present on disk compares to the build
386// options and returns the IndexState and the name of the first shard.
387func (o *Options) IndexState() (IndexState, string) {
388 // Open the latest version we support that is on disk.
389 fn := o.findShard()
390 if fn == "" {
391 return IndexStateMissing, fn
392 }
393
394 repos, index, err := ReadMetadataPathAlive(fn)
395 if os.IsNotExist(err) {
396 return IndexStateMissing, fn
397 } else if err != nil {
398 return IndexStateCorrupt, fn
399 }
400
401 for _, v := range readVersions {
402 if v.IndexFormatVersion == index.IndexFormatVersion && v.FeatureVersion != index.IndexFeatureVersion {
403 return IndexStateVersion, fn
404 }
405 }
406
407 var repo *zoekt.Repository
408 for _, cand := range repos {
409 if cand.Name == o.RepositoryDescription.Name {
410 repo = cand
411 break
412 }
413 }
414
415 if repo == nil {
416 return IndexStateCorrupt, fn
417 }
418
419 if repo.IndexOptions != o.GetHash() {
420 return IndexStateOption, fn
421 }
422
423 if !reflect.DeepEqual(repo.Branches, o.RepositoryDescription.Branches) {
424 return IndexStateContent, fn
425 }
426
427 // We can mutate repo since it lives in the scope of this function call.
428 if updated, err := repo.MergeMutable(&o.RepositoryDescription); err != nil {
429 // non-nil err means we are trying to update an immutable field =>
430 // reindex content.
431 log.Printf("warn: immutable field changed, requires re-index: %s", err)
432 return IndexStateContent, fn
433 } else if updated {
434 return IndexStateMeta, fn
435 }
436
437 return IndexStateEqual, fn
438}
439
440// FindRepositoryMetadata returns the index metadata for the repository
441// specified in the options. 'ok' is false if the repository's metadata
442// couldn't be found or if an error occurred.
443func (o *Options) FindRepositoryMetadata() (repository *zoekt.Repository, metadata *zoekt.IndexMetadata, ok bool, err error) {
444 shard := o.findShard()
445 if shard == "" {
446 return nil, nil, false, nil
447 }
448
449 repositories, metadata, err := ReadMetadataPathAlive(shard)
450 if err != nil {
451 return nil, nil, false, fmt.Errorf("reading metadata for shard %q: %w", shard, err)
452 }
453
454 ID := o.RepositoryDescription.ID
455 for _, r := range repositories {
456 // compound shards contain multiple repositories, so we
457 // have to pick only the one we're looking for
458 if r.ID == ID {
459 return r, metadata, true, nil
460 }
461 }
462
463 // If we're here, then we're somehow in a state where we found a matching
464 // shard that's missing the repository metadata we're looking for. This
465 // should never happen.
466 name := o.RepositoryDescription.Name
467 return nil, nil, false, fmt.Errorf("matching shard %q doesn't contain metadata for repo id %d (%q)", shard, ID, name)
468}
469
470func (o *Options) findShard() string {
471 for _, v := range readVersions {
472 fn := o.shardNameVersion(v.IndexFormatVersion, 0)
473 if _, err := os.Stat(fn); err == nil {
474 return fn
475 }
476 }
477
478 // Brute force finding the shard in compound shards. We should only hit this
479 // code path for repositories that don't exist yet or are in compound shards.
480 return o.findCompoundShard()
481}
482
483func (o *Options) findCompoundShard() string {
484 compoundShards, err := filepath.Glob(path.Join(o.IndexDir, "compound-*.zoekt"))
485 if err != nil {
486 return ""
487 }
488 for _, fn := range compoundShards {
489 if containsRepo(fn, o.RepositoryDescription.ID) {
490 return fn
491 }
492 }
493
494 return ""
495}
496
497func (o *Options) FindAllShards() []string {
498 for _, v := range readVersions {
499 fn := o.shardNameVersion(v.IndexFormatVersion, 0)
500 if _, err := os.Stat(fn); err == nil {
501 shards := []string{fn}
502 for i := 1; ; i++ {
503 fn := o.shardNameVersion(v.IndexFormatVersion, i)
504 if _, err := os.Stat(fn); err != nil {
505 return shards
506 }
507 shards = append(shards, fn)
508 }
509 }
510 }
511
512 // lazily fallback to findShard which will look for a compound shard.
513 if fn := o.findShard(); fn != "" {
514 return []string{fn}
515 }
516
517 return nil
518}
519
520// IgnoreSizeMax determines whether the max size should be ignored.
521func (o *Options) IgnoreSizeMax(name string) bool {
522 // A pattern match will override preceding pattern matches.
523 for i := len(o.LargeFiles) - 1; i >= 0; i-- {
524 pattern := strings.TrimSpace(o.LargeFiles[i])
525 negated, validatedPattern := checkIsNegatePattern(pattern)
526
527 if m, _ := doublestar.PathMatch(validatedPattern, name); m {
528 if negated {
529 return false
530 } else {
531 return true
532 }
533 }
534 }
535
536 return false
537}
538
539func checkIsNegatePattern(pattern string) (bool, string) {
540 negate := "!"
541
542 // if negated then strip prefix meta character which identifies negated filter pattern
543 if strings.HasPrefix(pattern, negate) {
544 return true, pattern[len(negate):]
545 }
546
547 return false, pattern
548}
549
550// NewBuilder creates a new Builder instance.
551func NewBuilder(opts Options) (*Builder, error) {
552 opts.SetDefaults()
553 if opts.RepositoryDescription.Name == "" {
554 return nil, fmt.Errorf("builder: must set Name")
555 }
556
557 b := &Builder{
558 opts: opts,
559 throttle: make(chan int, opts.Parallelism),
560 finishedShards: map[string]string{},
561 }
562
563 parserBins, err := ctags.NewParserBinMap(
564 b.opts.CTagsPath,
565 b.opts.ScipCTagsPath,
566 opts.LanguageMap,
567 b.opts.CTagsMustSucceed,
568 )
569 if err != nil {
570 return nil, err
571 }
572
573 b.parserBins = parserBins
574
575 if opts.IsDelta {
576 // Delta shards build on top of previously existing shards.
577 // As a consequence, the shardNum for delta shards starts from
578 // the number following the most recently generated shard - not 0.
579 //
580 // Using this numbering scheme allows all the shards to be
581 // discovered as a set.
582 shards := b.opts.FindAllShards()
583 b.nextShardNum = len(shards) // shards are zero indexed, so len() provides the next number after the last one
584 }
585
586 if _, err := b.newShardBuilder(); err != nil {
587 return nil, err
588 }
589
590 now := time.Now()
591 b.indexTime = now
592 b.id = xid.NewWithTime(now).String()
593
594 return b, nil
595}
596
597// AddFile is a convenience wrapper for the Add method
598func (b *Builder) AddFile(name string, content []byte) error {
599 return b.Add(Document{Name: name, Content: content})
600}
601
602func (b *Builder) Add(doc Document) error {
603 if b.finishCalled {
604 return nil
605 }
606
607 allowLargeFile := b.opts.IgnoreSizeMax(doc.Name)
608 if len(doc.Content) > b.opts.SizeMax && !allowLargeFile {
609 // We could pass the document on to the shardbuilder, but if
610 // we pass through a part of the source tree with binary/large
611 // files, the corresponding shard would be mostly empty, so
612 // insert a reason here too.
613 doc.SkipReason = SkipReasonTooLarge
614 } else if skip := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile); skip != SkipReasonNone {
615 doc.SkipReason = skip
616 }
617
618 b.todo = append(b.todo, &doc)
619
620 if doc.SkipReason == SkipReasonNone {
621 b.size += len(doc.Name) + len(doc.Content)
622 } else {
623 b.size += len(doc.Name)
624 // Drop the content if we are skipping the document. Skipped content is not counted towards the
625 // shard size limit, so otherwise we might buffer too much data in memory before flushing.
626 doc.Content = nil
627 }
628
629 if b.size > b.opts.ShardMax {
630 return b.flush()
631 }
632
633 return nil
634}
635
636// MarkFileAsChangedOrRemoved indicates that the file specified by the given path
637// has been changed or removed since the last indexing job for this repository.
638//
639// If this build is a delta build, these files will be tombstoned in the older shards for this repository.
640func (b *Builder) MarkFileAsChangedOrRemoved(path string) {
641 b.opts.changedOrRemovedFiles = append(b.opts.changedOrRemovedFiles, path)
642}
643
644// Finish creates a last shard from the buffered documents, and clears
645// stale shards from previous runs. This should always be called, also
646// in failure cases, to ensure cleanup.
647//
648// It is safe to call Finish() multiple times.
649func (b *Builder) Finish() error {
650 if b.finishCalled {
651 return b.buildError
652 }
653
654 b.finishCalled = true
655
656 b.flush()
657 b.building.Wait()
658
659 if b.buildError != nil {
660 for tmp := range b.finishedShards {
661 log.Printf("Builder.Finish %s", tmp)
662 os.Remove(tmp)
663 }
664 b.finishedShards = map[string]string{}
665 return b.buildError
666 }
667
668 // map of temporary -> final names for all updated shards + shard metadata files
669 artifactPaths := make(map[string]string)
670 maps.Copy(artifactPaths, b.finishedShards)
671
672 oldShards := b.opts.FindAllShards()
673
674 if b.opts.IsDelta {
675 // Delta shard builds need to update FileTombstone and branch commit information for all
676 // existing shards
677 for _, shard := range oldShards {
678 repositories, _, err := ReadMetadataPathAlive(shard)
679 if err != nil {
680 return fmt.Errorf("reading metadata from shard %q: %w", shard, err)
681 }
682
683 if len(repositories) > 1 {
684 return fmt.Errorf("delta shard builds don't support repositories contained in compound shards (shard %q)", shard)
685 }
686
687 if len(repositories) == 0 {
688 return fmt.Errorf("failed to update repository metadata for shard %q - shard contains no repositories", shard)
689 }
690
691 repository := repositories[0]
692 if repository.ID != b.opts.RepositoryDescription.ID {
693 return fmt.Errorf("shard %q doesn't contain repository ID %d (%q)", shard, b.opts.RepositoryDescription.ID, b.opts.RepositoryDescription.Name)
694 }
695
696 if len(b.opts.changedOrRemovedFiles) > 0 && repository.FileTombstones == nil {
697 repository.FileTombstones = make(map[string]struct{}, len(b.opts.changedOrRemovedFiles))
698 }
699
700 for _, f := range b.opts.changedOrRemovedFiles {
701 repository.FileTombstones[f] = struct{}{}
702 }
703
704 if !BranchNamesEqual(repository.Branches, b.opts.RepositoryDescription.Branches) {
705 return deltaBranchSetError{
706 shardName: shard,
707 old: repository.Branches,
708 new: b.opts.RepositoryDescription.Branches,
709 }
710 }
711
712 if b.opts.GetHash() != repository.IndexOptions {
713 return &deltaIndexOptionsMismatchError{
714 shardName: shard,
715 newOptions: b.opts.HashOptions(),
716 }
717 }
718
719 repository.Branches = b.opts.RepositoryDescription.Branches
720
721 repository.LatestCommitDate = b.opts.RepositoryDescription.LatestCommitDate
722
723 tempPath, finalPath, err := JsonMarshalRepoMetaTemp(shard, repository)
724 if err != nil {
725 return fmt.Errorf("writing repository metadta for shard %q: %w", shard, err)
726 }
727
728 artifactPaths[tempPath] = finalPath
729 }
730 }
731
732 // We mark finished shards as empty when we successfully finish. Return now
733 // to allow call sites to call Finish idempotently.
734 if len(artifactPaths) == 0 {
735 return b.buildError
736 }
737
738 // Collect a map of the old shards on disk. For each new shard we replace we
739 // delete it from toDelete. Anything remaining in toDelete will be removed
740 // after we have renamed everything into place.
741
742 var toDelete map[string]struct{}
743 if !b.opts.IsDelta {
744 // Non-delta shard builds delete all existing shards before they write out
745 // new ones.
746 // By contrast, delta shard builds work by stacking changes on top of existing shards.
747 // So, we skip populating the toDelete map if we're building delta shards.
748
749 toDelete = make(map[string]struct{})
750 for _, name := range oldShards {
751 paths, err := IndexFilePaths(name)
752 if err != nil {
753 b.buildError = fmt.Errorf("failed to find old paths for %s: %w", name, err)
754 }
755 for _, p := range paths {
756 toDelete[p] = struct{}{}
757 }
758 }
759 }
760
761 for tmp, final := range artifactPaths {
762 if err := os.Rename(tmp, final); err != nil {
763 b.buildError = err
764 continue
765 }
766
767 delete(toDelete, final)
768 }
769
770 b.finishedShards = map[string]string{}
771
772 for p := range toDelete {
773 // Don't delete compound shards, set tombstones instead.
774 if b.opts.ShardMerging && strings.HasPrefix(filepath.Base(p), "compound-") {
775 if !strings.HasSuffix(p, ".zoekt") {
776 continue
777 }
778 err := SetTombstone(p, b.opts.RepositoryDescription.ID)
779 b.buildError = err
780 continue
781 }
782 log.Printf("removing old shard file: %s", p)
783 if err := os.Remove(p); err != nil {
784 b.buildError = err
785 }
786 }
787
788 return b.buildError
789}
790
791// BranchNamesEqual compares the given zoekt.RepositoryBranch slices, and returns true
792// iff both slices specify the same set of branch names in the same order.
793func BranchNamesEqual(a, b []zoekt.RepositoryBranch) bool {
794 if len(a) != len(b) {
795 return false
796 }
797
798 for i := range a {
799 x, y := a[i], b[i]
800 if x.Name != y.Name {
801 return false
802 }
803 }
804
805 return true
806}
807
808func (b *Builder) flush() error {
809 todo := b.todo
810 b.todo = nil
811 b.size = 0
812 b.errMu.Lock()
813 defer b.errMu.Unlock()
814 if b.buildError != nil {
815 return b.buildError
816 }
817
818 hasShard := b.nextShardNum > 0
819 if len(todo) == 0 && hasShard {
820 return nil
821 }
822
823 shard := b.nextShardNum
824 b.nextShardNum++
825
826 if b.opts.Parallelism > 1 {
827 b.building.Add(1)
828 b.throttle <- 1
829 go func() {
830 done, err := b.buildShard(todo, shard)
831 <-b.throttle
832
833 b.errMu.Lock()
834 defer b.errMu.Unlock()
835 if err != nil && b.buildError == nil {
836 b.buildError = err
837 }
838 if err == nil {
839 b.finishedShards[done.temp] = done.final
840 }
841 b.building.Done()
842 }()
843 } else {
844 // No goroutines when we're not parallel. This
845 // simplifies memory profiling.
846 done, err := b.buildShard(todo, shard)
847 b.buildError = err
848 if err == nil {
849 b.finishedShards[done.temp] = done.final
850 }
851
852 return b.buildError
853 }
854
855 return nil
856}
857
858// map [0,inf) to [0,1) monotonically
859func squashRange(j int) float64 {
860 x := float64(j)
861 return x / (1 + x)
862}
863
864type rankedDoc struct {
865 *Document
866 rank []float64
867}
868
869// rank returns a vector of scores which is used at index-time to sort documents
870// before writing them to disk. The order of documents in the shard is important
871// at query time, because earlier documents receive a boost at query time and
872// have a higher chance of being searched before limits kick in.
873func rank(d *Document, origIdx int) []float64 {
874 skipped := 0.0
875 if d.SkipReason != SkipReasonNone {
876 skipped = 1.0
877 }
878
879 generated := 0.0
880 if enry.IsGenerated(d.Name, d.Content) {
881 generated = 1.0
882 }
883
884 vendor := 0.0
885 if enry.IsVendor(d.Name) {
886 vendor = 1.0
887 }
888
889 test := 0.0
890 if enry.IsTest(d.Name) {
891 test = 1.0
892 }
893
894 // Smaller is earlier (=better).
895 return []float64{
896 // Always place skipped docs last
897 skipped,
898
899 // Prefer docs that are not generated
900 generated,
901
902 // Prefer docs that are not vendored
903 vendor,
904
905 // Prefer docs that are not tests
906 test,
907
908 // With short names
909 squashRange(len(d.Name)),
910
911 // With many symbols
912 1.0 - squashRange(len(d.Symbols)),
913
914 // With short content
915 squashRange(len(d.Content)),
916
917 // That is present is as many branches as possible
918 1.0 - squashRange(len(d.Branches)),
919
920 // Preserve original ordering.
921 squashRange(origIdx),
922 }
923}
924
925func sortDocuments(todo []*Document) {
926 rs := make([]rankedDoc, 0, len(todo))
927 for i, t := range todo {
928 rd := rankedDoc{t, rank(t, i)}
929 rs = append(rs, rd)
930 }
931 sort.Slice(rs, func(i, j int) bool {
932 r1 := rs[i].rank
933 r2 := rs[j].rank
934 for i := range r1 {
935 if r1[i] < r2[i] {
936 return true
937 }
938 if r1[i] > r2[i] {
939 return false
940 }
941 }
942
943 return false
944 })
945 for i := range todo {
946 todo[i] = rs[i].Document
947 }
948}
949
950func (b *Builder) buildShard(todo []*Document, nextShardNum int) (*finishedShard, error) {
951 if !b.opts.DisableCTags && (b.opts.CTagsPath != "" || b.opts.ScipCTagsPath != "") {
952 err := parseSymbols(todo, b.opts.LanguageMap, b.parserBins)
953 if b.opts.CTagsMustSucceed && err != nil {
954 return nil, err
955 }
956 if err != nil {
957 log.Printf("ignoring universal:%s or scip:%s error: %v", b.opts.CTagsPath, b.opts.ScipCTagsPath, err)
958 }
959 }
960
961 name := b.opts.shardName(nextShardNum)
962
963 shardBuilder, err := b.newShardBuilder()
964 if err != nil {
965 return nil, err
966 }
967
968 sortDocuments(todo)
969
970 for idx, t := range todo {
971 if err := shardBuilder.Add(*t); err != nil {
972 return nil, err
973 }
974
975 if idx%10_000 == 0 {
976 b.CheckMemoryUsage()
977 }
978 }
979
980 return b.writeShard(name, shardBuilder)
981}
982
983// CheckMemoryUsage checks the memory usage of the process and writes a memory profile if the heap usage exceeds the
984// configured threshold. NOTE: this method is expensive and should only be used for debugging.
985func (b *Builder) CheckMemoryUsage() {
986 // Don't check memory if heap profiling is disabled, or we've already written 10 profiles
987 if b.opts.HeapProfileTriggerBytes <= 0 || b.heapProfileNum >= 10 {
988 return
989 }
990
991 var m runtime.MemStats
992 runtime.ReadMemStats(&m)
993
994 if m.HeapAlloc > b.opts.HeapProfileTriggerBytes && b.heapProfileMu.TryLock() {
995 defer b.heapProfileMu.Unlock()
996
997 log.Printf("writing memory profile, allocated heap: %s", humanize.Bytes(m.HeapAlloc))
998 name := filepath.Join(b.opts.IndexDir, fmt.Sprintf("indexmemory.prof.%d", b.heapProfileNum))
999 f, err := os.Create(name)
1000 if err != nil {
1001 log.Printf("failed to create memory profile file: %v", err)
1002 return
1003 }
1004
1005 err = pprof.WriteHeapProfile(f)
1006 if err != nil {
1007 log.Printf("failed to write memory profile: %v", err)
1008 }
1009
1010 b.heapProfileNum++
1011 }
1012}
1013
1014func (b *Builder) newShardBuilder() (*ShardBuilder, error) {
1015 desc := b.opts.RepositoryDescription
1016 desc.HasSymbols = !b.opts.DisableCTags && b.opts.CTagsPath != ""
1017 desc.SubRepoMap = b.opts.SubRepositories
1018 desc.IndexOptions = b.opts.GetHash()
1019
1020 shardBuilder, err := NewShardBuilder(&desc)
1021 if err != nil {
1022 return nil, err
1023 }
1024 shardBuilder.IndexTime = b.indexTime
1025 shardBuilder.ID = b.id
1026 return shardBuilder, nil
1027}
1028
1029func (b *Builder) writeShard(fn string, ib *ShardBuilder) (*finishedShard, error) {
1030 dir := filepath.Dir(fn)
1031 if err := os.MkdirAll(dir, 0o700); err != nil {
1032 return nil, err
1033 }
1034
1035 f, err := os.CreateTemp(dir, filepath.Base(fn)+".*.tmp")
1036 if err != nil {
1037 return nil, err
1038 }
1039 if runtime.GOOS != "windows" {
1040 if err := f.Chmod(0o666 &^ umask); err != nil {
1041 return nil, err
1042 }
1043 }
1044
1045 defer f.Close()
1046 if err := ib.Write(f); err != nil {
1047 return nil, err
1048 }
1049 fi, err := f.Stat()
1050 if err != nil {
1051 return nil, err
1052 }
1053 if err := f.Close(); err != nil {
1054 return nil, err
1055 }
1056
1057 log.Printf("finished shard %s: %d index bytes (overhead %3.1f), %d files processed \n",
1058 fn,
1059 fi.Size(),
1060 float64(fi.Size())/float64(ib.ContentSize()+1),
1061 ib.NumFiles())
1062
1063 return &finishedShard{f.Name(), fn}, nil
1064}
1065
1066type deltaBranchSetError struct {
1067 shardName string
1068 old, new []zoekt.RepositoryBranch
1069}
1070
1071func (e deltaBranchSetError) Error() string {
1072 return fmt.Sprintf("repository metadata in shard %q contains a different set of branch names than what was requested, which is unsupported in a delta shard build. old: %+v, new: %+v", e.shardName, e.old, e.new)
1073}
1074
1075type deltaIndexOptionsMismatchError struct {
1076 shardName string
1077 newOptions HashOptions
1078}
1079
1080func (e *deltaIndexOptionsMismatchError) Error() string {
1081 return fmt.Sprintf("one or more index options for shard %q do not match Builder's index options. These index option updates are incompatible with delta build. New index options: %+v", e.shardName, e.newOptions)
1082}
1083
1084// umask holds the Umask of the current process
1085var umask os.FileMode
1086
1087func init() {
1088 umask = os.FileMode(unix.Umask(0))
1089 unix.Umask(int(umask))
1090}