fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// Package index contains logic for building Zoekt indexes. NOTE: this package is not considered
16// part of the public API, and it is not recommended to rely on it in external code.
17package index
18
19import (
20 "cmp"
21 "crypto/sha1"
22 "flag"
23 "fmt"
24 "log"
25 "net/url"
26 "os"
27 "os/exec"
28 "path"
29 "path/filepath"
30 "reflect"
31 "runtime"
32 "runtime/pprof"
33 "sort"
34 "strconv"
35 "strings"
36 "sync"
37 "time"
38
39 "github.com/bmatcuk/doublestar"
40 "github.com/dustin/go-humanize"
41 "github.com/go-enry/go-enry/v2"
42 "github.com/rs/xid"
43 "golang.org/x/sys/unix"
44
45 "maps"
46
47 "github.com/sourcegraph/zoekt"
48 "github.com/sourcegraph/zoekt/internal/ctags"
49)
50
51var DefaultDir = filepath.Join(os.Getenv("HOME"), ".zoekt")
52
53// Branch describes a single branch version.
54type Branch struct {
55 Name string
56 Version string
57}
58
59// Options sets options for the index building.
60type Options struct {
61 // IndexDir is a directory that holds *.zoekt index files.
62 IndexDir string
63
64 // SizeMax is the maximum file size
65 SizeMax int
66
67 // Parallelism is the maximum number of shards to index in parallel
68 Parallelism int
69
70 // ShardMax sets the maximum corpus size for a single shard
71 ShardMax int
72
73 // TrigramMax sets the maximum number of distinct trigrams per document.
74 TrigramMax int
75
76 // RepositoryDescription holds names and URLs for the repository.
77 RepositoryDescription zoekt.Repository
78
79 // SubRepositories is a path => sub repository map.
80 SubRepositories map[string]*zoekt.Repository
81
82 // DisableCTags disables the generation of ctags metadata.
83 DisableCTags bool
84
85 // CtagsPath is the path to the ctags binary to run, or empty
86 // if a valid binary couldn't be found.
87 CTagsPath string
88
89 // Same as CTagsPath but for scip-ctags
90 ScipCTagsPath string
91
92 // If set, ctags must succeed.
93 CTagsMustSucceed bool
94
95 // LargeFiles is a slice of glob patterns, including ** for any number
96 // of directories, where matching file paths should be indexed
97 // regardless of their size. The full pattern syntax is here:
98 // https://github.com/bmatcuk/doublestar/tree/v1#patterns.
99 LargeFiles []string
100
101 // IsDelta is true if this run contains only the changed documents since the
102 // last run.
103 IsDelta bool
104
105 // changedOrRemovedFiles is a list of file paths that have been changed or removed
106 // since the last indexing job for this repository. These files will be tombstoned
107 // in the older shards for this repository.
108 changedOrRemovedFiles []string
109
110 LanguageMap ctags.LanguageMap
111
112 // ShardMerging is true if builder should respect compound shards. This is a
113 // Sourcegraph specific option.
114 ShardMerging bool
115
116 // HeapProfileTriggerBytes is the heap allocation in bytes that will trigger a memory profile. If 0, no memory profile
117 // will be triggered. Note this trigger looks at total heap allocation (which includes both inuse and garbage objects).
118 //
119 // Profiles will be written to files named `index-memory.prof.n` in the index directory. No more than 10 files are written.
120 //
121 // Note: heap checking is "best effort", and it's possible for the process to OOM without triggering the heap profile.
122 HeapProfileTriggerBytes uint64
123
124 // ShardPrefix is the prefix of the shard. It defaults to the repository name.
125 ShardPrefix string
126}
127
128// HashOptions contains only the options in Options that upon modification leads to IndexState of IndexStateMismatch during the next index building.
129type HashOptions struct {
130 sizeMax int
131 disableCTags bool
132 ctagsPath string
133 cTagsMustSucceed bool
134 largeFiles []string
135}
136
137func (o *Options) HashOptions() HashOptions {
138 return HashOptions{
139 sizeMax: o.SizeMax,
140 disableCTags: o.DisableCTags,
141 ctagsPath: o.CTagsPath,
142 cTagsMustSucceed: o.CTagsMustSucceed,
143 largeFiles: o.LargeFiles,
144 }
145}
146
147func (o *Options) GetHash() string {
148 h := o.HashOptions()
149 hasher := sha1.New()
150
151 hasher.Write([]byte(h.ctagsPath))
152 hasher.Write(fmt.Appendf(nil, "%t", h.cTagsMustSucceed))
153 hasher.Write(fmt.Appendf(nil, "%d", h.sizeMax))
154 hasher.Write(fmt.Appendf(nil, "%q", h.largeFiles))
155 hasher.Write(fmt.Appendf(nil, "%t", h.disableCTags))
156
157 return fmt.Sprintf("%x", hasher.Sum(nil))
158}
159
160type largeFilesFlag struct{ *Options }
161
162func (f largeFilesFlag) String() string {
163 // From flag.Value documentation:
164 //
165 // The flag package may call the String method with a zero-valued receiver,
166 // such as a nil pointer.
167 if f.Options == nil {
168 return ""
169 }
170 s := append([]string{""}, f.LargeFiles...)
171 return strings.Join(s, "-large_file ")
172}
173
174func (f largeFilesFlag) Set(value string) error {
175 f.LargeFiles = append(f.LargeFiles, value)
176 return nil
177}
178
179// Flags adds flags for build options to fs. It is the "inverse" of Args.
180func (o *Options) Flags(fs *flag.FlagSet) {
181 x := *o
182 x.SetDefaults()
183 fs.IntVar(&o.SizeMax, "file_limit", x.SizeMax, "maximum file size")
184 fs.IntVar(&o.TrigramMax, "max_trigram_count", x.TrigramMax, "maximum number of trigrams per document")
185 fs.IntVar(&o.ShardMax, "shard_limit", x.ShardMax, "maximum corpus size for a shard")
186 fs.IntVar(&o.Parallelism, "parallelism", x.Parallelism, "maximum number of parallel indexing processes.")
187 fs.StringVar(&o.IndexDir, "index", x.IndexDir, "directory for search indices")
188 fs.BoolVar(&o.CTagsMustSucceed, "require_ctags", x.CTagsMustSucceed, "If set, ctags calls must succeed.")
189 fs.Var(largeFilesFlag{o}, "large_file", "A glob pattern where matching files are to be index regardless of their size. You can add multiple patterns by setting this more than once.")
190 fs.StringVar(&o.ShardPrefix, "shard_prefix", x.ShardPrefix, "the prefix of the shard. Defaults to repository name")
191
192 // Sourcegraph specific
193 fs.BoolVar(&o.DisableCTags, "disable_ctags", x.DisableCTags, "If set, ctags will not be called.")
194 fs.BoolVar(&o.ShardMerging, "shard_merging", x.ShardMerging, "If set, builder will respect compound shards.")
195}
196
197// Args generates command line arguments for o. It is the "inverse" of Flags.
198func (o *Options) Args() []string {
199 var args []string
200
201 if o.SizeMax != 0 {
202 args = append(args, "-file_limit", strconv.Itoa(o.SizeMax))
203 }
204
205 if o.TrigramMax != 0 {
206 args = append(args, "-max_trigram_count", strconv.Itoa(o.TrigramMax))
207 }
208
209 if o.ShardMax != 0 {
210 args = append(args, "-shard_limit", strconv.Itoa(o.ShardMax))
211 }
212
213 if o.Parallelism != 0 {
214 args = append(args, "-parallelism", strconv.Itoa(o.Parallelism))
215 }
216
217 if o.IndexDir != "" {
218 args = append(args, "-index", o.IndexDir)
219 }
220
221 if o.CTagsMustSucceed {
222 args = append(args, "-require_ctags")
223 }
224
225 for _, a := range o.LargeFiles {
226 args = append(args, "-large_file", a)
227 }
228
229 // Sourcegraph specific
230 if o.DisableCTags {
231 args = append(args, "-disable_ctags")
232 }
233
234 if o.ShardMerging {
235 args = append(args, "-shard_merging")
236 }
237
238 if o.ShardPrefix != "" {
239 args = append(args, "-shard_prefix", o.ShardPrefix)
240 }
241
242 return args
243}
244
245// Builder manages (parallel) creation of uniformly sized shards. The
246// builder buffers up documents until it collects enough documents and
247// then builds a shard and writes.
248type Builder struct {
249 opts Options
250 throttle chan int
251
252 nextShardNum int
253 todo []*Document
254 docChecker DocChecker
255 size int
256
257 parserBins ctags.ParserBinMap
258 building sync.WaitGroup
259
260 errMu sync.Mutex
261 buildError error
262
263 // temp name => final name for finished shards. We only rename
264 // them once all shards succeed to avoid Frankstein corpuses.
265 finishedShards map[string]string
266
267 // indexTime is set by tests for doing reproducible builds.
268 indexTime time.Time
269
270 // heapProfileMu is used to ensure that only one memory profile is written at a time
271 heapProfileMu sync.Mutex
272 heapProfileNum int
273
274 // a sortable 20 chars long id.
275 id string
276
277 finishCalled bool
278}
279
280type finishedShard struct {
281 temp, final string
282}
283
284func checkCTags() string {
285 if ctags := os.Getenv("CTAGS_COMMAND"); ctags != "" {
286 return ctags
287 }
288
289 if ctags, err := exec.LookPath("universal-ctags"); err == nil {
290 return ctags
291 }
292
293 return ""
294}
295
296func checkScipCTags() string {
297 if ctags := os.Getenv("SCIP_CTAGS_COMMAND"); ctags != "" {
298 return ctags
299 }
300
301 if ctags, err := exec.LookPath("scip-ctags"); err == nil {
302 return ctags
303 }
304
305 return ""
306}
307
308// SetDefaults sets reasonable default options.
309func (o *Options) SetDefaults() {
310 if o.CTagsPath == "" && !o.DisableCTags {
311 o.CTagsPath = checkCTags()
312 }
313
314 if o.ScipCTagsPath == "" && !o.DisableCTags {
315 o.ScipCTagsPath = checkScipCTags()
316 }
317
318 if o.Parallelism == 0 {
319 o.Parallelism = 4
320 }
321 if o.SizeMax == 0 {
322 o.SizeMax = 2 << 20
323 }
324 if o.ShardMax == 0 {
325 o.ShardMax = 100 << 20
326 }
327 if o.TrigramMax == 0 {
328 o.TrigramMax = 20000
329 }
330
331 if o.RepositoryDescription.Name == "" && o.RepositoryDescription.URL != "" {
332 parsed, _ := url.Parse(o.RepositoryDescription.URL)
333 if parsed != nil {
334 o.RepositoryDescription.Name = filepath.Join(parsed.Host, parsed.Path)
335 }
336 }
337}
338
339// ShardName returns the name the given index shard.
340func (o *Options) shardName(n int) string {
341 return o.shardNameVersion(IndexFormatVersion, n)
342}
343
344func (o *Options) shardNameVersion(version, n int) string {
345 return ShardName(o.IndexDir, cmp.Or(o.ShardPrefix, o.RepositoryDescription.Name), version, n)
346}
347
348type IndexState string
349
350const (
351 IndexStateMissing IndexState = "missing"
352 IndexStateCorrupt IndexState = "corrupt"
353 IndexStateVersion IndexState = "version-mismatch"
354 IndexStateOption IndexState = "option-mismatch"
355 IndexStateMeta IndexState = "meta-mismatch"
356 IndexStateContent IndexState = "content-mismatch"
357 IndexStateEqual IndexState = "equal"
358)
359
360var readVersions = []struct {
361 IndexFormatVersion int
362 FeatureVersion int
363}{{
364 IndexFormatVersion: IndexFormatVersion,
365 FeatureVersion: FeatureVersion,
366}, {
367 IndexFormatVersion: NextIndexFormatVersion,
368 FeatureVersion: FeatureVersion,
369}}
370
371// IncrementalSkipIndexing returns true if the index present on disk matches
372// the build options.
373func (o *Options) IncrementalSkipIndexing() bool {
374 state, _ := o.IndexState()
375 return state == IndexStateEqual
376}
377
378// IndexState checks how the index present on disk compares to the build
379// options and returns the IndexState and the name of the first shard.
380func (o *Options) IndexState() (IndexState, string) {
381 // Open the latest version we support that is on disk.
382 fn := o.findShard()
383 if fn == "" {
384 return IndexStateMissing, fn
385 }
386
387 repos, index, err := ReadMetadataPathAlive(fn)
388 if os.IsNotExist(err) {
389 return IndexStateMissing, fn
390 } else if err != nil {
391 return IndexStateCorrupt, fn
392 }
393
394 for _, v := range readVersions {
395 if v.IndexFormatVersion == index.IndexFormatVersion && v.FeatureVersion != index.IndexFeatureVersion {
396 return IndexStateVersion, fn
397 }
398 }
399
400 var repo *zoekt.Repository
401 for _, cand := range repos {
402 if cand.Name == o.RepositoryDescription.Name {
403 repo = cand
404 break
405 }
406 }
407
408 if repo == nil {
409 return IndexStateCorrupt, fn
410 }
411
412 if repo.IndexOptions != o.GetHash() {
413 return IndexStateOption, fn
414 }
415
416 if !reflect.DeepEqual(repo.Branches, o.RepositoryDescription.Branches) {
417 return IndexStateContent, fn
418 }
419
420 // We can mutate repo since it lives in the scope of this function call.
421 if updated, err := repo.MergeMutable(&o.RepositoryDescription); err != nil {
422 // non-nil err means we are trying to update an immutable field =>
423 // reindex content.
424 log.Printf("warn: immutable field changed, requires re-index: %s", err)
425 return IndexStateContent, fn
426 } else if updated {
427 return IndexStateMeta, fn
428 }
429
430 return IndexStateEqual, fn
431}
432
433// FindRepositoryMetadata returns the index metadata for the repository
434// specified in the options. 'ok' is false if the repository's metadata
435// couldn't be found or if an error occurred.
436func (o *Options) FindRepositoryMetadata() (repository *zoekt.Repository, metadata *zoekt.IndexMetadata, ok bool, err error) {
437 shard := o.findShard()
438 if shard == "" {
439 return nil, nil, false, nil
440 }
441
442 repositories, metadata, err := ReadMetadataPathAlive(shard)
443 if err != nil {
444 return nil, nil, false, fmt.Errorf("reading metadata for shard %q: %w", shard, err)
445 }
446
447 ID := o.RepositoryDescription.ID
448 for _, r := range repositories {
449 // compound shards contain multiple repositories, so we
450 // have to pick only the one we're looking for
451 if r.ID == ID {
452 return r, metadata, true, nil
453 }
454 }
455
456 // If we're here, then we're somehow in a state where we found a matching
457 // shard that's missing the repository metadata we're looking for. This
458 // should never happen.
459 name := o.RepositoryDescription.Name
460 return nil, nil, false, fmt.Errorf("matching shard %q doesn't contain metadata for repo id %d (%q)", shard, ID, name)
461}
462
463func (o *Options) findShard() string {
464 for _, v := range readVersions {
465 fn := o.shardNameVersion(v.IndexFormatVersion, 0)
466 if _, err := os.Stat(fn); err == nil {
467 return fn
468 }
469 }
470
471 // Brute force finding the shard in compound shards. We should only hit this
472 // code path for repositories that don't exist yet or are in compound shards.
473 return o.findCompoundShard()
474}
475
476func (o *Options) findCompoundShard() string {
477 compoundShards, err := filepath.Glob(path.Join(o.IndexDir, "compound-*.zoekt"))
478 if err != nil {
479 return ""
480 }
481 for _, fn := range compoundShards {
482 if containsRepo(fn, o.RepositoryDescription.ID) {
483 return fn
484 }
485 }
486
487 return ""
488}
489
490func (o *Options) FindAllShards() []string {
491 for _, v := range readVersions {
492 fn := o.shardNameVersion(v.IndexFormatVersion, 0)
493 if _, err := os.Stat(fn); err == nil {
494 shards := []string{fn}
495 for i := 1; ; i++ {
496 fn := o.shardNameVersion(v.IndexFormatVersion, i)
497 if _, err := os.Stat(fn); err != nil {
498 return shards
499 }
500 shards = append(shards, fn)
501 }
502 }
503 }
504
505 // lazily fallback to findShard which will look for a compound shard.
506 if fn := o.findShard(); fn != "" {
507 return []string{fn}
508 }
509
510 return nil
511}
512
513// IgnoreSizeMax determines whether the max size should be ignored.
514func (o *Options) IgnoreSizeMax(name string) bool {
515 // A pattern match will override preceding pattern matches.
516 for i := len(o.LargeFiles) - 1; i >= 0; i-- {
517 pattern := strings.TrimSpace(o.LargeFiles[i])
518 negated, validatedPattern := checkIsNegatePattern(pattern)
519
520 if m, _ := doublestar.PathMatch(validatedPattern, name); m {
521 if negated {
522 return false
523 } else {
524 return true
525 }
526 }
527 }
528
529 return false
530}
531
532func checkIsNegatePattern(pattern string) (bool, string) {
533 negate := "!"
534
535 // if negated then strip prefix meta character which identifies negated filter pattern
536 if strings.HasPrefix(pattern, negate) {
537 return true, pattern[len(negate):]
538 }
539
540 return false, pattern
541}
542
543// NewBuilder creates a new Builder instance.
544func NewBuilder(opts Options) (*Builder, error) {
545 opts.SetDefaults()
546 if opts.RepositoryDescription.Name == "" {
547 return nil, fmt.Errorf("builder: must set Name")
548 }
549
550 b := &Builder{
551 opts: opts,
552 throttle: make(chan int, opts.Parallelism),
553 finishedShards: map[string]string{},
554 }
555
556 parserBins, err := ctags.NewParserBinMap(
557 b.opts.CTagsPath,
558 b.opts.ScipCTagsPath,
559 opts.LanguageMap,
560 b.opts.CTagsMustSucceed,
561 )
562 if err != nil {
563 return nil, err
564 }
565
566 b.parserBins = parserBins
567
568 if opts.IsDelta {
569 // Delta shards build on top of previously existing shards.
570 // As a consequence, the shardNum for delta shards starts from
571 // the number following the most recently generated shard - not 0.
572 //
573 // Using this numbering scheme allows all the shards to be
574 // discovered as a set.
575 shards := b.opts.FindAllShards()
576 b.nextShardNum = len(shards) // shards are zero indexed, so len() provides the next number after the last one
577 }
578
579 if _, err := b.newShardBuilder(); err != nil {
580 return nil, err
581 }
582
583 now := time.Now()
584 b.indexTime = now
585 b.id = xid.NewWithTime(now).String()
586
587 return b, nil
588}
589
590// AddFile is a convenience wrapper for the Add method
591func (b *Builder) AddFile(name string, content []byte) error {
592 return b.Add(Document{Name: name, Content: content})
593}
594
595func (b *Builder) Add(doc Document) error {
596 if b.finishCalled {
597 return nil
598 }
599
600 allowLargeFile := b.opts.IgnoreSizeMax(doc.Name)
601 if len(doc.Content) > b.opts.SizeMax && !allowLargeFile {
602 // We could pass the document on to the shardbuilder, but if
603 // we pass through a part of the source tree with binary/large
604 // files, the corresponding shard would be mostly empty, so
605 // insert a reason here too.
606 doc.SkipReason = SkipReasonTooLarge
607 } else if skip := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile); skip != SkipReasonNone {
608 doc.SkipReason = skip
609 }
610
611 b.todo = append(b.todo, &doc)
612
613 if doc.SkipReason == SkipReasonNone {
614 b.size += len(doc.Name) + len(doc.Content)
615 } else {
616 b.size += len(doc.Name)
617 // Drop the content if we are skipping the document. Skipped content is not counted towards the
618 // shard size limit, so otherwise we might buffer too much data in memory before flushing.
619 doc.Content = nil
620 }
621
622 if b.size > b.opts.ShardMax {
623 return b.flush()
624 }
625
626 return nil
627}
628
629// MarkFileAsChangedOrRemoved indicates that the file specified by the given path
630// has been changed or removed since the last indexing job for this repository.
631//
632// If this build is a delta build, these files will be tombstoned in the older shards for this repository.
633func (b *Builder) MarkFileAsChangedOrRemoved(path string) {
634 b.opts.changedOrRemovedFiles = append(b.opts.changedOrRemovedFiles, path)
635}
636
637// Finish creates a last shard from the buffered documents, and clears
638// stale shards from previous runs. This should always be called, also
639// in failure cases, to ensure cleanup.
640//
641// It is safe to call Finish() multiple times.
642func (b *Builder) Finish() error {
643 if b.finishCalled {
644 return b.buildError
645 }
646
647 b.finishCalled = true
648
649 b.flush()
650 b.building.Wait()
651
652 if b.buildError != nil {
653 for tmp := range b.finishedShards {
654 log.Printf("Builder.Finish %s", tmp)
655 os.Remove(tmp)
656 }
657 b.finishedShards = map[string]string{}
658 return b.buildError
659 }
660
661 // map of temporary -> final names for all updated shards + shard metadata files
662 artifactPaths := make(map[string]string)
663 maps.Copy(artifactPaths, b.finishedShards)
664
665 oldShards := b.opts.FindAllShards()
666
667 if b.opts.IsDelta {
668 // Delta shard builds need to update FileTombstone and branch commit information for all
669 // existing shards
670 for _, shard := range oldShards {
671 repositories, _, err := ReadMetadataPathAlive(shard)
672 if err != nil {
673 return fmt.Errorf("reading metadata from shard %q: %w", shard, err)
674 }
675
676 if len(repositories) > 1 {
677 return fmt.Errorf("delta shard builds don't support repositories contained in compound shards (shard %q)", shard)
678 }
679
680 if len(repositories) == 0 {
681 return fmt.Errorf("failed to update repository metadata for shard %q - shard contains no repositories", shard)
682 }
683
684 repository := repositories[0]
685 if repository.ID != b.opts.RepositoryDescription.ID {
686 return fmt.Errorf("shard %q doesn't contain repository ID %d (%q)", shard, b.opts.RepositoryDescription.ID, b.opts.RepositoryDescription.Name)
687 }
688
689 if len(b.opts.changedOrRemovedFiles) > 0 && repository.FileTombstones == nil {
690 repository.FileTombstones = make(map[string]struct{}, len(b.opts.changedOrRemovedFiles))
691 }
692
693 for _, f := range b.opts.changedOrRemovedFiles {
694 repository.FileTombstones[f] = struct{}{}
695 }
696
697 if !BranchNamesEqual(repository.Branches, b.opts.RepositoryDescription.Branches) {
698 return deltaBranchSetError{
699 shardName: shard,
700 old: repository.Branches,
701 new: b.opts.RepositoryDescription.Branches,
702 }
703 }
704
705 if b.opts.GetHash() != repository.IndexOptions {
706 return &deltaIndexOptionsMismatchError{
707 shardName: shard,
708 newOptions: b.opts.HashOptions(),
709 }
710 }
711
712 repository.Branches = b.opts.RepositoryDescription.Branches
713
714 repository.LatestCommitDate = b.opts.RepositoryDescription.LatestCommitDate
715
716 tempPath, finalPath, err := JsonMarshalRepoMetaTemp(shard, repository)
717 if err != nil {
718 return fmt.Errorf("writing repository metadta for shard %q: %w", shard, err)
719 }
720
721 artifactPaths[tempPath] = finalPath
722 }
723 }
724
725 // We mark finished shards as empty when we successfully finish. Return now
726 // to allow call sites to call Finish idempotently.
727 if len(artifactPaths) == 0 {
728 return b.buildError
729 }
730
731 // Collect a map of the old shards on disk. For each new shard we replace we
732 // delete it from toDelete. Anything remaining in toDelete will be removed
733 // after we have renamed everything into place.
734
735 var toDelete map[string]struct{}
736 if !b.opts.IsDelta {
737 // Non-delta shard builds delete all existing shards before they write out
738 // new ones.
739 // By contrast, delta shard builds work by stacking changes on top of existing shards.
740 // So, we skip populating the toDelete map if we're building delta shards.
741
742 toDelete = make(map[string]struct{})
743 for _, name := range oldShards {
744 paths, err := IndexFilePaths(name)
745 if err != nil {
746 b.buildError = fmt.Errorf("failed to find old paths for %s: %w", name, err)
747 }
748 for _, p := range paths {
749 toDelete[p] = struct{}{}
750 }
751 }
752 }
753
754 for tmp, final := range artifactPaths {
755 if err := os.Rename(tmp, final); err != nil {
756 b.buildError = err
757 continue
758 }
759
760 delete(toDelete, final)
761 }
762
763 b.finishedShards = map[string]string{}
764
765 for p := range toDelete {
766 // Don't delete compound shards, set tombstones instead.
767 if b.opts.ShardMerging && strings.HasPrefix(filepath.Base(p), "compound-") {
768 if !strings.HasSuffix(p, ".zoekt") {
769 continue
770 }
771 err := SetTombstone(p, b.opts.RepositoryDescription.ID)
772 b.buildError = err
773 continue
774 }
775 log.Printf("removing old shard file: %s", p)
776 if err := os.Remove(p); err != nil {
777 b.buildError = err
778 }
779 }
780
781 return b.buildError
782}
783
784// BranchNamesEqual compares the given zoekt.RepositoryBranch slices, and returns true
785// iff both slices specify the same set of branch names in the same order.
786func BranchNamesEqual(a, b []zoekt.RepositoryBranch) bool {
787 if len(a) != len(b) {
788 return false
789 }
790
791 for i := range a {
792 x, y := a[i], b[i]
793 if x.Name != y.Name {
794 return false
795 }
796 }
797
798 return true
799}
800
801func (b *Builder) flush() error {
802 todo := b.todo
803 b.todo = nil
804 b.size = 0
805 b.errMu.Lock()
806 defer b.errMu.Unlock()
807 if b.buildError != nil {
808 return b.buildError
809 }
810
811 hasShard := b.nextShardNum > 0
812 if len(todo) == 0 && hasShard {
813 return nil
814 }
815
816 shard := b.nextShardNum
817 b.nextShardNum++
818
819 if b.opts.Parallelism > 1 {
820 b.building.Add(1)
821 b.throttle <- 1
822 go func() {
823 done, err := b.buildShard(todo, shard)
824 <-b.throttle
825
826 b.errMu.Lock()
827 defer b.errMu.Unlock()
828 if err != nil && b.buildError == nil {
829 b.buildError = err
830 }
831 if err == nil {
832 b.finishedShards[done.temp] = done.final
833 }
834 b.building.Done()
835 }()
836 } else {
837 // No goroutines when we're not parallel. This
838 // simplifies memory profiling.
839 done, err := b.buildShard(todo, shard)
840 b.buildError = err
841 if err == nil {
842 b.finishedShards[done.temp] = done.final
843 }
844
845 return b.buildError
846 }
847
848 return nil
849}
850
851// map [0,inf) to [0,1) monotonically
852func squashRange(j int) float64 {
853 x := float64(j)
854 return x / (1 + x)
855}
856
857type rankedDoc struct {
858 *Document
859 rank []float64
860}
861
862// rank returns a vector of scores which is used at index-time to sort documents
863// before writing them to disk. The order of documents in the shard is important
864// at query time, because earlier documents receive a boost at query time and
865// have a higher chance of being searched before limits kick in.
866func rank(d *Document, origIdx int) []float64 {
867 skipped := 0.0
868 if d.SkipReason != SkipReasonNone {
869 skipped = 1.0
870 }
871
872 generated := 0.0
873 if enry.IsGenerated(d.Name, d.Content) {
874 generated = 1.0
875 }
876
877 vendor := 0.0
878 if enry.IsVendor(d.Name) {
879 vendor = 1.0
880 }
881
882 test := 0.0
883 if enry.IsTest(d.Name) {
884 test = 1.0
885 }
886
887 // Smaller is earlier (=better).
888 return []float64{
889 // Always place skipped docs last
890 skipped,
891
892 // Prefer docs that are not generated
893 generated,
894
895 // Prefer docs that are not vendored
896 vendor,
897
898 // Prefer docs that are not tests
899 test,
900
901 // With short names
902 squashRange(len(d.Name)),
903
904 // With many symbols
905 1.0 - squashRange(len(d.Symbols)),
906
907 // With short content
908 squashRange(len(d.Content)),
909
910 // That is present is as many branches as possible
911 1.0 - squashRange(len(d.Branches)),
912
913 // Preserve original ordering.
914 squashRange(origIdx),
915 }
916}
917
918func sortDocuments(todo []*Document) {
919 rs := make([]rankedDoc, 0, len(todo))
920 for i, t := range todo {
921 rd := rankedDoc{t, rank(t, i)}
922 rs = append(rs, rd)
923 }
924 sort.Slice(rs, func(i, j int) bool {
925 r1 := rs[i].rank
926 r2 := rs[j].rank
927 for i := range r1 {
928 if r1[i] < r2[i] {
929 return true
930 }
931 if r1[i] > r2[i] {
932 return false
933 }
934 }
935
936 return false
937 })
938 for i := range todo {
939 todo[i] = rs[i].Document
940 }
941}
942
943func (b *Builder) buildShard(todo []*Document, nextShardNum int) (*finishedShard, error) {
944 if !b.opts.DisableCTags && (b.opts.CTagsPath != "" || b.opts.ScipCTagsPath != "") {
945 err := parseSymbols(todo, b.opts.LanguageMap, b.parserBins)
946 if b.opts.CTagsMustSucceed && err != nil {
947 return nil, err
948 }
949 if err != nil {
950 log.Printf("ignoring universal:%s or scip:%s error: %v", b.opts.CTagsPath, b.opts.ScipCTagsPath, err)
951 }
952 }
953
954 name := b.opts.shardName(nextShardNum)
955
956 shardBuilder, err := b.newShardBuilder()
957 if err != nil {
958 return nil, err
959 }
960
961 sortDocuments(todo)
962
963 for idx, t := range todo {
964 if err := shardBuilder.Add(*t); err != nil {
965 return nil, err
966 }
967
968 if idx%10_000 == 0 {
969 b.CheckMemoryUsage()
970 }
971 }
972
973 return b.writeShard(name, shardBuilder)
974}
975
976// CheckMemoryUsage checks the memory usage of the process and writes a memory profile if the heap usage exceeds the
977// configured threshold. NOTE: this method is expensive and should only be used for debugging.
978func (b *Builder) CheckMemoryUsage() {
979 // Don't check memory if heap profiling is disabled, or we've already written 10 profiles
980 if b.opts.HeapProfileTriggerBytes <= 0 || b.heapProfileNum >= 10 {
981 return
982 }
983
984 var m runtime.MemStats
985 runtime.ReadMemStats(&m)
986
987 if m.HeapAlloc > b.opts.HeapProfileTriggerBytes && b.heapProfileMu.TryLock() {
988 defer b.heapProfileMu.Unlock()
989
990 log.Printf("writing memory profile, allocated heap: %s", humanize.Bytes(m.HeapAlloc))
991 name := filepath.Join(b.opts.IndexDir, fmt.Sprintf("indexmemory.prof.%d", b.heapProfileNum))
992 f, err := os.Create(name)
993 if err != nil {
994 log.Printf("failed to create memory profile file: %v", err)
995 return
996 }
997
998 err = pprof.WriteHeapProfile(f)
999 if err != nil {
1000 log.Printf("failed to write memory profile: %v", err)
1001 }
1002
1003 b.heapProfileNum++
1004 }
1005}
1006
1007func (b *Builder) newShardBuilder() (*ShardBuilder, error) {
1008 desc := b.opts.RepositoryDescription
1009 desc.HasSymbols = !b.opts.DisableCTags && b.opts.CTagsPath != ""
1010 desc.SubRepoMap = b.opts.SubRepositories
1011 desc.IndexOptions = b.opts.GetHash()
1012
1013 shardBuilder, err := NewShardBuilder(&desc)
1014 if err != nil {
1015 return nil, err
1016 }
1017 shardBuilder.IndexTime = b.indexTime
1018 shardBuilder.ID = b.id
1019 return shardBuilder, nil
1020}
1021
1022func (b *Builder) writeShard(fn string, ib *ShardBuilder) (*finishedShard, error) {
1023 dir := filepath.Dir(fn)
1024 if err := os.MkdirAll(dir, 0o700); err != nil {
1025 return nil, err
1026 }
1027
1028 f, err := os.CreateTemp(dir, filepath.Base(fn)+".*.tmp")
1029 if err != nil {
1030 return nil, err
1031 }
1032 if runtime.GOOS != "windows" {
1033 if err := f.Chmod(0o666 &^ umask); err != nil {
1034 return nil, err
1035 }
1036 }
1037
1038 defer f.Close()
1039 if err := ib.Write(f); err != nil {
1040 return nil, err
1041 }
1042 fi, err := f.Stat()
1043 if err != nil {
1044 return nil, err
1045 }
1046 if err := f.Close(); err != nil {
1047 return nil, err
1048 }
1049
1050 log.Printf("finished shard %s: %d index bytes (overhead %3.1f), %d files processed \n",
1051 fn,
1052 fi.Size(),
1053 float64(fi.Size())/float64(ib.ContentSize()+1),
1054 ib.NumFiles())
1055
1056 return &finishedShard{f.Name(), fn}, nil
1057}
1058
1059type deltaBranchSetError struct {
1060 shardName string
1061 old, new []zoekt.RepositoryBranch
1062}
1063
1064func (e deltaBranchSetError) Error() string {
1065 return fmt.Sprintf("repository metadata in shard %q contains a different set of branch names than what was requested, which is unsupported in a delta shard build. old: %+v, new: %+v", e.shardName, e.old, e.new)
1066}
1067
1068type deltaIndexOptionsMismatchError struct {
1069 shardName string
1070 newOptions HashOptions
1071}
1072
1073func (e *deltaIndexOptionsMismatchError) Error() string {
1074 return fmt.Sprintf("one or more index options for shard %q do not match Builder's index options. These index option updates are incompatible with delta build. New index options: %+v", e.shardName, e.newOptions)
1075}
1076
1077// umask holds the Umask of the current process
1078var umask os.FileMode
1079
1080func init() {
1081 umask = os.FileMode(unix.Umask(0))
1082 unix.Umask(int(umask))
1083}