fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// Package index contains logic for building Zoekt indexes. NOTE: this package is not considered
16// part of the public API, and it is not recommended to rely on it in external code.
17package index
18
19import (
20 "crypto/sha1"
21 "flag"
22 "fmt"
23 "log"
24 "net/url"
25 "os"
26 "os/exec"
27 "path"
28 "path/filepath"
29 "reflect"
30 "runtime"
31 "runtime/pprof"
32 "sort"
33 "strconv"
34 "strings"
35 "sync"
36 "time"
37
38 "github.com/bmatcuk/doublestar"
39 "github.com/dustin/go-humanize"
40 "github.com/go-enry/go-enry/v2"
41 "github.com/rs/xid"
42 "golang.org/x/sys/unix"
43
44 "maps"
45
46 "github.com/sourcegraph/zoekt"
47 "github.com/sourcegraph/zoekt/internal/ctags"
48 "github.com/sourcegraph/zoekt/internal/tenant"
49)
50
51var DefaultDir = filepath.Join(os.Getenv("HOME"), ".zoekt")
52
53// Branch describes a single branch version.
54type Branch struct {
55 Name string
56 Version string
57}
58
59// Options sets options for the index building.
60type Options struct {
61 // IndexDir is a directory that holds *.zoekt index files.
62 IndexDir string
63
64 // SizeMax is the maximum file size
65 SizeMax int
66
67 // Parallelism is the maximum number of shards to index in parallel
68 Parallelism int
69
70 // ShardMax sets the maximum corpus size for a single shard
71 ShardMax int
72
73 // TrigramMax sets the maximum number of distinct trigrams per document.
74 TrigramMax int
75
76 // RepositoryDescription holds names and URLs for the repository.
77 RepositoryDescription zoekt.Repository
78
79 // SubRepositories is a path => sub repository map.
80 SubRepositories map[string]*zoekt.Repository
81
82 // DisableCTags disables the generation of ctags metadata.
83 DisableCTags bool
84
85 // CtagsPath is the path to the ctags binary to run, or empty
86 // if a valid binary couldn't be found.
87 CTagsPath string
88
89 // Same as CTagsPath but for scip-ctags
90 ScipCTagsPath string
91
92 // If set, ctags must succeed.
93 CTagsMustSucceed bool
94
95 // LargeFiles is a slice of glob patterns, including ** for any number
96 // of directories, where matching file paths should be indexed
97 // regardless of their size. The full pattern syntax is here:
98 // https://github.com/bmatcuk/doublestar/tree/v1#patterns.
99 LargeFiles []string
100
101 // IsDelta is true if this run contains only the changed documents since the
102 // last run.
103 IsDelta bool
104
105 // changedOrRemovedFiles is a list of file paths that have been changed or removed
106 // since the last indexing job for this repository. These files will be tombstoned
107 // in the older shards for this repository.
108 changedOrRemovedFiles []string
109
110 LanguageMap ctags.LanguageMap
111
112 // ShardMerging is true if builder should respect compound shards. This is a
113 // Sourcegraph specific option.
114 ShardMerging bool
115
116 // HeapProfileTriggerBytes is the heap allocation in bytes that will trigger a memory profile. If 0, no memory profile
117 // will be triggered. Note this trigger looks at total heap allocation (which includes both inuse and garbage objects).
118 //
119 // Profiles will be written to files named `index-memory.prof.n` in the index directory. No more than 10 files are written.
120 //
121 // Note: heap checking is "best effort", and it's possible for the process to OOM without triggering the heap profile.
122 HeapProfileTriggerBytes uint64
123}
124
125// HashOptions contains only the options in Options that upon modification leads to IndexState of IndexStateMismatch during the next index building.
126type HashOptions struct {
127 sizeMax int
128 disableCTags bool
129 ctagsPath string
130 cTagsMustSucceed bool
131 largeFiles []string
132}
133
134func (o *Options) HashOptions() HashOptions {
135 return HashOptions{
136 sizeMax: o.SizeMax,
137 disableCTags: o.DisableCTags,
138 ctagsPath: o.CTagsPath,
139 cTagsMustSucceed: o.CTagsMustSucceed,
140 largeFiles: o.LargeFiles,
141 }
142}
143
144func (o *Options) GetHash() string {
145 h := o.HashOptions()
146 hasher := sha1.New()
147
148 hasher.Write([]byte(h.ctagsPath))
149 hasher.Write(fmt.Appendf(nil, "%t", h.cTagsMustSucceed))
150 hasher.Write(fmt.Appendf(nil, "%d", h.sizeMax))
151 hasher.Write(fmt.Appendf(nil, "%q", h.largeFiles))
152 hasher.Write(fmt.Appendf(nil, "%t", h.disableCTags))
153
154 return fmt.Sprintf("%x", hasher.Sum(nil))
155}
156
157type largeFilesFlag struct{ *Options }
158
159func (f largeFilesFlag) String() string {
160 // From flag.Value documentation:
161 //
162 // The flag package may call the String method with a zero-valued receiver,
163 // such as a nil pointer.
164 if f.Options == nil {
165 return ""
166 }
167 s := append([]string{""}, f.LargeFiles...)
168 return strings.Join(s, "-large_file ")
169}
170
171func (f largeFilesFlag) Set(value string) error {
172 f.LargeFiles = append(f.LargeFiles, value)
173 return nil
174}
175
176// Flags adds flags for build options to fs. It is the "inverse" of Args.
177func (o *Options) Flags(fs *flag.FlagSet) {
178 x := *o
179 x.SetDefaults()
180 fs.IntVar(&o.SizeMax, "file_limit", x.SizeMax, "maximum file size")
181 fs.IntVar(&o.TrigramMax, "max_trigram_count", x.TrigramMax, "maximum number of trigrams per document")
182 fs.IntVar(&o.ShardMax, "shard_limit", x.ShardMax, "maximum corpus size for a shard")
183 fs.IntVar(&o.Parallelism, "parallelism", x.Parallelism, "maximum number of parallel indexing processes.")
184 fs.StringVar(&o.IndexDir, "index", x.IndexDir, "directory for search indices")
185 fs.BoolVar(&o.CTagsMustSucceed, "require_ctags", x.CTagsMustSucceed, "If set, ctags calls must succeed.")
186 fs.Var(largeFilesFlag{o}, "large_file", "A glob pattern where matching files are to be index regardless of their size. You can add multiple patterns by setting this more than once.")
187
188 // Sourcegraph specific
189 fs.BoolVar(&o.DisableCTags, "disable_ctags", x.DisableCTags, "If set, ctags will not be called.")
190 fs.BoolVar(&o.ShardMerging, "shard_merging", x.ShardMerging, "If set, builder will respect compound shards.")
191}
192
193// Args generates command line arguments for o. It is the "inverse" of Flags.
194func (o *Options) Args() []string {
195 var args []string
196
197 if o.SizeMax != 0 {
198 args = append(args, "-file_limit", strconv.Itoa(o.SizeMax))
199 }
200
201 if o.TrigramMax != 0 {
202 args = append(args, "-max_trigram_count", strconv.Itoa(o.TrigramMax))
203 }
204
205 if o.ShardMax != 0 {
206 args = append(args, "-shard_limit", strconv.Itoa(o.ShardMax))
207 }
208
209 if o.Parallelism != 0 {
210 args = append(args, "-parallelism", strconv.Itoa(o.Parallelism))
211 }
212
213 if o.IndexDir != "" {
214 args = append(args, "-index", o.IndexDir)
215 }
216
217 if o.CTagsMustSucceed {
218 args = append(args, "-require_ctags")
219 }
220
221 for _, a := range o.LargeFiles {
222 args = append(args, "-large_file", a)
223 }
224
225 // Sourcegraph specific
226 if o.DisableCTags {
227 args = append(args, "-disable_ctags")
228 }
229
230 if o.ShardMerging {
231 args = append(args, "-shard_merging")
232 }
233
234 return args
235}
236
237// Builder manages (parallel) creation of uniformly sized shards. The
238// builder buffers up documents until it collects enough documents and
239// then builds a shard and writes.
240type Builder struct {
241 opts Options
242 throttle chan int
243
244 nextShardNum int
245 todo []*Document
246 docChecker DocChecker
247 size int
248
249 parserBins ctags.ParserBinMap
250 building sync.WaitGroup
251
252 errMu sync.Mutex
253 buildError error
254
255 // temp name => final name for finished shards. We only rename
256 // them once all shards succeed to avoid Frankstein corpuses.
257 finishedShards map[string]string
258
259 // indexTime is set by tests for doing reproducible builds.
260 indexTime time.Time
261
262 // heapProfileMu is used to ensure that only one memory profile is written at a time
263 heapProfileMu sync.Mutex
264 heapProfileNum int
265
266 // a sortable 20 chars long id.
267 id string
268
269 finishCalled bool
270}
271
272type finishedShard struct {
273 temp, final string
274}
275
276func checkCTags() string {
277 if ctags := os.Getenv("CTAGS_COMMAND"); ctags != "" {
278 return ctags
279 }
280
281 if ctags, err := exec.LookPath("universal-ctags"); err == nil {
282 return ctags
283 }
284
285 return ""
286}
287
288func checkScipCTags() string {
289 if ctags := os.Getenv("SCIP_CTAGS_COMMAND"); ctags != "" {
290 return ctags
291 }
292
293 if ctags, err := exec.LookPath("scip-ctags"); err == nil {
294 return ctags
295 }
296
297 return ""
298}
299
300// SetDefaults sets reasonable default options.
301func (o *Options) SetDefaults() {
302 if o.CTagsPath == "" && !o.DisableCTags {
303 o.CTagsPath = checkCTags()
304 }
305
306 if o.ScipCTagsPath == "" && !o.DisableCTags {
307 o.ScipCTagsPath = checkScipCTags()
308 }
309
310 if o.Parallelism == 0 {
311 o.Parallelism = 4
312 }
313 if o.SizeMax == 0 {
314 o.SizeMax = 2 << 20
315 }
316 if o.ShardMax == 0 {
317 o.ShardMax = 100 << 20
318 }
319 if o.TrigramMax == 0 {
320 o.TrigramMax = 20000
321 }
322
323 if o.RepositoryDescription.Name == "" && o.RepositoryDescription.URL != "" {
324 parsed, _ := url.Parse(o.RepositoryDescription.URL)
325 if parsed != nil {
326 o.RepositoryDescription.Name = filepath.Join(parsed.Host, parsed.Path)
327 }
328 }
329}
330
331// ShardName returns the name the given index shard.
332func (o *Options) shardName(n int) string {
333 return o.shardNameVersion(IndexFormatVersion, n)
334}
335
336func (o *Options) shardNameVersion(version, n int) string {
337 var prefix string
338
339 // If tenant enforcement is enabled and we have tenant/repo IDs, use those to generate the prefix
340 if o.RepositoryDescription.TenantID != 0 && o.RepositoryDescription.ID != 0 && tenant.EnforceTenant() {
341 prefix = tenant.SrcPrefix(o.RepositoryDescription.TenantID, o.RepositoryDescription.ID)
342 } else {
343 prefix = o.RepositoryDescription.Name
344 }
345
346 return shardName(o.IndexDir, prefix, version, n)
347}
348
349type IndexState string
350
351const (
352 IndexStateMissing IndexState = "missing"
353 IndexStateCorrupt IndexState = "corrupt"
354 IndexStateVersion IndexState = "version-mismatch"
355 IndexStateOption IndexState = "option-mismatch"
356 IndexStateMeta IndexState = "meta-mismatch"
357 IndexStateContent IndexState = "content-mismatch"
358 IndexStateEqual IndexState = "equal"
359)
360
361var readVersions = []struct {
362 IndexFormatVersion int
363 FeatureVersion int
364}{{
365 IndexFormatVersion: IndexFormatVersion,
366 FeatureVersion: FeatureVersion,
367}, {
368 IndexFormatVersion: NextIndexFormatVersion,
369 FeatureVersion: FeatureVersion,
370}}
371
372// IncrementalSkipIndexing returns true if the index present on disk matches
373// the build options.
374func (o *Options) IncrementalSkipIndexing() bool {
375 state, _ := o.IndexState()
376 return state == IndexStateEqual
377}
378
379// IndexState checks how the index present on disk compares to the build
380// options and returns the IndexState and the name of the first shard.
381func (o *Options) IndexState() (IndexState, string) {
382 // Open the latest version we support that is on disk.
383 fn := o.findShard()
384 if fn == "" {
385 return IndexStateMissing, fn
386 }
387
388 repos, index, err := ReadMetadataPathAlive(fn)
389 if os.IsNotExist(err) {
390 return IndexStateMissing, fn
391 } else if err != nil {
392 return IndexStateCorrupt, fn
393 }
394
395 for _, v := range readVersions {
396 if v.IndexFormatVersion == index.IndexFormatVersion && v.FeatureVersion != index.IndexFeatureVersion {
397 return IndexStateVersion, fn
398 }
399 }
400
401 var repo *zoekt.Repository
402 for _, cand := range repos {
403 if cand.Name == o.RepositoryDescription.Name {
404 repo = cand
405 break
406 }
407 }
408
409 if repo == nil {
410 return IndexStateCorrupt, fn
411 }
412
413 if repo.IndexOptions != o.GetHash() {
414 return IndexStateOption, fn
415 }
416
417 if !reflect.DeepEqual(repo.Branches, o.RepositoryDescription.Branches) {
418 return IndexStateContent, fn
419 }
420
421 // We can mutate repo since it lives in the scope of this function call.
422 if updated, err := repo.MergeMutable(&o.RepositoryDescription); err != nil {
423 // non-nil err means we are trying to update an immutable field =>
424 // reindex content.
425 log.Printf("warn: immutable field changed, requires re-index: %s", err)
426 return IndexStateContent, fn
427 } else if updated {
428 return IndexStateMeta, fn
429 }
430
431 return IndexStateEqual, fn
432}
433
434// FindRepositoryMetadata returns the index metadata for the repository
435// specified in the options. 'ok' is false if the repository's metadata
436// couldn't be found or if an error occurred.
437func (o *Options) FindRepositoryMetadata() (repository *zoekt.Repository, metadata *zoekt.IndexMetadata, ok bool, err error) {
438 shard := o.findShard()
439 if shard == "" {
440 return nil, nil, false, nil
441 }
442
443 repositories, metadata, err := ReadMetadataPathAlive(shard)
444 if err != nil {
445 return nil, nil, false, fmt.Errorf("reading metadata for shard %q: %w", shard, err)
446 }
447
448 ID := o.RepositoryDescription.ID
449 for _, r := range repositories {
450 // compound shards contain multiple repositories, so we
451 // have to pick only the one we're looking for
452 if r.ID == ID {
453 return r, metadata, true, nil
454 }
455 }
456
457 // If we're here, then we're somehow in a state where we found a matching
458 // shard that's missing the repository metadata we're looking for. This
459 // should never happen.
460 name := o.RepositoryDescription.Name
461 return nil, nil, false, fmt.Errorf("matching shard %q doesn't contain metadata for repo id %d (%q)", shard, ID, name)
462}
463
464func (o *Options) findShard() string {
465 for _, v := range readVersions {
466 fn := o.shardNameVersion(v.IndexFormatVersion, 0)
467 if _, err := os.Stat(fn); err == nil {
468 return fn
469 }
470 }
471
472 // Brute force finding the shard in compound shards. We should only hit this
473 // code path for repositories that don't exist yet or are in compound shards.
474 return o.findCompoundShard()
475}
476
477func (o *Options) findCompoundShard() string {
478 compoundShards, err := filepath.Glob(path.Join(o.IndexDir, "compound-*.zoekt"))
479 if err != nil {
480 return ""
481 }
482 for _, fn := range compoundShards {
483 if containsRepo(fn, o.RepositoryDescription.ID) {
484 return fn
485 }
486 }
487
488 return ""
489}
490
491func (o *Options) FindAllShards() []string {
492 for _, v := range readVersions {
493 fn := o.shardNameVersion(v.IndexFormatVersion, 0)
494 if _, err := os.Stat(fn); err == nil {
495 shards := []string{fn}
496 for i := 1; ; i++ {
497 fn := o.shardNameVersion(v.IndexFormatVersion, i)
498 if _, err := os.Stat(fn); err != nil {
499 return shards
500 }
501 shards = append(shards, fn)
502 }
503 }
504 }
505
506 // lazily fallback to findShard which will look for a compound shard.
507 if fn := o.findShard(); fn != "" {
508 return []string{fn}
509 }
510
511 return nil
512}
513
514// IgnoreSizeMax determines whether the max size should be ignored.
515func (o *Options) IgnoreSizeMax(name string) bool {
516 // A pattern match will override preceding pattern matches.
517 for i := len(o.LargeFiles) - 1; i >= 0; i-- {
518 pattern := strings.TrimSpace(o.LargeFiles[i])
519 negated, validatedPattern := checkIsNegatePattern(pattern)
520
521 if m, _ := doublestar.PathMatch(validatedPattern, name); m {
522 if negated {
523 return false
524 } else {
525 return true
526 }
527 }
528 }
529
530 return false
531}
532
533func checkIsNegatePattern(pattern string) (bool, string) {
534 negate := "!"
535
536 // if negated then strip prefix meta character which identifies negated filter pattern
537 if strings.HasPrefix(pattern, negate) {
538 return true, pattern[len(negate):]
539 }
540
541 return false, pattern
542}
543
544// NewBuilder creates a new Builder instance.
545func NewBuilder(opts Options) (*Builder, error) {
546 opts.SetDefaults()
547 if opts.RepositoryDescription.Name == "" {
548 return nil, fmt.Errorf("builder: must set Name")
549 }
550
551 b := &Builder{
552 opts: opts,
553 throttle: make(chan int, opts.Parallelism),
554 finishedShards: map[string]string{},
555 }
556
557 parserBins, err := ctags.NewParserBinMap(
558 b.opts.CTagsPath,
559 b.opts.ScipCTagsPath,
560 opts.LanguageMap,
561 b.opts.CTagsMustSucceed,
562 )
563 if err != nil {
564 return nil, err
565 }
566
567 b.parserBins = parserBins
568
569 if opts.IsDelta {
570 // Delta shards build on top of previously existing shards.
571 // As a consequence, the shardNum for delta shards starts from
572 // the number following the most recently generated shard - not 0.
573 //
574 // Using this numbering scheme allows all the shards to be
575 // discovered as a set.
576 shards := b.opts.FindAllShards()
577 b.nextShardNum = len(shards) // shards are zero indexed, so len() provides the next number after the last one
578 }
579
580 if _, err := b.newShardBuilder(); err != nil {
581 return nil, err
582 }
583
584 now := time.Now()
585 b.indexTime = now
586 b.id = xid.NewWithTime(now).String()
587
588 return b, nil
589}
590
591// AddFile is a convenience wrapper for the Add method
592func (b *Builder) AddFile(name string, content []byte) error {
593 return b.Add(Document{Name: name, Content: content})
594}
595
596func (b *Builder) Add(doc Document) error {
597 if b.finishCalled {
598 return nil
599 }
600
601 allowLargeFile := b.opts.IgnoreSizeMax(doc.Name)
602 if len(doc.Content) > b.opts.SizeMax && !allowLargeFile {
603 // We could pass the document on to the shardbuilder, but if
604 // we pass through a part of the source tree with binary/large
605 // files, the corresponding shard would be mostly empty, so
606 // insert a reason here too.
607 doc.SkipReason = SkipReasonTooLarge
608 } else if skip := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile); skip != SkipReasonNone {
609 doc.SkipReason = skip
610 }
611
612 b.todo = append(b.todo, &doc)
613
614 if doc.SkipReason == SkipReasonNone {
615 b.size += len(doc.Name) + len(doc.Content)
616 } else {
617 b.size += len(doc.Name)
618 // Drop the content if we are skipping the document. Skipped content is not counted towards the
619 // shard size limit, so otherwise we might buffer too much data in memory before flushing.
620 doc.Content = nil
621 }
622
623 if b.size > b.opts.ShardMax {
624 return b.flush()
625 }
626
627 return nil
628}
629
630// MarkFileAsChangedOrRemoved indicates that the file specified by the given path
631// has been changed or removed since the last indexing job for this repository.
632//
633// If this build is a delta build, these files will be tombstoned in the older shards for this repository.
634func (b *Builder) MarkFileAsChangedOrRemoved(path string) {
635 b.opts.changedOrRemovedFiles = append(b.opts.changedOrRemovedFiles, path)
636}
637
638// Finish creates a last shard from the buffered documents, and clears
639// stale shards from previous runs. This should always be called, also
640// in failure cases, to ensure cleanup.
641//
642// It is safe to call Finish() multiple times.
643func (b *Builder) Finish() error {
644 if b.finishCalled {
645 return b.buildError
646 }
647
648 b.finishCalled = true
649
650 b.flush()
651 b.building.Wait()
652
653 if b.buildError != nil {
654 for tmp := range b.finishedShards {
655 log.Printf("Builder.Finish %s", tmp)
656 os.Remove(tmp)
657 }
658 b.finishedShards = map[string]string{}
659 return b.buildError
660 }
661
662 // map of temporary -> final names for all updated shards + shard metadata files
663 artifactPaths := make(map[string]string)
664 maps.Copy(artifactPaths, b.finishedShards)
665
666 oldShards := b.opts.FindAllShards()
667
668 if b.opts.IsDelta {
669 // Delta shard builds need to update FileTombstone and branch commit information for all
670 // existing shards
671 for _, shard := range oldShards {
672 repositories, _, err := ReadMetadataPathAlive(shard)
673 if err != nil {
674 return fmt.Errorf("reading metadata from shard %q: %w", shard, err)
675 }
676
677 if len(repositories) > 1 {
678 return fmt.Errorf("delta shard builds don't support repositories contained in compound shards (shard %q)", shard)
679 }
680
681 if len(repositories) == 0 {
682 return fmt.Errorf("failed to update repository metadata for shard %q - shard contains no repositories", shard)
683 }
684
685 repository := repositories[0]
686 if repository.ID != b.opts.RepositoryDescription.ID {
687 return fmt.Errorf("shard %q doesn't contain repository ID %d (%q)", shard, b.opts.RepositoryDescription.ID, b.opts.RepositoryDescription.Name)
688 }
689
690 if len(b.opts.changedOrRemovedFiles) > 0 && repository.FileTombstones == nil {
691 repository.FileTombstones = make(map[string]struct{}, len(b.opts.changedOrRemovedFiles))
692 }
693
694 for _, f := range b.opts.changedOrRemovedFiles {
695 repository.FileTombstones[f] = struct{}{}
696 }
697
698 if !BranchNamesEqual(repository.Branches, b.opts.RepositoryDescription.Branches) {
699 return deltaBranchSetError{
700 shardName: shard,
701 old: repository.Branches,
702 new: b.opts.RepositoryDescription.Branches,
703 }
704 }
705
706 if b.opts.GetHash() != repository.IndexOptions {
707 return &deltaIndexOptionsMismatchError{
708 shardName: shard,
709 newOptions: b.opts.HashOptions(),
710 }
711 }
712
713 repository.Branches = b.opts.RepositoryDescription.Branches
714
715 repository.LatestCommitDate = b.opts.RepositoryDescription.LatestCommitDate
716
717 tempPath, finalPath, err := JsonMarshalRepoMetaTemp(shard, repository)
718 if err != nil {
719 return fmt.Errorf("writing repository metadta for shard %q: %w", shard, err)
720 }
721
722 artifactPaths[tempPath] = finalPath
723 }
724 }
725
726 // We mark finished shards as empty when we successfully finish. Return now
727 // to allow call sites to call Finish idempotently.
728 if len(artifactPaths) == 0 {
729 return b.buildError
730 }
731
732 // Collect a map of the old shards on disk. For each new shard we replace we
733 // delete it from toDelete. Anything remaining in toDelete will be removed
734 // after we have renamed everything into place.
735
736 var toDelete map[string]struct{}
737 if !b.opts.IsDelta {
738 // Non-delta shard builds delete all existing shards before they write out
739 // new ones.
740 // By contrast, delta shard builds work by stacking changes on top of existing shards.
741 // So, we skip populating the toDelete map if we're building delta shards.
742
743 toDelete = make(map[string]struct{})
744 for _, name := range oldShards {
745 paths, err := IndexFilePaths(name)
746 if err != nil {
747 b.buildError = fmt.Errorf("failed to find old paths for %s: %w", name, err)
748 }
749 for _, p := range paths {
750 toDelete[p] = struct{}{}
751 }
752 }
753 }
754
755 for tmp, final := range artifactPaths {
756 if err := os.Rename(tmp, final); err != nil {
757 b.buildError = err
758 continue
759 }
760
761 delete(toDelete, final)
762 }
763
764 b.finishedShards = map[string]string{}
765
766 for p := range toDelete {
767 // Don't delete compound shards, set tombstones instead.
768 if b.opts.ShardMerging && strings.HasPrefix(filepath.Base(p), "compound-") {
769 if !strings.HasSuffix(p, ".zoekt") {
770 continue
771 }
772 err := SetTombstone(p, b.opts.RepositoryDescription.ID)
773 b.buildError = err
774 continue
775 }
776 log.Printf("removing old shard file: %s", p)
777 if err := os.Remove(p); err != nil {
778 b.buildError = err
779 }
780 }
781
782 return b.buildError
783}
784
785// BranchNamesEqual compares the given zoekt.RepositoryBranch slices, and returns true
786// iff both slices specify the same set of branch names in the same order.
787func BranchNamesEqual(a, b []zoekt.RepositoryBranch) bool {
788 if len(a) != len(b) {
789 return false
790 }
791
792 for i := range a {
793 x, y := a[i], b[i]
794 if x.Name != y.Name {
795 return false
796 }
797 }
798
799 return true
800}
801
802func (b *Builder) flush() error {
803 todo := b.todo
804 b.todo = nil
805 b.size = 0
806 b.errMu.Lock()
807 defer b.errMu.Unlock()
808 if b.buildError != nil {
809 return b.buildError
810 }
811
812 hasShard := b.nextShardNum > 0
813 if len(todo) == 0 && hasShard {
814 return nil
815 }
816
817 shard := b.nextShardNum
818 b.nextShardNum++
819
820 if b.opts.Parallelism > 1 {
821 b.building.Add(1)
822 b.throttle <- 1
823 go func() {
824 done, err := b.buildShard(todo, shard)
825 <-b.throttle
826
827 b.errMu.Lock()
828 defer b.errMu.Unlock()
829 if err != nil && b.buildError == nil {
830 b.buildError = err
831 }
832 if err == nil {
833 b.finishedShards[done.temp] = done.final
834 }
835 b.building.Done()
836 }()
837 } else {
838 // No goroutines when we're not parallel. This
839 // simplifies memory profiling.
840 done, err := b.buildShard(todo, shard)
841 b.buildError = err
842 if err == nil {
843 b.finishedShards[done.temp] = done.final
844 }
845
846 return b.buildError
847 }
848
849 return nil
850}
851
852// map [0,inf) to [0,1) monotonically
853func squashRange(j int) float64 {
854 x := float64(j)
855 return x / (1 + x)
856}
857
858type rankedDoc struct {
859 *Document
860 rank []float64
861}
862
863// rank returns a vector of scores which is used at index-time to sort documents
864// before writing them to disk. The order of documents in the shard is important
865// at query time, because earlier documents receive a boost at query time and
866// have a higher chance of being searched before limits kick in.
867func rank(d *Document, origIdx int) []float64 {
868 skipped := 0.0
869 if d.SkipReason != SkipReasonNone {
870 skipped = 1.0
871 }
872
873 generated := 0.0
874 if enry.IsGenerated(d.Name, d.Content) {
875 generated = 1.0
876 }
877
878 vendor := 0.0
879 if enry.IsVendor(d.Name) {
880 vendor = 1.0
881 }
882
883 test := 0.0
884 if enry.IsTest(d.Name) {
885 test = 1.0
886 }
887
888 // Smaller is earlier (=better).
889 return []float64{
890 // Always place skipped docs last
891 skipped,
892
893 // Prefer docs that are not generated
894 generated,
895
896 // Prefer docs that are not vendored
897 vendor,
898
899 // Prefer docs that are not tests
900 test,
901
902 // With short names
903 squashRange(len(d.Name)),
904
905 // With many symbols
906 1.0 - squashRange(len(d.Symbols)),
907
908 // With short content
909 squashRange(len(d.Content)),
910
911 // That is present is as many branches as possible
912 1.0 - squashRange(len(d.Branches)),
913
914 // Preserve original ordering.
915 squashRange(origIdx),
916 }
917}
918
919func sortDocuments(todo []*Document) {
920 rs := make([]rankedDoc, 0, len(todo))
921 for i, t := range todo {
922 rd := rankedDoc{t, rank(t, i)}
923 rs = append(rs, rd)
924 }
925 sort.Slice(rs, func(i, j int) bool {
926 r1 := rs[i].rank
927 r2 := rs[j].rank
928 for i := range r1 {
929 if r1[i] < r2[i] {
930 return true
931 }
932 if r1[i] > r2[i] {
933 return false
934 }
935 }
936
937 return false
938 })
939 for i := range todo {
940 todo[i] = rs[i].Document
941 }
942}
943
944func (b *Builder) buildShard(todo []*Document, nextShardNum int) (*finishedShard, error) {
945 if !b.opts.DisableCTags && (b.opts.CTagsPath != "" || b.opts.ScipCTagsPath != "") {
946 err := parseSymbols(todo, b.opts.LanguageMap, b.parserBins)
947 if b.opts.CTagsMustSucceed && err != nil {
948 return nil, err
949 }
950 if err != nil {
951 log.Printf("ignoring universal:%s or scip:%s error: %v", b.opts.CTagsPath, b.opts.ScipCTagsPath, err)
952 }
953 }
954
955 name := b.opts.shardName(nextShardNum)
956
957 shardBuilder, err := b.newShardBuilder()
958 if err != nil {
959 return nil, err
960 }
961
962 sortDocuments(todo)
963
964 for idx, t := range todo {
965 if err := shardBuilder.Add(*t); err != nil {
966 return nil, err
967 }
968
969 if idx%10_000 == 0 {
970 b.CheckMemoryUsage()
971 }
972 }
973
974 return b.writeShard(name, shardBuilder)
975}
976
977// CheckMemoryUsage checks the memory usage of the process and writes a memory profile if the heap usage exceeds the
978// configured threshold. NOTE: this method is expensive and should only be used for debugging.
979func (b *Builder) CheckMemoryUsage() {
980 // Don't check memory if heap profiling is disabled, or we've already written 10 profiles
981 if b.opts.HeapProfileTriggerBytes <= 0 || b.heapProfileNum >= 10 {
982 return
983 }
984
985 var m runtime.MemStats
986 runtime.ReadMemStats(&m)
987
988 if m.HeapAlloc > b.opts.HeapProfileTriggerBytes && b.heapProfileMu.TryLock() {
989 defer b.heapProfileMu.Unlock()
990
991 log.Printf("writing memory profile, allocated heap: %s", humanize.Bytes(m.HeapAlloc))
992 name := filepath.Join(b.opts.IndexDir, fmt.Sprintf("indexmemory.prof.%d", b.heapProfileNum))
993 f, err := os.Create(name)
994 if err != nil {
995 log.Printf("failed to create memory profile file: %v", err)
996 return
997 }
998
999 err = pprof.WriteHeapProfile(f)
1000 if err != nil {
1001 log.Printf("failed to write memory profile: %v", err)
1002 }
1003
1004 b.heapProfileNum++
1005 }
1006}
1007
1008func (b *Builder) newShardBuilder() (*ShardBuilder, error) {
1009 desc := b.opts.RepositoryDescription
1010 desc.HasSymbols = !b.opts.DisableCTags && b.opts.CTagsPath != ""
1011 desc.SubRepoMap = b.opts.SubRepositories
1012 desc.IndexOptions = b.opts.GetHash()
1013
1014 shardBuilder, err := NewShardBuilder(&desc)
1015 if err != nil {
1016 return nil, err
1017 }
1018 shardBuilder.IndexTime = b.indexTime
1019 shardBuilder.ID = b.id
1020 return shardBuilder, nil
1021}
1022
1023func (b *Builder) writeShard(fn string, ib *ShardBuilder) (*finishedShard, error) {
1024 dir := filepath.Dir(fn)
1025 if err := os.MkdirAll(dir, 0o700); err != nil {
1026 return nil, err
1027 }
1028
1029 f, err := os.CreateTemp(dir, filepath.Base(fn)+".*.tmp")
1030 if err != nil {
1031 return nil, err
1032 }
1033 if runtime.GOOS != "windows" {
1034 if err := f.Chmod(0o666 &^ umask); err != nil {
1035 return nil, err
1036 }
1037 }
1038
1039 defer f.Close()
1040 if err := ib.Write(f); err != nil {
1041 return nil, err
1042 }
1043 fi, err := f.Stat()
1044 if err != nil {
1045 return nil, err
1046 }
1047 if err := f.Close(); err != nil {
1048 return nil, err
1049 }
1050
1051 log.Printf("finished shard %s: %d index bytes (overhead %3.1f), %d files processed \n",
1052 fn,
1053 fi.Size(),
1054 float64(fi.Size())/float64(ib.ContentSize()+1),
1055 ib.NumFiles())
1056
1057 return &finishedShard{f.Name(), fn}, nil
1058}
1059
1060type deltaBranchSetError struct {
1061 shardName string
1062 old, new []zoekt.RepositoryBranch
1063}
1064
1065func (e deltaBranchSetError) Error() string {
1066 return fmt.Sprintf("repository metadata in shard %q contains a different set of branch names than what was requested, which is unsupported in a delta shard build. old: %+v, new: %+v", e.shardName, e.old, e.new)
1067}
1068
1069type deltaIndexOptionsMismatchError struct {
1070 shardName string
1071 newOptions HashOptions
1072}
1073
1074func (e *deltaIndexOptionsMismatchError) Error() string {
1075 return fmt.Sprintf("one or more index options for shard %q do not match Builder's index options. These index option updates are incompatible with delta build. New index options: %+v", e.shardName, e.newOptions)
1076}
1077
1078// umask holds the Umask of the current process
1079var umask os.FileMode
1080
1081func init() {
1082 umask = os.FileMode(unix.Umask(0))
1083 unix.Umask(int(umask))
1084}