fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// Package index contains logic for building Zoekt indexes. NOTE: this package is not considered
16// part of the public API, and it is not recommended to rely on it in external code.
17package index
18
19import (
20 "cmp"
21 "crypto/sha1"
22 "flag"
23 "fmt"
24 "log"
25 "net/url"
26 "os"
27 "os/exec"
28 "path"
29 "path/filepath"
30 "reflect"
31 "runtime"
32 "runtime/pprof"
33 "sort"
34 "strconv"
35 "strings"
36 "sync"
37 "time"
38
39 "github.com/bmatcuk/doublestar"
40 "github.com/dustin/go-humanize"
41 "github.com/go-enry/go-enry/v2"
42 "github.com/rs/xid"
43
44 "github.com/sourcegraph/zoekt"
45 "github.com/sourcegraph/zoekt/internal/ctags"
46)
47
48var DefaultDir = filepath.Join(os.Getenv("HOME"), ".zoekt")
49
50// Branch describes a single branch version.
51type Branch struct {
52 Name string
53 Version string
54}
55
56// Options sets options for the index building.
57type Options struct {
58 // IndexDir is a directory that holds *.zoekt index files.
59 IndexDir string
60
61 // SizeMax is the maximum file size
62 SizeMax int
63
64 // Parallelism is the maximum number of shards to index in parallel
65 Parallelism int
66
67 // ShardMax sets the maximum corpus size for a single shard
68 ShardMax int
69
70 // TrigramMax sets the maximum number of distinct trigrams per document.
71 TrigramMax int
72
73 // RepositoryDescription holds names and URLs for the repository.
74 RepositoryDescription zoekt.Repository
75
76 // SubRepositories is a path => sub repository map.
77 SubRepositories map[string]*zoekt.Repository
78
79 // DisableCTags disables the generation of ctags metadata.
80 DisableCTags bool
81
82 // CtagsPath is the path to the ctags binary to run, or empty
83 // if a valid binary couldn't be found.
84 CTagsPath string
85
86 // Same as CTagsPath but for scip-ctags
87 ScipCTagsPath string
88
89 // If set, ctags must succeed.
90 CTagsMustSucceed bool
91
92 // LargeFiles is a slice of glob patterns, including ** for any number
93 // of directories, where matching file paths should be indexed
94 // regardless of their size. The full pattern syntax is here:
95 // https://github.com/bmatcuk/doublestar/tree/v1#patterns.
96 LargeFiles []string
97
98 // IsDelta is true if this run contains only the changed documents since the
99 // last run.
100 IsDelta bool
101
102 // changedOrRemovedFiles is a list of file paths that have been changed or removed
103 // since the last indexing job for this repository. These files will be tombstoned
104 // in the older shards for this repository.
105 changedOrRemovedFiles []string
106
107 LanguageMap ctags.LanguageMap
108
109 // ShardMerging is true if builder should respect compound shards. This is a
110 // Sourcegraph specific option.
111 ShardMerging bool
112
113 // HeapProfileTriggerBytes is the heap allocation in bytes that will trigger a memory profile. If 0, no memory profile
114 // will be triggered. Note this trigger looks at total heap allocation (which includes both inuse and garbage objects).
115 //
116 // Profiles will be written to files named `index-memory.prof.n` in the index directory. No more than 10 files are written.
117 //
118 // Note: heap checking is "best effort", and it's possible for the process to OOM without triggering the heap profile.
119 HeapProfileTriggerBytes uint64
120
121 // ShardPrefix is the prefix of the shard. It defaults to the repository name.
122 ShardPrefix string
123}
124
125// HashOptions contains only the options in Options that upon modification leads to IndexState of IndexStateMismatch during the next index building.
126type HashOptions struct {
127 sizeMax int
128 disableCTags bool
129 ctagsPath string
130 cTagsMustSucceed bool
131 largeFiles []string
132}
133
134func (o *Options) HashOptions() HashOptions {
135 return HashOptions{
136 sizeMax: o.SizeMax,
137 disableCTags: o.DisableCTags,
138 ctagsPath: o.CTagsPath,
139 cTagsMustSucceed: o.CTagsMustSucceed,
140 largeFiles: o.LargeFiles,
141 }
142}
143
144func (o *Options) GetHash() string {
145 h := o.HashOptions()
146 hasher := sha1.New()
147
148 hasher.Write([]byte(h.ctagsPath))
149 hasher.Write([]byte(fmt.Sprintf("%t", h.cTagsMustSucceed)))
150 hasher.Write([]byte(fmt.Sprintf("%d", h.sizeMax)))
151 hasher.Write([]byte(fmt.Sprintf("%q", h.largeFiles)))
152 hasher.Write([]byte(fmt.Sprintf("%t", h.disableCTags)))
153
154 return fmt.Sprintf("%x", hasher.Sum(nil))
155}
156
157type largeFilesFlag struct{ *Options }
158
159func (f largeFilesFlag) String() string {
160 // From flag.Value documentation:
161 //
162 // The flag package may call the String method with a zero-valued receiver,
163 // such as a nil pointer.
164 if f.Options == nil {
165 return ""
166 }
167 s := append([]string{""}, f.LargeFiles...)
168 return strings.Join(s, "-large_file ")
169}
170
171func (f largeFilesFlag) Set(value string) error {
172 f.LargeFiles = append(f.LargeFiles, value)
173 return nil
174}
175
176// Flags adds flags for build options to fs. It is the "inverse" of Args.
177func (o *Options) Flags(fs *flag.FlagSet) {
178 x := *o
179 x.SetDefaults()
180 fs.IntVar(&o.SizeMax, "file_limit", x.SizeMax, "maximum file size")
181 fs.IntVar(&o.TrigramMax, "max_trigram_count", x.TrigramMax, "maximum number of trigrams per document")
182 fs.IntVar(&o.ShardMax, "shard_limit", x.ShardMax, "maximum corpus size for a shard")
183 fs.IntVar(&o.Parallelism, "parallelism", x.Parallelism, "maximum number of parallel indexing processes.")
184 fs.StringVar(&o.IndexDir, "index", x.IndexDir, "directory for search indices")
185 fs.BoolVar(&o.CTagsMustSucceed, "require_ctags", x.CTagsMustSucceed, "If set, ctags calls must succeed.")
186 fs.Var(largeFilesFlag{o}, "large_file", "A glob pattern where matching files are to be index regardless of their size. You can add multiple patterns by setting this more than once.")
187 fs.StringVar(&o.ShardPrefix, "shard_prefix", x.ShardPrefix, "the prefix of the shard. Defaults to repository name")
188
189 // Sourcegraph specific
190 fs.BoolVar(&o.DisableCTags, "disable_ctags", x.DisableCTags, "If set, ctags will not be called.")
191 fs.BoolVar(&o.ShardMerging, "shard_merging", x.ShardMerging, "If set, builder will respect compound shards.")
192}
193
194// Args generates command line arguments for o. It is the "inverse" of Flags.
195func (o *Options) Args() []string {
196 var args []string
197
198 if o.SizeMax != 0 {
199 args = append(args, "-file_limit", strconv.Itoa(o.SizeMax))
200 }
201
202 if o.TrigramMax != 0 {
203 args = append(args, "-max_trigram_count", strconv.Itoa(o.TrigramMax))
204 }
205
206 if o.ShardMax != 0 {
207 args = append(args, "-shard_limit", strconv.Itoa(o.ShardMax))
208 }
209
210 if o.Parallelism != 0 {
211 args = append(args, "-parallelism", strconv.Itoa(o.Parallelism))
212 }
213
214 if o.IndexDir != "" {
215 args = append(args, "-index", o.IndexDir)
216 }
217
218 if o.CTagsMustSucceed {
219 args = append(args, "-require_ctags")
220 }
221
222 for _, a := range o.LargeFiles {
223 args = append(args, "-large_file", a)
224 }
225
226 // Sourcegraph specific
227 if o.DisableCTags {
228 args = append(args, "-disable_ctags")
229 }
230
231 if o.ShardMerging {
232 args = append(args, "-shard_merging")
233 }
234
235 if o.ShardPrefix != "" {
236 args = append(args, "-shard_prefix", o.ShardPrefix)
237 }
238
239 return args
240}
241
242// Builder manages (parallel) creation of uniformly sized shards. The
243// builder buffers up documents until it collects enough documents and
244// then builds a shard and writes.
245type Builder struct {
246 opts Options
247 throttle chan int
248
249 nextShardNum int
250 todo []*Document
251 docChecker DocChecker
252 size int
253
254 parserBins ctags.ParserBinMap
255 building sync.WaitGroup
256
257 errMu sync.Mutex
258 buildError error
259
260 // temp name => final name for finished shards. We only rename
261 // them once all shards succeed to avoid Frankstein corpuses.
262 finishedShards map[string]string
263
264 // indexTime is set by tests for doing reproducible builds.
265 indexTime time.Time
266
267 // heapProfileMu is used to ensure that only one memory profile is written at a time
268 heapProfileMu sync.Mutex
269 heapProfileNum int
270
271 // a sortable 20 chars long id.
272 id string
273
274 finishCalled bool
275}
276
277type finishedShard struct {
278 temp, final string
279}
280
281func checkCTags() string {
282 if ctags := os.Getenv("CTAGS_COMMAND"); ctags != "" {
283 return ctags
284 }
285
286 if ctags, err := exec.LookPath("universal-ctags"); err == nil {
287 return ctags
288 }
289
290 return ""
291}
292
293func checkScipCTags() string {
294 if ctags := os.Getenv("SCIP_CTAGS_COMMAND"); ctags != "" {
295 return ctags
296 }
297
298 if ctags, err := exec.LookPath("scip-ctags"); err == nil {
299 return ctags
300 }
301
302 return ""
303}
304
305// SetDefaults sets reasonable default options.
306func (o *Options) SetDefaults() {
307 if o.CTagsPath == "" && !o.DisableCTags {
308 o.CTagsPath = checkCTags()
309 }
310
311 if o.ScipCTagsPath == "" && !o.DisableCTags {
312 o.ScipCTagsPath = checkScipCTags()
313 }
314
315 if o.Parallelism == 0 {
316 o.Parallelism = 4
317 }
318 if o.SizeMax == 0 {
319 o.SizeMax = 2 << 20
320 }
321 if o.ShardMax == 0 {
322 o.ShardMax = 100 << 20
323 }
324 if o.TrigramMax == 0 {
325 o.TrigramMax = 20000
326 }
327
328 if o.RepositoryDescription.Name == "" && o.RepositoryDescription.URL != "" {
329 parsed, _ := url.Parse(o.RepositoryDescription.URL)
330 if parsed != nil {
331 o.RepositoryDescription.Name = filepath.Join(parsed.Host, parsed.Path)
332 }
333 }
334}
335
336// ShardName returns the name the given index shard.
337func (o *Options) shardName(n int) string {
338 return o.shardNameVersion(IndexFormatVersion, n)
339}
340
341func (o *Options) shardNameVersion(version, n int) string {
342 return ShardName(o.IndexDir, cmp.Or(o.ShardPrefix, o.RepositoryDescription.Name), version, n)
343}
344
345type IndexState string
346
347const (
348 IndexStateMissing IndexState = "missing"
349 IndexStateCorrupt IndexState = "corrupt"
350 IndexStateVersion IndexState = "version-mismatch"
351 IndexStateOption IndexState = "option-mismatch"
352 IndexStateMeta IndexState = "meta-mismatch"
353 IndexStateContent IndexState = "content-mismatch"
354 IndexStateEqual IndexState = "equal"
355)
356
357var readVersions = []struct {
358 IndexFormatVersion int
359 FeatureVersion int
360}{{
361 IndexFormatVersion: IndexFormatVersion,
362 FeatureVersion: FeatureVersion,
363}, {
364 IndexFormatVersion: NextIndexFormatVersion,
365 FeatureVersion: FeatureVersion,
366}}
367
368// IncrementalSkipIndexing returns true if the index present on disk matches
369// the build options.
370func (o *Options) IncrementalSkipIndexing() bool {
371 state, _ := o.IndexState()
372 return state == IndexStateEqual
373}
374
375// IndexState checks how the index present on disk compares to the build
376// options and returns the IndexState and the name of the first shard.
377func (o *Options) IndexState() (IndexState, string) {
378 // Open the latest version we support that is on disk.
379 fn := o.findShard()
380 if fn == "" {
381 return IndexStateMissing, fn
382 }
383
384 repos, index, err := ReadMetadataPathAlive(fn)
385 if os.IsNotExist(err) {
386 return IndexStateMissing, fn
387 } else if err != nil {
388 return IndexStateCorrupt, fn
389 }
390
391 for _, v := range readVersions {
392 if v.IndexFormatVersion == index.IndexFormatVersion && v.FeatureVersion != index.IndexFeatureVersion {
393 return IndexStateVersion, fn
394 }
395 }
396
397 var repo *zoekt.Repository
398 for _, cand := range repos {
399 if cand.Name == o.RepositoryDescription.Name {
400 repo = cand
401 break
402 }
403 }
404
405 if repo == nil {
406 return IndexStateCorrupt, fn
407 }
408
409 if repo.IndexOptions != o.GetHash() {
410 return IndexStateOption, fn
411 }
412
413 if !reflect.DeepEqual(repo.Branches, o.RepositoryDescription.Branches) {
414 return IndexStateContent, fn
415 }
416
417 // We can mutate repo since it lives in the scope of this function call.
418 if updated, err := repo.MergeMutable(&o.RepositoryDescription); err != nil {
419 // non-nil err means we are trying to update an immutable field =>
420 // reindex content.
421 log.Printf("warn: immutable field changed, requires re-index: %s", err)
422 return IndexStateContent, fn
423 } else if updated {
424 return IndexStateMeta, fn
425 }
426
427 return IndexStateEqual, fn
428}
429
430// FindRepositoryMetadata returns the index metadata for the repository
431// specified in the options. 'ok' is false if the repository's metadata
432// couldn't be found or if an error occurred.
433func (o *Options) FindRepositoryMetadata() (repository *zoekt.Repository, metadata *zoekt.IndexMetadata, ok bool, err error) {
434 shard := o.findShard()
435 if shard == "" {
436 return nil, nil, false, nil
437 }
438
439 repositories, metadata, err := ReadMetadataPathAlive(shard)
440 if err != nil {
441 return nil, nil, false, fmt.Errorf("reading metadata for shard %q: %w", shard, err)
442 }
443
444 ID := o.RepositoryDescription.ID
445 for _, r := range repositories {
446 // compound shards contain multiple repositories, so we
447 // have to pick only the one we're looking for
448 if r.ID == ID {
449 return r, metadata, true, nil
450 }
451 }
452
453 // If we're here, then we're somehow in a state where we found a matching
454 // shard that's missing the repository metadata we're looking for. This
455 // should never happen.
456 name := o.RepositoryDescription.Name
457 return nil, nil, false, fmt.Errorf("matching shard %q doesn't contain metadata for repo id %d (%q)", shard, ID, name)
458}
459
460func (o *Options) findShard() string {
461 for _, v := range readVersions {
462 fn := o.shardNameVersion(v.IndexFormatVersion, 0)
463 if _, err := os.Stat(fn); err == nil {
464 return fn
465 }
466 }
467
468 // Brute force finding the shard in compound shards. We should only hit this
469 // code path for repositories that are not already existing or are in
470 // compound shards.
471 //
472 // TODO add an oracle which can speed this up in the case of repositories
473 // already in compound shards.
474 compoundShards, err := filepath.Glob(path.Join(o.IndexDir, "compound-*.zoekt"))
475 if err != nil {
476 return ""
477 }
478 for _, fn := range compoundShards {
479 repos, _, err := ReadMetadataPathAlive(fn)
480 if err != nil {
481 continue
482 }
483 for _, repo := range repos {
484 if repo.ID == o.RepositoryDescription.ID {
485 return fn
486 }
487 }
488 }
489
490 return ""
491}
492
493func (o *Options) FindAllShards() []string {
494 for _, v := range readVersions {
495 fn := o.shardNameVersion(v.IndexFormatVersion, 0)
496 if _, err := os.Stat(fn); err == nil {
497 shards := []string{fn}
498 for i := 1; ; i++ {
499 fn := o.shardNameVersion(v.IndexFormatVersion, i)
500 if _, err := os.Stat(fn); err != nil {
501 return shards
502 }
503 shards = append(shards, fn)
504 }
505 }
506 }
507
508 // lazily fallback to findShard which will look for a compound shard.
509 if fn := o.findShard(); fn != "" {
510 return []string{fn}
511 }
512
513 return nil
514}
515
516// IgnoreSizeMax determines whether the max size should be ignored.
517func (o *Options) IgnoreSizeMax(name string) bool {
518 // A pattern match will override preceding pattern matches.
519 for i := len(o.LargeFiles) - 1; i >= 0; i-- {
520 pattern := strings.TrimSpace(o.LargeFiles[i])
521 negated, validatedPattern := checkIsNegatePattern(pattern)
522
523 if m, _ := doublestar.PathMatch(validatedPattern, name); m {
524 if negated {
525 return false
526 } else {
527 return true
528 }
529 }
530 }
531
532 return false
533}
534
535func checkIsNegatePattern(pattern string) (bool, string) {
536 negate := "!"
537
538 // if negated then strip prefix meta character which identifies negated filter pattern
539 if strings.HasPrefix(pattern, negate) {
540 return true, pattern[len(negate):]
541 }
542
543 return false, pattern
544}
545
546// NewBuilder creates a new Builder instance.
547func NewBuilder(opts Options) (*Builder, error) {
548 opts.SetDefaults()
549 if opts.RepositoryDescription.Name == "" {
550 return nil, fmt.Errorf("builder: must set Name")
551 }
552
553 b := &Builder{
554 opts: opts,
555 throttle: make(chan int, opts.Parallelism),
556 finishedShards: map[string]string{},
557 }
558
559 parserBins, err := ctags.NewParserBinMap(
560 b.opts.CTagsPath,
561 b.opts.ScipCTagsPath,
562 opts.LanguageMap,
563 b.opts.CTagsMustSucceed,
564 )
565 if err != nil {
566 return nil, err
567 }
568
569 b.parserBins = parserBins
570
571 if opts.IsDelta {
572 // Delta shards build on top of previously existing shards.
573 // As a consequence, the shardNum for delta shards starts from
574 // the number following the most recently generated shard - not 0.
575 //
576 // Using this numbering scheme allows all the shards to be
577 // discovered as a set.
578 shards := b.opts.FindAllShards()
579 b.nextShardNum = len(shards) // shards are zero indexed, so len() provides the next number after the last one
580 }
581
582 if _, err := b.newShardBuilder(); err != nil {
583 return nil, err
584 }
585
586 now := time.Now()
587 b.indexTime = now
588 b.id = xid.NewWithTime(now).String()
589
590 return b, nil
591}
592
593// AddFile is a convenience wrapper for the Add method
594func (b *Builder) AddFile(name string, content []byte) error {
595 return b.Add(Document{Name: name, Content: content})
596}
597
598func (b *Builder) Add(doc Document) error {
599 if b.finishCalled {
600 return nil
601 }
602
603 allowLargeFile := b.opts.IgnoreSizeMax(doc.Name)
604 if len(doc.Content) > b.opts.SizeMax && !allowLargeFile {
605 // We could pass the document on to the shardbuilder, but if
606 // we pass through a part of the source tree with binary/large
607 // files, the corresponding shard would be mostly empty, so
608 // insert a reason here too.
609 doc.SkipReason = fmt.Sprintf("document size %d larger than limit %d", len(doc.Content), b.opts.SizeMax)
610 } else if err := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile); err != nil {
611 doc.SkipReason = err.Error()
612 doc.Language = "binary"
613 }
614
615 b.todo = append(b.todo, &doc)
616
617 if doc.SkipReason == "" {
618 b.size += len(doc.Name) + len(doc.Content)
619 } else {
620 b.size += len(doc.Name) + len(doc.SkipReason)
621 // Drop the content if we are skipping the document. Skipped content is not counted towards the
622 // shard size limit, so otherwise we might buffer too much data in memory before flushing.
623 doc.Content = nil
624 }
625
626 if b.size > b.opts.ShardMax {
627 return b.flush()
628 }
629
630 return nil
631}
632
633// MarkFileAsChangedOrRemoved indicates that the file specified by the given path
634// has been changed or removed since the last indexing job for this repository.
635//
636// If this build is a delta build, these files will be tombstoned in the older shards for this repository.
637func (b *Builder) MarkFileAsChangedOrRemoved(path string) {
638 b.opts.changedOrRemovedFiles = append(b.opts.changedOrRemovedFiles, path)
639}
640
641// Finish creates a last shard from the buffered documents, and clears
642// stale shards from previous runs. This should always be called, also
643// in failure cases, to ensure cleanup.
644//
645// It is safe to call Finish() multiple times.
646func (b *Builder) Finish() error {
647 if b.finishCalled {
648 return b.buildError
649 }
650
651 b.finishCalled = true
652
653 b.flush()
654 b.building.Wait()
655
656 if b.buildError != nil {
657 for tmp := range b.finishedShards {
658 log.Printf("Builder.Finish %s", tmp)
659 os.Remove(tmp)
660 }
661 b.finishedShards = map[string]string{}
662 return b.buildError
663 }
664
665 // map of temporary -> final names for all updated shards + shard metadata files
666 artifactPaths := make(map[string]string)
667 for tmp, final := range b.finishedShards {
668 artifactPaths[tmp] = final
669 }
670
671 oldShards := b.opts.FindAllShards()
672
673 if b.opts.IsDelta {
674 // Delta shard builds need to update FileTombstone and branch commit information for all
675 // existing shards
676 for _, shard := range oldShards {
677 repositories, _, err := ReadMetadataPathAlive(shard)
678 if err != nil {
679 return fmt.Errorf("reading metadata from shard %q: %w", shard, err)
680 }
681
682 if len(repositories) > 1 {
683 return fmt.Errorf("delta shard builds don't support repositories contained in compound shards (shard %q)", shard)
684 }
685
686 if len(repositories) == 0 {
687 return fmt.Errorf("failed to update repository metadata for shard %q - shard contains no repositories", shard)
688 }
689
690 repository := repositories[0]
691 if repository.ID != b.opts.RepositoryDescription.ID {
692 return fmt.Errorf("shard %q doesn't contain repository ID %d (%q)", shard, b.opts.RepositoryDescription.ID, b.opts.RepositoryDescription.Name)
693 }
694
695 if len(b.opts.changedOrRemovedFiles) > 0 && repository.FileTombstones == nil {
696 repository.FileTombstones = make(map[string]struct{}, len(b.opts.changedOrRemovedFiles))
697 }
698
699 for _, f := range b.opts.changedOrRemovedFiles {
700 repository.FileTombstones[f] = struct{}{}
701 }
702
703 if !BranchNamesEqual(repository.Branches, b.opts.RepositoryDescription.Branches) {
704 return deltaBranchSetError{
705 shardName: shard,
706 old: repository.Branches,
707 new: b.opts.RepositoryDescription.Branches,
708 }
709 }
710
711 if b.opts.GetHash() != repository.IndexOptions {
712 return &deltaIndexOptionsMismatchError{
713 shardName: shard,
714 newOptions: b.opts.HashOptions(),
715 }
716 }
717
718 repository.Branches = b.opts.RepositoryDescription.Branches
719
720 repository.LatestCommitDate = b.opts.RepositoryDescription.LatestCommitDate
721
722 tempPath, finalPath, err := JsonMarshalRepoMetaTemp(shard, repository)
723 if err != nil {
724 return fmt.Errorf("writing repository metadta for shard %q: %w", shard, err)
725 }
726
727 artifactPaths[tempPath] = finalPath
728 }
729 }
730
731 // We mark finished shards as empty when we successfully finish. Return now
732 // to allow call sites to call Finish idempotently.
733 if len(artifactPaths) == 0 {
734 return b.buildError
735 }
736
737 // Collect a map of the old shards on disk. For each new shard we replace we
738 // delete it from toDelete. Anything remaining in toDelete will be removed
739 // after we have renamed everything into place.
740
741 var toDelete map[string]struct{}
742 if !b.opts.IsDelta {
743 // Non-delta shard builds delete all existing shards before they write out
744 // new ones.
745 // By contrast, delta shard builds work by stacking changes on top of existing shards.
746 // So, we skip populating the toDelete map if we're building delta shards.
747
748 toDelete = make(map[string]struct{})
749 for _, name := range oldShards {
750 paths, err := IndexFilePaths(name)
751 if err != nil {
752 b.buildError = fmt.Errorf("failed to find old paths for %s: %w", name, err)
753 }
754 for _, p := range paths {
755 toDelete[p] = struct{}{}
756 }
757 }
758 }
759
760 for tmp, final := range artifactPaths {
761 if err := os.Rename(tmp, final); err != nil {
762 b.buildError = err
763 continue
764 }
765
766 delete(toDelete, final)
767 }
768
769 b.finishedShards = map[string]string{}
770
771 for p := range toDelete {
772 // Don't delete compound shards, set tombstones instead.
773 if b.opts.ShardMerging && strings.HasPrefix(filepath.Base(p), "compound-") {
774 if !strings.HasSuffix(p, ".zoekt") {
775 continue
776 }
777 err := SetTombstone(p, b.opts.RepositoryDescription.ID)
778 b.buildError = err
779 continue
780 }
781 log.Printf("removing old shard file: %s", p)
782 if err := os.Remove(p); err != nil {
783 b.buildError = err
784 }
785 }
786
787 return b.buildError
788}
789
790// BranchNamesEqual compares the given zoekt.RepositoryBranch slices, and returns true
791// iff both slices specify the same set of branch names in the same order.
792func BranchNamesEqual(a, b []zoekt.RepositoryBranch) bool {
793 if len(a) != len(b) {
794 return false
795 }
796
797 for i := range a {
798 x, y := a[i], b[i]
799 if x.Name != y.Name {
800 return false
801 }
802 }
803
804 return true
805}
806
807func (b *Builder) flush() error {
808 todo := b.todo
809 b.todo = nil
810 b.size = 0
811 b.errMu.Lock()
812 defer b.errMu.Unlock()
813 if b.buildError != nil {
814 return b.buildError
815 }
816
817 hasShard := b.nextShardNum > 0
818 if len(todo) == 0 && hasShard {
819 return nil
820 }
821
822 shard := b.nextShardNum
823 b.nextShardNum++
824
825 if b.opts.Parallelism > 1 {
826 b.building.Add(1)
827 b.throttle <- 1
828 go func() {
829 done, err := b.buildShard(todo, shard)
830 <-b.throttle
831
832 b.errMu.Lock()
833 defer b.errMu.Unlock()
834 if err != nil && b.buildError == nil {
835 b.buildError = err
836 }
837 if err == nil {
838 b.finishedShards[done.temp] = done.final
839 }
840 b.building.Done()
841 }()
842 } else {
843 // No goroutines when we're not parallel. This
844 // simplifies memory profiling.
845 done, err := b.buildShard(todo, shard)
846 b.buildError = err
847 if err == nil {
848 b.finishedShards[done.temp] = done.final
849 }
850
851 return b.buildError
852 }
853
854 return nil
855}
856
857// map [0,inf) to [0,1) monotonically
858func squashRange(j int) float64 {
859 x := float64(j)
860 return x / (1 + x)
861}
862
863// IsLowPriority takes a file name and makes an educated guess about its priority
864// in search results. A file is considered low priority if it looks like a test,
865// vendored, or generated file.
866//
867// These 'priority' criteria affects how documents are ordered within a shard. It's
868// also used to help guess a file's rank when we're missing ranking information.
869func IsLowPriority(path string, content []byte) bool {
870 return enry.IsTest(path) || enry.IsVendor(path) || enry.IsGenerated(path, content)
871}
872
873type rankedDoc struct {
874 *Document
875 rank []float64
876}
877
878// rank returns a vector of scores which is used at index-time to sort documents
879// before writing them to disk. The order of documents in the shard is important
880// at query time, because earlier documents receive a boost at query time and
881// have a higher chance of being searched before limits kick in.
882func rank(d *Document, origIdx int) []float64 {
883 skipped := 0.0
884 if d.SkipReason != "" {
885 skipped = 1.0
886 }
887
888 generated := 0.0
889 if enry.IsGenerated(d.Name, d.Content) {
890 generated = 1.0
891 }
892
893 vendor := 0.0
894 if enry.IsVendor(d.Name) {
895 vendor = 1.0
896 }
897
898 test := 0.0
899 if enry.IsTest(d.Name) {
900 test = 1.0
901 }
902
903 // Smaller is earlier (=better).
904 return []float64{
905 // Always place skipped docs last
906 skipped,
907
908 // Prefer docs that are not generated
909 generated,
910
911 // Prefer docs that are not vendored
912 vendor,
913
914 // Prefer docs that are not tests
915 test,
916
917 // With short names
918 squashRange(len(d.Name)),
919
920 // With many symbols
921 1.0 - squashRange(len(d.Symbols)),
922
923 // With short content
924 squashRange(len(d.Content)),
925
926 // That is present is as many branches as possible
927 1.0 - squashRange(len(d.Branches)),
928
929 // Preserve original ordering.
930 squashRange(origIdx),
931 }
932}
933
934func sortDocuments(todo []*Document) {
935 rs := make([]rankedDoc, 0, len(todo))
936 for i, t := range todo {
937 rd := rankedDoc{t, rank(t, i)}
938 rs = append(rs, rd)
939 }
940 sort.Slice(rs, func(i, j int) bool {
941 r1 := rs[i].rank
942 r2 := rs[j].rank
943 for i := range r1 {
944 if r1[i] < r2[i] {
945 return true
946 }
947 if r1[i] > r2[i] {
948 return false
949 }
950 }
951
952 return false
953 })
954 for i := range todo {
955 todo[i] = rs[i].Document
956 }
957}
958
959func (b *Builder) buildShard(todo []*Document, nextShardNum int) (*finishedShard, error) {
960 if !b.opts.DisableCTags && (b.opts.CTagsPath != "" || b.opts.ScipCTagsPath != "") {
961 err := parseSymbols(todo, b.opts.LanguageMap, b.parserBins)
962 if b.opts.CTagsMustSucceed && err != nil {
963 return nil, err
964 }
965 if err != nil {
966 log.Printf("ignoring universal:%s or scip:%s error: %v", b.opts.CTagsPath, b.opts.ScipCTagsPath, err)
967 }
968 }
969
970 name := b.opts.shardName(nextShardNum)
971
972 shardBuilder, err := b.newShardBuilder()
973 if err != nil {
974 return nil, err
975 }
976
977 sortDocuments(todo)
978
979 for idx, t := range todo {
980 if err := shardBuilder.Add(*t); err != nil {
981 return nil, err
982 }
983
984 if idx%10_000 == 0 {
985 b.CheckMemoryUsage()
986 }
987 }
988
989 return b.writeShard(name, shardBuilder)
990}
991
992// CheckMemoryUsage checks the memory usage of the process and writes a memory profile if the heap usage exceeds the
993// configured threshold. NOTE: this method is expensive and should only be used for debugging.
994func (b *Builder) CheckMemoryUsage() {
995 // Don't check memory if heap profiling is disabled, or we've already written 10 profiles
996 if b.opts.HeapProfileTriggerBytes <= 0 || b.heapProfileNum >= 10 {
997 return
998 }
999
1000 var m runtime.MemStats
1001 runtime.ReadMemStats(&m)
1002
1003 if m.HeapAlloc > b.opts.HeapProfileTriggerBytes && b.heapProfileMu.TryLock() {
1004 defer b.heapProfileMu.Unlock()
1005
1006 log.Printf("writing memory profile, allocated heap: %s", humanize.Bytes(m.HeapAlloc))
1007 name := filepath.Join(b.opts.IndexDir, fmt.Sprintf("indexmemory.prof.%d", b.heapProfileNum))
1008 f, err := os.Create(name)
1009 if err != nil {
1010 log.Printf("failed to create memory profile file: %v", err)
1011 return
1012 }
1013
1014 err = pprof.WriteHeapProfile(f)
1015 if err != nil {
1016 log.Printf("failed to write memory profile: %v", err)
1017 }
1018
1019 b.heapProfileNum++
1020 }
1021}
1022
1023func (b *Builder) newShardBuilder() (*ShardBuilder, error) {
1024 desc := b.opts.RepositoryDescription
1025 desc.HasSymbols = !b.opts.DisableCTags && b.opts.CTagsPath != ""
1026 desc.SubRepoMap = b.opts.SubRepositories
1027 desc.IndexOptions = b.opts.GetHash()
1028
1029 shardBuilder, err := NewShardBuilder(&desc)
1030 if err != nil {
1031 return nil, err
1032 }
1033 shardBuilder.IndexTime = b.indexTime
1034 shardBuilder.ID = b.id
1035 return shardBuilder, nil
1036}
1037
1038func (b *Builder) writeShard(fn string, ib *ShardBuilder) (*finishedShard, error) {
1039 dir := filepath.Dir(fn)
1040 if err := os.MkdirAll(dir, 0o700); err != nil {
1041 return nil, err
1042 }
1043
1044 f, err := os.CreateTemp(dir, filepath.Base(fn)+".*.tmp")
1045 if err != nil {
1046 return nil, err
1047 }
1048 if runtime.GOOS != "windows" {
1049 if err := f.Chmod(0o666 &^ umask); err != nil {
1050 return nil, err
1051 }
1052 }
1053
1054 defer f.Close()
1055 if err := ib.Write(f); err != nil {
1056 return nil, err
1057 }
1058 fi, err := f.Stat()
1059 if err != nil {
1060 return nil, err
1061 }
1062 if err := f.Close(); err != nil {
1063 return nil, err
1064 }
1065
1066 log.Printf("finished shard %s: %d index bytes (overhead %3.1f), %d files processed \n",
1067 fn,
1068 fi.Size(),
1069 float64(fi.Size())/float64(ib.ContentSize()+1),
1070 ib.NumFiles())
1071
1072 return &finishedShard{f.Name(), fn}, nil
1073}
1074
1075type deltaBranchSetError struct {
1076 shardName string
1077 old, new []zoekt.RepositoryBranch
1078}
1079
1080func (e deltaBranchSetError) Error() string {
1081 return fmt.Sprintf("repository metadata in shard %q contains a different set of branch names than what was requested, which is unsupported in a delta shard build. old: %+v, new: %+v", e.shardName, e.old, e.new)
1082}
1083
1084type deltaIndexOptionsMismatchError struct {
1085 shardName string
1086 newOptions HashOptions
1087}
1088
1089func (e *deltaIndexOptionsMismatchError) Error() string {
1090 return fmt.Sprintf("one or more index options for shard %q do not match Builder's index options. These index option updates are incompatible with delta build. New index options: %+v", e.shardName, e.newOptions)
1091}
1092
1093// umask holds the Umask of the current process
1094var umask os.FileMode
1095
1096// Document holds a document (file) to index.
1097type Document struct {
1098 Name string
1099 Content []byte
1100 Branches []string
1101 SubRepositoryPath string
1102 Language string
1103
1104 // If set, something is wrong with the file contents, and this
1105 // is the reason it wasn't indexed.
1106 SkipReason string
1107
1108 // Document sections for symbols. Offsets should use bytes.
1109 Symbols []DocumentSection
1110 SymbolsMetaData []*zoekt.Symbol
1111}
1112
1113type DocumentSection struct {
1114 Start, End uint32
1115}