fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// package build implements a more convenient interface for building
16// zoekt indices.
17package build
18
19import (
20 "cmp"
21 "crypto/sha1"
22 "flag"
23 "fmt"
24 "io"
25 "log"
26 "net/url"
27 "os"
28 "os/exec"
29 "path"
30 "path/filepath"
31 "reflect"
32 "runtime"
33 "runtime/pprof"
34 "sort"
35 "strconv"
36 "strings"
37 "sync"
38 "time"
39
40 "github.com/bmatcuk/doublestar"
41 "github.com/dustin/go-humanize"
42 "github.com/go-enry/go-enry/v2"
43 "github.com/rs/xid"
44
45 "github.com/sourcegraph/zoekt"
46 "github.com/sourcegraph/zoekt/ctags"
47)
48
49var DefaultDir = filepath.Join(os.Getenv("HOME"), ".zoekt")
50
51// Branch describes a single branch version.
52type Branch struct {
53 Name string
54 Version string
55}
56
57// Options sets options for the index building.
58type Options struct {
59 // IndexDir is a directory that holds *.zoekt index files.
60 IndexDir string
61
62 // SizeMax is the maximum file size
63 SizeMax int
64
65 // Parallelism is the maximum number of shards to index in parallel
66 Parallelism int
67
68 // ShardMax sets the maximum corpus size for a single shard
69 ShardMax int
70
71 // TrigramMax sets the maximum number of distinct trigrams per document.
72 TrigramMax int
73
74 // RepositoryDescription holds names and URLs for the repository.
75 RepositoryDescription zoekt.Repository
76
77 // SubRepositories is a path => sub repository map.
78 SubRepositories map[string]*zoekt.Repository
79
80 // DisableCTags disables the generation of ctags metadata.
81 DisableCTags bool
82
83 // CtagsPath is the path to the ctags binary to run, or empty
84 // if a valid binary couldn't be found.
85 CTagsPath string
86
87 // Same as CTagsPath but for scip-ctags
88 ScipCTagsPath string
89
90 // If set, ctags must succeed.
91 CTagsMustSucceed bool
92
93 // LargeFiles is a slice of glob patterns, including ** for any number
94 // of directories, where matching file paths should be indexed
95 // regardless of their size. The full pattern syntax is here:
96 // https://github.com/bmatcuk/doublestar/tree/v1#patterns.
97 LargeFiles []string
98
99 // IsDelta is true if this run contains only the changed documents since the
100 // last run.
101 IsDelta bool
102
103 // changedOrRemovedFiles is a list of file paths that have been changed or removed
104 // since the last indexing job for this repository. These files will be tombstoned
105 // in the older shards for this repository.
106 changedOrRemovedFiles []string
107
108 LanguageMap ctags.LanguageMap
109
110 // ShardMerging is true if builder should respect compound shards. This is a
111 // Sourcegraph specific option.
112 ShardMerging bool
113
114 // HeapProfileTriggerBytes is the heap allocation in bytes that will trigger a memory profile. If 0, no memory profile
115 // will be triggered. Note this trigger looks at total heap allocation (which includes both inuse and garbage objects).
116 //
117 // Profiles will be written to files named `index-memory.prof.n` in the index directory. No more than 10 files are written.
118 //
119 // Note: heap checking is "best effort", and it's possible for the process to OOM without triggering the heap profile.
120 HeapProfileTriggerBytes uint64
121
122 // ShardPrefix is the prefix of the shard. It defaults to the repository name.
123 ShardPrefix string
124}
125
126// HashOptions contains only the options in Options that upon modification leads to IndexState of IndexStateMismatch during the next index building.
127type HashOptions struct {
128 sizeMax int
129 disableCTags bool
130 ctagsPath string
131 cTagsMustSucceed bool
132 largeFiles []string
133}
134
135func (o *Options) HashOptions() HashOptions {
136 return HashOptions{
137 sizeMax: o.SizeMax,
138 disableCTags: o.DisableCTags,
139 ctagsPath: o.CTagsPath,
140 cTagsMustSucceed: o.CTagsMustSucceed,
141 largeFiles: o.LargeFiles,
142 }
143}
144
145func (o *Options) GetHash() string {
146 h := o.HashOptions()
147 hasher := sha1.New()
148
149 hasher.Write([]byte(h.ctagsPath))
150 hasher.Write([]byte(fmt.Sprintf("%t", h.cTagsMustSucceed)))
151 hasher.Write([]byte(fmt.Sprintf("%d", h.sizeMax)))
152 hasher.Write([]byte(fmt.Sprintf("%q", h.largeFiles)))
153 hasher.Write([]byte(fmt.Sprintf("%t", h.disableCTags)))
154
155 return fmt.Sprintf("%x", hasher.Sum(nil))
156}
157
158type largeFilesFlag struct{ *Options }
159
160func (f largeFilesFlag) String() string {
161 // From flag.Value documentation:
162 //
163 // The flag package may call the String method with a zero-valued receiver,
164 // such as a nil pointer.
165 if f.Options == nil {
166 return ""
167 }
168 s := append([]string{""}, f.LargeFiles...)
169 return strings.Join(s, "-large_file ")
170}
171
172func (f largeFilesFlag) Set(value string) error {
173 f.LargeFiles = append(f.LargeFiles, value)
174 return nil
175}
176
177// Flags adds flags for build options to fs. It is the "inverse" of Args.
178func (o *Options) Flags(fs *flag.FlagSet) {
179 x := *o
180 x.SetDefaults()
181 fs.IntVar(&o.SizeMax, "file_limit", x.SizeMax, "maximum file size")
182 fs.IntVar(&o.TrigramMax, "max_trigram_count", x.TrigramMax, "maximum number of trigrams per document")
183 fs.IntVar(&o.ShardMax, "shard_limit", x.ShardMax, "maximum corpus size for a shard")
184 fs.IntVar(&o.Parallelism, "parallelism", x.Parallelism, "maximum number of parallel indexing processes.")
185 fs.StringVar(&o.IndexDir, "index", x.IndexDir, "directory for search indices")
186 fs.BoolVar(&o.CTagsMustSucceed, "require_ctags", x.CTagsMustSucceed, "If set, ctags calls must succeed.")
187 fs.Var(largeFilesFlag{o}, "large_file", "A glob pattern where matching files are to be index regardless of their size. You can add multiple patterns by setting this more than once.")
188 fs.StringVar(&o.ShardPrefix, "shard_prefix", x.ShardPrefix, "the prefix of the shard. Defaults to repository name")
189
190 // Sourcegraph specific
191 fs.BoolVar(&o.DisableCTags, "disable_ctags", x.DisableCTags, "If set, ctags will not be called.")
192 fs.BoolVar(&o.ShardMerging, "shard_merging", x.ShardMerging, "If set, builder will respect compound shards.")
193}
194
195// Args generates command line arguments for o. It is the "inverse" of Flags.
196func (o *Options) Args() []string {
197 var args []string
198
199 if o.SizeMax != 0 {
200 args = append(args, "-file_limit", strconv.Itoa(o.SizeMax))
201 }
202
203 if o.TrigramMax != 0 {
204 args = append(args, "-max_trigram_count", strconv.Itoa(o.TrigramMax))
205 }
206
207 if o.ShardMax != 0 {
208 args = append(args, "-shard_limit", strconv.Itoa(o.ShardMax))
209 }
210
211 if o.Parallelism != 0 {
212 args = append(args, "-parallelism", strconv.Itoa(o.Parallelism))
213 }
214
215 if o.IndexDir != "" {
216 args = append(args, "-index", o.IndexDir)
217 }
218
219 if o.CTagsMustSucceed {
220 args = append(args, "-require_ctags")
221 }
222
223 for _, a := range o.LargeFiles {
224 args = append(args, "-large_file", a)
225 }
226
227 // Sourcegraph specific
228 if o.DisableCTags {
229 args = append(args, "-disable_ctags")
230 }
231
232 if o.ShardMerging {
233 args = append(args, "-shard_merging")
234 }
235
236 if o.ShardPrefix != "" {
237 args = append(args, "-shard_prefix", o.ShardPrefix)
238 }
239
240 return args
241}
242
243// Builder manages (parallel) creation of uniformly sized shards. The
244// builder buffers up documents until it collects enough documents and
245// then builds a shard and writes.
246type Builder struct {
247 opts Options
248 throttle chan int
249
250 nextShardNum int
251 todo []*zoekt.Document
252 docChecker zoekt.DocChecker
253 size int
254
255 parserBins ctags.ParserBinMap
256 building sync.WaitGroup
257
258 errMu sync.Mutex
259 buildError error
260
261 // temp name => final name for finished shards. We only rename
262 // them once all shards succeed to avoid Frankstein corpuses.
263 finishedShards map[string]string
264
265 // indexTime is set by tests for doing reproducible builds.
266 indexTime time.Time
267
268 // heapProfileMu is used to ensure that only one memory profile is written at a time
269 heapProfileMu sync.Mutex
270 heapProfileNum int
271
272 // a sortable 20 chars long id.
273 id string
274
275 finishCalled bool
276}
277
278type finishedShard struct {
279 temp, final string
280}
281
282func checkCTags() string {
283 if ctags := os.Getenv("CTAGS_COMMAND"); ctags != "" {
284 return ctags
285 }
286
287 if ctags, err := exec.LookPath("universal-ctags"); err == nil {
288 return ctags
289 }
290
291 return ""
292}
293
294func checkScipCTags() string {
295 if ctags := os.Getenv("SCIP_CTAGS_COMMAND"); ctags != "" {
296 return ctags
297 }
298
299 if ctags, err := exec.LookPath("scip-ctags"); err == nil {
300 return ctags
301 }
302
303 return ""
304}
305
306// SetDefaults sets reasonable default options.
307func (o *Options) SetDefaults() {
308 if o.CTagsPath == "" && !o.DisableCTags {
309 o.CTagsPath = checkCTags()
310 }
311
312 if o.ScipCTagsPath == "" && !o.DisableCTags {
313 o.ScipCTagsPath = checkScipCTags()
314 }
315
316 if o.Parallelism == 0 {
317 o.Parallelism = 4
318 }
319 if o.SizeMax == 0 {
320 o.SizeMax = 2 << 20
321 }
322 if o.ShardMax == 0 {
323 o.ShardMax = 100 << 20
324 }
325 if o.TrigramMax == 0 {
326 o.TrigramMax = 20000
327 }
328
329 if o.RepositoryDescription.Name == "" && o.RepositoryDescription.URL != "" {
330 parsed, _ := url.Parse(o.RepositoryDescription.URL)
331 if parsed != nil {
332 o.RepositoryDescription.Name = filepath.Join(parsed.Host, parsed.Path)
333 }
334 }
335}
336
337func hashString(s string) string {
338 h := sha1.New()
339 _, _ = io.WriteString(h, s)
340 return fmt.Sprintf("%x", h.Sum(nil))
341}
342
343// ShardName returns the name the given index shard.
344func (o *Options) shardName(n int) string {
345 return o.shardNameVersion(zoekt.IndexFormatVersion, n)
346}
347
348func (o *Options) shardNameVersion(version, n int) string {
349 prefix := url.QueryEscape(cmp.Or(o.ShardPrefix, o.RepositoryDescription.Name))
350 if len(prefix) > 200 {
351 prefix = prefix[:200] + hashString(prefix)[:8]
352 }
353 shardName := filepath.Join(o.IndexDir,
354 fmt.Sprintf("%s_v%d.%05d.zoekt", prefix, version, n))
355 return shardName
356}
357
358type IndexState string
359
360const (
361 IndexStateMissing IndexState = "missing"
362 IndexStateCorrupt IndexState = "corrupt"
363 IndexStateVersion IndexState = "version-mismatch"
364 IndexStateOption IndexState = "option-mismatch"
365 IndexStateMeta IndexState = "meta-mismatch"
366 IndexStateContent IndexState = "content-mismatch"
367 IndexStateEqual IndexState = "equal"
368)
369
370var readVersions = []struct {
371 IndexFormatVersion int
372 FeatureVersion int
373}{{
374 IndexFormatVersion: zoekt.IndexFormatVersion,
375 FeatureVersion: zoekt.FeatureVersion,
376}, {
377 IndexFormatVersion: zoekt.NextIndexFormatVersion,
378 FeatureVersion: zoekt.FeatureVersion,
379}}
380
381// IncrementalSkipIndexing returns true if the index present on disk matches
382// the build options.
383func (o *Options) IncrementalSkipIndexing() bool {
384 state, _ := o.IndexState()
385 return state == IndexStateEqual
386}
387
388// IndexState checks how the index present on disk compares to the build
389// options and returns the IndexState and the name of the first shard.
390func (o *Options) IndexState() (IndexState, string) {
391 // Open the latest version we support that is on disk.
392 fn := o.findShard()
393 if fn == "" {
394 return IndexStateMissing, fn
395 }
396
397 repos, index, err := zoekt.ReadMetadataPathAlive(fn)
398 if os.IsNotExist(err) {
399 return IndexStateMissing, fn
400 } else if err != nil {
401 return IndexStateCorrupt, fn
402 }
403
404 for _, v := range readVersions {
405 if v.IndexFormatVersion == index.IndexFormatVersion && v.FeatureVersion != index.IndexFeatureVersion {
406 return IndexStateVersion, fn
407 }
408 }
409
410 var repo *zoekt.Repository
411 for _, cand := range repos {
412 if cand.Name == o.RepositoryDescription.Name {
413 repo = cand
414 break
415 }
416 }
417
418 if repo == nil {
419 return IndexStateCorrupt, fn
420 }
421
422 if repo.IndexOptions != o.GetHash() {
423 return IndexStateOption, fn
424 }
425
426 if !reflect.DeepEqual(repo.Branches, o.RepositoryDescription.Branches) {
427 return IndexStateContent, fn
428 }
429
430 // We can mutate repo since it lives in the scope of this function call.
431 if updated, err := repo.MergeMutable(&o.RepositoryDescription); err != nil {
432 // non-nil err means we are trying to update an immutable field =>
433 // reindex content.
434 log.Printf("warn: immutable field changed, requires re-index: %s", err)
435 return IndexStateContent, fn
436 } else if updated {
437 return IndexStateMeta, fn
438 }
439
440 return IndexStateEqual, fn
441}
442
443// FindRepositoryMetadata returns the index metadata for the repository
444// specified in the options. 'ok' is false if the repository's metadata
445// couldn't be found or if an error occurred.
446func (o *Options) FindRepositoryMetadata() (repository *zoekt.Repository, metadata *zoekt.IndexMetadata, ok bool, err error) {
447 shard := o.findShard()
448 if shard == "" {
449 return nil, nil, false, nil
450 }
451
452 repositories, metadata, err := zoekt.ReadMetadataPathAlive(shard)
453 if err != nil {
454 return nil, nil, false, fmt.Errorf("reading metadata for shard %q: %w", shard, err)
455 }
456
457 ID := o.RepositoryDescription.ID
458 for _, r := range repositories {
459 // compound shards contain multiple repositories, so we
460 // have to pick only the one we're looking for
461 if r.ID == ID {
462 return r, metadata, true, nil
463 }
464 }
465
466 // If we're here, then we're somehow in a state where we found a matching
467 // shard that's missing the repository metadata we're looking for. This
468 // should never happen.
469 name := o.RepositoryDescription.Name
470 return nil, nil, false, fmt.Errorf("matching shard %q doesn't contain metadata for repo id %d (%q)", shard, ID, name)
471}
472
473func (o *Options) findShard() string {
474 for _, v := range readVersions {
475 fn := o.shardNameVersion(v.IndexFormatVersion, 0)
476 if _, err := os.Stat(fn); err == nil {
477 return fn
478 }
479 }
480
481 // Brute force finding the shard in compound shards. We should only hit this
482 // code path for repositories that are not already existing or are in
483 // compound shards.
484 //
485 // TODO add an oracle which can speed this up in the case of repositories
486 // already in compound shards.
487 compoundShards, err := filepath.Glob(path.Join(o.IndexDir, "compound-*.zoekt"))
488 if err != nil {
489 return ""
490 }
491 for _, fn := range compoundShards {
492 repos, _, err := zoekt.ReadMetadataPathAlive(fn)
493 if err != nil {
494 continue
495 }
496 for _, repo := range repos {
497 if repo.ID == o.RepositoryDescription.ID {
498 return fn
499 }
500 }
501 }
502
503 return ""
504}
505
506func (o *Options) FindAllShards() []string {
507 for _, v := range readVersions {
508 fn := o.shardNameVersion(v.IndexFormatVersion, 0)
509 if _, err := os.Stat(fn); err == nil {
510 shards := []string{fn}
511 for i := 1; ; i++ {
512 fn := o.shardNameVersion(v.IndexFormatVersion, i)
513 if _, err := os.Stat(fn); err != nil {
514 return shards
515 }
516 shards = append(shards, fn)
517 }
518 }
519 }
520
521 // lazily fallback to findShard which will look for a compound shard.
522 if fn := o.findShard(); fn != "" {
523 return []string{fn}
524 }
525
526 return nil
527}
528
529// IgnoreSizeMax determines whether the max size should be ignored.
530func (o *Options) IgnoreSizeMax(name string) bool {
531 // A pattern match will override preceding pattern matches.
532 for i := len(o.LargeFiles) - 1; i >= 0; i-- {
533 pattern := strings.TrimSpace(o.LargeFiles[i])
534 negated, validatedPattern := checkIsNegatePattern(pattern)
535
536 if m, _ := doublestar.PathMatch(validatedPattern, name); m {
537 if negated {
538 return false
539 } else {
540 return true
541 }
542 }
543 }
544
545 return false
546}
547
548func checkIsNegatePattern(pattern string) (bool, string) {
549 negate := "!"
550
551 // if negated then strip prefix meta character which identifies negated filter pattern
552 if strings.HasPrefix(pattern, negate) {
553 return true, pattern[len(negate):]
554 }
555
556 return false, pattern
557}
558
559// NewBuilder creates a new Builder instance.
560func NewBuilder(opts Options) (*Builder, error) {
561 opts.SetDefaults()
562 if opts.RepositoryDescription.Name == "" {
563 return nil, fmt.Errorf("builder: must set Name")
564 }
565
566 b := &Builder{
567 opts: opts,
568 throttle: make(chan int, opts.Parallelism),
569 finishedShards: map[string]string{},
570 }
571
572 parserBins, err := ctags.NewParserBinMap(
573 b.opts.CTagsPath,
574 b.opts.ScipCTagsPath,
575 opts.LanguageMap,
576 b.opts.CTagsMustSucceed,
577 )
578 if err != nil {
579 return nil, err
580 }
581
582 b.parserBins = parserBins
583
584 if opts.IsDelta {
585 // Delta shards build on top of previously existing shards.
586 // As a consequence, the shardNum for delta shards starts from
587 // the number following the most recently generated shard - not 0.
588 //
589 // Using this numbering scheme allows all the shards to be
590 // discovered as a set.
591 shards := b.opts.FindAllShards()
592 b.nextShardNum = len(shards) // shards are zero indexed, so len() provides the next number after the last one
593 }
594
595 if _, err := b.newShardBuilder(); err != nil {
596 return nil, err
597 }
598
599 now := time.Now()
600 b.indexTime = now
601 b.id = xid.NewWithTime(now).String()
602
603 return b, nil
604}
605
606// AddFile is a convenience wrapper for the Add method
607func (b *Builder) AddFile(name string, content []byte) error {
608 return b.Add(zoekt.Document{Name: name, Content: content})
609}
610
611func (b *Builder) Add(doc zoekt.Document) error {
612 if b.finishCalled {
613 return nil
614 }
615
616 allowLargeFile := b.opts.IgnoreSizeMax(doc.Name)
617 if len(doc.Content) > b.opts.SizeMax && !allowLargeFile {
618 // We could pass the document on to the shardbuilder, but if
619 // we pass through a part of the source tree with binary/large
620 // files, the corresponding shard would be mostly empty, so
621 // insert a reason here too.
622 doc.SkipReason = fmt.Sprintf("document size %d larger than limit %d", len(doc.Content), b.opts.SizeMax)
623 } else if err := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile); err != nil {
624 doc.SkipReason = err.Error()
625 doc.Language = "binary"
626 }
627
628 b.todo = append(b.todo, &doc)
629
630 if doc.SkipReason == "" {
631 b.size += len(doc.Name) + len(doc.Content)
632 } else {
633 b.size += len(doc.Name) + len(doc.SkipReason)
634 // Drop the content if we are skipping the document. Skipped content is not counted towards the
635 // shard size limit, so otherwise we might buffer too much data in memory before flushing.
636 doc.Content = nil
637 }
638
639 if b.size > b.opts.ShardMax {
640 return b.flush()
641 }
642
643 return nil
644}
645
646// MarkFileAsChangedOrRemoved indicates that the file specified by the given path
647// has been changed or removed since the last indexing job for this repository.
648//
649// If this build is a delta build, these files will be tombstoned in the older shards for this repository.
650func (b *Builder) MarkFileAsChangedOrRemoved(path string) {
651 b.opts.changedOrRemovedFiles = append(b.opts.changedOrRemovedFiles, path)
652}
653
654// Finish creates a last shard from the buffered documents, and clears
655// stale shards from previous runs. This should always be called, also
656// in failure cases, to ensure cleanup.
657//
658// It is safe to call Finish() multiple times.
659func (b *Builder) Finish() error {
660 if b.finishCalled {
661 return b.buildError
662 }
663
664 b.finishCalled = true
665
666 b.flush()
667 b.building.Wait()
668
669 if b.buildError != nil {
670 for tmp := range b.finishedShards {
671 log.Printf("Builder.Finish %s", tmp)
672 os.Remove(tmp)
673 }
674 b.finishedShards = map[string]string{}
675 return b.buildError
676 }
677
678 // map of temporary -> final names for all updated shards + shard metadata files
679 artifactPaths := make(map[string]string)
680 for tmp, final := range b.finishedShards {
681 artifactPaths[tmp] = final
682 }
683
684 oldShards := b.opts.FindAllShards()
685
686 if b.opts.IsDelta {
687 // Delta shard builds need to update FileTombstone and branch commit information for all
688 // existing shards
689 for _, shard := range oldShards {
690 repositories, _, err := zoekt.ReadMetadataPathAlive(shard)
691 if err != nil {
692 return fmt.Errorf("reading metadata from shard %q: %w", shard, err)
693 }
694
695 if len(repositories) > 1 {
696 return fmt.Errorf("delta shard builds don't support repositories contained in compound shards (shard %q)", shard)
697 }
698
699 if len(repositories) == 0 {
700 return fmt.Errorf("failed to update repository metadata for shard %q - shard contains no repositories", shard)
701 }
702
703 repository := repositories[0]
704 if repository.ID != b.opts.RepositoryDescription.ID {
705 return fmt.Errorf("shard %q doesn't contain repository ID %d (%q)", shard, b.opts.RepositoryDescription.ID, b.opts.RepositoryDescription.Name)
706 }
707
708 if len(b.opts.changedOrRemovedFiles) > 0 && repository.FileTombstones == nil {
709 repository.FileTombstones = make(map[string]struct{}, len(b.opts.changedOrRemovedFiles))
710 }
711
712 for _, f := range b.opts.changedOrRemovedFiles {
713 repository.FileTombstones[f] = struct{}{}
714 }
715
716 if !BranchNamesEqual(repository.Branches, b.opts.RepositoryDescription.Branches) {
717 return deltaBranchSetError{
718 shardName: shard,
719 old: repository.Branches,
720 new: b.opts.RepositoryDescription.Branches,
721 }
722 }
723
724 if b.opts.GetHash() != repository.IndexOptions {
725 return &deltaIndexOptionsMismatchError{
726 shardName: shard,
727 newOptions: b.opts.HashOptions(),
728 }
729 }
730
731 repository.Branches = b.opts.RepositoryDescription.Branches
732
733 repository.LatestCommitDate = b.opts.RepositoryDescription.LatestCommitDate
734
735 tempPath, finalPath, err := zoekt.JsonMarshalRepoMetaTemp(shard, repository)
736 if err != nil {
737 return fmt.Errorf("writing repository metadta for shard %q: %w", shard, err)
738 }
739
740 artifactPaths[tempPath] = finalPath
741 }
742 }
743
744 // We mark finished shards as empty when we successfully finish. Return now
745 // to allow call sites to call Finish idempotently.
746 if len(artifactPaths) == 0 {
747 return b.buildError
748 }
749
750 // Collect a map of the old shards on disk. For each new shard we replace we
751 // delete it from toDelete. Anything remaining in toDelete will be removed
752 // after we have renamed everything into place.
753
754 var toDelete map[string]struct{}
755 if !b.opts.IsDelta {
756 // Non-delta shard builds delete all existing shards before they write out
757 // new ones.
758 // By contrast, delta shard builds work by stacking changes on top of existing shards.
759 // So, we skip populating the toDelete map if we're building delta shards.
760
761 toDelete = make(map[string]struct{})
762 for _, name := range oldShards {
763 paths, err := zoekt.IndexFilePaths(name)
764 if err != nil {
765 b.buildError = fmt.Errorf("failed to find old paths for %s: %w", name, err)
766 }
767 for _, p := range paths {
768 toDelete[p] = struct{}{}
769 }
770 }
771 }
772
773 for tmp, final := range artifactPaths {
774 if err := os.Rename(tmp, final); err != nil {
775 b.buildError = err
776 continue
777 }
778
779 delete(toDelete, final)
780 }
781
782 b.finishedShards = map[string]string{}
783
784 for p := range toDelete {
785 // Don't delete compound shards, set tombstones instead.
786 if b.opts.ShardMerging && strings.HasPrefix(filepath.Base(p), "compound-") {
787 if !strings.HasSuffix(p, ".zoekt") {
788 continue
789 }
790 err := zoekt.SetTombstone(p, b.opts.RepositoryDescription.ID)
791 b.buildError = err
792 continue
793 }
794 log.Printf("removing old shard file: %s", p)
795 if err := os.Remove(p); err != nil {
796 b.buildError = err
797 }
798 }
799
800 return b.buildError
801}
802
803// BranchNamesEqual compares the given zoekt.RepositoryBranch slices, and returns true
804// iff both slices specify the same set of branch names in the same order.
805func BranchNamesEqual(a, b []zoekt.RepositoryBranch) bool {
806 if len(a) != len(b) {
807 return false
808 }
809
810 for i := range a {
811 x, y := a[i], b[i]
812 if x.Name != y.Name {
813 return false
814 }
815 }
816
817 return true
818}
819
820func (b *Builder) flush() error {
821 todo := b.todo
822 b.todo = nil
823 b.size = 0
824 b.errMu.Lock()
825 defer b.errMu.Unlock()
826 if b.buildError != nil {
827 return b.buildError
828 }
829
830 hasShard := b.nextShardNum > 0
831 if len(todo) == 0 && hasShard {
832 return nil
833 }
834
835 shard := b.nextShardNum
836 b.nextShardNum++
837
838 if b.opts.Parallelism > 1 {
839 b.building.Add(1)
840 b.throttle <- 1
841 go func() {
842 done, err := b.buildShard(todo, shard)
843 <-b.throttle
844
845 b.errMu.Lock()
846 defer b.errMu.Unlock()
847 if err != nil && b.buildError == nil {
848 b.buildError = err
849 }
850 if err == nil {
851 b.finishedShards[done.temp] = done.final
852 }
853 b.building.Done()
854 }()
855 } else {
856 // No goroutines when we're not parallel. This
857 // simplifies memory profiling.
858 done, err := b.buildShard(todo, shard)
859 b.buildError = err
860 if err == nil {
861 b.finishedShards[done.temp] = done.final
862 }
863
864 return b.buildError
865 }
866
867 return nil
868}
869
870// map [0,inf) to [0,1) monotonically
871func squashRange(j int) float64 {
872 x := float64(j)
873 return x / (1 + x)
874}
875
876// IsLowPriority takes a file name and makes an educated guess about its priority
877// in search results. A file is considered low priority if it looks like a test,
878// vendored, or generated file.
879//
880// These 'priority' criteria affects how documents are ordered within a shard. It's
881// also used to help guess a file's rank when we're missing ranking information.
882func IsLowPriority(path string, content []byte) bool {
883 return enry.IsTest(path) || enry.IsVendor(path) || enry.IsGenerated(path, content)
884}
885
886type rankedDoc struct {
887 *zoekt.Document
888 rank []float64
889}
890
891// rank returns a vector of scores which is used at index-time to sort documents
892// before writing them to disk. The order of documents in the shard is important
893// at query time, because earlier documents receive a boost at query time and
894// have a higher chance of being searched before limits kick in.
895func rank(d *zoekt.Document, origIdx int) []float64 {
896 skipped := 0.0
897 if d.SkipReason != "" {
898 skipped = 1.0
899 }
900
901 generated := 0.0
902 if enry.IsGenerated(d.Name, d.Content) {
903 generated = 1.0
904 }
905
906 vendor := 0.0
907 if enry.IsVendor(d.Name) {
908 vendor = 1.0
909 }
910
911 test := 0.0
912 if enry.IsTest(d.Name) {
913 test = 1.0
914 }
915
916 // Smaller is earlier (=better).
917 return []float64{
918 // Always place skipped docs last
919 skipped,
920
921 // Prefer docs that are not generated
922 generated,
923
924 // Prefer docs that are not vendored
925 vendor,
926
927 // Prefer docs that are not tests
928 test,
929
930 // With short names
931 squashRange(len(d.Name)),
932
933 // With many symbols
934 1.0 - squashRange(len(d.Symbols)),
935
936 // With short content
937 squashRange(len(d.Content)),
938
939 // That is present is as many branches as possible
940 1.0 - squashRange(len(d.Branches)),
941
942 // Preserve original ordering.
943 squashRange(origIdx),
944 }
945}
946
947func sortDocuments(todo []*zoekt.Document) {
948 rs := make([]rankedDoc, 0, len(todo))
949 for i, t := range todo {
950 rd := rankedDoc{t, rank(t, i)}
951 rs = append(rs, rd)
952 }
953 sort.Slice(rs, func(i, j int) bool {
954 r1 := rs[i].rank
955 r2 := rs[j].rank
956 for i := range r1 {
957 if r1[i] < r2[i] {
958 return true
959 }
960 if r1[i] > r2[i] {
961 return false
962 }
963 }
964
965 return false
966 })
967 for i := range todo {
968 todo[i] = rs[i].Document
969 }
970}
971
972func (b *Builder) buildShard(todo []*zoekt.Document, nextShardNum int) (*finishedShard, error) {
973 if !b.opts.DisableCTags && (b.opts.CTagsPath != "" || b.opts.ScipCTagsPath != "") {
974 err := parseSymbols(todo, b.opts.LanguageMap, b.parserBins)
975 if b.opts.CTagsMustSucceed && err != nil {
976 return nil, err
977 }
978 if err != nil {
979 log.Printf("ignoring universal:%s or scip:%s error: %v", b.opts.CTagsPath, b.opts.ScipCTagsPath, err)
980 }
981 }
982
983 name := b.opts.shardName(nextShardNum)
984
985 shardBuilder, err := b.newShardBuilder()
986 if err != nil {
987 return nil, err
988 }
989
990 sortDocuments(todo)
991
992 for idx, t := range todo {
993 if err := shardBuilder.Add(*t); err != nil {
994 return nil, err
995 }
996
997 if idx%10_000 == 0 {
998 b.CheckMemoryUsage()
999 }
1000 }
1001
1002 return b.writeShard(name, shardBuilder)
1003}
1004
1005// CheckMemoryUsage checks the memory usage of the process and writes a memory profile if the heap usage exceeds the
1006// configured threshold. NOTE: this method is expensive and should only be used for debugging.
1007func (b *Builder) CheckMemoryUsage() {
1008 // Don't check memory if heap profiling is disabled, or we've already written 10 profiles
1009 if b.opts.HeapProfileTriggerBytes <= 0 || b.heapProfileNum >= 10 {
1010 return
1011 }
1012
1013 var m runtime.MemStats
1014 runtime.ReadMemStats(&m)
1015
1016 if m.HeapAlloc > b.opts.HeapProfileTriggerBytes && b.heapProfileMu.TryLock() {
1017 defer b.heapProfileMu.Unlock()
1018
1019 log.Printf("writing memory profile, allocated heap: %s", humanize.Bytes(m.HeapAlloc))
1020 name := filepath.Join(b.opts.IndexDir, fmt.Sprintf("indexmemory.prof.%d", b.heapProfileNum))
1021 f, err := os.Create(name)
1022 if err != nil {
1023 log.Printf("failed to create memory profile file: %v", err)
1024 return
1025 }
1026
1027 err = pprof.WriteHeapProfile(f)
1028 if err != nil {
1029 log.Printf("failed to write memory profile: %v", err)
1030 }
1031
1032 b.heapProfileNum++
1033 }
1034}
1035
1036func (b *Builder) newShardBuilder() (*zoekt.IndexBuilder, error) {
1037 desc := b.opts.RepositoryDescription
1038 desc.HasSymbols = !b.opts.DisableCTags && b.opts.CTagsPath != ""
1039 desc.SubRepoMap = b.opts.SubRepositories
1040 desc.IndexOptions = b.opts.GetHash()
1041
1042 shardBuilder, err := zoekt.NewIndexBuilder(&desc)
1043 if err != nil {
1044 return nil, err
1045 }
1046 shardBuilder.IndexTime = b.indexTime
1047 shardBuilder.ID = b.id
1048 return shardBuilder, nil
1049}
1050
1051func (b *Builder) writeShard(fn string, ib *zoekt.IndexBuilder) (*finishedShard, error) {
1052 dir := filepath.Dir(fn)
1053 if err := os.MkdirAll(dir, 0o700); err != nil {
1054 return nil, err
1055 }
1056
1057 f, err := os.CreateTemp(dir, filepath.Base(fn)+".*.tmp")
1058 if err != nil {
1059 return nil, err
1060 }
1061 if runtime.GOOS != "windows" {
1062 if err := f.Chmod(0o666 &^ umask); err != nil {
1063 return nil, err
1064 }
1065 }
1066
1067 defer f.Close()
1068 if err := ib.Write(f); err != nil {
1069 return nil, err
1070 }
1071 fi, err := f.Stat()
1072 if err != nil {
1073 return nil, err
1074 }
1075 if err := f.Close(); err != nil {
1076 return nil, err
1077 }
1078
1079 log.Printf("finished shard %s: %d index bytes (overhead %3.1f), %d files processed \n",
1080 fn,
1081 fi.Size(),
1082 float64(fi.Size())/float64(ib.ContentSize()+1),
1083 ib.NumFiles())
1084
1085 return &finishedShard{f.Name(), fn}, nil
1086}
1087
1088type deltaBranchSetError struct {
1089 shardName string
1090 old, new []zoekt.RepositoryBranch
1091}
1092
1093func (e deltaBranchSetError) Error() string {
1094 return fmt.Sprintf("repository metadata in shard %q contains a different set of branch names than what was requested, which is unsupported in a delta shard build. old: %+v, new: %+v", e.shardName, e.old, e.new)
1095}
1096
1097type deltaIndexOptionsMismatchError struct {
1098 shardName string
1099 newOptions HashOptions
1100}
1101
1102func (e *deltaIndexOptionsMismatchError) Error() string {
1103 return fmt.Sprintf("one or more index options for shard %q do not match Builder's index options. These index option updates are incompatible with delta build. New index options: %+v", e.shardName, e.newOptions)
1104}
1105
1106// umask holds the Umask of the current process
1107var umask os.FileMode