fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// package build implements a more convenient interface for building
16// zoekt indices.
17package build
18
19import (
20 "crypto/sha1"
21 "flag"
22 "fmt"
23 "io"
24 "log"
25 "net/url"
26 "os"
27 "os/exec"
28 "path"
29 "path/filepath"
30 "reflect"
31 "runtime"
32 "runtime/pprof"
33 "sort"
34 "strconv"
35 "strings"
36 "sync"
37 "time"
38
39 "github.com/bmatcuk/doublestar"
40 "github.com/dustin/go-humanize"
41 "github.com/go-enry/go-enry/v2"
42 "github.com/rs/xid"
43
44 "github.com/sourcegraph/zoekt"
45 "github.com/sourcegraph/zoekt/ctags"
46)
47
48var DefaultDir = filepath.Join(os.Getenv("HOME"), ".zoekt")
49
50// Branch describes a single branch version.
51type Branch struct {
52 Name string
53 Version string
54}
55
56// Options sets options for the index building.
57type Options struct {
58 // IndexDir is a directory that holds *.zoekt index files.
59 IndexDir string
60
61 // SizeMax is the maximum file size
62 SizeMax int
63
64 // Parallelism is the maximum number of shards to index in parallel
65 Parallelism int
66
67 // ShardMax sets the maximum corpus size for a single shard
68 ShardMax int
69
70 // TrigramMax sets the maximum number of distinct trigrams per document.
71 TrigramMax int
72
73 // RepositoryDescription holds names and URLs for the repository.
74 RepositoryDescription zoekt.Repository
75
76 // SubRepositories is a path => sub repository map.
77 SubRepositories map[string]*zoekt.Repository
78
79 // DisableCTags disables the generation of ctags metadata.
80 DisableCTags bool
81
82 // CtagsPath is the path to the ctags binary to run, or empty
83 // if a valid binary couldn't be found.
84 CTagsPath string
85
86 // Same as CTagsPath but for scip-ctags
87 ScipCTagsPath string
88
89 // If set, ctags must succeed.
90 CTagsMustSucceed bool
91
92 // LargeFiles is a slice of glob patterns, including ** for any number
93 // of directories, where matching file paths should be indexed
94 // regardless of their size. The full pattern syntax is here:
95 // https://github.com/bmatcuk/doublestar/tree/v1#patterns.
96 LargeFiles []string
97
98 // IsDelta is true if this run contains only the changed documents since the
99 // last run.
100 IsDelta bool
101
102 // changedOrRemovedFiles is a list of file paths that have been changed or removed
103 // since the last indexing job for this repository. These files will be tombstoned
104 // in the older shards for this repository.
105 changedOrRemovedFiles []string
106
107 LanguageMap ctags.LanguageMap
108
109 // ShardMerging is true if builder should respect compound shards. This is a
110 // Sourcegraph specific option.
111 ShardMerging bool
112
113 // HeapProfileTriggerBytes is the heap allocation in bytes that will trigger a memory profile. If 0, no memory profile
114 // will be triggered. Note this trigger looks at total heap allocation (which includes both inuse and garbage objects).
115 //
116 // Profiles will be written to files named `index-memory.prof.n` in the index directory. No more than 10 files are written.
117 //
118 // Note: heap checking is "best effort", and it's possible for the process to OOM without triggering the heap profile.
119 HeapProfileTriggerBytes uint64
120}
121
122// HashOptions contains only the options in Options that upon modification leads to IndexState of IndexStateMismatch during the next index building.
123type HashOptions struct {
124 sizeMax int
125 disableCTags bool
126 ctagsPath string
127 cTagsMustSucceed bool
128 largeFiles []string
129}
130
131func (o *Options) HashOptions() HashOptions {
132 return HashOptions{
133 sizeMax: o.SizeMax,
134 disableCTags: o.DisableCTags,
135 ctagsPath: o.CTagsPath,
136 cTagsMustSucceed: o.CTagsMustSucceed,
137 largeFiles: o.LargeFiles,
138 }
139}
140
141func (o *Options) GetHash() string {
142 h := o.HashOptions()
143 hasher := sha1.New()
144
145 hasher.Write([]byte(h.ctagsPath))
146 hasher.Write([]byte(fmt.Sprintf("%t", h.cTagsMustSucceed)))
147 hasher.Write([]byte(fmt.Sprintf("%d", h.sizeMax)))
148 hasher.Write([]byte(fmt.Sprintf("%q", h.largeFiles)))
149 hasher.Write([]byte(fmt.Sprintf("%t", h.disableCTags)))
150
151 return fmt.Sprintf("%x", hasher.Sum(nil))
152}
153
154type largeFilesFlag struct{ *Options }
155
156func (f largeFilesFlag) String() string {
157 // From flag.Value documentation:
158 //
159 // The flag package may call the String method with a zero-valued receiver,
160 // such as a nil pointer.
161 if f.Options == nil {
162 return ""
163 }
164 s := append([]string{""}, f.LargeFiles...)
165 return strings.Join(s, "-large_file ")
166}
167
168func (f largeFilesFlag) Set(value string) error {
169 f.LargeFiles = append(f.LargeFiles, value)
170 return nil
171}
172
173// Flags adds flags for build options to fs. It is the "inverse" of Args.
174func (o *Options) Flags(fs *flag.FlagSet) {
175 x := *o
176 x.SetDefaults()
177 fs.IntVar(&o.SizeMax, "file_limit", x.SizeMax, "maximum file size")
178 fs.IntVar(&o.TrigramMax, "max_trigram_count", x.TrigramMax, "maximum number of trigrams per document")
179 fs.IntVar(&o.ShardMax, "shard_limit", x.ShardMax, "maximum corpus size for a shard")
180 fs.IntVar(&o.Parallelism, "parallelism", x.Parallelism, "maximum number of parallel indexing processes.")
181 fs.StringVar(&o.IndexDir, "index", x.IndexDir, "directory for search indices")
182 fs.BoolVar(&o.CTagsMustSucceed, "require_ctags", x.CTagsMustSucceed, "If set, ctags calls must succeed.")
183 fs.Var(largeFilesFlag{o}, "large_file", "A glob pattern where matching files are to be index regardless of their size. You can add multiple patterns by setting this more than once.")
184
185 // Sourcegraph specific
186 fs.BoolVar(&o.DisableCTags, "disable_ctags", x.DisableCTags, "If set, ctags will not be called.")
187 fs.BoolVar(&o.ShardMerging, "shard_merging", x.ShardMerging, "If set, builder will respect compound shards.")
188}
189
190// Args generates command line arguments for o. It is the "inverse" of Flags.
191func (o *Options) Args() []string {
192 var args []string
193
194 if o.SizeMax != 0 {
195 args = append(args, "-file_limit", strconv.Itoa(o.SizeMax))
196 }
197
198 if o.TrigramMax != 0 {
199 args = append(args, "-max_trigram_count", strconv.Itoa(o.TrigramMax))
200 }
201
202 if o.ShardMax != 0 {
203 args = append(args, "-shard_limit", strconv.Itoa(o.ShardMax))
204 }
205
206 if o.Parallelism != 0 {
207 args = append(args, "-parallelism", strconv.Itoa(o.Parallelism))
208 }
209
210 if o.IndexDir != "" {
211 args = append(args, "-index", o.IndexDir)
212 }
213
214 if o.CTagsMustSucceed {
215 args = append(args, "-require_ctags")
216 }
217
218 for _, a := range o.LargeFiles {
219 args = append(args, "-large_file", a)
220 }
221
222 // Sourcegraph specific
223 if o.DisableCTags {
224 args = append(args, "-disable_ctags")
225 }
226
227 if o.ShardMerging {
228 args = append(args, "-shard_merging")
229 }
230
231 return args
232}
233
234// Builder manages (parallel) creation of uniformly sized shards. The
235// builder buffers up documents until it collects enough documents and
236// then builds a shard and writes.
237type Builder struct {
238 opts Options
239 throttle chan int
240
241 nextShardNum int
242 todo []*zoekt.Document
243 docChecker zoekt.DocChecker
244 size int
245
246 parserBins ctags.ParserBinMap
247 building sync.WaitGroup
248
249 errMu sync.Mutex
250 buildError error
251
252 // temp name => final name for finished shards. We only rename
253 // them once all shards succeed to avoid Frankstein corpuses.
254 finishedShards map[string]string
255
256 // indexTime is set by tests for doing reproducible builds.
257 indexTime time.Time
258
259 // heapProfileMu is used to ensure that only one memory profile is written at a time
260 heapProfileMu sync.Mutex
261 heapProfileNum int
262
263 // a sortable 20 chars long id.
264 id string
265
266 finishCalled bool
267}
268
269type finishedShard struct {
270 temp, final string
271}
272
273func checkCTags() string {
274 if ctags := os.Getenv("CTAGS_COMMAND"); ctags != "" {
275 return ctags
276 }
277
278 if ctags, err := exec.LookPath("universal-ctags"); err == nil {
279 return ctags
280 }
281
282 return ""
283}
284
285func checkScipCTags() string {
286 if ctags := os.Getenv("SCIP_CTAGS_COMMAND"); ctags != "" {
287 return ctags
288 }
289
290 if ctags, err := exec.LookPath("scip-ctags"); err == nil {
291 return ctags
292 }
293
294 return ""
295}
296
297// SetDefaults sets reasonable default options.
298func (o *Options) SetDefaults() {
299 if o.CTagsPath == "" && !o.DisableCTags {
300 o.CTagsPath = checkCTags()
301 }
302
303 if o.ScipCTagsPath == "" && !o.DisableCTags {
304 o.ScipCTagsPath = checkScipCTags()
305 }
306
307 if o.Parallelism == 0 {
308 o.Parallelism = 4
309 }
310 if o.SizeMax == 0 {
311 o.SizeMax = 2 << 20
312 }
313 if o.ShardMax == 0 {
314 o.ShardMax = 100 << 20
315 }
316 if o.TrigramMax == 0 {
317 o.TrigramMax = 20000
318 }
319
320 if o.RepositoryDescription.Name == "" && o.RepositoryDescription.URL != "" {
321 parsed, _ := url.Parse(o.RepositoryDescription.URL)
322 if parsed != nil {
323 o.RepositoryDescription.Name = filepath.Join(parsed.Host, parsed.Path)
324 }
325 }
326}
327
328func hashString(s string) string {
329 h := sha1.New()
330 _, _ = io.WriteString(h, s)
331 return fmt.Sprintf("%x", h.Sum(nil))
332}
333
334// ShardName returns the name the given index shard.
335func (o *Options) shardName(n int) string {
336 return o.shardNameVersion(zoekt.IndexFormatVersion, n)
337}
338
339func (o *Options) shardNameVersion(version, n int) string {
340 abs := url.QueryEscape(o.RepositoryDescription.Name)
341 if len(abs) > 200 {
342 abs = abs[:200] + hashString(abs)[:8]
343 }
344 return filepath.Join(o.IndexDir,
345 fmt.Sprintf("%s_v%d.%05d.zoekt", abs, version, n))
346}
347
348type IndexState string
349
350const (
351 IndexStateMissing IndexState = "missing"
352 IndexStateCorrupt IndexState = "corrupt"
353 IndexStateVersion IndexState = "version-mismatch"
354 IndexStateOption IndexState = "option-mismatch"
355 IndexStateMeta IndexState = "meta-mismatch"
356 IndexStateContent IndexState = "content-mismatch"
357 IndexStateEqual IndexState = "equal"
358)
359
360var readVersions = []struct {
361 IndexFormatVersion int
362 FeatureVersion int
363}{{
364 IndexFormatVersion: zoekt.IndexFormatVersion,
365 FeatureVersion: zoekt.FeatureVersion,
366}, {
367 IndexFormatVersion: zoekt.NextIndexFormatVersion,
368 FeatureVersion: zoekt.FeatureVersion,
369}}
370
371// IncrementalSkipIndexing returns true if the index present on disk matches
372// the build options.
373func (o *Options) IncrementalSkipIndexing() bool {
374 state, _ := o.IndexState()
375 return state == IndexStateEqual
376}
377
378// IndexState checks how the index present on disk compares to the build
379// options and returns the IndexState and the name of the first shard.
380func (o *Options) IndexState() (IndexState, string) {
381 // Open the latest version we support that is on disk.
382 fn := o.findShard()
383 if fn == "" {
384 return IndexStateMissing, fn
385 }
386
387 repos, index, err := zoekt.ReadMetadataPathAlive(fn)
388 if os.IsNotExist(err) {
389 return IndexStateMissing, fn
390 } else if err != nil {
391 return IndexStateCorrupt, fn
392 }
393
394 for _, v := range readVersions {
395 if v.IndexFormatVersion == index.IndexFormatVersion && v.FeatureVersion != index.IndexFeatureVersion {
396 return IndexStateVersion, fn
397 }
398 }
399
400 var repo *zoekt.Repository
401 for _, cand := range repos {
402 if cand.Name == o.RepositoryDescription.Name {
403 repo = cand
404 break
405 }
406 }
407
408 if repo == nil {
409 return IndexStateCorrupt, fn
410 }
411
412 if repo.IndexOptions != o.GetHash() {
413 return IndexStateOption, fn
414 }
415
416 if !reflect.DeepEqual(repo.Branches, o.RepositoryDescription.Branches) {
417 return IndexStateContent, fn
418 }
419
420 // We can mutate repo since it lives in the scope of this function call.
421 if updated, err := repo.MergeMutable(&o.RepositoryDescription); err != nil {
422 // non-nil err means we are trying to update an immutable field =>
423 // reindex content.
424 log.Printf("warn: immutable field changed, requires re-index: %s", err)
425 return IndexStateContent, fn
426 } else if updated {
427 return IndexStateMeta, fn
428 }
429
430 return IndexStateEqual, fn
431}
432
433// FindRepositoryMetadata returns the index metadata for the repository
434// specified in the options. 'ok' is false if the repository's metadata
435// couldn't be found or if an error occurred.
436func (o *Options) FindRepositoryMetadata() (repository *zoekt.Repository, metadata *zoekt.IndexMetadata, ok bool, err error) {
437 shard := o.findShard()
438 if shard == "" {
439 return nil, nil, false, nil
440 }
441
442 repositories, metadata, err := zoekt.ReadMetadataPathAlive(shard)
443 if err != nil {
444 return nil, nil, false, fmt.Errorf("reading metadata for shard %q: %w", shard, err)
445 }
446
447 ID := o.RepositoryDescription.ID
448 for _, r := range repositories {
449 // compound shards contain multiple repositories, so we
450 // have to pick only the one we're looking for
451 if r.ID == ID {
452 return r, metadata, true, nil
453 }
454 }
455
456 // If we're here, then we're somehow in a state where we found a matching
457 // shard that's missing the repository metadata we're looking for. This
458 // should never happen.
459 name := o.RepositoryDescription.Name
460 return nil, nil, false, fmt.Errorf("matching shard %q doesn't contain metadata for repo id %d (%q)", shard, ID, name)
461}
462
463func (o *Options) findShard() string {
464 for _, v := range readVersions {
465 fn := o.shardNameVersion(v.IndexFormatVersion, 0)
466 if _, err := os.Stat(fn); err == nil {
467 return fn
468 }
469 }
470
471 // Brute force finding the shard in compound shards. We should only hit this
472 // code path for repositories that are not already existing or are in
473 // compound shards.
474 //
475 // TODO add an oracle which can speed this up in the case of repositories
476 // already in compound shards.
477 compoundShards, err := filepath.Glob(path.Join(o.IndexDir, "compound-*.zoekt"))
478 if err != nil {
479 return ""
480 }
481 for _, fn := range compoundShards {
482 repos, _, err := zoekt.ReadMetadataPathAlive(fn)
483 if err != nil {
484 continue
485 }
486 for _, repo := range repos {
487 if repo.ID == o.RepositoryDescription.ID {
488 return fn
489 }
490 }
491 }
492
493 return ""
494}
495
496func (o *Options) FindAllShards() []string {
497 for _, v := range readVersions {
498 fn := o.shardNameVersion(v.IndexFormatVersion, 0)
499 if _, err := os.Stat(fn); err == nil {
500 shards := []string{fn}
501 for i := 1; ; i++ {
502 fn := o.shardNameVersion(v.IndexFormatVersion, i)
503 if _, err := os.Stat(fn); err != nil {
504 return shards
505 }
506 shards = append(shards, fn)
507 }
508 }
509 }
510
511 // lazily fallback to findShard which will look for a compound shard.
512 if fn := o.findShard(); fn != "" {
513 return []string{fn}
514 }
515
516 return nil
517}
518
519// IgnoreSizeMax determines whether the max size should be ignored.
520func (o *Options) IgnoreSizeMax(name string) bool {
521 // A pattern match will override preceding pattern matches.
522 for i := len(o.LargeFiles) - 1; i >= 0; i-- {
523 pattern := strings.TrimSpace(o.LargeFiles[i])
524 negated, validatedPattern := checkIsNegatePattern(pattern)
525
526 if m, _ := doublestar.PathMatch(validatedPattern, name); m {
527 if negated {
528 return false
529 } else {
530 return true
531 }
532 }
533 }
534
535 return false
536}
537
538func checkIsNegatePattern(pattern string) (bool, string) {
539 negate := "!"
540
541 // if negated then strip prefix meta character which identifies negated filter pattern
542 if strings.HasPrefix(pattern, negate) {
543 return true, pattern[len(negate):]
544 }
545
546 return false, pattern
547}
548
549// NewBuilder creates a new Builder instance.
550func NewBuilder(opts Options) (*Builder, error) {
551 opts.SetDefaults()
552 if opts.RepositoryDescription.Name == "" {
553 return nil, fmt.Errorf("builder: must set Name")
554 }
555
556 b := &Builder{
557 opts: opts,
558 throttle: make(chan int, opts.Parallelism),
559 finishedShards: map[string]string{},
560 }
561
562 parserBins, err := ctags.NewParserBinMap(
563 b.opts.CTagsPath,
564 b.opts.ScipCTagsPath,
565 opts.LanguageMap,
566 b.opts.CTagsMustSucceed,
567 )
568 if err != nil {
569 return nil, err
570 }
571
572 b.parserBins = parserBins
573
574 if opts.IsDelta {
575 // Delta shards build on top of previously existing shards.
576 // As a consequence, the shardNum for delta shards starts from
577 // the number following the most recently generated shard - not 0.
578 //
579 // Using this numbering scheme allows all the shards to be
580 // discovered as a set.
581 shards := b.opts.FindAllShards()
582 b.nextShardNum = len(shards) // shards are zero indexed, so len() provides the next number after the last one
583 }
584
585 if _, err := b.newShardBuilder(); err != nil {
586 return nil, err
587 }
588
589 now := time.Now()
590 b.indexTime = now
591 b.id = xid.NewWithTime(now).String()
592
593 return b, nil
594}
595
596// AddFile is a convenience wrapper for the Add method
597func (b *Builder) AddFile(name string, content []byte) error {
598 return b.Add(zoekt.Document{Name: name, Content: content})
599}
600
601func (b *Builder) Add(doc zoekt.Document) error {
602 if b.finishCalled {
603 return nil
604 }
605
606 allowLargeFile := b.opts.IgnoreSizeMax(doc.Name)
607 if len(doc.Content) > b.opts.SizeMax && !allowLargeFile {
608 // We could pass the document on to the shardbuilder, but if
609 // we pass through a part of the source tree with binary/large
610 // files, the corresponding shard would be mostly empty, so
611 // insert a reason here too.
612 doc.SkipReason = fmt.Sprintf("document size %d larger than limit %d", len(doc.Content), b.opts.SizeMax)
613 } else if err := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile); err != nil {
614 doc.SkipReason = err.Error()
615 doc.Language = "binary"
616 }
617
618 b.todo = append(b.todo, &doc)
619
620 if doc.SkipReason == "" {
621 b.size += len(doc.Name) + len(doc.Content)
622 } else {
623 b.size += len(doc.Name) + len(doc.SkipReason)
624 // Drop the content if we are skipping the document. Skipped content is not counted towards the
625 // shard size limit, so otherwise we might buffer too much data in memory before flushing.
626 doc.Content = nil
627 }
628
629 if b.size > b.opts.ShardMax {
630 return b.flush()
631 }
632
633 return nil
634}
635
636// MarkFileAsChangedOrRemoved indicates that the file specified by the given path
637// has been changed or removed since the last indexing job for this repository.
638//
639// If this build is a delta build, these files will be tombstoned in the older shards for this repository.
640func (b *Builder) MarkFileAsChangedOrRemoved(path string) {
641 b.opts.changedOrRemovedFiles = append(b.opts.changedOrRemovedFiles, path)
642}
643
644// Finish creates a last shard from the buffered documents, and clears
645// stale shards from previous runs. This should always be called, also
646// in failure cases, to ensure cleanup.
647//
648// It is safe to call Finish() multiple times.
649func (b *Builder) Finish() error {
650 if b.finishCalled {
651 return b.buildError
652 }
653
654 b.finishCalled = true
655
656 b.flush()
657 b.building.Wait()
658
659 if b.buildError != nil {
660 for tmp := range b.finishedShards {
661 log.Printf("Builder.Finish %s", tmp)
662 os.Remove(tmp)
663 }
664 b.finishedShards = map[string]string{}
665 return b.buildError
666 }
667
668 // map of temporary -> final names for all updated shards + shard metadata files
669 artifactPaths := make(map[string]string)
670 for tmp, final := range b.finishedShards {
671 artifactPaths[tmp] = final
672 }
673
674 oldShards := b.opts.FindAllShards()
675
676 if b.opts.IsDelta {
677 // Delta shard builds need to update FileTombstone and branch commit information for all
678 // existing shards
679 for _, shard := range oldShards {
680 repositories, _, err := zoekt.ReadMetadataPathAlive(shard)
681 if err != nil {
682 return fmt.Errorf("reading metadata from shard %q: %w", shard, err)
683 }
684
685 if len(repositories) > 1 {
686 return fmt.Errorf("delta shard builds don't support repositories contained in compound shards (shard %q)", shard)
687 }
688
689 if len(repositories) == 0 {
690 return fmt.Errorf("failed to update repository metadata for shard %q - shard contains no repositories", shard)
691 }
692
693 repository := repositories[0]
694 if repository.ID != b.opts.RepositoryDescription.ID {
695 return fmt.Errorf("shard %q doesn't contain repository ID %d (%q)", shard, b.opts.RepositoryDescription.ID, b.opts.RepositoryDescription.Name)
696 }
697
698 if len(b.opts.changedOrRemovedFiles) > 0 && repository.FileTombstones == nil {
699 repository.FileTombstones = make(map[string]struct{}, len(b.opts.changedOrRemovedFiles))
700 }
701
702 for _, f := range b.opts.changedOrRemovedFiles {
703 repository.FileTombstones[f] = struct{}{}
704 }
705
706 if !BranchNamesEqual(repository.Branches, b.opts.RepositoryDescription.Branches) {
707 return deltaBranchSetError{
708 shardName: shard,
709 old: repository.Branches,
710 new: b.opts.RepositoryDescription.Branches,
711 }
712 }
713
714 if b.opts.GetHash() != repository.IndexOptions {
715 return &deltaIndexOptionsMismatchError{
716 shardName: shard,
717 newOptions: b.opts.HashOptions(),
718 }
719 }
720
721 repository.Branches = b.opts.RepositoryDescription.Branches
722
723 repository.LatestCommitDate = b.opts.RepositoryDescription.LatestCommitDate
724
725 tempPath, finalPath, err := zoekt.JsonMarshalRepoMetaTemp(shard, repository)
726 if err != nil {
727 return fmt.Errorf("writing repository metadta for shard %q: %w", shard, err)
728 }
729
730 artifactPaths[tempPath] = finalPath
731 }
732 }
733
734 // We mark finished shards as empty when we successfully finish. Return now
735 // to allow call sites to call Finish idempotently.
736 if len(artifactPaths) == 0 {
737 return b.buildError
738 }
739
740 // Collect a map of the old shards on disk. For each new shard we replace we
741 // delete it from toDelete. Anything remaining in toDelete will be removed
742 // after we have renamed everything into place.
743
744 var toDelete map[string]struct{}
745 if !b.opts.IsDelta {
746 // Non-delta shard builds delete all existing shards before they write out
747 // new ones.
748 // By contrast, delta shard builds work by stacking changes on top of existing shards.
749 // So, we skip populating the toDelete map if we're building delta shards.
750
751 toDelete = make(map[string]struct{})
752 for _, name := range oldShards {
753 paths, err := zoekt.IndexFilePaths(name)
754 if err != nil {
755 b.buildError = fmt.Errorf("failed to find old paths for %s: %w", name, err)
756 }
757 for _, p := range paths {
758 toDelete[p] = struct{}{}
759 }
760 }
761 }
762
763 for tmp, final := range artifactPaths {
764 if err := os.Rename(tmp, final); err != nil {
765 b.buildError = err
766 continue
767 }
768
769 delete(toDelete, final)
770 }
771
772 b.finishedShards = map[string]string{}
773
774 for p := range toDelete {
775 // Don't delete compound shards, set tombstones instead.
776 if b.opts.ShardMerging && strings.HasPrefix(filepath.Base(p), "compound-") {
777 if !strings.HasSuffix(p, ".zoekt") {
778 continue
779 }
780 err := zoekt.SetTombstone(p, b.opts.RepositoryDescription.ID)
781 b.buildError = err
782 continue
783 }
784 log.Printf("removing old shard file: %s", p)
785 if err := os.Remove(p); err != nil {
786 b.buildError = err
787 }
788 }
789
790 return b.buildError
791}
792
793// BranchNamesEqual compares the given zoekt.RepositoryBranch slices, and returns true
794// iff both slices specify the same set of branch names in the same order.
795func BranchNamesEqual(a, b []zoekt.RepositoryBranch) bool {
796 if len(a) != len(b) {
797 return false
798 }
799
800 for i := range a {
801 x, y := a[i], b[i]
802 if x.Name != y.Name {
803 return false
804 }
805 }
806
807 return true
808}
809
810func (b *Builder) flush() error {
811 todo := b.todo
812 b.todo = nil
813 b.size = 0
814 b.errMu.Lock()
815 defer b.errMu.Unlock()
816 if b.buildError != nil {
817 return b.buildError
818 }
819
820 hasShard := b.nextShardNum > 0
821 if len(todo) == 0 && hasShard {
822 return nil
823 }
824
825 shard := b.nextShardNum
826 b.nextShardNum++
827
828 if b.opts.Parallelism > 1 {
829 b.building.Add(1)
830 b.throttle <- 1
831 go func() {
832 done, err := b.buildShard(todo, shard)
833 <-b.throttle
834
835 b.errMu.Lock()
836 defer b.errMu.Unlock()
837 if err != nil && b.buildError == nil {
838 b.buildError = err
839 }
840 if err == nil {
841 b.finishedShards[done.temp] = done.final
842 }
843 b.building.Done()
844 }()
845 } else {
846 // No goroutines when we're not parallel. This
847 // simplifies memory profiling.
848 done, err := b.buildShard(todo, shard)
849 b.buildError = err
850 if err == nil {
851 b.finishedShards[done.temp] = done.final
852 }
853
854 return b.buildError
855 }
856
857 return nil
858}
859
860// map [0,inf) to [0,1) monotonically
861func squashRange(j int) float64 {
862 x := float64(j)
863 return x / (1 + x)
864}
865
866// IsLowPriority takes a file name and makes an educated guess about its priority
867// in search results. A file is considered low priority if it looks like a test,
868// vendored, or generated file.
869//
870// These 'priority' criteria affects how documents are ordered within a shard. It's
871// also used to help guess a file's rank when we're missing ranking information.
872func IsLowPriority(path string, content []byte) bool {
873 return enry.IsTest(path) || enry.IsVendor(path) || enry.IsGenerated(path, content)
874}
875
876type rankedDoc struct {
877 *zoekt.Document
878 rank []float64
879}
880
881// rank returns a vector of scores which is used at index-time to sort documents
882// before writing them to disk. The order of documents in the shard is important
883// at query time, because earlier documents receive a boost at query time and
884// have a higher chance of being searched before limits kick in.
885func rank(d *zoekt.Document, origIdx int) []float64 {
886 skipped := 0.0
887 if d.SkipReason != "" {
888 skipped = 1.0
889 }
890
891 generated := 0.0
892 if enry.IsGenerated(d.Name, d.Content) {
893 generated = 1.0
894 }
895
896 vendor := 0.0
897 if enry.IsVendor(d.Name) {
898 vendor = 1.0
899 }
900
901 test := 0.0
902 if enry.IsTest(d.Name) {
903 test = 1.0
904 }
905
906 // Smaller is earlier (=better).
907 return []float64{
908 // Always place skipped docs last
909 skipped,
910
911 // Prefer docs that are not generated
912 generated,
913
914 // Prefer docs that are not vendored
915 vendor,
916
917 // Prefer docs that are not tests
918 test,
919
920 // With short names
921 squashRange(len(d.Name)),
922
923 // With many symbols
924 1.0 - squashRange(len(d.Symbols)),
925
926 // With short content
927 squashRange(len(d.Content)),
928
929 // That is present is as many branches as possible
930 1.0 - squashRange(len(d.Branches)),
931
932 // Preserve original ordering.
933 squashRange(origIdx),
934 }
935}
936
937func sortDocuments(todo []*zoekt.Document) {
938 rs := make([]rankedDoc, 0, len(todo))
939 for i, t := range todo {
940 rd := rankedDoc{t, rank(t, i)}
941 rs = append(rs, rd)
942 }
943 sort.Slice(rs, func(i, j int) bool {
944 r1 := rs[i].rank
945 r2 := rs[j].rank
946 for i := range r1 {
947 if r1[i] < r2[i] {
948 return true
949 }
950 if r1[i] > r2[i] {
951 return false
952 }
953 }
954
955 return false
956 })
957 for i := range todo {
958 todo[i] = rs[i].Document
959 }
960}
961
962func (b *Builder) buildShard(todo []*zoekt.Document, nextShardNum int) (*finishedShard, error) {
963 if !b.opts.DisableCTags && (b.opts.CTagsPath != "" || b.opts.ScipCTagsPath != "") {
964 err := parseSymbols(todo, b.opts.LanguageMap, b.parserBins)
965 if b.opts.CTagsMustSucceed && err != nil {
966 return nil, err
967 }
968 if err != nil {
969 log.Printf("ignoring universal:%s or scip:%s error: %v", b.opts.CTagsPath, b.opts.ScipCTagsPath, err)
970 }
971 }
972
973 name := b.opts.shardName(nextShardNum)
974
975 shardBuilder, err := b.newShardBuilder()
976 if err != nil {
977 return nil, err
978 }
979
980 sortDocuments(todo)
981
982 for idx, t := range todo {
983 if err := shardBuilder.Add(*t); err != nil {
984 return nil, err
985 }
986
987 if idx%10_000 == 0 {
988 b.CheckMemoryUsage()
989 }
990 }
991
992 return b.writeShard(name, shardBuilder)
993}
994
995// CheckMemoryUsage checks the memory usage of the process and writes a memory profile if the heap usage exceeds the
996// configured threshold. NOTE: this method is expensive and should only be used for debugging.
997func (b *Builder) CheckMemoryUsage() {
998 // Don't check memory if heap profiling is disabled, or we've already written 10 profiles
999 if b.opts.HeapProfileTriggerBytes <= 0 || b.heapProfileNum >= 10 {
1000 return
1001 }
1002
1003 var m runtime.MemStats
1004 runtime.ReadMemStats(&m)
1005
1006 if m.HeapAlloc > b.opts.HeapProfileTriggerBytes && b.heapProfileMu.TryLock() {
1007 defer b.heapProfileMu.Unlock()
1008
1009 log.Printf("writing memory profile, allocated heap: %s", humanize.Bytes(m.HeapAlloc))
1010 name := filepath.Join(b.opts.IndexDir, fmt.Sprintf("indexmemory.prof.%d", b.heapProfileNum))
1011 f, err := os.Create(name)
1012 if err != nil {
1013 log.Printf("failed to create memory profile file: %v", err)
1014 return
1015 }
1016
1017 err = pprof.WriteHeapProfile(f)
1018 if err != nil {
1019 log.Printf("failed to write memory profile: %v", err)
1020 }
1021
1022 b.heapProfileNum++
1023 }
1024}
1025
1026func (b *Builder) newShardBuilder() (*zoekt.IndexBuilder, error) {
1027 desc := b.opts.RepositoryDescription
1028 desc.HasSymbols = !b.opts.DisableCTags && b.opts.CTagsPath != ""
1029 desc.SubRepoMap = b.opts.SubRepositories
1030 desc.IndexOptions = b.opts.GetHash()
1031
1032 shardBuilder, err := zoekt.NewIndexBuilder(&desc)
1033 if err != nil {
1034 return nil, err
1035 }
1036 shardBuilder.IndexTime = b.indexTime
1037 shardBuilder.ID = b.id
1038 return shardBuilder, nil
1039}
1040
1041func (b *Builder) writeShard(fn string, ib *zoekt.IndexBuilder) (*finishedShard, error) {
1042 dir := filepath.Dir(fn)
1043 if err := os.MkdirAll(dir, 0o700); err != nil {
1044 return nil, err
1045 }
1046
1047 f, err := os.CreateTemp(dir, filepath.Base(fn)+".*.tmp")
1048 if err != nil {
1049 return nil, err
1050 }
1051 if runtime.GOOS != "windows" {
1052 if err := f.Chmod(0o666 &^ umask); err != nil {
1053 return nil, err
1054 }
1055 }
1056
1057 defer f.Close()
1058 if err := ib.Write(f); err != nil {
1059 return nil, err
1060 }
1061 fi, err := f.Stat()
1062 if err != nil {
1063 return nil, err
1064 }
1065 if err := f.Close(); err != nil {
1066 return nil, err
1067 }
1068
1069 log.Printf("finished shard %s: %d index bytes (overhead %3.1f), %d files processed \n",
1070 fn,
1071 fi.Size(),
1072 float64(fi.Size())/float64(ib.ContentSize()+1),
1073 ib.NumFiles())
1074
1075 return &finishedShard{f.Name(), fn}, nil
1076}
1077
1078type deltaBranchSetError struct {
1079 shardName string
1080 old, new []zoekt.RepositoryBranch
1081}
1082
1083func (e deltaBranchSetError) Error() string {
1084 return fmt.Sprintf("repository metadata in shard %q contains a different set of branch names than what was requested, which is unsupported in a delta shard build. old: %+v, new: %+v", e.shardName, e.old, e.new)
1085}
1086
1087type deltaIndexOptionsMismatchError struct {
1088 shardName string
1089 newOptions HashOptions
1090}
1091
1092func (e *deltaIndexOptionsMismatchError) Error() string {
1093 return fmt.Sprintf("one or more index options for shard %q do not match Builder's index options. These index option updates are incompatible with delta build. New index options: %+v", e.shardName, e.newOptions)
1094}
1095
1096// umask holds the Umask of the current process
1097var umask os.FileMode