fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// package build implements a more convenient interface for building
16// zoekt indices.
17package build
18
19import (
20 "crypto/sha1"
21 "flag"
22 "fmt"
23 "io"
24 "log"
25 "net/url"
26 "os"
27 "os/exec"
28 "path"
29 "path/filepath"
30 "reflect"
31 "runtime"
32 "runtime/pprof"
33 "sort"
34 "strconv"
35 "strings"
36 "sync"
37 "time"
38
39 "github.com/bmatcuk/doublestar"
40 "github.com/dustin/go-humanize"
41 "github.com/go-enry/go-enry/v2"
42 "github.com/rs/xid"
43
44 "github.com/sourcegraph/zoekt"
45 "github.com/sourcegraph/zoekt/ctags"
46)
47
48var DefaultDir = filepath.Join(os.Getenv("HOME"), ".zoekt")
49
50// Branch describes a single branch version.
51type Branch struct {
52 Name string
53 Version string
54}
55
56// Options sets options for the index building.
57type Options struct {
58 // IndexDir is a directory that holds *.zoekt index files.
59 IndexDir string
60
61 // SizeMax is the maximum file size
62 SizeMax int
63
64 // Parallelism is the maximum number of shards to index in parallel
65 Parallelism int
66
67 // ShardMax sets the maximum corpus size for a single shard
68 ShardMax int
69
70 // TrigramMax sets the maximum number of distinct trigrams per document.
71 TrigramMax int
72
73 // RepositoryDescription holds names and URLs for the repository.
74 RepositoryDescription zoekt.Repository
75
76 // SubRepositories is a path => sub repository map.
77 SubRepositories map[string]*zoekt.Repository
78
79 // DisableCTags disables the generation of ctags metadata.
80 DisableCTags bool
81
82 // CtagsPath is the path to the ctags binary to run, or empty
83 // if a valid binary couldn't be found.
84 CTagsPath string
85
86 // Same as CTagsPath but for scip-ctags
87 ScipCTagsPath string
88
89 // If set, ctags must succeed.
90 CTagsMustSucceed bool
91
92 // LargeFiles is a slice of glob patterns, including ** for any number
93 // of directories, where matching file paths should be indexed
94 // regardless of their size. The full pattern syntax is here:
95 // https://github.com/bmatcuk/doublestar/tree/v1#patterns.
96 LargeFiles []string
97
98 // IsDelta is true if this run contains only the changed documents since the
99 // last run.
100 IsDelta bool
101
102 // DocumentRanksPath is the path to the file with document ranks. If empty,
103 // ranks will be computed on-the-fly.
104 DocumentRanksPath string
105
106 // DocumentRanksVersion is a string which when changed will cause us to
107 // reindex a shard. This field is used so that when the contents of
108 // DocumentRanksPath changes, we can reindex.
109 DocumentRanksVersion string
110
111 // changedOrRemovedFiles is a list of file paths that have been changed or removed
112 // since the last indexing job for this repository. These files will be tombstoned
113 // in the older shards for this repository.
114 changedOrRemovedFiles []string
115
116 LanguageMap ctags.LanguageMap
117
118 // ShardMerging is true if builder should respect compound shards. This is a
119 // Sourcegraph specific option.
120 ShardMerging bool
121
122 // HeapProfileTriggerBytes is the heap usage in bytes that will trigger a memory profile. If 0, no memory profile will be triggered.
123 // Profiles will be written to files named `index-memory.prof.n` in the index directory. No more than 10 files are written.
124 //
125 // Note: heap checking is "best effort", and it's possible for the process to OOM without triggering the heap profile.
126 HeapProfileTriggerBytes uint64
127}
128
129// HashOptions contains only the options in Options that upon modification leads to IndexState of IndexStateMismatch during the next index building.
130type HashOptions struct {
131 sizeMax int
132 disableCTags bool
133 ctagsPath string
134 cTagsMustSucceed bool
135 largeFiles []string
136
137 // documentRankVersion is an experimental field which will change when the
138 // DocumentRanksPath content changes. If empty we ignore it.
139 documentRankVersion string
140}
141
142func (o *Options) HashOptions() HashOptions {
143 return HashOptions{
144 sizeMax: o.SizeMax,
145 disableCTags: o.DisableCTags,
146 ctagsPath: o.CTagsPath,
147 cTagsMustSucceed: o.CTagsMustSucceed,
148 largeFiles: o.LargeFiles,
149 documentRankVersion: o.DocumentRanksVersion,
150 }
151}
152
153func (o *Options) GetHash() string {
154 h := o.HashOptions()
155 hasher := sha1.New()
156
157 hasher.Write([]byte(h.ctagsPath))
158 hasher.Write([]byte(fmt.Sprintf("%t", h.cTagsMustSucceed)))
159 hasher.Write([]byte(fmt.Sprintf("%d", h.sizeMax)))
160 hasher.Write([]byte(fmt.Sprintf("%q", h.largeFiles)))
161 hasher.Write([]byte(fmt.Sprintf("%t", h.disableCTags)))
162
163 if h.documentRankVersion != "" {
164 hasher.Write([]byte{0})
165 io.WriteString(hasher, h.documentRankVersion)
166 }
167
168 return fmt.Sprintf("%x", hasher.Sum(nil))
169}
170
171type largeFilesFlag struct{ *Options }
172
173func (f largeFilesFlag) String() string {
174 // From flag.Value documentation:
175 //
176 // The flag package may call the String method with a zero-valued receiver,
177 // such as a nil pointer.
178 if f.Options == nil {
179 return ""
180 }
181 s := append([]string{""}, f.LargeFiles...)
182 return strings.Join(s, "-large_file ")
183}
184
185func (f largeFilesFlag) Set(value string) error {
186 f.LargeFiles = append(f.LargeFiles, value)
187 return nil
188}
189
190// Flags adds flags for build options to fs. It is the "inverse" of Args.
191func (o *Options) Flags(fs *flag.FlagSet) {
192 x := *o
193 x.SetDefaults()
194 fs.IntVar(&o.SizeMax, "file_limit", x.SizeMax, "maximum file size")
195 fs.IntVar(&o.TrigramMax, "max_trigram_count", x.TrigramMax, "maximum number of trigrams per document")
196 fs.IntVar(&o.ShardMax, "shard_limit", x.ShardMax, "maximum corpus size for a shard")
197 fs.IntVar(&o.Parallelism, "parallelism", x.Parallelism, "maximum number of parallel indexing processes.")
198 fs.StringVar(&o.IndexDir, "index", x.IndexDir, "directory for search indices")
199 fs.BoolVar(&o.CTagsMustSucceed, "require_ctags", x.CTagsMustSucceed, "If set, ctags calls must succeed.")
200 fs.Var(largeFilesFlag{o}, "large_file", "A glob pattern where matching files are to be index regardless of their size. You can add multiple patterns by setting this more than once.")
201
202 // Sourcegraph specific
203 fs.BoolVar(&o.DisableCTags, "disable_ctags", x.DisableCTags, "If set, ctags will not be called.")
204 fs.BoolVar(&o.ShardMerging, "shard_merging", x.ShardMerging, "If set, builder will respect compound shards.")
205}
206
207// Args generates command line arguments for o. It is the "inverse" of Flags.
208func (o *Options) Args() []string {
209 var args []string
210
211 if o.SizeMax != 0 {
212 args = append(args, "-file_limit", strconv.Itoa(o.SizeMax))
213 }
214
215 if o.TrigramMax != 0 {
216 args = append(args, "-max_trigram_count", strconv.Itoa(o.TrigramMax))
217 }
218
219 if o.ShardMax != 0 {
220 args = append(args, "-shard_limit", strconv.Itoa(o.ShardMax))
221 }
222
223 if o.Parallelism != 0 {
224 args = append(args, "-parallelism", strconv.Itoa(o.Parallelism))
225 }
226
227 if o.IndexDir != "" {
228 args = append(args, "-index", o.IndexDir)
229 }
230
231 if o.CTagsMustSucceed {
232 args = append(args, "-require_ctags")
233 }
234
235 for _, a := range o.LargeFiles {
236 args = append(args, "-large_file", a)
237 }
238
239 // Sourcegraph specific
240 if o.DisableCTags {
241 args = append(args, "-disable_ctags")
242 }
243
244 if o.ShardMerging {
245 args = append(args, "-shard_merging")
246 }
247
248 return args
249}
250
251// Builder manages (parallel) creation of uniformly sized shards. The
252// builder buffers up documents until it collects enough documents and
253// then builds a shard and writes.
254type Builder struct {
255 opts Options
256 throttle chan int
257
258 nextShardNum int
259 todo []*zoekt.Document
260 docChecker zoekt.DocChecker
261 size int
262
263 parserBins ctags.ParserBinMap
264 building sync.WaitGroup
265
266 errMu sync.Mutex
267 buildError error
268
269 // temp name => final name for finished shards. We only rename
270 // them once all shards succeed to avoid Frankstein corpuses.
271 finishedShards map[string]string
272
273 // indexTime is set by tests for doing reproducible builds.
274 indexTime time.Time
275
276 // heapProfileMu is used to ensure that only one memory profile is written at a time
277 heapProfileMu sync.Mutex
278 heapProfileNum int
279
280 // a sortable 20 chars long id.
281 id string
282
283 finishCalled bool
284}
285
286type finishedShard struct {
287 temp, final string
288}
289
290func checkCTags() string {
291 if ctags := os.Getenv("CTAGS_COMMAND"); ctags != "" {
292 return ctags
293 }
294
295 if ctags, err := exec.LookPath("universal-ctags"); err == nil {
296 return ctags
297 }
298
299 return ""
300}
301
302func checkScipCTags() string {
303 if ctags := os.Getenv("SCIP_CTAGS_COMMAND"); ctags != "" {
304 return ctags
305 }
306
307 if ctags, err := exec.LookPath("scip-ctags"); err == nil {
308 return ctags
309 }
310
311 return ""
312}
313
314// SetDefaults sets reasonable default options.
315func (o *Options) SetDefaults() {
316 if o.CTagsPath == "" && !o.DisableCTags {
317 o.CTagsPath = checkCTags()
318 }
319
320 if o.ScipCTagsPath == "" && !o.DisableCTags {
321 o.ScipCTagsPath = checkScipCTags()
322 }
323
324 if o.Parallelism == 0 {
325 o.Parallelism = 4
326 }
327 if o.SizeMax == 0 {
328 o.SizeMax = 2 << 20
329 }
330 if o.ShardMax == 0 {
331 o.ShardMax = 100 << 20
332 }
333 if o.TrigramMax == 0 {
334 o.TrigramMax = 20000
335 }
336
337 if o.RepositoryDescription.Name == "" && o.RepositoryDescription.URL != "" {
338 parsed, _ := url.Parse(o.RepositoryDescription.URL)
339 if parsed != nil {
340 o.RepositoryDescription.Name = filepath.Join(parsed.Host, parsed.Path)
341 }
342 }
343}
344
345func hashString(s string) string {
346 h := sha1.New()
347 _, _ = io.WriteString(h, s)
348 return fmt.Sprintf("%x", h.Sum(nil))
349}
350
351// ShardName returns the name the given index shard.
352func (o *Options) shardName(n int) string {
353 return o.shardNameVersion(zoekt.IndexFormatVersion, n)
354}
355
356func (o *Options) shardNameVersion(version, n int) string {
357 abs := url.QueryEscape(o.RepositoryDescription.Name)
358 if len(abs) > 200 {
359 abs = abs[:200] + hashString(abs)[:8]
360 }
361 return filepath.Join(o.IndexDir,
362 fmt.Sprintf("%s_v%d.%05d.zoekt", abs, version, n))
363}
364
365type IndexState string
366
367const (
368 IndexStateMissing IndexState = "missing"
369 IndexStateCorrupt IndexState = "corrupt"
370 IndexStateVersion IndexState = "version-mismatch"
371 IndexStateOption IndexState = "option-mismatch"
372 IndexStateMeta IndexState = "meta-mismatch"
373 IndexStateContent IndexState = "content-mismatch"
374 IndexStateEqual IndexState = "equal"
375)
376
377var readVersions = []struct {
378 IndexFormatVersion int
379 FeatureVersion int
380}{{
381 IndexFormatVersion: zoekt.IndexFormatVersion,
382 FeatureVersion: zoekt.FeatureVersion,
383}, {
384 IndexFormatVersion: zoekt.NextIndexFormatVersion,
385 FeatureVersion: zoekt.FeatureVersion,
386}}
387
388// IncrementalSkipIndexing returns true if the index present on disk matches
389// the build options.
390func (o *Options) IncrementalSkipIndexing() bool {
391 state, _ := o.IndexState()
392 return state == IndexStateEqual
393}
394
395// IndexState checks how the index present on disk compares to the build
396// options and returns the IndexState and the name of the first shard.
397func (o *Options) IndexState() (IndexState, string) {
398 // Open the latest version we support that is on disk.
399 fn := o.findShard()
400 if fn == "" {
401 return IndexStateMissing, fn
402 }
403
404 repos, index, err := zoekt.ReadMetadataPathAlive(fn)
405 if os.IsNotExist(err) {
406 return IndexStateMissing, fn
407 } else if err != nil {
408 return IndexStateCorrupt, fn
409 }
410
411 for _, v := range readVersions {
412 if v.IndexFormatVersion == index.IndexFormatVersion && v.FeatureVersion != index.IndexFeatureVersion {
413 return IndexStateVersion, fn
414 }
415 }
416
417 var repo *zoekt.Repository
418 for _, cand := range repos {
419 if cand.Name == o.RepositoryDescription.Name {
420 repo = cand
421 break
422 }
423 }
424
425 if repo == nil {
426 return IndexStateCorrupt, fn
427 }
428
429 if repo.IndexOptions != o.GetHash() {
430 return IndexStateOption, fn
431 }
432
433 if !reflect.DeepEqual(repo.Branches, o.RepositoryDescription.Branches) {
434 return IndexStateContent, fn
435 }
436
437 // We can mutate repo since it lives in the scope of this function call.
438 if updated, err := repo.MergeMutable(&o.RepositoryDescription); err != nil {
439 // non-nil err means we are trying to update an immutable field =>
440 // reindex content.
441 log.Printf("warn: immutable field changed, requires re-index: %s", err)
442 return IndexStateContent, fn
443 } else if updated {
444 return IndexStateMeta, fn
445 }
446
447 return IndexStateEqual, fn
448}
449
450// FindRepositoryMetadata returns the index metadata for the repository
451// specified in the options. 'ok' is false if the repository's metadata
452// couldn't be found or if an error occurred.
453func (o *Options) FindRepositoryMetadata() (repository *zoekt.Repository, metadata *zoekt.IndexMetadata, ok bool, err error) {
454 shard := o.findShard()
455 if shard == "" {
456 return nil, nil, false, nil
457 }
458
459 repositories, metadata, err := zoekt.ReadMetadataPathAlive(shard)
460 if err != nil {
461 return nil, nil, false, fmt.Errorf("reading metadata for shard %q: %w", shard, err)
462 }
463
464 ID := o.RepositoryDescription.ID
465 for _, r := range repositories {
466 // compound shards contain multiple repositories, so we
467 // have to pick only the one we're looking for
468 if r.ID == ID {
469 return r, metadata, true, nil
470 }
471 }
472
473 // If we're here, then we're somehow in a state where we found a matching
474 // shard that's missing the repository metadata we're looking for. This
475 // should never happen.
476 name := o.RepositoryDescription.Name
477 return nil, nil, false, fmt.Errorf("matching shard %q doesn't contain metadata for repo id %d (%q)", shard, ID, name)
478}
479
480func (o *Options) findShard() string {
481 for _, v := range readVersions {
482 fn := o.shardNameVersion(v.IndexFormatVersion, 0)
483 if _, err := os.Stat(fn); err == nil {
484 return fn
485 }
486 }
487
488 // Brute force finding the shard in compound shards. We should only hit this
489 // code path for repositories that are not already existing or are in
490 // compound shards.
491 //
492 // TODO add an oracle which can speed this up in the case of repositories
493 // already in compound shards.
494 compoundShards, err := filepath.Glob(path.Join(o.IndexDir, "compound-*.zoekt"))
495 if err != nil {
496 return ""
497 }
498 for _, fn := range compoundShards {
499 repos, _, err := zoekt.ReadMetadataPathAlive(fn)
500 if err != nil {
501 continue
502 }
503 for _, repo := range repos {
504 if repo.ID == o.RepositoryDescription.ID {
505 return fn
506 }
507 }
508 }
509
510 return ""
511}
512
513func (o *Options) FindAllShards() []string {
514 for _, v := range readVersions {
515 fn := o.shardNameVersion(v.IndexFormatVersion, 0)
516 if _, err := os.Stat(fn); err == nil {
517 shards := []string{fn}
518 for i := 1; ; i++ {
519 fn := o.shardNameVersion(v.IndexFormatVersion, i)
520 if _, err := os.Stat(fn); err != nil {
521 return shards
522 }
523 shards = append(shards, fn)
524 }
525 }
526 }
527
528 // lazily fallback to findShard which will look for a compound shard.
529 if fn := o.findShard(); fn != "" {
530 return []string{fn}
531 }
532
533 return nil
534}
535
536// IgnoreSizeMax determines whether the max size should be ignored.
537func (o *Options) IgnoreSizeMax(name string) bool {
538 // A pattern match will override preceding pattern matches.
539 for i := len(o.LargeFiles) - 1; i >= 0; i-- {
540 pattern := strings.TrimSpace(o.LargeFiles[i])
541 negated, validatedPattern := checkIsNegatePattern(pattern)
542
543 if m, _ := doublestar.PathMatch(validatedPattern, name); m {
544 if negated {
545 return false
546 } else {
547 return true
548 }
549 }
550 }
551
552 return false
553}
554
555func checkIsNegatePattern(pattern string) (bool, string) {
556 negate := "!"
557
558 // if negated then strip prefix meta character which identifies negated filter pattern
559 if strings.HasPrefix(pattern, negate) {
560 return true, pattern[len(negate):]
561 }
562
563 return false, pattern
564}
565
566// NewBuilder creates a new Builder instance.
567func NewBuilder(opts Options) (*Builder, error) {
568 opts.SetDefaults()
569 if opts.RepositoryDescription.Name == "" {
570 return nil, fmt.Errorf("builder: must set Name")
571 }
572
573 b := &Builder{
574 opts: opts,
575 throttle: make(chan int, opts.Parallelism),
576 finishedShards: map[string]string{},
577 }
578
579 parserBins, err := ctags.NewParserBinMap(
580 b.opts.CTagsPath,
581 b.opts.ScipCTagsPath,
582 opts.LanguageMap,
583 b.opts.CTagsMustSucceed,
584 )
585 if err != nil {
586 return nil, err
587 }
588
589 b.parserBins = parserBins
590
591 if opts.IsDelta {
592 // Delta shards build on top of previously existing shards.
593 // As a consequence, the shardNum for delta shards starts from
594 // the number following the most recently generated shard - not 0.
595 //
596 // Using this numbering scheme allows all the shards to be
597 // discovered as a set.
598 shards := b.opts.FindAllShards()
599 b.nextShardNum = len(shards) // shards are zero indexed, so len() provides the next number after the last one
600 }
601
602 if _, err := b.newShardBuilder(); err != nil {
603 return nil, err
604 }
605
606 now := time.Now()
607 b.indexTime = now
608 b.id = xid.NewWithTime(now).String()
609
610 return b, nil
611}
612
613// AddFile is a convenience wrapper for the Add method
614func (b *Builder) AddFile(name string, content []byte) error {
615 return b.Add(zoekt.Document{Name: name, Content: content})
616}
617
618func (b *Builder) Add(doc zoekt.Document) error {
619 if b.finishCalled {
620 return nil
621 }
622
623 allowLargeFile := b.opts.IgnoreSizeMax(doc.Name)
624 if len(doc.Content) > b.opts.SizeMax && !allowLargeFile {
625 // We could pass the document on to the shardbuilder, but if
626 // we pass through a part of the source tree with binary/large
627 // files, the corresponding shard would be mostly empty, so
628 // insert a reason here too.
629 doc.SkipReason = fmt.Sprintf("document size %d larger than limit %d", len(doc.Content), b.opts.SizeMax)
630 } else if err := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile); err != nil {
631 doc.SkipReason = err.Error()
632 doc.Language = "binary"
633 }
634
635 b.todo = append(b.todo, &doc)
636
637 if doc.SkipReason == "" {
638 b.size += len(doc.Name) + len(doc.Content)
639 } else {
640 b.size += len(doc.Name) + len(doc.SkipReason)
641 // Drop the content if we are skipping the document. Skipped content is not counted towards the
642 // shard size limit, so otherwise we might buffer too much data in memory before flushing.
643 doc.Content = nil
644 }
645
646 if b.size > b.opts.ShardMax {
647 return b.flush()
648 }
649
650 return nil
651}
652
653// MarkFileAsChangedOrRemoved indicates that the file specified by the given path
654// has been changed or removed since the last indexing job for this repository.
655//
656// If this build is a delta build, these files will be tombstoned in the older shards for this repository.
657func (b *Builder) MarkFileAsChangedOrRemoved(path string) {
658 b.opts.changedOrRemovedFiles = append(b.opts.changedOrRemovedFiles, path)
659}
660
661// Finish creates a last shard from the buffered documents, and clears
662// stale shards from previous runs. This should always be called, also
663// in failure cases, to ensure cleanup.
664//
665// It is safe to call Finish() multiple times.
666func (b *Builder) Finish() error {
667 if b.finishCalled {
668 return b.buildError
669 }
670
671 b.finishCalled = true
672
673 b.flush()
674 b.building.Wait()
675
676 if b.buildError != nil {
677 for tmp := range b.finishedShards {
678 log.Printf("Builder.Finish %s", tmp)
679 os.Remove(tmp)
680 }
681 b.finishedShards = map[string]string{}
682 return b.buildError
683 }
684
685 // map of temporary -> final names for all updated shards + shard metadata files
686 artifactPaths := make(map[string]string)
687 for tmp, final := range b.finishedShards {
688 artifactPaths[tmp] = final
689 }
690
691 oldShards := b.opts.FindAllShards()
692
693 if b.opts.IsDelta {
694 // Delta shard builds need to update FileTombstone and branch commit information for all
695 // existing shards
696 for _, shard := range oldShards {
697 repositories, _, err := zoekt.ReadMetadataPathAlive(shard)
698 if err != nil {
699 return fmt.Errorf("reading metadata from shard %q: %w", shard, err)
700 }
701
702 if len(repositories) > 1 {
703 return fmt.Errorf("delta shard builds don't support repositories contained in compound shards (shard %q)", shard)
704 }
705
706 if len(repositories) == 0 {
707 return fmt.Errorf("failed to update repository metadata for shard %q - shard contains no repositories", shard)
708 }
709
710 repository := repositories[0]
711 if repository.ID != b.opts.RepositoryDescription.ID {
712 return fmt.Errorf("shard %q doesn't contain repository ID %d (%q)", shard, b.opts.RepositoryDescription.ID, b.opts.RepositoryDescription.Name)
713 }
714
715 if len(b.opts.changedOrRemovedFiles) > 0 && repository.FileTombstones == nil {
716 repository.FileTombstones = make(map[string]struct{}, len(b.opts.changedOrRemovedFiles))
717 }
718
719 for _, f := range b.opts.changedOrRemovedFiles {
720 repository.FileTombstones[f] = struct{}{}
721 }
722
723 if !BranchNamesEqual(repository.Branches, b.opts.RepositoryDescription.Branches) {
724 return deltaBranchSetError{
725 shardName: shard,
726 old: repository.Branches,
727 new: b.opts.RepositoryDescription.Branches,
728 }
729 }
730
731 if b.opts.GetHash() != repository.IndexOptions {
732 return &deltaIndexOptionsMismatchError{
733 shardName: shard,
734 newOptions: b.opts.HashOptions(),
735 }
736 }
737
738 repository.Branches = b.opts.RepositoryDescription.Branches
739
740 repository.LatestCommitDate = b.opts.RepositoryDescription.LatestCommitDate
741
742 tempPath, finalPath, err := zoekt.JsonMarshalRepoMetaTemp(shard, repository)
743 if err != nil {
744 return fmt.Errorf("writing repository metadta for shard %q: %w", shard, err)
745 }
746
747 artifactPaths[tempPath] = finalPath
748 }
749 }
750
751 // We mark finished shards as empty when we successfully finish. Return now
752 // to allow call sites to call Finish idempotently.
753 if len(artifactPaths) == 0 {
754 return b.buildError
755 }
756
757 // Collect a map of the old shards on disk. For each new shard we replace we
758 // delete it from toDelete. Anything remaining in toDelete will be removed
759 // after we have renamed everything into place.
760
761 var toDelete map[string]struct{}
762 if !b.opts.IsDelta {
763 // Non-delta shard builds delete all existing shards before they write out
764 // new ones.
765 // By contrast, delta shard builds work by stacking changes on top of existing shards.
766 // So, we skip populating the toDelete map if we're building delta shards.
767
768 toDelete = make(map[string]struct{})
769 for _, name := range oldShards {
770 paths, err := zoekt.IndexFilePaths(name)
771 if err != nil {
772 b.buildError = fmt.Errorf("failed to find old paths for %s: %w", name, err)
773 }
774 for _, p := range paths {
775 toDelete[p] = struct{}{}
776 }
777 }
778 }
779
780 for tmp, final := range artifactPaths {
781 if err := os.Rename(tmp, final); err != nil {
782 b.buildError = err
783 continue
784 }
785
786 delete(toDelete, final)
787 }
788
789 b.finishedShards = map[string]string{}
790
791 for p := range toDelete {
792 // Don't delete compound shards, set tombstones instead.
793 if b.opts.ShardMerging && strings.HasPrefix(filepath.Base(p), "compound-") {
794 if !strings.HasSuffix(p, ".zoekt") {
795 continue
796 }
797 err := zoekt.SetTombstone(p, b.opts.RepositoryDescription.ID)
798 b.buildError = err
799 continue
800 }
801 log.Printf("removing old shard file: %s", p)
802 if err := os.Remove(p); err != nil {
803 b.buildError = err
804 }
805 }
806
807 return b.buildError
808}
809
810// BranchNamesEqual compares the given zoekt.RepositoryBranch slices, and returns true
811// iff both slices specify the same set of branch names in the same order.
812func BranchNamesEqual(a, b []zoekt.RepositoryBranch) bool {
813 if len(a) != len(b) {
814 return false
815 }
816
817 for i := range a {
818 x, y := a[i], b[i]
819 if x.Name != y.Name {
820 return false
821 }
822 }
823
824 return true
825}
826
827func (b *Builder) flush() error {
828 todo := b.todo
829 b.todo = nil
830 b.size = 0
831 b.errMu.Lock()
832 defer b.errMu.Unlock()
833 if b.buildError != nil {
834 return b.buildError
835 }
836
837 hasShard := b.nextShardNum > 0
838 if len(todo) == 0 && hasShard {
839 return nil
840 }
841
842 shard := b.nextShardNum
843 b.nextShardNum++
844
845 if b.opts.Parallelism > 1 {
846 b.building.Add(1)
847 b.throttle <- 1
848 go func() {
849 done, err := b.buildShard(todo, shard)
850 <-b.throttle
851
852 b.errMu.Lock()
853 defer b.errMu.Unlock()
854 if err != nil && b.buildError == nil {
855 b.buildError = err
856 }
857 if err == nil {
858 b.finishedShards[done.temp] = done.final
859 }
860 b.building.Done()
861 }()
862 } else {
863 // No goroutines when we're not parallel. This
864 // simplifies memory profiling.
865 done, err := b.buildShard(todo, shard)
866 b.buildError = err
867 if err == nil {
868 b.finishedShards[done.temp] = done.final
869 }
870
871 return b.buildError
872 }
873
874 return nil
875}
876
877// map [0,inf) to [0,1) monotonically
878func squashRange(j int) float64 {
879 x := float64(j)
880 return x / (1 + x)
881}
882
883// IsLowPriority takes a file name and makes an educated guess about its priority
884// in search results. A file is considered low priority if it looks like a test,
885// vendored, or generated file.
886//
887// These 'priority' criteria affects how documents are ordered within a shard. It's
888// also used to help guess a file's rank when we're missing ranking information.
889func IsLowPriority(path string, content []byte) bool {
890 return enry.IsTest(path) || enry.IsVendor(path) || enry.IsGenerated(path, content)
891}
892
893type rankedDoc struct {
894 *zoekt.Document
895 rank []float64
896}
897
898// rank returns a vector of scores which is used at index-time to sort documents
899// before writing them to disk. The order of documents in the shard is important
900// at query time, because earlier documents receive a boost at query time and
901// have a higher chance of being searched before limits kick in.
902func rank(d *zoekt.Document, origIdx int) []float64 {
903 skipped := 0.0
904 if d.SkipReason != "" {
905 skipped = 1.0
906 }
907
908 generated := 0.0
909 if enry.IsGenerated(d.Name, d.Content) {
910 generated = 1.0
911 }
912
913 vendor := 0.0
914 if enry.IsVendor(d.Name) {
915 vendor = 1.0
916 }
917
918 test := 0.0
919 if enry.IsTest(d.Name) {
920 test = 1.0
921 }
922
923 // Smaller is earlier (=better).
924 return []float64{
925 // Always place skipped docs last
926 skipped,
927
928 // Prefer docs that are not generated
929 generated,
930
931 // Prefer docs that are not vendored
932 vendor,
933
934 // Prefer docs that are not tests
935 test,
936
937 // With short names
938 squashRange(len(d.Name)),
939
940 // With many symbols
941 1.0 - squashRange(len(d.Symbols)),
942
943 // With short content
944 squashRange(len(d.Content)),
945
946 // That is present is as many branches as possible
947 1.0 - squashRange(len(d.Branches)),
948
949 // Preserve original ordering.
950 squashRange(origIdx),
951 }
952}
953
954func sortDocuments(todo []*zoekt.Document) {
955 rs := make([]rankedDoc, 0, len(todo))
956 for i, t := range todo {
957 rd := rankedDoc{t, rank(t, i)}
958 rs = append(rs, rd)
959 }
960 sort.Slice(rs, func(i, j int) bool {
961 r1 := rs[i].rank
962 r2 := rs[j].rank
963 for i := range r1 {
964 if r1[i] < r2[i] {
965 return true
966 }
967 if r1[i] > r2[i] {
968 return false
969 }
970 }
971
972 return false
973 })
974 for i := range todo {
975 todo[i] = rs[i].Document
976 }
977}
978
979func (b *Builder) buildShard(todo []*zoekt.Document, nextShardNum int) (*finishedShard, error) {
980 if !b.opts.DisableCTags && (b.opts.CTagsPath != "" || b.opts.ScipCTagsPath != "") {
981 err := parseSymbols(todo, b.opts.LanguageMap, b.parserBins)
982 if b.opts.CTagsMustSucceed && err != nil {
983 return nil, err
984 }
985 if err != nil {
986 log.Printf("ignoring universal:%s or scip:%s error: %v", b.opts.CTagsPath, b.opts.ScipCTagsPath, err)
987 }
988 }
989
990 name := b.opts.shardName(nextShardNum)
991
992 shardBuilder, err := b.newShardBuilder()
993 if err != nil {
994 return nil, err
995 }
996
997 sortDocuments(todo)
998
999 for idx, t := range todo {
1000 if err := shardBuilder.Add(*t); err != nil {
1001 return nil, err
1002 }
1003
1004 if idx%10_000 == 0 {
1005 b.CheckMemoryUsage()
1006 }
1007 }
1008
1009 return b.writeShard(name, shardBuilder)
1010}
1011
1012// CheckMemoryUsage checks the memory usage of the process and writes a memory profile if the heap usage exceeds the
1013// configured threshold. NOTE: this method is expensive and should only be used for debugging.
1014func (b *Builder) CheckMemoryUsage() {
1015 // Don't check memory if heap profiling is disabled, or we've already written 10 profiles
1016 if b.opts.HeapProfileTriggerBytes <= 0 || b.heapProfileNum >= 10 {
1017 return
1018 }
1019
1020 var m runtime.MemStats
1021 runtime.ReadMemStats(&m)
1022
1023 if m.HeapAlloc > b.opts.HeapProfileTriggerBytes && b.heapProfileMu.TryLock() {
1024 defer b.heapProfileMu.Unlock()
1025
1026 log.Printf("writing memory profile, heap usage: %s", humanize.Bytes(m.HeapAlloc))
1027 name := filepath.Join(b.opts.IndexDir, fmt.Sprintf("indexmemory.prof.%d", b.heapProfileNum))
1028 f, err := os.Create(name)
1029 if err != nil {
1030 log.Printf("failed to create memory profile file: %v", err)
1031 return
1032 }
1033
1034 err = pprof.WriteHeapProfile(f)
1035 if err != nil {
1036 log.Printf("failed to write memory profile: %v", err)
1037 }
1038
1039 b.heapProfileNum++
1040 }
1041}
1042
1043func (b *Builder) newShardBuilder() (*zoekt.IndexBuilder, error) {
1044 desc := b.opts.RepositoryDescription
1045 desc.HasSymbols = !b.opts.DisableCTags && b.opts.CTagsPath != ""
1046 desc.SubRepoMap = b.opts.SubRepositories
1047 desc.IndexOptions = b.opts.GetHash()
1048
1049 shardBuilder, err := zoekt.NewIndexBuilder(&desc)
1050 if err != nil {
1051 return nil, err
1052 }
1053 shardBuilder.IndexTime = b.indexTime
1054 shardBuilder.ID = b.id
1055 return shardBuilder, nil
1056}
1057
1058func (b *Builder) writeShard(fn string, ib *zoekt.IndexBuilder) (*finishedShard, error) {
1059 dir := filepath.Dir(fn)
1060 if err := os.MkdirAll(dir, 0o700); err != nil {
1061 return nil, err
1062 }
1063
1064 f, err := os.CreateTemp(dir, filepath.Base(fn)+".*.tmp")
1065 if err != nil {
1066 return nil, err
1067 }
1068 if runtime.GOOS != "windows" {
1069 if err := f.Chmod(0o666 &^ umask); err != nil {
1070 return nil, err
1071 }
1072 }
1073
1074 defer f.Close()
1075 if err := ib.Write(f); err != nil {
1076 return nil, err
1077 }
1078 fi, err := f.Stat()
1079 if err != nil {
1080 return nil, err
1081 }
1082 if err := f.Close(); err != nil {
1083 return nil, err
1084 }
1085
1086 log.Printf("finished shard %s: %d index bytes (overhead %3.1f), %d files processed \n",
1087 fn,
1088 fi.Size(),
1089 float64(fi.Size())/float64(ib.ContentSize()+1),
1090 ib.NumFiles())
1091
1092 return &finishedShard{f.Name(), fn}, nil
1093}
1094
1095type deltaBranchSetError struct {
1096 shardName string
1097 old, new []zoekt.RepositoryBranch
1098}
1099
1100func (e deltaBranchSetError) Error() string {
1101 return fmt.Sprintf("repository metadata in shard %q contains a different set of branch names than what was requested, which is unsupported in a delta shard build. old: %+v, new: %+v", e.shardName, e.old, e.new)
1102}
1103
1104type deltaIndexOptionsMismatchError struct {
1105 shardName string
1106 newOptions HashOptions
1107}
1108
1109func (e *deltaIndexOptionsMismatchError) Error() string {
1110 return fmt.Sprintf("one or more index options for shard %q do not match Builder's index options. These index option updates are incompatible with delta build. New index options: %+v", e.shardName, e.newOptions)
1111}
1112
1113// umask holds the Umask of the current process
1114var umask os.FileMode