fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// package build implements a more convenient interface for building
16// zoekt indices.
17package build
18
19import (
20 "crypto/sha1"
21 "flag"
22 "fmt"
23 "io"
24 "log"
25 "net/url"
26 "os"
27 "os/exec"
28 "path"
29 "path/filepath"
30 "reflect"
31 "runtime"
32 "runtime/pprof"
33 "sort"
34 "strconv"
35 "strings"
36 "sync"
37 "time"
38
39 "github.com/bmatcuk/doublestar"
40 "github.com/go-enry/go-enry/v2"
41 "github.com/rs/xid"
42
43 "github.com/sourcegraph/zoekt"
44 "github.com/sourcegraph/zoekt/ctags"
45)
46
47var DefaultDir = filepath.Join(os.Getenv("HOME"), ".zoekt")
48
49// Branch describes a single branch version.
50type Branch struct {
51 Name string
52 Version string
53}
54
55// Options sets options for the index building.
56type Options struct {
57 // IndexDir is a directory that holds *.zoekt index files.
58 IndexDir string
59
60 // SizeMax is the maximum file size
61 SizeMax int
62
63 // Parallelism is the maximum number of shards to index in parallel
64 Parallelism int
65
66 // ShardMax sets the maximum corpus size for a single shard
67 ShardMax int
68
69 // TrigramMax sets the maximum number of distinct trigrams per document.
70 TrigramMax int
71
72 // RepositoryDescription holds names and URLs for the repository.
73 RepositoryDescription zoekt.Repository
74
75 // SubRepositories is a path => sub repository map.
76 SubRepositories map[string]*zoekt.Repository
77
78 // DisableCTags disables the generation of ctags metadata.
79 DisableCTags bool
80
81 // CtagsPath is the path to the ctags binary to run, or empty
82 // if a valid binary couldn't be found.
83 CTagsPath string
84
85 // Same as CTagsPath but for scip-ctags
86 ScipCTagsPath string
87
88 // If set, ctags must succeed.
89 CTagsMustSucceed bool
90
91 // Write memory profiles to this file.
92 MemProfile string
93
94 // LargeFiles is a slice of glob patterns, including ** for any number
95 // of directories, where matching file paths should be indexed
96 // regardless of their size. The full pattern syntax is here:
97 // https://github.com/bmatcuk/doublestar/tree/v1#patterns.
98 LargeFiles []string
99
100 // IsDelta is true if this run contains only the changed documents since the
101 // last run.
102 IsDelta bool
103
104 // DocumentRanksPath is the path to the file with document ranks. If empty,
105 // ranks will be computed on-the-fly.
106 DocumentRanksPath string
107
108 // DocumentRanksVersion is a string which when changed will cause us to
109 // reindex a shard. This field is used so that when the contents of
110 // DocumentRanksPath changes, we can reindex.
111 DocumentRanksVersion string
112
113 // changedOrRemovedFiles is a list of file paths that have been changed or removed
114 // since the last indexing job for this repository. These files will be tombstoned
115 // in the older shards for this repository.
116 changedOrRemovedFiles []string
117
118 LanguageMap ctags.LanguageMap
119
120 // ShardMerging is true if builder should respect compound shards. This is a
121 // Sourcegraph specific option.
122 ShardMerging bool
123}
124
125// HashOptions contains only the options in Options that upon modification leads to IndexState of IndexStateMismatch during the next index building.
126type HashOptions struct {
127 sizeMax int
128 disableCTags bool
129 ctagsPath string
130 cTagsMustSucceed bool
131 largeFiles []string
132
133 // documentRankVersion is an experimental field which will change when the
134 // DocumentRanksPath content changes. If empty we ignore it.
135 documentRankVersion string
136}
137
138func (o *Options) HashOptions() HashOptions {
139 return HashOptions{
140 sizeMax: o.SizeMax,
141 disableCTags: o.DisableCTags,
142 ctagsPath: o.CTagsPath,
143 cTagsMustSucceed: o.CTagsMustSucceed,
144 largeFiles: o.LargeFiles,
145 documentRankVersion: o.DocumentRanksVersion,
146 }
147}
148
149func (o *Options) GetHash() string {
150 h := o.HashOptions()
151 hasher := sha1.New()
152
153 hasher.Write([]byte(h.ctagsPath))
154 hasher.Write([]byte(fmt.Sprintf("%t", h.cTagsMustSucceed)))
155 hasher.Write([]byte(fmt.Sprintf("%d", h.sizeMax)))
156 hasher.Write([]byte(fmt.Sprintf("%q", h.largeFiles)))
157 hasher.Write([]byte(fmt.Sprintf("%t", h.disableCTags)))
158
159 if h.documentRankVersion != "" {
160 hasher.Write([]byte{0})
161 io.WriteString(hasher, h.documentRankVersion)
162 }
163
164 return fmt.Sprintf("%x", hasher.Sum(nil))
165}
166
167type largeFilesFlag struct{ *Options }
168
169func (f largeFilesFlag) String() string {
170 // From flag.Value documentation:
171 //
172 // The flag package may call the String method with a zero-valued receiver,
173 // such as a nil pointer.
174 if f.Options == nil {
175 return ""
176 }
177 s := append([]string{""}, f.LargeFiles...)
178 return strings.Join(s, "-large_file ")
179}
180
181func (f largeFilesFlag) Set(value string) error {
182 f.LargeFiles = append(f.LargeFiles, value)
183 return nil
184}
185
186// Flags adds flags for build options to fs. It is the "inverse" of Args.
187func (o *Options) Flags(fs *flag.FlagSet) {
188 x := *o
189 x.SetDefaults()
190 fs.IntVar(&o.SizeMax, "file_limit", x.SizeMax, "maximum file size")
191 fs.IntVar(&o.TrigramMax, "max_trigram_count", x.TrigramMax, "maximum number of trigrams per document")
192 fs.IntVar(&o.ShardMax, "shard_limit", x.ShardMax, "maximum corpus size for a shard")
193 fs.IntVar(&o.Parallelism, "parallelism", x.Parallelism, "maximum number of parallel indexing processes.")
194 fs.StringVar(&o.IndexDir, "index", x.IndexDir, "directory for search indices")
195 fs.BoolVar(&o.CTagsMustSucceed, "require_ctags", x.CTagsMustSucceed, "If set, ctags calls must succeed.")
196 fs.Var(largeFilesFlag{o}, "large_file", "A glob pattern where matching files are to be index regardless of their size. You can add multiple patterns by setting this more than once.")
197 fs.StringVar(&o.MemProfile, "memprofile", "", "write memory profile(s) to `file.shardnum`. Note: sets parallelism to 1.")
198
199 // Sourcegraph specific
200 fs.BoolVar(&o.DisableCTags, "disable_ctags", x.DisableCTags, "If set, ctags will not be called.")
201 fs.BoolVar(&o.ShardMerging, "shard_merging", x.ShardMerging, "If set, builder will respect compound shards.")
202}
203
204// Args generates command line arguments for o. It is the "inverse" of Flags.
205func (o *Options) Args() []string {
206 var args []string
207
208 if o.SizeMax != 0 {
209 args = append(args, "-file_limit", strconv.Itoa(o.SizeMax))
210 }
211
212 if o.TrigramMax != 0 {
213 args = append(args, "-max_trigram_count", strconv.Itoa(o.TrigramMax))
214 }
215
216 if o.ShardMax != 0 {
217 args = append(args, "-shard_limit", strconv.Itoa(o.ShardMax))
218 }
219
220 if o.Parallelism != 0 {
221 args = append(args, "-parallelism", strconv.Itoa(o.Parallelism))
222 }
223
224 if o.IndexDir != "" {
225 args = append(args, "-index", o.IndexDir)
226 }
227
228 if o.CTagsMustSucceed {
229 args = append(args, "-require_ctags")
230 }
231
232 for _, a := range o.LargeFiles {
233 args = append(args, "-large_file", a)
234 }
235
236 // Sourcegraph specific
237 if o.DisableCTags {
238 args = append(args, "-disable_ctags")
239 }
240
241 if o.ShardMerging {
242 args = append(args, "-shard_merging")
243 }
244
245 return args
246}
247
248// Builder manages (parallel) creation of uniformly sized shards. The
249// builder buffers up documents until it collects enough documents and
250// then builds a shard and writes.
251type Builder struct {
252 opts Options
253 throttle chan int
254
255 nextShardNum int
256 todo []*zoekt.Document
257 docChecker zoekt.DocChecker
258 size int
259
260 parserBins ctags.ParserBinMap
261 building sync.WaitGroup
262
263 errMu sync.Mutex
264 buildError error
265
266 // temp name => final name for finished shards. We only rename
267 // them once all shards succeed to avoid Frankstein corpuses.
268 finishedShards map[string]string
269
270 // indexTime is set by tests for doing reproducible builds.
271 indexTime time.Time
272
273 // a sortable 20 chars long id.
274 id string
275
276 finishCalled bool
277}
278
279type finishedShard struct {
280 temp, final string
281}
282
283func checkCTags() string {
284 if ctags := os.Getenv("CTAGS_COMMAND"); ctags != "" {
285 return ctags
286 }
287
288 if ctags, err := exec.LookPath("universal-ctags"); err == nil {
289 return ctags
290 }
291
292 return ""
293}
294
295func checkScipCTags() string {
296 if ctags := os.Getenv("SCIP_CTAGS_COMMAND"); ctags != "" {
297 return ctags
298 }
299
300 if ctags, err := exec.LookPath("scip-ctags"); err == nil {
301 return ctags
302 }
303
304 return ""
305}
306
307// SetDefaults sets reasonable default options.
308func (o *Options) SetDefaults() {
309 if o.CTagsPath == "" && !o.DisableCTags {
310 o.CTagsPath = checkCTags()
311 }
312
313 if o.ScipCTagsPath == "" && !o.DisableCTags {
314 o.ScipCTagsPath = checkScipCTags()
315 }
316
317 if o.Parallelism == 0 {
318 o.Parallelism = 4
319 }
320 if o.SizeMax == 0 {
321 o.SizeMax = 2 << 20
322 }
323 if o.ShardMax == 0 {
324 o.ShardMax = 100 << 20
325 }
326 if o.TrigramMax == 0 {
327 o.TrigramMax = 20000
328 }
329
330 if o.RepositoryDescription.Name == "" && o.RepositoryDescription.URL != "" {
331 parsed, _ := url.Parse(o.RepositoryDescription.URL)
332 if parsed != nil {
333 o.RepositoryDescription.Name = filepath.Join(parsed.Host, parsed.Path)
334 }
335 }
336}
337
338func hashString(s string) string {
339 h := sha1.New()
340 _, _ = io.WriteString(h, s)
341 return fmt.Sprintf("%x", h.Sum(nil))
342}
343
344// ShardName returns the name the given index shard.
345func (o *Options) shardName(n int) string {
346 return o.shardNameVersion(zoekt.IndexFormatVersion, n)
347}
348
349func (o *Options) shardNameVersion(version, n int) string {
350 abs := url.QueryEscape(o.RepositoryDescription.Name)
351 if len(abs) > 200 {
352 abs = abs[:200] + hashString(abs)[:8]
353 }
354 return filepath.Join(o.IndexDir,
355 fmt.Sprintf("%s_v%d.%05d.zoekt", abs, version, n))
356}
357
358type IndexState string
359
360const (
361 IndexStateMissing IndexState = "missing"
362 IndexStateCorrupt IndexState = "corrupt"
363 IndexStateVersion IndexState = "version-mismatch"
364 IndexStateOption IndexState = "option-mismatch"
365 IndexStateMeta IndexState = "meta-mismatch"
366 IndexStateContent IndexState = "content-mismatch"
367 IndexStateEqual IndexState = "equal"
368)
369
370var readVersions = []struct {
371 IndexFormatVersion int
372 FeatureVersion int
373}{{
374 IndexFormatVersion: zoekt.IndexFormatVersion,
375 FeatureVersion: zoekt.FeatureVersion,
376}, {
377 IndexFormatVersion: zoekt.NextIndexFormatVersion,
378 FeatureVersion: zoekt.FeatureVersion,
379}}
380
381// IncrementalSkipIndexing returns true if the index present on disk matches
382// the build options.
383func (o *Options) IncrementalSkipIndexing() bool {
384 state, _ := o.IndexState()
385 return state == IndexStateEqual
386}
387
388// IndexState checks how the index present on disk compares to the build
389// options and returns the IndexState and the name of the first shard.
390func (o *Options) IndexState() (IndexState, string) {
391 // Open the latest version we support that is on disk.
392 fn := o.findShard()
393 if fn == "" {
394 return IndexStateMissing, fn
395 }
396
397 repos, index, err := zoekt.ReadMetadataPathAlive(fn)
398 if os.IsNotExist(err) {
399 return IndexStateMissing, fn
400 } else if err != nil {
401 return IndexStateCorrupt, fn
402 }
403
404 for _, v := range readVersions {
405 if v.IndexFormatVersion == index.IndexFormatVersion && v.FeatureVersion != index.IndexFeatureVersion {
406 return IndexStateVersion, fn
407 }
408 }
409
410 var repo *zoekt.Repository
411 for _, cand := range repos {
412 if cand.Name == o.RepositoryDescription.Name {
413 repo = cand
414 break
415 }
416 }
417
418 if repo == nil {
419 return IndexStateCorrupt, fn
420 }
421
422 if repo.IndexOptions != o.GetHash() {
423 return IndexStateOption, fn
424 }
425
426 if !reflect.DeepEqual(repo.Branches, o.RepositoryDescription.Branches) {
427 return IndexStateContent, fn
428 }
429
430 // We can mutate repo since it lives in the scope of this function call.
431 if updated, err := repo.MergeMutable(&o.RepositoryDescription); err != nil {
432 // non-nil err means we are trying to update an immutable field =>
433 // reindex content.
434 log.Printf("warn: immutable field changed, requires re-index: %s", err)
435 return IndexStateContent, fn
436 } else if updated {
437 return IndexStateMeta, fn
438 }
439
440 return IndexStateEqual, fn
441}
442
443// FindRepositoryMetadata returns the index metadata for the repository
444// specified in the options. 'ok' is false if the repository's metadata
445// couldn't be found or if an error occurred.
446func (o *Options) FindRepositoryMetadata() (repository *zoekt.Repository, metadata *zoekt.IndexMetadata, ok bool, err error) {
447 shard := o.findShard()
448 if shard == "" {
449 return nil, nil, false, nil
450 }
451
452 repositories, metadata, err := zoekt.ReadMetadataPathAlive(shard)
453 if err != nil {
454 return nil, nil, false, fmt.Errorf("reading metadata for shard %q: %w", shard, err)
455 }
456
457 ID := o.RepositoryDescription.ID
458 for _, r := range repositories {
459 // compound shards contain multiple repositories, so we
460 // have to pick only the one we're looking for
461 if r.ID == ID {
462 return r, metadata, true, nil
463 }
464 }
465
466 // If we're here, then we're somehow in a state where we found a matching
467 // shard that's missing the repository metadata we're looking for. This
468 // should never happen.
469 name := o.RepositoryDescription.Name
470 return nil, nil, false, fmt.Errorf("matching shard %q doesn't contain metadata for repo id %d (%q)", shard, ID, name)
471}
472
473func (o *Options) findShard() string {
474 for _, v := range readVersions {
475 fn := o.shardNameVersion(v.IndexFormatVersion, 0)
476 if _, err := os.Stat(fn); err == nil {
477 return fn
478 }
479 }
480
481 // Brute force finding the shard in compound shards. We should only hit this
482 // code path for repositories that are not already existing or are in
483 // compound shards.
484 //
485 // TODO add an oracle which can speed this up in the case of repositories
486 // already in compound shards.
487 compoundShards, err := filepath.Glob(path.Join(o.IndexDir, "compound-*.zoekt"))
488 if err != nil {
489 return ""
490 }
491 for _, fn := range compoundShards {
492 repos, _, err := zoekt.ReadMetadataPathAlive(fn)
493 if err != nil {
494 continue
495 }
496 for _, repo := range repos {
497 if repo.ID == o.RepositoryDescription.ID {
498 return fn
499 }
500 }
501 }
502
503 return ""
504}
505
506func (o *Options) FindAllShards() []string {
507 for _, v := range readVersions {
508 fn := o.shardNameVersion(v.IndexFormatVersion, 0)
509 if _, err := os.Stat(fn); err == nil {
510 shards := []string{fn}
511 for i := 1; ; i++ {
512 fn := o.shardNameVersion(v.IndexFormatVersion, i)
513 if _, err := os.Stat(fn); err != nil {
514 return shards
515 }
516 shards = append(shards, fn)
517 }
518 }
519 }
520
521 // lazily fallback to findShard which will look for a compound shard.
522 if fn := o.findShard(); fn != "" {
523 return []string{fn}
524 }
525
526 return nil
527}
528
529// IgnoreSizeMax determines whether the max size should be ignored.
530func (o *Options) IgnoreSizeMax(name string) bool {
531 // A pattern match will override preceding pattern matches.
532 for i := len(o.LargeFiles) - 1; i >= 0; i-- {
533 pattern := strings.TrimSpace(o.LargeFiles[i])
534 negated, validatedPattern := checkIsNegatePattern(pattern)
535
536 if m, _ := doublestar.PathMatch(validatedPattern, name); m {
537 if negated {
538 return false
539 } else {
540 return true
541 }
542 }
543 }
544
545 return false
546}
547
548func checkIsNegatePattern(pattern string) (bool, string) {
549 negate := "!"
550
551 // if negated then strip prefix meta character which identifies negated filter pattern
552 if strings.HasPrefix(pattern, negate) {
553 return true, pattern[len(negate):]
554 }
555
556 return false, pattern
557}
558
559// NewBuilder creates a new Builder instance.
560func NewBuilder(opts Options) (*Builder, error) {
561 opts.SetDefaults()
562 if opts.RepositoryDescription.Name == "" {
563 return nil, fmt.Errorf("builder: must set Name")
564 }
565
566 b := &Builder{
567 opts: opts,
568 throttle: make(chan int, opts.Parallelism),
569 finishedShards: map[string]string{},
570 }
571
572 parserBins, err := ctags.NewParserBinMap(
573 b.opts.CTagsPath,
574 b.opts.ScipCTagsPath,
575 opts.LanguageMap,
576 b.opts.CTagsMustSucceed,
577 )
578 if err != nil {
579 return nil, err
580 }
581
582 b.parserBins = parserBins
583
584 if opts.IsDelta {
585 // Delta shards build on top of previously existing shards.
586 // As a consequence, the shardNum for delta shards starts from
587 // the number following the most recently generated shard - not 0.
588 //
589 // Using this numbering scheme allows all the shards to be
590 // discovered as a set.
591 shards := b.opts.FindAllShards()
592 b.nextShardNum = len(shards) // shards are zero indexed, so len() provides the next number after the last one
593 }
594
595 if _, err := b.newShardBuilder(); err != nil {
596 return nil, err
597 }
598
599 now := time.Now()
600 b.indexTime = now
601 b.id = xid.NewWithTime(now).String()
602
603 return b, nil
604}
605
606// AddFile is a convenience wrapper for the Add method
607func (b *Builder) AddFile(name string, content []byte) error {
608 return b.Add(zoekt.Document{Name: name, Content: content})
609}
610
611func (b *Builder) Add(doc zoekt.Document) error {
612 if b.finishCalled {
613 return nil
614 }
615
616 allowLargeFile := b.opts.IgnoreSizeMax(doc.Name)
617 if len(doc.Content) > b.opts.SizeMax && !allowLargeFile {
618 // We could pass the document on to the shardbuilder, but if
619 // we pass through a part of the source tree with binary/large
620 // files, the corresponding shard would be mostly empty, so
621 // insert a reason here too.
622 doc.SkipReason = fmt.Sprintf("document size %d larger than limit %d", len(doc.Content), b.opts.SizeMax)
623 } else if err := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile); err != nil {
624 doc.SkipReason = err.Error()
625 doc.Language = "binary"
626 }
627
628 b.todo = append(b.todo, &doc)
629
630 if doc.SkipReason == "" {
631 b.size += len(doc.Name) + len(doc.Content)
632 } else {
633 b.size += len(doc.Name) + len(doc.SkipReason)
634 // Drop the content if we are skipping the document. Skipped content is not counted towards the
635 // shard size limit, so otherwise we might buffer too much data in memory before flushing.
636 doc.Content = nil
637 }
638
639 if b.size > b.opts.ShardMax {
640 return b.flush()
641 }
642
643 return nil
644}
645
646// MarkFileAsChangedOrRemoved indicates that the file specified by the given path
647// has been changed or removed since the last indexing job for this repository.
648//
649// If this build is a delta build, these files will be tombstoned in the older shards for this repository.
650func (b *Builder) MarkFileAsChangedOrRemoved(path string) {
651 b.opts.changedOrRemovedFiles = append(b.opts.changedOrRemovedFiles, path)
652}
653
654// Finish creates a last shard from the buffered documents, and clears
655// stale shards from previous runs. This should always be called, also
656// in failure cases, to ensure cleanup.
657//
658// It is safe to call Finish() multiple times.
659func (b *Builder) Finish() error {
660 if b.finishCalled {
661 return b.buildError
662 }
663
664 b.finishCalled = true
665
666 b.flush()
667 b.building.Wait()
668
669 if b.buildError != nil {
670 for tmp := range b.finishedShards {
671 log.Printf("Builder.Finish %s", tmp)
672 os.Remove(tmp)
673 }
674 b.finishedShards = map[string]string{}
675 return b.buildError
676 }
677
678 // map of temporary -> final names for all updated shards + shard metadata files
679 artifactPaths := make(map[string]string)
680 for tmp, final := range b.finishedShards {
681 artifactPaths[tmp] = final
682 }
683
684 oldShards := b.opts.FindAllShards()
685
686 if b.opts.IsDelta {
687 // Delta shard builds need to update FileTombstone and branch commit information for all
688 // existing shards
689 for _, shard := range oldShards {
690 repositories, _, err := zoekt.ReadMetadataPathAlive(shard)
691 if err != nil {
692 return fmt.Errorf("reading metadata from shard %q: %w", shard, err)
693 }
694
695 if len(repositories) > 1 {
696 return fmt.Errorf("delta shard builds don't support repositories contained in compound shards (shard %q)", shard)
697 }
698
699 if len(repositories) == 0 {
700 return fmt.Errorf("failed to update repository metadata for shard %q - shard contains no repositories", shard)
701 }
702
703 repository := repositories[0]
704 if repository.ID != b.opts.RepositoryDescription.ID {
705 return fmt.Errorf("shard %q doesn't contain repository ID %d (%q)", shard, b.opts.RepositoryDescription.ID, b.opts.RepositoryDescription.Name)
706 }
707
708 if len(b.opts.changedOrRemovedFiles) > 0 && repository.FileTombstones == nil {
709 repository.FileTombstones = make(map[string]struct{}, len(b.opts.changedOrRemovedFiles))
710 }
711
712 for _, f := range b.opts.changedOrRemovedFiles {
713 repository.FileTombstones[f] = struct{}{}
714 }
715
716 if !BranchNamesEqual(repository.Branches, b.opts.RepositoryDescription.Branches) {
717 return deltaBranchSetError{
718 shardName: shard,
719 old: repository.Branches,
720 new: b.opts.RepositoryDescription.Branches,
721 }
722 }
723
724 if b.opts.GetHash() != repository.IndexOptions {
725 return &deltaIndexOptionsMismatchError{
726 shardName: shard,
727 newOptions: b.opts.HashOptions(),
728 }
729 }
730
731 repository.Branches = b.opts.RepositoryDescription.Branches
732
733 repository.LatestCommitDate = b.opts.RepositoryDescription.LatestCommitDate
734
735 tempPath, finalPath, err := zoekt.JsonMarshalRepoMetaTemp(shard, repository)
736 if err != nil {
737 return fmt.Errorf("writing repository metadta for shard %q: %w", shard, err)
738 }
739
740 artifactPaths[tempPath] = finalPath
741 }
742 }
743
744 // We mark finished shards as empty when we successfully finish. Return now
745 // to allow call sites to call Finish idempotently.
746 if len(artifactPaths) == 0 {
747 return b.buildError
748 }
749
750 // Collect a map of the old shards on disk. For each new shard we replace we
751 // delete it from toDelete. Anything remaining in toDelete will be removed
752 // after we have renamed everything into place.
753
754 var toDelete map[string]struct{}
755 if !b.opts.IsDelta {
756 // Non-delta shard builds delete all existing shards before they write out
757 // new ones.
758 // By contrast, delta shard builds work by stacking changes on top of existing shards.
759 // So, we skip populating the toDelete map if we're building delta shards.
760
761 toDelete = make(map[string]struct{})
762 for _, name := range oldShards {
763 paths, err := zoekt.IndexFilePaths(name)
764 if err != nil {
765 b.buildError = fmt.Errorf("failed to find old paths for %s: %w", name, err)
766 }
767 for _, p := range paths {
768 toDelete[p] = struct{}{}
769 }
770 }
771 }
772
773 for tmp, final := range artifactPaths {
774 if err := os.Rename(tmp, final); err != nil {
775 b.buildError = err
776 continue
777 }
778
779 delete(toDelete, final)
780 }
781
782 b.finishedShards = map[string]string{}
783
784 for p := range toDelete {
785 // Don't delete compound shards, set tombstones instead.
786 if b.opts.ShardMerging && strings.HasPrefix(filepath.Base(p), "compound-") {
787 if !strings.HasSuffix(p, ".zoekt") {
788 continue
789 }
790 err := zoekt.SetTombstone(p, b.opts.RepositoryDescription.ID)
791 b.buildError = err
792 continue
793 }
794 log.Printf("removing old shard file: %s", p)
795 if err := os.Remove(p); err != nil {
796 b.buildError = err
797 }
798 }
799
800 return b.buildError
801}
802
803// BranchNamesEqual compares the given zoekt.RepositoryBranch slices, and returns true
804// iff both slices specify the same set of branch names in the same order.
805func BranchNamesEqual(a, b []zoekt.RepositoryBranch) bool {
806 if len(a) != len(b) {
807 return false
808 }
809
810 for i := range a {
811 x, y := a[i], b[i]
812 if x.Name != y.Name {
813 return false
814 }
815 }
816
817 return true
818}
819
820func (b *Builder) flush() error {
821 todo := b.todo
822 b.todo = nil
823 b.size = 0
824 b.errMu.Lock()
825 defer b.errMu.Unlock()
826 if b.buildError != nil {
827 return b.buildError
828 }
829
830 hasShard := b.nextShardNum > 0
831 if len(todo) == 0 && hasShard {
832 return nil
833 }
834
835 shard := b.nextShardNum
836 b.nextShardNum++
837
838 if b.opts.Parallelism > 1 && b.opts.MemProfile == "" {
839 b.building.Add(1)
840 b.throttle <- 1
841 go func() {
842 done, err := b.buildShard(todo, shard)
843 <-b.throttle
844
845 b.errMu.Lock()
846 defer b.errMu.Unlock()
847 if err != nil && b.buildError == nil {
848 b.buildError = err
849 }
850 if err == nil {
851 b.finishedShards[done.temp] = done.final
852 }
853 b.building.Done()
854 }()
855 } else {
856 // No goroutines when we're not parallel. This
857 // simplifies memory profiling.
858 done, err := b.buildShard(todo, shard)
859 b.buildError = err
860 if err == nil {
861 b.finishedShards[done.temp] = done.final
862 }
863 if b.opts.MemProfile != "" {
864 // drop memory, and profile.
865 todo = nil
866 b.writeMemProfile(b.opts.MemProfile)
867 }
868
869 return b.buildError
870 }
871
872 return nil
873}
874
875var profileNumber int
876
877func (b *Builder) writeMemProfile(name string) {
878 nm := fmt.Sprintf("%s.%d", name, profileNumber)
879 profileNumber++
880 f, err := os.Create(nm)
881 if err != nil {
882 log.Fatal("could not create memory profile: ", err)
883 }
884 runtime.GC() // get up-to-date statistics
885 if err := pprof.WriteHeapProfile(f); err != nil {
886 log.Fatal("could not write memory profile: ", err)
887 }
888 f.Close()
889 log.Printf("wrote mem profile %q", nm)
890}
891
892// map [0,inf) to [0,1) monotonically
893func squashRange(j int) float64 {
894 x := float64(j)
895 return x / (1 + x)
896}
897
898// IsLowPriority takes a file name and makes an educated guess about its priority
899// in search results. A file is considered low priority if it looks like a test,
900// vendored, or generated file.
901//
902// These 'priority' criteria affects how documents are ordered within a shard. It's
903// also used to help guess a file's rank when we're missing ranking information.
904func IsLowPriority(path string, content []byte) bool {
905 return enry.IsTest(path) || enry.IsVendor(path) || enry.IsGenerated(path, content)
906}
907
908type rankedDoc struct {
909 *zoekt.Document
910 rank []float64
911}
912
913// rank returns a vector of scores which is used at index-time to sort documents
914// before writing them to disk. The order of documents in the shard is important
915// at query time, because earlier documents receive a boost at query time and
916// have a higher chance of being searched before limits kick in.
917func rank(d *zoekt.Document, origIdx int) []float64 {
918 skipped := 0.0
919 if d.SkipReason != "" {
920 skipped = 1.0
921 }
922
923 generated := 0.0
924 if enry.IsGenerated(d.Name, d.Content) {
925 generated = 1.0
926 }
927
928 vendor := 0.0
929 if enry.IsVendor(d.Name) {
930 vendor = 1.0
931 }
932
933 test := 0.0
934 if enry.IsTest(d.Name) {
935 test = 1.0
936 }
937
938 // Smaller is earlier (=better).
939 return []float64{
940 // Always place skipped docs last
941 skipped,
942
943 // Prefer docs that are not generated
944 generated,
945
946 // Prefer docs that are not vendored
947 vendor,
948
949 // Prefer docs that are not tests
950 test,
951
952 // With short names
953 squashRange(len(d.Name)),
954
955 // With many symbols
956 1.0 - squashRange(len(d.Symbols)),
957
958 // With short content
959 squashRange(len(d.Content)),
960
961 // That is present is as many branches as possible
962 1.0 - squashRange(len(d.Branches)),
963
964 // Preserve original ordering.
965 squashRange(origIdx),
966 }
967}
968
969func sortDocuments(todo []*zoekt.Document) {
970 rs := make([]rankedDoc, 0, len(todo))
971 for i, t := range todo {
972 rd := rankedDoc{t, rank(t, i)}
973 rs = append(rs, rd)
974 }
975 sort.Slice(rs, func(i, j int) bool {
976 r1 := rs[i].rank
977 r2 := rs[j].rank
978 for i := range r1 {
979 if r1[i] < r2[i] {
980 return true
981 }
982 if r1[i] > r2[i] {
983 return false
984 }
985 }
986
987 return false
988 })
989 for i := range todo {
990 todo[i] = rs[i].Document
991 }
992}
993
994func (b *Builder) buildShard(todo []*zoekt.Document, nextShardNum int) (*finishedShard, error) {
995 if !b.opts.DisableCTags && (b.opts.CTagsPath != "" || b.opts.ScipCTagsPath != "") {
996 err := parseSymbols(todo, b.opts.LanguageMap, b.parserBins)
997 if b.opts.CTagsMustSucceed && err != nil {
998 return nil, err
999 }
1000 if err != nil {
1001 log.Printf("ignoring universal:%s or scip:%s error: %v", b.opts.CTagsPath, b.opts.ScipCTagsPath, err)
1002 }
1003 }
1004
1005 name := b.opts.shardName(nextShardNum)
1006
1007 shardBuilder, err := b.newShardBuilder()
1008 if err != nil {
1009 return nil, err
1010 }
1011
1012 sortDocuments(todo)
1013
1014 for _, t := range todo {
1015 if err := shardBuilder.Add(*t); err != nil {
1016 return nil, err
1017 }
1018 }
1019
1020 return b.writeShard(name, shardBuilder)
1021}
1022
1023func (b *Builder) newShardBuilder() (*zoekt.IndexBuilder, error) {
1024 desc := b.opts.RepositoryDescription
1025 desc.HasSymbols = !b.opts.DisableCTags && b.opts.CTagsPath != ""
1026 desc.SubRepoMap = b.opts.SubRepositories
1027 desc.IndexOptions = b.opts.GetHash()
1028
1029 shardBuilder, err := zoekt.NewIndexBuilder(&desc)
1030 if err != nil {
1031 return nil, err
1032 }
1033 shardBuilder.IndexTime = b.indexTime
1034 shardBuilder.ID = b.id
1035 return shardBuilder, nil
1036}
1037
1038func (b *Builder) writeShard(fn string, ib *zoekt.IndexBuilder) (*finishedShard, error) {
1039 dir := filepath.Dir(fn)
1040 if err := os.MkdirAll(dir, 0o700); err != nil {
1041 return nil, err
1042 }
1043
1044 f, err := os.CreateTemp(dir, filepath.Base(fn)+".*.tmp")
1045 if err != nil {
1046 return nil, err
1047 }
1048 if runtime.GOOS != "windows" {
1049 if err := f.Chmod(0o666 &^ umask); err != nil {
1050 return nil, err
1051 }
1052 }
1053
1054 defer f.Close()
1055 if err := ib.Write(f); err != nil {
1056 return nil, err
1057 }
1058 fi, err := f.Stat()
1059 if err != nil {
1060 return nil, err
1061 }
1062 if err := f.Close(); err != nil {
1063 return nil, err
1064 }
1065
1066 log.Printf("finished shard %s: %d index bytes (overhead %3.1f), %d files processed \n",
1067 fn,
1068 fi.Size(),
1069 float64(fi.Size())/float64(ib.ContentSize()+1),
1070 ib.NumFiles())
1071
1072 return &finishedShard{f.Name(), fn}, nil
1073}
1074
1075type deltaBranchSetError struct {
1076 shardName string
1077 old, new []zoekt.RepositoryBranch
1078}
1079
1080func (e deltaBranchSetError) Error() string {
1081 return fmt.Sprintf("repository metadata in shard %q contains a different set of branch names than what was requested, which is unsupported in a delta shard build. old: %+v, new: %+v", e.shardName, e.old, e.new)
1082}
1083
1084type deltaIndexOptionsMismatchError struct {
1085 shardName string
1086 newOptions HashOptions
1087}
1088
1089func (e *deltaIndexOptionsMismatchError) Error() string {
1090 return fmt.Sprintf("one or more index options for shard %q do not match Builder's index options. These index option updates are incompatible with delta build. New index options: %+v", e.shardName, e.newOptions)
1091}
1092
1093// umask holds the Umask of the current process
1094var umask os.FileMode