fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// package build implements a more convenient interface for building
16// zoekt indices.
17package build
18
19import (
20 "crypto/sha1"
21 "flag"
22 "fmt"
23 "io"
24 "log"
25 "net/url"
26 "os"
27 "os/exec"
28 "path"
29 "path/filepath"
30 "reflect"
31 "runtime"
32 "runtime/pprof"
33 "sort"
34 "strconv"
35 "strings"
36 "sync"
37 "time"
38
39 "github.com/bmatcuk/doublestar"
40 "github.com/grafana/regexp"
41 "github.com/rs/xid"
42
43 "github.com/sourcegraph/zoekt"
44 "github.com/sourcegraph/zoekt/ctags"
45)
46
47var DefaultDir = filepath.Join(os.Getenv("HOME"), ".zoekt")
48
49// Branch describes a single branch version.
50type Branch struct {
51 Name string
52 Version string
53}
54
55// Options sets options for the index building.
56type Options struct {
57 // IndexDir is a directory that holds *.zoekt index files.
58 IndexDir string
59
60 // SizeMax is the maximum file size
61 SizeMax int
62
63 // Parallelism is the maximum number of shards to index in parallel
64 Parallelism int
65
66 // ShardMax sets the maximum corpus size for a single shard
67 ShardMax int
68
69 // TrigramMax sets the maximum number of distinct trigrams per document.
70 TrigramMax int
71
72 // RepositoryDescription holds names and URLs for the repository.
73 RepositoryDescription zoekt.Repository
74
75 // SubRepositories is a path => sub repository map.
76 SubRepositories map[string]*zoekt.Repository
77
78 // DisableCTags disables the generation of ctags metadata.
79 DisableCTags bool
80
81 // CtagsPath is the path to the ctags binary to run, or empty
82 // if a valid binary couldn't be found.
83 CTagsPath string
84
85 // Same as CTagsPath but for scip-ctags
86 ScipCTagsPath string
87
88 // If set, ctags must succeed.
89 CTagsMustSucceed bool
90
91 // Write memory profiles to this file.
92 MemProfile string
93
94 // LargeFiles is a slice of glob patterns, including ** for any number
95 // of directories, where matching file paths should be indexed
96 // regardless of their size. The full pattern syntax is here:
97 // https://github.com/bmatcuk/doublestar/tree/v1#patterns.
98 LargeFiles []string
99
100 // IsDelta is true if this run contains only the changed documents since the
101 // last run.
102 IsDelta bool
103
104 // DocumentRanksPath is the path to the file with document ranks. If empty,
105 // ranks will be computed on-the-fly.
106 DocumentRanksPath string
107
108 // DocumentRanksVersion is a string which when changed will cause us to
109 // reindex a shard. This field is used so that when the contents of
110 // DocumentRanksPath changes, we can reindex.
111 DocumentRanksVersion string
112
113 // changedOrRemovedFiles is a list of file paths that have been changed or removed
114 // since the last indexing job for this repository. These files will be tombstoned
115 // in the older shards for this repository.
116 changedOrRemovedFiles []string
117
118 LanguageMap ctags.LanguageMap
119}
120
121// HashOptions contains only the options in Options that upon modification leads to IndexState of IndexStateMismatch during the next index building.
122type HashOptions struct {
123 sizeMax int
124 disableCTags bool
125 ctagsPath string
126 cTagsMustSucceed bool
127 largeFiles []string
128
129 // documentRankVersion is an experimental field which will change when the
130 // DocumentRanksPath content changes. If empty we ignore it.
131 documentRankVersion string
132}
133
134func (o *Options) HashOptions() HashOptions {
135 return HashOptions{
136 sizeMax: o.SizeMax,
137 disableCTags: o.DisableCTags,
138 ctagsPath: o.CTagsPath,
139 cTagsMustSucceed: o.CTagsMustSucceed,
140 largeFiles: o.LargeFiles,
141 documentRankVersion: o.DocumentRanksVersion,
142 }
143}
144
145func (o *Options) GetHash() string {
146 h := o.HashOptions()
147 hasher := sha1.New()
148
149 hasher.Write([]byte(h.ctagsPath))
150 hasher.Write([]byte(fmt.Sprintf("%t", h.cTagsMustSucceed)))
151 hasher.Write([]byte(fmt.Sprintf("%d", h.sizeMax)))
152 hasher.Write([]byte(fmt.Sprintf("%q", h.largeFiles)))
153 hasher.Write([]byte(fmt.Sprintf("%t", h.disableCTags)))
154
155 if h.documentRankVersion != "" {
156 hasher.Write([]byte{0})
157 io.WriteString(hasher, h.documentRankVersion)
158 }
159
160 return fmt.Sprintf("%x", hasher.Sum(nil))
161}
162
163type largeFilesFlag struct{ *Options }
164
165func (f largeFilesFlag) String() string {
166 // From flag.Value documentation:
167 //
168 // The flag package may call the String method with a zero-valued receiver,
169 // such as a nil pointer.
170 if f.Options == nil {
171 return ""
172 }
173 s := append([]string{""}, f.LargeFiles...)
174 return strings.Join(s, "-large_file ")
175}
176
177func (f largeFilesFlag) Set(value string) error {
178 f.LargeFiles = append(f.LargeFiles, value)
179 return nil
180}
181
182// Flags adds flags for build options to fs. It is the "inverse" of Args.
183func (o *Options) Flags(fs *flag.FlagSet) {
184 x := *o
185 x.SetDefaults()
186 fs.IntVar(&o.SizeMax, "file_limit", x.SizeMax, "maximum file size")
187 fs.IntVar(&o.TrigramMax, "max_trigram_count", x.TrigramMax, "maximum number of trigrams per document")
188 fs.IntVar(&o.ShardMax, "shard_limit", x.ShardMax, "maximum corpus size for a shard")
189 fs.IntVar(&o.Parallelism, "parallelism", x.Parallelism, "maximum number of parallel indexing processes.")
190 fs.StringVar(&o.IndexDir, "index", x.IndexDir, "directory for search indices")
191 fs.BoolVar(&o.CTagsMustSucceed, "require_ctags", x.CTagsMustSucceed, "If set, ctags calls must succeed.")
192 fs.Var(largeFilesFlag{o}, "large_file", "A glob pattern where matching files are to be index regardless of their size. You can add multiple patterns by setting this more than once.")
193 fs.StringVar(&o.MemProfile, "memprofile", "", "write memory profile(s) to `file.shardnum`. Note: sets parallelism to 1.")
194
195 // Sourcegraph specific
196 fs.BoolVar(&o.DisableCTags, "disable_ctags", x.DisableCTags, "If set, ctags will not be called.")
197}
198
199// Args generates command line arguments for o. It is the "inverse" of Flags.
200func (o *Options) Args() []string {
201 var args []string
202
203 if o.SizeMax != 0 {
204 args = append(args, "-file_limit", strconv.Itoa(o.SizeMax))
205 }
206
207 if o.TrigramMax != 0 {
208 args = append(args, "-max_trigram_count", strconv.Itoa(o.TrigramMax))
209 }
210
211 if o.ShardMax != 0 {
212 args = append(args, "-shard_limit", strconv.Itoa(o.ShardMax))
213 }
214
215 if o.Parallelism != 0 {
216 args = append(args, "-parallelism", strconv.Itoa(o.Parallelism))
217 }
218
219 if o.IndexDir != "" {
220 args = append(args, "-index", o.IndexDir)
221 }
222
223 if o.CTagsMustSucceed {
224 args = append(args, "-require_ctags")
225 }
226
227 for _, a := range o.LargeFiles {
228 args = append(args, "-large_file", a)
229 }
230
231 // Sourcegraph specific
232 if o.DisableCTags {
233 args = append(args, "-disable_ctags")
234 }
235
236 return args
237}
238
239// Builder manages (parallel) creation of uniformly sized shards. The
240// builder buffers up documents until it collects enough documents and
241// then builds a shard and writes.
242type Builder struct {
243 opts Options
244 throttle chan int
245
246 nextShardNum int
247 todo []*zoekt.Document
248 docChecker zoekt.DocChecker
249 size int
250
251 parserBins ctags.ParserBinMap
252 building sync.WaitGroup
253
254 errMu sync.Mutex
255 buildError error
256
257 // temp name => final name for finished shards. We only rename
258 // them once all shards succeed to avoid Frankstein corpuses.
259 finishedShards map[string]string
260
261 // indexTime is set by tests for doing reproducible builds.
262 indexTime time.Time
263
264 // a sortable 20 chars long id.
265 id string
266
267 finishCalled bool
268}
269
270type finishedShard struct {
271 temp, final string
272}
273
274func checkCTags() string {
275 if ctags := os.Getenv("CTAGS_COMMAND"); ctags != "" {
276 return ctags
277 }
278
279 if ctags, err := exec.LookPath("universal-ctags"); err == nil {
280 return ctags
281 }
282
283 return ""
284}
285
286func checkScipCTags() string {
287 if ctags := os.Getenv("SCIP_CTAGS_COMMAND"); ctags != "" {
288 return ctags
289 }
290
291 if ctags, err := exec.LookPath("scip-ctags"); err == nil {
292 return ctags
293 }
294
295 return ""
296}
297
298// SetDefaults sets reasonable default options.
299func (o *Options) SetDefaults() {
300 if o.CTagsPath == "" && !o.DisableCTags {
301 o.CTagsPath = checkCTags()
302 }
303
304 if o.ScipCTagsPath == "" && !o.DisableCTags {
305 o.ScipCTagsPath = checkScipCTags()
306 }
307
308 if o.Parallelism == 0 {
309 o.Parallelism = 4
310 }
311 if o.SizeMax == 0 {
312 o.SizeMax = 2 << 20
313 }
314 if o.ShardMax == 0 {
315 o.ShardMax = 100 << 20
316 }
317 if o.TrigramMax == 0 {
318 o.TrigramMax = 20000
319 }
320
321 if o.RepositoryDescription.Name == "" && o.RepositoryDescription.URL != "" {
322 parsed, _ := url.Parse(o.RepositoryDescription.URL)
323 if parsed != nil {
324 o.RepositoryDescription.Name = filepath.Join(parsed.Host, parsed.Path)
325 }
326 }
327}
328
329func hashString(s string) string {
330 h := sha1.New()
331 _, _ = io.WriteString(h, s)
332 return fmt.Sprintf("%x", h.Sum(nil))
333}
334
335// ShardName returns the name the given index shard.
336func (o *Options) shardName(n int) string {
337 return o.shardNameVersion(zoekt.IndexFormatVersion, n)
338}
339
340func (o *Options) shardNameVersion(version, n int) string {
341 abs := url.QueryEscape(o.RepositoryDescription.Name)
342 if len(abs) > 200 {
343 abs = abs[:200] + hashString(abs)[:8]
344 }
345 return filepath.Join(o.IndexDir,
346 fmt.Sprintf("%s_v%d.%05d.zoekt", abs, version, n))
347}
348
349type IndexState string
350
351const (
352 IndexStateMissing IndexState = "missing"
353 IndexStateCorrupt IndexState = "corrupt"
354 IndexStateVersion IndexState = "version-mismatch"
355 IndexStateOption IndexState = "option-mismatch"
356 IndexStateMeta IndexState = "meta-mismatch"
357 IndexStateContent IndexState = "content-mismatch"
358 IndexStateEqual IndexState = "equal"
359)
360
361var readVersions = []struct {
362 IndexFormatVersion int
363 FeatureVersion int
364}{{
365 IndexFormatVersion: zoekt.IndexFormatVersion,
366 FeatureVersion: zoekt.FeatureVersion,
367}, {
368 IndexFormatVersion: zoekt.NextIndexFormatVersion,
369 FeatureVersion: zoekt.FeatureVersion,
370}}
371
372// IncrementalSkipIndexing returns true if the index present on disk matches
373// the build options.
374func (o *Options) IncrementalSkipIndexing() bool {
375 state, _ := o.IndexState()
376 return state == IndexStateEqual
377}
378
379// IndexState checks how the index present on disk compares to the build
380// options and returns the IndexState and the name of the first shard.
381func (o *Options) IndexState() (IndexState, string) {
382 // Open the latest version we support that is on disk.
383 fn := o.findShard()
384 if fn == "" {
385 return IndexStateMissing, fn
386 }
387
388 repos, index, err := zoekt.ReadMetadataPathAlive(fn)
389 if os.IsNotExist(err) {
390 return IndexStateMissing, fn
391 } else if err != nil {
392 return IndexStateCorrupt, fn
393 }
394
395 for _, v := range readVersions {
396 if v.IndexFormatVersion == index.IndexFormatVersion && v.FeatureVersion != index.IndexFeatureVersion {
397 return IndexStateVersion, fn
398 }
399 }
400
401 var repo *zoekt.Repository
402 for _, cand := range repos {
403 if cand.Name == o.RepositoryDescription.Name {
404 repo = cand
405 break
406 }
407 }
408
409 if repo == nil {
410 return IndexStateCorrupt, fn
411 }
412
413 if repo.IndexOptions != o.GetHash() {
414 return IndexStateOption, fn
415 }
416
417 if !reflect.DeepEqual(repo.Branches, o.RepositoryDescription.Branches) {
418 return IndexStateContent, fn
419 }
420
421 // We can mutate repo since it lives in the scope of this function call.
422 if updated, err := repo.MergeMutable(&o.RepositoryDescription); err != nil {
423 // non-nil err means we are trying to update an immutable field =>
424 // reindex content.
425 log.Printf("warn: immutable field changed, requires re-index: %s", err)
426 return IndexStateContent, fn
427 } else if updated {
428 return IndexStateMeta, fn
429 }
430
431 return IndexStateEqual, fn
432}
433
434// FindRepositoryMetadata returns the index metadata for the repository
435// specified in the options. 'ok' is false if the repository's metadata
436// couldn't be found or if an error occurred.
437func (o *Options) FindRepositoryMetadata() (repository *zoekt.Repository, metadata *zoekt.IndexMetadata, ok bool, err error) {
438 shard := o.findShard()
439 if shard == "" {
440 return nil, nil, false, nil
441 }
442
443 repositories, metadata, err := zoekt.ReadMetadataPathAlive(shard)
444 if err != nil {
445 return nil, nil, false, fmt.Errorf("reading metadata for shard %q: %w", shard, err)
446 }
447
448 ID := o.RepositoryDescription.ID
449 for _, r := range repositories {
450 // compound shards contain multiple repositories, so we
451 // have to pick only the one we're looking for
452 if r.ID == ID {
453 return r, metadata, true, nil
454 }
455 }
456
457 // If we're here, then we're somehow in a state where we found a matching
458 // shard that's missing the repository metadata we're looking for. This
459 // should never happen.
460 name := o.RepositoryDescription.Name
461 return nil, nil, false, fmt.Errorf("matching shard %q doesn't contain metadata for repo id %d (%q)", shard, ID, name)
462}
463
464func (o *Options) findShard() string {
465 for _, v := range readVersions {
466 fn := o.shardNameVersion(v.IndexFormatVersion, 0)
467 if _, err := os.Stat(fn); err == nil {
468 return fn
469 }
470 }
471
472 // Brute force finding the shard in compound shards. We should only hit this
473 // code path for repositories that are not already existing or are in
474 // compound shards.
475 //
476 // TODO add an oracle which can speed this up in the case of repositories
477 // already in compound shards.
478 compoundShards, err := filepath.Glob(path.Join(o.IndexDir, "compound-*.zoekt"))
479 if err != nil {
480 return ""
481 }
482 for _, fn := range compoundShards {
483 repos, _, err := zoekt.ReadMetadataPathAlive(fn)
484 if err != nil {
485 continue
486 }
487 for _, repo := range repos {
488 if repo.ID == o.RepositoryDescription.ID {
489 return fn
490 }
491 }
492 }
493
494 return ""
495}
496
497func (o *Options) FindAllShards() []string {
498 for _, v := range readVersions {
499 fn := o.shardNameVersion(v.IndexFormatVersion, 0)
500 if _, err := os.Stat(fn); err == nil {
501 shards := []string{fn}
502 for i := 1; ; i++ {
503 fn := o.shardNameVersion(v.IndexFormatVersion, i)
504 if _, err := os.Stat(fn); err != nil {
505 return shards
506 }
507 shards = append(shards, fn)
508 }
509 }
510 }
511
512 // lazily fallback to findShard which will look for a compound shard.
513 if fn := o.findShard(); fn != "" {
514 return []string{fn}
515 }
516
517 return nil
518}
519
520// IgnoreSizeMax determines whether the max size should be ignored.
521func (o *Options) IgnoreSizeMax(name string) bool {
522 // A pattern match will override preceding pattern matches.
523 for i := len(o.LargeFiles) - 1; i >= 0; i-- {
524 pattern := strings.TrimSpace(o.LargeFiles[i])
525 negated, validatedPattern := checkIsNegatePattern(pattern)
526
527 if m, _ := doublestar.PathMatch(validatedPattern, name); m {
528 if negated {
529 return false
530 } else {
531 return true
532 }
533 }
534 }
535
536 return false
537}
538
539func checkIsNegatePattern(pattern string) (bool, string) {
540 negate := "!"
541
542 // if negated then strip prefix meta character which identifies negated filter pattern
543 if strings.HasPrefix(pattern, negate) {
544 return true, pattern[len(negate):]
545 }
546
547 return false, pattern
548}
549
550// NewBuilder creates a new Builder instance.
551func NewBuilder(opts Options) (*Builder, error) {
552 opts.SetDefaults()
553 if opts.RepositoryDescription.Name == "" {
554 return nil, fmt.Errorf("builder: must set Name")
555 }
556
557 b := &Builder{
558 opts: opts,
559 throttle: make(chan int, opts.Parallelism),
560 finishedShards: map[string]string{},
561 }
562
563 parserBins, err := ctags.NewParserBinMap(
564 b.opts.CTagsPath,
565 b.opts.ScipCTagsPath,
566 opts.LanguageMap,
567 b.opts.CTagsMustSucceed,
568 )
569 if err != nil {
570 return nil, err
571 }
572
573 b.parserBins = parserBins
574
575 if opts.IsDelta {
576 // Delta shards build on top of previously existing shards.
577 // As a consequence, the shardNum for delta shards starts from
578 // the number following the most recently generated shard - not 0.
579 //
580 // Using this numbering scheme allows all the shards to be
581 // discovered as a set.
582 shards := b.opts.FindAllShards()
583 b.nextShardNum = len(shards) // shards are zero indexed, so len() provides the next number after the last one
584 }
585
586 if _, err := b.newShardBuilder(); err != nil {
587 return nil, err
588 }
589
590 now := time.Now()
591 b.indexTime = now
592 b.id = xid.NewWithTime(now).String()
593
594 return b, nil
595}
596
597// AddFile is a convenience wrapper for the Add method
598func (b *Builder) AddFile(name string, content []byte) error {
599 return b.Add(zoekt.Document{Name: name, Content: content})
600}
601
602func (b *Builder) Add(doc zoekt.Document) error {
603 if b.finishCalled {
604 return nil
605 }
606
607 allowLargeFile := b.opts.IgnoreSizeMax(doc.Name)
608 if len(doc.Content) > b.opts.SizeMax && !allowLargeFile {
609 // We could pass the document on to the shardbuilder, but if
610 // we pass through a part of the source tree with binary/large
611 // files, the corresponding shard would be mostly empty, so
612 // insert a reason here too.
613 doc.SkipReason = fmt.Sprintf("document size %d larger than limit %d", len(doc.Content), b.opts.SizeMax)
614 } else if err := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile); err != nil {
615 doc.SkipReason = err.Error()
616 doc.Language = "binary"
617 }
618
619 b.todo = append(b.todo, &doc)
620
621 if doc.SkipReason == "" {
622 b.size += len(doc.Name) + len(doc.Content)
623 } else {
624 b.size += len(doc.Name) + len(doc.SkipReason)
625 // Drop the content if we are skipping the document. Skipped content is not counted towards the
626 // shard size limit, so otherwise we might buffer too much data in memory before flushing.
627 doc.Content = nil
628 }
629
630 if b.size > b.opts.ShardMax {
631 return b.flush()
632 }
633
634 return nil
635}
636
637// MarkFileAsChangedOrRemoved indicates that the file specified by the given path
638// has been changed or removed since the last indexing job for this repository.
639//
640// If this build is a delta build, these files will be tombstoned in the older shards for this repository.
641func (b *Builder) MarkFileAsChangedOrRemoved(path string) {
642 b.opts.changedOrRemovedFiles = append(b.opts.changedOrRemovedFiles, path)
643}
644
645// Finish creates a last shard from the buffered documents, and clears
646// stale shards from previous runs. This should always be called, also
647// in failure cases, to ensure cleanup.
648//
649// It is safe to call Finish() multiple times.
650func (b *Builder) Finish() error {
651 if b.finishCalled {
652 return b.buildError
653 }
654
655 b.finishCalled = true
656
657 b.flush()
658 b.building.Wait()
659
660 if b.buildError != nil {
661 for tmp := range b.finishedShards {
662 log.Printf("Builder.Finish %s", tmp)
663 os.Remove(tmp)
664 }
665 b.finishedShards = map[string]string{}
666 return b.buildError
667 }
668
669 // map of temporary -> final names for all updated shards + shard metadata files
670 artifactPaths := make(map[string]string)
671 for tmp, final := range b.finishedShards {
672 artifactPaths[tmp] = final
673 }
674
675 oldShards := b.opts.FindAllShards()
676
677 if b.opts.IsDelta {
678 // Delta shard builds need to update FileTombstone and branch commit information for all
679 // existing shards
680 for _, shard := range oldShards {
681 repositories, _, err := zoekt.ReadMetadataPathAlive(shard)
682 if err != nil {
683 return fmt.Errorf("reading metadata from shard %q: %w", shard, err)
684 }
685
686 if len(repositories) > 1 {
687 return fmt.Errorf("delta shard builds don't support repositories contained in compound shards (shard %q)", shard)
688 }
689
690 if len(repositories) == 0 {
691 return fmt.Errorf("failed to update repository metadata for shard %q - shard contains no repositories", shard)
692 }
693
694 repository := repositories[0]
695 if repository.ID != b.opts.RepositoryDescription.ID {
696 return fmt.Errorf("shard %q doesn't contain repository ID %d (%q)", shard, b.opts.RepositoryDescription.ID, b.opts.RepositoryDescription.Name)
697 }
698
699 if len(b.opts.changedOrRemovedFiles) > 0 && repository.FileTombstones == nil {
700 repository.FileTombstones = make(map[string]struct{}, len(b.opts.changedOrRemovedFiles))
701 }
702
703 for _, f := range b.opts.changedOrRemovedFiles {
704 repository.FileTombstones[f] = struct{}{}
705 }
706
707 if !BranchNamesEqual(repository.Branches, b.opts.RepositoryDescription.Branches) {
708 return deltaBranchSetError{
709 shardName: shard,
710 old: repository.Branches,
711 new: b.opts.RepositoryDescription.Branches,
712 }
713 }
714
715 if b.opts.GetHash() != repository.IndexOptions {
716 return &deltaIndexOptionsMismatchError{
717 shardName: shard,
718 newOptions: b.opts.HashOptions(),
719 }
720 }
721
722 repository.Branches = b.opts.RepositoryDescription.Branches
723
724 repository.LatestCommitDate = b.opts.RepositoryDescription.LatestCommitDate
725
726 tempPath, finalPath, err := zoekt.JsonMarshalRepoMetaTemp(shard, repository)
727 if err != nil {
728 return fmt.Errorf("writing repository metadta for shard %q: %w", shard, err)
729 }
730
731 artifactPaths[tempPath] = finalPath
732 }
733 }
734
735 // We mark finished shards as empty when we successfully finish. Return now
736 // to allow call sites to call Finish idempotently.
737 if len(artifactPaths) == 0 {
738 return b.buildError
739 }
740
741 // Collect a map of the old shards on disk. For each new shard we replace we
742 // delete it from toDelete. Anything remaining in toDelete will be removed
743 // after we have renamed everything into place.
744
745 var toDelete map[string]struct{}
746 if !b.opts.IsDelta {
747 // Non-delta shard builds delete all existing shards before they write out
748 // new ones.
749 // By contrast, delta shard builds work by stacking changes on top of existing shards.
750 // So, we skip populating the toDelete map if we're building delta shards.
751
752 toDelete = make(map[string]struct{})
753 for _, name := range oldShards {
754 paths, err := zoekt.IndexFilePaths(name)
755 if err != nil {
756 b.buildError = fmt.Errorf("failed to find old paths for %s: %w", name, err)
757 }
758 for _, p := range paths {
759 toDelete[p] = struct{}{}
760 }
761 }
762 }
763
764 for tmp, final := range artifactPaths {
765 if err := os.Rename(tmp, final); err != nil {
766 b.buildError = err
767 continue
768 }
769
770 delete(toDelete, final)
771 }
772
773 b.finishedShards = map[string]string{}
774
775 for p := range toDelete {
776 // Don't delete compound shards, set tombstones instead.
777 if zoekt.ShardMergingEnabled() && strings.HasPrefix(filepath.Base(p), "compound-") {
778 if !strings.HasSuffix(p, ".zoekt") {
779 continue
780 }
781 err := zoekt.SetTombstone(p, b.opts.RepositoryDescription.ID)
782 b.buildError = err
783 continue
784 }
785 log.Printf("removing old shard file: %s", p)
786 if err := os.Remove(p); err != nil {
787 b.buildError = err
788 }
789 }
790
791 return b.buildError
792}
793
794// BranchNamesEqual compares the given zoekt.RepositoryBranch slices, and returns true
795// iff both slices specify the same set of branch names in the same order.
796func BranchNamesEqual(a, b []zoekt.RepositoryBranch) bool {
797 if len(a) != len(b) {
798 return false
799 }
800
801 for i := range a {
802 x, y := a[i], b[i]
803 if x.Name != y.Name {
804 return false
805 }
806 }
807
808 return true
809}
810
811func (b *Builder) flush() error {
812 todo := b.todo
813 b.todo = nil
814 b.size = 0
815 b.errMu.Lock()
816 defer b.errMu.Unlock()
817 if b.buildError != nil {
818 return b.buildError
819 }
820
821 hasShard := b.nextShardNum > 0
822 if len(todo) == 0 && hasShard {
823 return nil
824 }
825
826 shard := b.nextShardNum
827 b.nextShardNum++
828
829 if b.opts.Parallelism > 1 && b.opts.MemProfile == "" {
830 b.building.Add(1)
831 b.throttle <- 1
832 go func() {
833 done, err := b.buildShard(todo, shard)
834 <-b.throttle
835
836 b.errMu.Lock()
837 defer b.errMu.Unlock()
838 if err != nil && b.buildError == nil {
839 b.buildError = err
840 }
841 if err == nil {
842 b.finishedShards[done.temp] = done.final
843 }
844 b.building.Done()
845 }()
846 } else {
847 // No goroutines when we're not parallel. This
848 // simplifies memory profiling.
849 done, err := b.buildShard(todo, shard)
850 b.buildError = err
851 if err == nil {
852 b.finishedShards[done.temp] = done.final
853 }
854 if b.opts.MemProfile != "" {
855 // drop memory, and profile.
856 todo = nil
857 b.writeMemProfile(b.opts.MemProfile)
858 }
859
860 return b.buildError
861 }
862
863 return nil
864}
865
866var profileNumber int
867
868func (b *Builder) writeMemProfile(name string) {
869 nm := fmt.Sprintf("%s.%d", name, profileNumber)
870 profileNumber++
871 f, err := os.Create(nm)
872 if err != nil {
873 log.Fatal("could not create memory profile: ", err)
874 }
875 runtime.GC() // get up-to-date statistics
876 if err := pprof.WriteHeapProfile(f); err != nil {
877 log.Fatal("could not write memory profile: ", err)
878 }
879 f.Close()
880 log.Printf("wrote mem profile %q", nm)
881}
882
883// map [0,inf) to [0,1) monotonically
884func squashRange(j int) float64 {
885 x := float64(j)
886 return x / (1 + x)
887}
888
889// IsLowPriority takes a file name and makes an educated guess about its priority
890// in search results. A file is considered low priority if it looks like a test,
891// vendored, or generated file.
892//
893// These 'priority' criteria affects how documents are ordered within a shard. It's
894// also used to help guess a file's rank when we're missing ranking information.
895func IsLowPriority(file string) bool {
896 return testRe.MatchString(file) || isGenerated(file) || isVendored(file)
897}
898
899var testRe = regexp.MustCompile("[Tt]est")
900
901func isGenerated(file string) bool {
902 return strings.HasSuffix(file, "min.js") || strings.HasSuffix(file, "js.map")
903}
904
905func isVendored(file string) bool {
906 return strings.Contains(file, "vendor/") || strings.Contains(file, "node_modules/")
907}
908
909type rankedDoc struct {
910 *zoekt.Document
911 rank []float64
912}
913
914// rank returns a vector of scores which is used at index-time to sort documents
915// before writing them to disk. The order of documents in the shard is important
916// at query time, because earlier documents receive a boost at query time and
917// have a higher chance of being searched before limits kick in.
918func rank(d *zoekt.Document, origIdx int) []float64 {
919 skipped := 0.0
920 if d.SkipReason != "" {
921 skipped = 1.0
922 }
923
924 generated := 0.0
925 if isGenerated(d.Name) {
926 generated = 1.0
927 }
928
929 vendor := 0.0
930 if isVendored(d.Name) {
931 vendor = 1.0
932 }
933
934 test := 0.0
935 if testRe.MatchString(d.Name) {
936 test = 1.0
937 }
938
939 // Smaller is earlier (=better).
940 return []float64{
941 // Always place skipped docs last
942 skipped,
943
944 // Prefer docs that are not generated
945 generated,
946
947 // Prefer docs that are not vendored
948 vendor,
949
950 // Prefer docs that are not tests
951 test,
952
953 // With short names
954 squashRange(len(d.Name)),
955
956 // With many symbols
957 1.0 - squashRange(len(d.Symbols)),
958
959 // With short content
960 squashRange(len(d.Content)),
961
962 // That is present is as many branches as possible
963 1.0 - squashRange(len(d.Branches)),
964
965 // Preserve original ordering.
966 squashRange(origIdx),
967 }
968}
969
970func sortDocuments(todo []*zoekt.Document) {
971 rs := make([]rankedDoc, 0, len(todo))
972 for i, t := range todo {
973 rd := rankedDoc{t, rank(t, i)}
974 rs = append(rs, rd)
975 }
976 sort.Slice(rs, func(i, j int) bool {
977 r1 := rs[i].rank
978 r2 := rs[j].rank
979 for i := range r1 {
980 if r1[i] < r2[i] {
981 return true
982 }
983 if r1[i] > r2[i] {
984 return false
985 }
986 }
987
988 return false
989 })
990 for i := range todo {
991 todo[i] = rs[i].Document
992 }
993}
994
995func (b *Builder) buildShard(todo []*zoekt.Document, nextShardNum int) (*finishedShard, error) {
996 if !b.opts.DisableCTags && (b.opts.CTagsPath != "" || b.opts.ScipCTagsPath != "") {
997 err := parseSymbols(todo, b.opts.LanguageMap, b.parserBins)
998 if b.opts.CTagsMustSucceed && err != nil {
999 return nil, err
1000 }
1001 if err != nil {
1002 log.Printf("ignoring universal:%s or scip:%s error: %v", b.opts.CTagsPath, b.opts.ScipCTagsPath, err)
1003 }
1004 }
1005
1006 name := b.opts.shardName(nextShardNum)
1007
1008 shardBuilder, err := b.newShardBuilder()
1009 if err != nil {
1010 return nil, err
1011 }
1012
1013 sortDocuments(todo)
1014
1015 for _, t := range todo {
1016 if err := shardBuilder.Add(*t); err != nil {
1017 return nil, err
1018 }
1019 }
1020
1021 return b.writeShard(name, shardBuilder)
1022}
1023
1024func (b *Builder) newShardBuilder() (*zoekt.IndexBuilder, error) {
1025 desc := b.opts.RepositoryDescription
1026 desc.HasSymbols = !b.opts.DisableCTags && b.opts.CTagsPath != ""
1027 desc.SubRepoMap = b.opts.SubRepositories
1028 desc.IndexOptions = b.opts.GetHash()
1029
1030 shardBuilder, err := zoekt.NewIndexBuilder(&desc)
1031 if err != nil {
1032 return nil, err
1033 }
1034 shardBuilder.IndexTime = b.indexTime
1035 shardBuilder.ID = b.id
1036 return shardBuilder, nil
1037}
1038
1039func (b *Builder) writeShard(fn string, ib *zoekt.IndexBuilder) (*finishedShard, error) {
1040 dir := filepath.Dir(fn)
1041 if err := os.MkdirAll(dir, 0o700); err != nil {
1042 return nil, err
1043 }
1044
1045 f, err := os.CreateTemp(dir, filepath.Base(fn)+".*.tmp")
1046 if err != nil {
1047 return nil, err
1048 }
1049 if runtime.GOOS != "windows" {
1050 if err := f.Chmod(0o666 &^ umask); err != nil {
1051 return nil, err
1052 }
1053 }
1054
1055 defer f.Close()
1056 if err := ib.Write(f); err != nil {
1057 return nil, err
1058 }
1059 fi, err := f.Stat()
1060 if err != nil {
1061 return nil, err
1062 }
1063 if err := f.Close(); err != nil {
1064 return nil, err
1065 }
1066
1067 log.Printf("finished shard %s: %d index bytes (overhead %3.1f), %d files processed \n",
1068 fn,
1069 fi.Size(),
1070 float64(fi.Size())/float64(ib.ContentSize()+1),
1071 ib.NumFiles())
1072
1073 return &finishedShard{f.Name(), fn}, nil
1074}
1075
1076type deltaBranchSetError struct {
1077 shardName string
1078 old, new []zoekt.RepositoryBranch
1079}
1080
1081func (e deltaBranchSetError) Error() string {
1082 return fmt.Sprintf("repository metadata in shard %q contains a different set of branch names than what was requested, which is unsupported in a delta shard build. old: %+v, new: %+v", e.shardName, e.old, e.new)
1083}
1084
1085type deltaIndexOptionsMismatchError struct {
1086 shardName string
1087 newOptions HashOptions
1088}
1089
1090func (e *deltaIndexOptionsMismatchError) Error() string {
1091 return fmt.Sprintf("one or more index options for shard %q do not match Builder's index options. These index option updates are incompatible with delta build. New index options: %+v", e.shardName, e.newOptions)
1092}
1093
1094// umask holds the Umask of the current process
1095var umask os.FileMode