fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// package build implements a more convenient interface for building
16// zoekt indices.
17package build
18
19import (
20 "crypto/sha1"
21 "flag"
22 "fmt"
23 "io"
24 "log"
25 "math"
26 "net/url"
27 "os"
28 "os/exec"
29 "path"
30 "path/filepath"
31 "reflect"
32 "runtime"
33 "runtime/pprof"
34 "sort"
35 "strconv"
36 "strings"
37 "sync"
38 "time"
39
40 "github.com/bmatcuk/doublestar"
41 "github.com/grafana/regexp"
42 "github.com/rs/xid"
43 "gopkg.in/natefinch/lumberjack.v2"
44
45 "github.com/sourcegraph/zoekt"
46 "github.com/sourcegraph/zoekt/ctags"
47)
48
49var DefaultDir = filepath.Join(os.Getenv("HOME"), ".zoekt")
50
51// Branch describes a single branch version.
52type Branch struct {
53 Name string
54 Version string
55}
56
57// Options sets options for the index building.
58type Options struct {
59 // IndexDir is a directory that holds *.zoekt index files.
60 IndexDir string
61
62 // SizeMax is the maximum file size
63 SizeMax int
64
65 // Parallelism is the maximum number of shards to index in parallel
66 Parallelism int
67
68 // ShardMax sets the maximum corpus size for a single shard
69 ShardMax int
70
71 // TrigramMax sets the maximum number of distinct trigrams per document.
72 TrigramMax int
73
74 // RepositoryDescription holds names and URLs for the repository.
75 RepositoryDescription zoekt.Repository
76
77 // SubRepositories is a path => sub repository map.
78 SubRepositories map[string]*zoekt.Repository
79
80 // DisableCTags disables the generation of ctags metadata.
81 DisableCTags bool
82
83 // CtagsPath is the path to the ctags binary to run, or empty
84 // if a valid binary couldn't be found.
85 CTagsPath string
86
87 // Same as CTagsPath but for scip-ctags
88 ScipCTagsPath string
89
90 // If set, ctags must succeed.
91 CTagsMustSucceed bool
92
93 // Write memory profiles to this file.
94 MemProfile string
95
96 // LargeFiles is a slice of glob patterns, including ** for any number
97 // of directories, where matching file paths should be indexed
98 // regardless of their size. The full pattern syntax is here:
99 // https://github.com/bmatcuk/doublestar/tree/v1#patterns.
100 LargeFiles []string
101
102 // IsDelta is true if this run contains only the changed documents since the
103 // last run.
104 IsDelta bool
105
106 // DocumentRanksPath is the path to the file with document ranks. If empty,
107 // ranks will be computed on-the-fly.
108 DocumentRanksPath string
109
110 // DocumentRanksVersion is a string which when changed will cause us to
111 // reindex a shard. This field is used so that when the contents of
112 // DocumentRanksPath changes, we can reindex.
113 DocumentRanksVersion string
114
115 // changedOrRemovedFiles is a list of file paths that have been changed or removed
116 // since the last indexing job for this repository. These files will be tombstoned
117 // in the older shards for this repository.
118 changedOrRemovedFiles []string
119
120 LanguageMap ctags.LanguageMap
121}
122
123// HashOptions contains only the options in Options that upon modification leads to IndexState of IndexStateMismatch during the next index building.
124type HashOptions struct {
125 sizeMax int
126 disableCTags bool
127 ctagsPath string
128 cTagsMustSucceed bool
129 largeFiles []string
130
131 // documentRankVersion is an experimental field which will change when the
132 // DocumentRanksPath content changes. If empty we ignore it.
133 documentRankVersion string
134}
135
136func (o *Options) HashOptions() HashOptions {
137 return HashOptions{
138 sizeMax: o.SizeMax,
139 disableCTags: o.DisableCTags,
140 ctagsPath: o.CTagsPath,
141 cTagsMustSucceed: o.CTagsMustSucceed,
142 largeFiles: o.LargeFiles,
143 documentRankVersion: o.DocumentRanksVersion,
144 }
145}
146
147func (o *Options) GetHash() string {
148 h := o.HashOptions()
149 hasher := sha1.New()
150
151 hasher.Write([]byte(h.ctagsPath))
152 hasher.Write([]byte(fmt.Sprintf("%t", h.cTagsMustSucceed)))
153 hasher.Write([]byte(fmt.Sprintf("%d", h.sizeMax)))
154 hasher.Write([]byte(fmt.Sprintf("%q", h.largeFiles)))
155 hasher.Write([]byte(fmt.Sprintf("%t", h.disableCTags)))
156
157 if h.documentRankVersion != "" {
158 hasher.Write([]byte{0})
159 io.WriteString(hasher, h.documentRankVersion)
160 }
161
162 return fmt.Sprintf("%x", hasher.Sum(nil))
163}
164
165type largeFilesFlag struct{ *Options }
166
167func (f largeFilesFlag) String() string {
168 // From flag.Value documentation:
169 //
170 // The flag package may call the String method with a zero-valued receiver,
171 // such as a nil pointer.
172 if f.Options == nil {
173 return ""
174 }
175 s := append([]string{""}, f.LargeFiles...)
176 return strings.Join(s, "-large_file ")
177}
178
179func (f largeFilesFlag) Set(value string) error {
180 f.LargeFiles = append(f.LargeFiles, value)
181 return nil
182}
183
184// Flags adds flags for build options to fs. It is the "inverse" of Args.
185func (o *Options) Flags(fs *flag.FlagSet) {
186 x := *o
187 x.SetDefaults()
188 fs.IntVar(&o.SizeMax, "file_limit", x.SizeMax, "maximum file size")
189 fs.IntVar(&o.TrigramMax, "max_trigram_count", x.TrigramMax, "maximum number of trigrams per document")
190 fs.IntVar(&o.ShardMax, "shard_limit", x.ShardMax, "maximum corpus size for a shard")
191 fs.IntVar(&o.Parallelism, "parallelism", x.Parallelism, "maximum number of parallel indexing processes.")
192 fs.StringVar(&o.IndexDir, "index", x.IndexDir, "directory for search indices")
193 fs.BoolVar(&o.CTagsMustSucceed, "require_ctags", x.CTagsMustSucceed, "If set, ctags calls must succeed.")
194 fs.Var(largeFilesFlag{o}, "large_file", "A glob pattern where matching files are to be index regardless of their size. You can add multiple patterns by setting this more than once.")
195 fs.StringVar(&o.MemProfile, "memprofile", "", "write memory profile(s) to `file.shardnum`. Note: sets parallelism to 1.")
196
197 // Sourcegraph specific
198 fs.BoolVar(&o.DisableCTags, "disable_ctags", x.DisableCTags, "If set, ctags will not be called.")
199}
200
201// Args generates command line arguments for o. It is the "inverse" of Flags.
202func (o *Options) Args() []string {
203 var args []string
204
205 if o.SizeMax != 0 {
206 args = append(args, "-file_limit", strconv.Itoa(o.SizeMax))
207 }
208
209 if o.TrigramMax != 0 {
210 args = append(args, "-max_trigram_count", strconv.Itoa(o.TrigramMax))
211 }
212
213 if o.ShardMax != 0 {
214 args = append(args, "-shard_limit", strconv.Itoa(o.ShardMax))
215 }
216
217 if o.Parallelism != 0 {
218 args = append(args, "-parallelism", strconv.Itoa(o.Parallelism))
219 }
220
221 if o.IndexDir != "" {
222 args = append(args, "-index", o.IndexDir)
223 }
224
225 if o.CTagsMustSucceed {
226 args = append(args, "-require_ctags")
227 }
228
229 for _, a := range o.LargeFiles {
230 args = append(args, "-large_file", a)
231 }
232
233 // Sourcegraph specific
234 if o.DisableCTags {
235 args = append(args, "-disable_ctags")
236 }
237
238 return args
239}
240
241// Builder manages (parallel) creation of uniformly sized shards. The
242// builder buffers up documents until it collects enough documents and
243// then builds a shard and writes.
244type Builder struct {
245 opts Options
246 throttle chan int
247
248 nextShardNum int
249 todo []*zoekt.Document
250 size int
251
252 parserMap ctags.ParserMap
253
254 building sync.WaitGroup
255
256 errMu sync.Mutex
257 buildError error
258
259 // temp name => final name for finished shards. We only rename
260 // them once all shards succeed to avoid Frankstein corpuses.
261 finishedShards map[string]string
262
263 shardLogger io.WriteCloser
264
265 // indexTime is set by tests for doing reproducible builds.
266 indexTime time.Time
267
268 // a sortable 20 chars long id.
269 id string
270
271 finishCalled bool
272}
273
274type finishedShard struct {
275 temp, final string
276}
277
278func checkCTags() string {
279 if ctags := os.Getenv("CTAGS_COMMAND"); ctags != "" {
280 return ctags
281 }
282
283 if ctags, err := exec.LookPath("universal-ctags"); err == nil {
284 return ctags
285 }
286
287 return ""
288}
289
290func checkScipCTags() string {
291 if ctags := os.Getenv("SCIP_CTAGS_COMMAND"); ctags != "" {
292 return ctags
293 }
294
295 if ctags, err := exec.LookPath("scip-ctags"); err == nil {
296 return ctags
297 }
298
299 return ""
300}
301
302// SetDefaults sets reasonable default options.
303func (o *Options) SetDefaults() {
304 if o.CTagsPath == "" && !o.DisableCTags {
305 o.CTagsPath = checkCTags()
306 }
307
308 if o.ScipCTagsPath == "" && !o.DisableCTags {
309 o.ScipCTagsPath = checkScipCTags()
310 }
311
312 if o.Parallelism == 0 {
313 o.Parallelism = 4
314 }
315 if o.SizeMax == 0 {
316 o.SizeMax = 2 << 20
317 }
318 if o.ShardMax == 0 {
319 o.ShardMax = 100 << 20
320 }
321 if o.TrigramMax == 0 {
322 o.TrigramMax = 20000
323 }
324
325 if o.RepositoryDescription.Name == "" && o.RepositoryDescription.URL != "" {
326 parsed, _ := url.Parse(o.RepositoryDescription.URL)
327 if parsed != nil {
328 o.RepositoryDescription.Name = filepath.Join(parsed.Host, parsed.Path)
329 }
330 }
331}
332
333func hashString(s string) string {
334 h := sha1.New()
335 _, _ = io.WriteString(h, s)
336 return fmt.Sprintf("%x", h.Sum(nil))
337}
338
339// ShardName returns the name the given index shard.
340func (o *Options) shardName(n int) string {
341 return o.shardNameVersion(zoekt.IndexFormatVersion, n)
342}
343
344func (o *Options) shardNameVersion(version, n int) string {
345 abs := url.QueryEscape(o.RepositoryDescription.Name)
346 if len(abs) > 200 {
347 abs = abs[:200] + hashString(abs)[:8]
348 }
349 return filepath.Join(o.IndexDir,
350 fmt.Sprintf("%s_v%d.%05d.zoekt", abs, version, n))
351}
352
353type IndexState string
354
355const (
356 IndexStateMissing IndexState = "missing"
357 IndexStateCorrupt IndexState = "corrupt"
358 IndexStateVersion IndexState = "version-mismatch"
359 IndexStateOption IndexState = "option-mismatch"
360 IndexStateMeta IndexState = "meta-mismatch"
361 IndexStateContent IndexState = "content-mismatch"
362 IndexStateEqual IndexState = "equal"
363)
364
365var readVersions = []struct {
366 IndexFormatVersion int
367 FeatureVersion int
368}{{
369 IndexFormatVersion: zoekt.IndexFormatVersion,
370 FeatureVersion: zoekt.FeatureVersion,
371}, {
372 IndexFormatVersion: zoekt.NextIndexFormatVersion,
373 FeatureVersion: zoekt.FeatureVersion,
374}}
375
376// IncrementalSkipIndexing returns true if the index present on disk matches
377// the build options.
378func (o *Options) IncrementalSkipIndexing() bool {
379 state, _ := o.IndexState()
380 return state == IndexStateEqual
381}
382
383// IndexState checks how the index present on disk compares to the build
384// options and returns the IndexState and the name of the first shard.
385func (o *Options) IndexState() (IndexState, string) {
386 // Open the latest version we support that is on disk.
387 fn := o.findShard()
388 if fn == "" {
389 return IndexStateMissing, fn
390 }
391
392 repos, index, err := zoekt.ReadMetadataPathAlive(fn)
393 if os.IsNotExist(err) {
394 return IndexStateMissing, fn
395 } else if err != nil {
396 return IndexStateCorrupt, fn
397 }
398
399 for _, v := range readVersions {
400 if v.IndexFormatVersion == index.IndexFormatVersion && v.FeatureVersion != index.IndexFeatureVersion {
401 return IndexStateVersion, fn
402 }
403 }
404
405 var repo *zoekt.Repository
406 for _, cand := range repos {
407 if cand.Name == o.RepositoryDescription.Name {
408 repo = cand
409 break
410 }
411 }
412
413 if repo == nil {
414 return IndexStateCorrupt, fn
415 }
416
417 if repo.IndexOptions != o.GetHash() {
418 return IndexStateOption, fn
419 }
420
421 if !reflect.DeepEqual(repo.Branches, o.RepositoryDescription.Branches) {
422 return IndexStateContent, fn
423 }
424
425 // We can mutate repo since it lives in the scope of this function call.
426 if updated, err := repo.MergeMutable(&o.RepositoryDescription); err != nil {
427 // non-nil err means we are trying to update an immutable field =>
428 // reindex content.
429 log.Printf("warn: immutable field changed, requires re-index: %s", err)
430 return IndexStateContent, fn
431 } else if updated {
432 return IndexStateMeta, fn
433 }
434
435 return IndexStateEqual, fn
436}
437
438// FindRepositoryMetadata returns the index metadata for the repository
439// specified in the options. 'ok' is false if the repository's metadata
440// couldn't be found or if an error occurred.
441func (o *Options) FindRepositoryMetadata() (repository *zoekt.Repository, metadata *zoekt.IndexMetadata, ok bool, err error) {
442 shard := o.findShard()
443 if shard == "" {
444 return nil, nil, false, nil
445 }
446
447 repositories, metadata, err := zoekt.ReadMetadataPathAlive(shard)
448 if err != nil {
449 return nil, nil, false, fmt.Errorf("reading metadata for shard %q: %w", shard, err)
450 }
451
452 ID := o.RepositoryDescription.ID
453 for _, r := range repositories {
454 // compound shards contain multiple repositories, so we
455 // have to pick only the one we're looking for
456 if r.ID == ID {
457 return r, metadata, true, nil
458 }
459 }
460
461 // If we're here, then we're somehow in a state where we found a matching
462 // shard that's missing the repository metadata we're looking for. This
463 // should never happen.
464 name := o.RepositoryDescription.Name
465 return nil, nil, false, fmt.Errorf("matching shard %q doesn't contain metadata for repo id %d (%q)", shard, ID, name)
466}
467
468func (o *Options) findShard() string {
469 for _, v := range readVersions {
470 fn := o.shardNameVersion(v.IndexFormatVersion, 0)
471 if _, err := os.Stat(fn); err == nil {
472 return fn
473 }
474 }
475
476 // Brute force finding the shard in compound shards. We should only hit this
477 // code path for repositories that are not already existing or are in
478 // compound shards.
479 //
480 // TODO add an oracle which can speed this up in the case of repositories
481 // already in compound shards.
482 compoundShards, err := filepath.Glob(path.Join(o.IndexDir, "compound-*.zoekt"))
483 if err != nil {
484 return ""
485 }
486 for _, fn := range compoundShards {
487 repos, _, err := zoekt.ReadMetadataPathAlive(fn)
488 if err != nil {
489 continue
490 }
491 for _, repo := range repos {
492 if repo.ID == o.RepositoryDescription.ID {
493 return fn
494 }
495 }
496 }
497
498 return ""
499}
500
501func (o *Options) FindAllShards() []string {
502 for _, v := range readVersions {
503 fn := o.shardNameVersion(v.IndexFormatVersion, 0)
504 if _, err := os.Stat(fn); err == nil {
505 shards := []string{fn}
506 for i := 1; ; i++ {
507 fn := o.shardNameVersion(v.IndexFormatVersion, i)
508 if _, err := os.Stat(fn); err != nil {
509 return shards
510 }
511 shards = append(shards, fn)
512 }
513 }
514 }
515
516 // lazily fallback to findShard which will look for a compound shard.
517 if fn := o.findShard(); fn != "" {
518 return []string{fn}
519 }
520
521 return nil
522}
523
524// IgnoreSizeMax determines whether the max size should be ignored.
525func (o *Options) IgnoreSizeMax(name string) bool {
526 // A pattern match will override preceding pattern matches.
527 for i := len(o.LargeFiles) - 1; i >= 0; i-- {
528 pattern := strings.TrimSpace(o.LargeFiles[i])
529 negated, validatedPattern := checkIsNegatePattern(pattern)
530
531 if m, _ := doublestar.PathMatch(validatedPattern, name); m {
532 if negated {
533 return false
534 } else {
535 return true
536 }
537 }
538 }
539
540 return false
541}
542
543func checkIsNegatePattern(pattern string) (bool, string) {
544 negate := "!"
545
546 // if negated then strip prefix meta character which identifies negated filter pattern
547 if strings.HasPrefix(pattern, negate) {
548 return true, pattern[len(negate):]
549 }
550
551 return false, pattern
552}
553
554// NewBuilder creates a new Builder instance.
555func NewBuilder(opts Options) (*Builder, error) {
556 opts.SetDefaults()
557 if opts.RepositoryDescription.Name == "" {
558 return nil, fmt.Errorf("builder: must set Name")
559 }
560
561 b := &Builder{
562 opts: opts,
563 throttle: make(chan int, opts.Parallelism),
564 finishedShards: map[string]string{},
565 }
566
567 if b.opts.CTagsPath == "" && b.opts.CTagsMustSucceed {
568 return nil, fmt.Errorf("ctags binary not found, but CTagsMustSucceed set")
569 }
570
571 parserMap, err := ctags.NewParserMap(ctags.ParserBinMap{
572 ctags.UniversalCTags: b.opts.CTagsPath,
573 ctags.ScipCTags: b.opts.ScipCTagsPath,
574 }, b.opts.CTagsMustSucceed)
575
576 if err != nil {
577 return nil, err
578 }
579
580 b.parserMap = parserMap
581
582 b.shardLogger = &lumberjack.Logger{
583 Filename: filepath.Join(opts.IndexDir, "zoekt-builder-shard-log.tsv"),
584 MaxSize: 100, // Megabyte
585 MaxBackups: 5,
586 }
587
588 if opts.IsDelta {
589 // Delta shards build on top of previously existing shards.
590 // As a consequence, the shardNum for delta shards starts from
591 // the number following the most recently generated shard - not 0.
592 //
593 // Using this numbering scheme allows all the shards to be
594 // discovered as a set.
595 shards := b.opts.FindAllShards()
596 b.nextShardNum = len(shards) // shards are zero indexed, so len() provides the next number after the last one
597 }
598
599 if _, err := b.newShardBuilder(); err != nil {
600 return nil, err
601 }
602
603 now := time.Now()
604 b.indexTime = now
605 b.id = xid.NewWithTime(now).String()
606
607 return b, nil
608}
609
610// AddFile is a convenience wrapper for the Add method
611func (b *Builder) AddFile(name string, content []byte) error {
612 return b.Add(zoekt.Document{Name: name, Content: content})
613}
614
615func (b *Builder) Add(doc zoekt.Document) error {
616 if b.finishCalled {
617 return nil
618 }
619
620 allowLargeFile := b.opts.IgnoreSizeMax(doc.Name)
621
622 // Adjust trigramMax for allowed large files so we don't exclude them.
623 trigramMax := b.opts.TrigramMax
624 if allowLargeFile {
625 trigramMax = math.MaxInt64
626 }
627
628 if len(doc.Content) > b.opts.SizeMax && !allowLargeFile {
629 // We could pass the document on to the shardbuilder, but if
630 // we pass through a part of the source tree with binary/large
631 // files, the corresponding shard would be mostly empty, so
632 // insert a reason here too.
633 doc.SkipReason = fmt.Sprintf("document size %d larger than limit %d", len(doc.Content), b.opts.SizeMax)
634 } else if err := zoekt.CheckText(doc.Content, trigramMax); err != nil {
635 doc.SkipReason = err.Error()
636 doc.Language = "binary"
637 }
638
639 b.todo = append(b.todo, &doc)
640
641 if doc.SkipReason == "" {
642 b.size += len(doc.Name) + len(doc.Content)
643 } else {
644 b.size += len(doc.Name) + len(doc.SkipReason)
645 }
646
647 if b.size > b.opts.ShardMax {
648 return b.flush()
649 }
650
651 return nil
652}
653
654// MarkFileAsChangedOrRemoved indicates that the file specified by the given path
655// has been changed or removed since the last indexing job for this repository.
656//
657// If this build is a delta build, these files will be tombstoned in the older shards for this repository.
658func (b *Builder) MarkFileAsChangedOrRemoved(path string) {
659 b.opts.changedOrRemovedFiles = append(b.opts.changedOrRemovedFiles, path)
660}
661
662// Finish creates a last shard from the buffered documents, and clears
663// stale shards from previous runs. This should always be called, also
664// in failure cases, to ensure cleanup.
665//
666// It is safe to call Finish() multiple times.
667func (b *Builder) Finish() error {
668 if b.finishCalled {
669 return b.buildError
670 }
671
672 b.finishCalled = true
673
674 b.flush()
675 b.building.Wait()
676
677 if b.buildError != nil {
678 for tmp := range b.finishedShards {
679 log.Printf("Builder.Finish %s", tmp)
680 os.Remove(tmp)
681 }
682 b.finishedShards = map[string]string{}
683 return b.buildError
684 }
685
686 // map of temporary -> final names for all updated shards + shard metadata files
687 artifactPaths := make(map[string]string)
688 for tmp, final := range b.finishedShards {
689 artifactPaths[tmp] = final
690 }
691
692 oldShards := b.opts.FindAllShards()
693
694 if b.opts.IsDelta {
695 // Delta shard builds need to update FileTombstone and branch commit information for all
696 // existing shards
697 for _, shard := range oldShards {
698 repositories, _, err := zoekt.ReadMetadataPathAlive(shard)
699 if err != nil {
700 return fmt.Errorf("reading metadata from shard %q: %w", shard, err)
701 }
702
703 if len(repositories) > 1 {
704 return fmt.Errorf("delta shard builds don't support repositories contained in compound shards (shard %q)", shard)
705 }
706
707 if len(repositories) == 0 {
708 return fmt.Errorf("failed to update repository metadata for shard %q - shard contains no repositories", shard)
709 }
710
711 repository := repositories[0]
712 if repository.ID != b.opts.RepositoryDescription.ID {
713 return fmt.Errorf("shard %q doesn't contain repository ID %d (%q)", shard, b.opts.RepositoryDescription.ID, b.opts.RepositoryDescription.Name)
714 }
715
716 if len(b.opts.changedOrRemovedFiles) > 0 && repository.FileTombstones == nil {
717 repository.FileTombstones = make(map[string]struct{}, len(b.opts.changedOrRemovedFiles))
718 }
719
720 for _, f := range b.opts.changedOrRemovedFiles {
721 repository.FileTombstones[f] = struct{}{}
722 }
723
724 if !BranchNamesEqual(repository.Branches, b.opts.RepositoryDescription.Branches) {
725 return deltaBranchSetError{
726 shardName: shard,
727 old: repository.Branches,
728 new: b.opts.RepositoryDescription.Branches,
729 }
730 }
731
732 if b.opts.GetHash() != repository.IndexOptions {
733 return &deltaIndexOptionsMismatchError{
734 shardName: shard,
735 newOptions: b.opts.HashOptions(),
736 }
737 }
738
739 repository.Branches = b.opts.RepositoryDescription.Branches
740
741 repository.LatestCommitDate = b.opts.RepositoryDescription.LatestCommitDate
742
743 tempPath, finalPath, err := zoekt.JsonMarshalRepoMetaTemp(shard, repository)
744 if err != nil {
745 return fmt.Errorf("writing repository metadta for shard %q: %w", shard, err)
746 }
747
748 artifactPaths[tempPath] = finalPath
749 }
750 }
751
752 // We mark finished shards as empty when we successfully finish. Return now
753 // to allow call sites to call Finish idempotently.
754 if len(artifactPaths) == 0 {
755 return b.buildError
756 }
757
758 defer b.shardLogger.Close()
759
760 // Collect a map of the old shards on disk. For each new shard we replace we
761 // delete it from toDelete. Anything remaining in toDelete will be removed
762 // after we have renamed everything into place.
763
764 var toDelete map[string]struct{}
765 if !b.opts.IsDelta {
766 // Non-delta shard builds delete all existing shards before they write out
767 // new ones.
768 // By contrast, delta shard builds work by stacking changes on top of existing shards.
769 // So, we skip populating the toDelete map if we're building delta shards.
770
771 toDelete = make(map[string]struct{})
772 for _, name := range oldShards {
773 paths, err := zoekt.IndexFilePaths(name)
774 if err != nil {
775 b.buildError = fmt.Errorf("failed to find old paths for %s: %w", name, err)
776 }
777 for _, p := range paths {
778 toDelete[p] = struct{}{}
779 }
780 }
781 }
782
783 for tmp, final := range artifactPaths {
784 if err := os.Rename(tmp, final); err != nil {
785 b.buildError = err
786 continue
787 }
788
789 delete(toDelete, final)
790
791 b.shardLog("upsert", final, b.opts.RepositoryDescription.Name)
792 }
793
794 b.finishedShards = map[string]string{}
795
796 for p := range toDelete {
797 // Don't delete compound shards, set tombstones instead.
798 if zoekt.ShardMergingEnabled() && strings.HasPrefix(filepath.Base(p), "compound-") {
799 if !strings.HasSuffix(p, ".zoekt") {
800 continue
801 }
802 b.shardLog("tomb", p, b.opts.RepositoryDescription.Name)
803 err := zoekt.SetTombstone(p, b.opts.RepositoryDescription.ID)
804 b.buildError = err
805 continue
806 }
807 log.Printf("removing old shard file: %s", p)
808 b.shardLog("remove", p, b.opts.RepositoryDescription.Name)
809 if err := os.Remove(p); err != nil {
810 b.buildError = err
811 }
812 }
813
814 return b.buildError
815}
816
817// BranchNamesEqual compares the given zoekt.RepositoryBranch slices, and returns true
818// iff both slices specify the same set of branch names in the same order.
819func BranchNamesEqual(a, b []zoekt.RepositoryBranch) bool {
820 if len(a) != len(b) {
821 return false
822 }
823
824 for i := range a {
825 x, y := a[i], b[i]
826 if x.Name != y.Name {
827 return false
828 }
829 }
830
831 return true
832}
833
834func (b *Builder) flush() error {
835 todo := b.todo
836 b.todo = nil
837 b.size = 0
838 b.errMu.Lock()
839 defer b.errMu.Unlock()
840 if b.buildError != nil {
841 return b.buildError
842 }
843
844 hasShard := b.nextShardNum > 0
845 if len(todo) == 0 && hasShard {
846 return nil
847 }
848
849 shard := b.nextShardNum
850 b.nextShardNum++
851
852 if b.opts.Parallelism > 1 && b.opts.MemProfile == "" {
853 b.building.Add(1)
854 go func() {
855 b.throttle <- 1
856 done, err := b.buildShard(todo, shard)
857 <-b.throttle
858
859 b.errMu.Lock()
860 defer b.errMu.Unlock()
861 if err != nil && b.buildError == nil {
862 b.buildError = err
863 }
864 if err == nil {
865 b.finishedShards[done.temp] = done.final
866 }
867 b.building.Done()
868 }()
869 } else {
870 // No goroutines when we're not parallel. This
871 // simplifies memory profiling.
872 done, err := b.buildShard(todo, shard)
873 b.buildError = err
874 if err == nil {
875 b.finishedShards[done.temp] = done.final
876 }
877 if b.opts.MemProfile != "" {
878 // drop memory, and profile.
879 todo = nil
880 b.writeMemProfile(b.opts.MemProfile)
881 }
882
883 return b.buildError
884 }
885
886 return nil
887}
888
889func (b *Builder) shardLog(action, shard string, repoName string) {
890 shard = filepath.Base(shard)
891 var shardSize int64
892 if fi, err := os.Stat(filepath.Join(b.opts.IndexDir, shard)); err == nil {
893 shardSize = fi.Size()
894 }
895 _, _ = fmt.Fprintf(b.shardLogger, "%s\t%s\t%s\t%d\t%s\n", time.Now().UTC().Format(time.RFC3339), action, shard, shardSize, repoName)
896}
897
898var profileNumber int
899
900func (b *Builder) writeMemProfile(name string) {
901 nm := fmt.Sprintf("%s.%d", name, profileNumber)
902 profileNumber++
903 f, err := os.Create(nm)
904 if err != nil {
905 log.Fatal("could not create memory profile: ", err)
906 }
907 runtime.GC() // get up-to-date statistics
908 if err := pprof.WriteHeapProfile(f); err != nil {
909 log.Fatal("could not write memory profile: ", err)
910 }
911 f.Close()
912 log.Printf("wrote mem profile %q", nm)
913}
914
915// map [0,inf) to [0,1) monotonically
916func squashRange(j int) float64 {
917 x := float64(j)
918 return x / (1 + x)
919}
920
921// IsLowPriority takes a file name and makes an educated guess about its priority
922// in search results. A file is considered low priority if it looks like a test,
923// vendored, or generated file.
924//
925// These 'priority' criteria affects how documents are ordered within a shard. It's
926// also used to help guess a file's rank when we're missing ranking information.
927func IsLowPriority(file string) bool {
928 return testRe.MatchString(file) || isGenerated(file) || isVendored(file)
929}
930
931var testRe = regexp.MustCompile("[Tt]est")
932
933func isGenerated(file string) bool {
934 return strings.HasSuffix(file, "min.js") || strings.HasSuffix(file, "js.map")
935}
936
937func isVendored(file string) bool {
938 return strings.Contains(file, "vendor/") || strings.Contains(file, "node_modules/")
939}
940
941type rankedDoc struct {
942 *zoekt.Document
943 rank []float64
944}
945
946// rank returns a vector of scores which is used at index-time to sort documents
947// before writing them to disk. The order of documents in the shard is important
948// at query time, because earlier documents receive a boost at query time and
949// have a higher chance of being searched before limits kick in.
950func rank(d *zoekt.Document, origIdx int) []float64 {
951 generated := 0.0
952 if isGenerated(d.Name) {
953 generated = 1.0
954 }
955
956 vendor := 0.0
957 if isVendored(d.Name) {
958 vendor = 1.0
959 }
960
961 test := 0.0
962 if testRe.MatchString(d.Name) {
963 test = 1.0
964 }
965
966 // Smaller is earlier (=better).
967 return []float64{
968 // Prefer docs that are not generated
969 generated,
970
971 // Prefer docs that are not vendored
972 vendor,
973
974 // Prefer docs that are not tests
975 test,
976
977 // With short names
978 squashRange(len(d.Name)),
979
980 // With many symbols
981 1.0 - squashRange(len(d.Symbols)),
982
983 // With short content
984 squashRange(len(d.Content)),
985
986 // That is present is as many branches as possible
987 1.0 - squashRange(len(d.Branches)),
988
989 // Preserve original ordering.
990 squashRange(origIdx),
991 }
992}
993
994func sortDocuments(todo []*zoekt.Document) {
995 rs := make([]rankedDoc, 0, len(todo))
996 for i, t := range todo {
997 rd := rankedDoc{t, rank(t, i)}
998 rs = append(rs, rd)
999 }
1000 sort.Slice(rs, func(i, j int) bool {
1001 r1 := rs[i].rank
1002 r2 := rs[j].rank
1003 for i := range r1 {
1004 if r1[i] < r2[i] {
1005 return true
1006 }
1007 if r1[i] > r2[i] {
1008 return false
1009 }
1010 }
1011
1012 return false
1013 })
1014 for i := range todo {
1015 todo[i] = rs[i].Document
1016 }
1017}
1018
1019func (b *Builder) buildShard(todo []*zoekt.Document, nextShardNum int) (*finishedShard, error) {
1020 if !b.opts.DisableCTags && (b.opts.CTagsPath != "" || b.opts.ScipCTagsPath != "") {
1021 err := ctagsAddSymbolsParserMap(todo, b.opts.LanguageMap, b.parserMap)
1022 if b.opts.CTagsMustSucceed && err != nil {
1023 return nil, err
1024 }
1025 if err != nil {
1026 log.Printf("ignoring universal:%s or scip:%s error: %v", b.opts.CTagsPath, b.opts.ScipCTagsPath, err)
1027 }
1028 }
1029
1030 name := b.opts.shardName(nextShardNum)
1031
1032 shardBuilder, err := b.newShardBuilder()
1033 if err != nil {
1034 return nil, err
1035 }
1036
1037 sortDocuments(todo)
1038
1039 for _, t := range todo {
1040 if err := shardBuilder.Add(*t); err != nil {
1041 return nil, err
1042 }
1043 }
1044
1045 return b.writeShard(name, shardBuilder)
1046}
1047
1048func (b *Builder) newShardBuilder() (*zoekt.IndexBuilder, error) {
1049 desc := b.opts.RepositoryDescription
1050 desc.HasSymbols = !b.opts.DisableCTags && b.opts.CTagsPath != ""
1051 desc.SubRepoMap = b.opts.SubRepositories
1052 desc.IndexOptions = b.opts.GetHash()
1053
1054 shardBuilder, err := zoekt.NewIndexBuilder(&desc)
1055 if err != nil {
1056 return nil, err
1057 }
1058 shardBuilder.IndexTime = b.indexTime
1059 shardBuilder.ID = b.id
1060 return shardBuilder, nil
1061}
1062
1063func (b *Builder) writeShard(fn string, ib *zoekt.IndexBuilder) (*finishedShard, error) {
1064 dir := filepath.Dir(fn)
1065 if err := os.MkdirAll(dir, 0o700); err != nil {
1066 return nil, err
1067 }
1068
1069 f, err := os.CreateTemp(dir, filepath.Base(fn)+".*.tmp")
1070 if err != nil {
1071 return nil, err
1072 }
1073 if runtime.GOOS != "windows" {
1074 if err := f.Chmod(0o666 &^ umask); err != nil {
1075 return nil, err
1076 }
1077 }
1078
1079 defer f.Close()
1080 if err := ib.Write(f); err != nil {
1081 return nil, err
1082 }
1083 fi, err := f.Stat()
1084 if err != nil {
1085 return nil, err
1086 }
1087 if err := f.Close(); err != nil {
1088 return nil, err
1089 }
1090
1091 log.Printf("finished shard %s: %d index bytes (overhead %3.1f), %d files processed \n",
1092 fn,
1093 fi.Size(),
1094 float64(fi.Size())/float64(ib.ContentSize()+1),
1095 ib.NumFiles())
1096
1097 return &finishedShard{f.Name(), fn}, nil
1098}
1099
1100type deltaBranchSetError struct {
1101 shardName string
1102 old, new []zoekt.RepositoryBranch
1103}
1104
1105func (e deltaBranchSetError) Error() string {
1106 return fmt.Sprintf("repository metadata in shard %q contains a different set of branch names than what was requested, which is unsupported in a delta shard build. old: %+v, new: %+v", e.shardName, e.old, e.new)
1107}
1108
1109type deltaIndexOptionsMismatchError struct {
1110 shardName string
1111 newOptions HashOptions
1112}
1113
1114func (e *deltaIndexOptionsMismatchError) Error() string {
1115 return fmt.Sprintf("one or more index options for shard %q do not match Builder's index options. These index option updates are incompatible with delta build. New index options: %+v", e.shardName, e.newOptions)
1116}
1117
1118// umask holds the Umask of the current process
1119var umask os.FileMode