fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// package build implements a more convenient interface for building
16// zoekt indices.
17package build
18
19import (
20 "crypto/sha1"
21 "flag"
22 "fmt"
23 "io"
24 "log"
25 "math"
26 "net/url"
27 "os"
28 "os/exec"
29 "path"
30 "path/filepath"
31 "reflect"
32 "runtime"
33 "runtime/pprof"
34 "sort"
35 "strconv"
36 "strings"
37 "sync"
38 "time"
39
40 "github.com/bmatcuk/doublestar"
41 "github.com/grafana/regexp"
42 "github.com/rs/xid"
43 "gopkg.in/natefinch/lumberjack.v2"
44
45 "github.com/sourcegraph/zoekt"
46 "github.com/sourcegraph/zoekt/ctags"
47)
48
49var DefaultDir = filepath.Join(os.Getenv("HOME"), ".zoekt")
50
51// Branch describes a single branch version.
52type Branch struct {
53 Name string
54 Version string
55}
56
57// Options sets options for the index building.
58type Options struct {
59 // IndexDir is a directory that holds *.zoekt index files.
60 IndexDir string
61
62 // SizeMax is the maximum file size
63 SizeMax int
64
65 // Parallelism is the maximum number of shards to index in parallel
66 Parallelism int
67
68 // ShardMax sets the maximum corpus size for a single shard
69 ShardMax int
70
71 // TrigramMax sets the maximum number of distinct trigrams per document.
72 TrigramMax int
73
74 // RepositoryDescription holds names and URLs for the repository.
75 RepositoryDescription zoekt.Repository
76
77 // SubRepositories is a path => sub repository map.
78 SubRepositories map[string]*zoekt.Repository
79
80 // DisableCTags disables the generation of ctags metadata.
81 DisableCTags bool
82
83 // CtagsPath is the path to the ctags binary to run, or empty
84 // if a valid binary couldn't be found.
85 CTagsPath string
86
87 // Same as CTagsPath but for scip-ctags
88 ScipCTagsPath string
89
90 // If set, ctags must succeed.
91 CTagsMustSucceed bool
92
93 // Write memory profiles to this file.
94 MemProfile string
95
96 // LargeFiles is a slice of glob patterns, including ** for any number
97 // of directories, where matching file paths should be indexed
98 // regardless of their size. The full pattern syntax is here:
99 // https://github.com/bmatcuk/doublestar/tree/v1#patterns.
100 LargeFiles []string
101
102 // IsDelta is true if this run contains only the changed documents since the
103 // last run.
104 IsDelta bool
105
106 // DocumentRanksPath is the path to the file with document ranks. If empty,
107 // ranks will be computed on-the-fly.
108 DocumentRanksPath string
109
110 // DocumentRanksVersion is a string which when changed will cause us to
111 // reindex a shard. This field is used so that when the contents of
112 // DocumentRanksPath changes, we can reindex.
113 DocumentRanksVersion string
114
115 // changedOrRemovedFiles is a list of file paths that have been changed or removed
116 // since the last indexing job for this repository. These files will be tombstoned
117 // in the older shards for this repository.
118 changedOrRemovedFiles []string
119
120 LanguageMap ctags.LanguageMap
121}
122
123// HashOptions contains only the options in Options that upon modification leads to IndexState of IndexStateMismatch during the next index building.
124type HashOptions struct {
125 sizeMax int
126 disableCTags bool
127 ctagsPath string
128 cTagsMustSucceed bool
129 largeFiles []string
130
131 // documentRankVersion is an experimental field which will change when the
132 // DocumentRanksPath content changes. If empty we ignore it.
133 documentRankVersion string
134}
135
136func (o *Options) HashOptions() HashOptions {
137 return HashOptions{
138 sizeMax: o.SizeMax,
139 disableCTags: o.DisableCTags,
140 ctagsPath: o.CTagsPath,
141 cTagsMustSucceed: o.CTagsMustSucceed,
142 largeFiles: o.LargeFiles,
143 documentRankVersion: o.DocumentRanksVersion,
144 }
145}
146
147func (o *Options) GetHash() string {
148 h := o.HashOptions()
149 hasher := sha1.New()
150
151 hasher.Write([]byte(h.ctagsPath))
152 hasher.Write([]byte(fmt.Sprintf("%t", h.cTagsMustSucceed)))
153 hasher.Write([]byte(fmt.Sprintf("%d", h.sizeMax)))
154 hasher.Write([]byte(fmt.Sprintf("%q", h.largeFiles)))
155 hasher.Write([]byte(fmt.Sprintf("%t", h.disableCTags)))
156
157 if h.documentRankVersion != "" {
158 hasher.Write([]byte{0})
159 io.WriteString(hasher, h.documentRankVersion)
160 }
161
162 return fmt.Sprintf("%x", hasher.Sum(nil))
163}
164
165type largeFilesFlag struct{ *Options }
166
167func (f largeFilesFlag) String() string {
168 // From flag.Value documentation:
169 //
170 // The flag package may call the String method with a zero-valued receiver,
171 // such as a nil pointer.
172 if f.Options == nil {
173 return ""
174 }
175 s := append([]string{""}, f.LargeFiles...)
176 return strings.Join(s, "-large_file ")
177}
178
179func (f largeFilesFlag) Set(value string) error {
180 f.LargeFiles = append(f.LargeFiles, value)
181 return nil
182}
183
184// Flags adds flags for build options to fs. It is the "inverse" of Args.
185func (o *Options) Flags(fs *flag.FlagSet) {
186 x := *o
187 x.SetDefaults()
188 fs.IntVar(&o.SizeMax, "file_limit", x.SizeMax, "maximum file size")
189 fs.IntVar(&o.TrigramMax, "max_trigram_count", x.TrigramMax, "maximum number of trigrams per document")
190 fs.IntVar(&o.ShardMax, "shard_limit", x.ShardMax, "maximum corpus size for a shard")
191 fs.IntVar(&o.Parallelism, "parallelism", x.Parallelism, "maximum number of parallel indexing processes.")
192 fs.StringVar(&o.IndexDir, "index", x.IndexDir, "directory for search indices")
193 fs.BoolVar(&o.CTagsMustSucceed, "require_ctags", x.CTagsMustSucceed, "If set, ctags calls must succeed.")
194 fs.Var(largeFilesFlag{o}, "large_file", "A glob pattern where matching files are to be index regardless of their size. You can add multiple patterns by setting this more than once.")
195 fs.StringVar(&o.MemProfile, "memprofile", "", "write memory profile(s) to `file.shardnum`. Note: sets parallelism to 1.")
196
197 // Sourcegraph specific
198 fs.BoolVar(&o.DisableCTags, "disable_ctags", x.DisableCTags, "If set, ctags will not be called.")
199}
200
201// Args generates command line arguments for o. It is the "inverse" of Flags.
202func (o *Options) Args() []string {
203 var args []string
204
205 if o.SizeMax != 0 {
206 args = append(args, "-file_limit", strconv.Itoa(o.SizeMax))
207 }
208
209 if o.TrigramMax != 0 {
210 args = append(args, "-max_trigram_count", strconv.Itoa(o.TrigramMax))
211 }
212
213 if o.ShardMax != 0 {
214 args = append(args, "-shard_limit", strconv.Itoa(o.ShardMax))
215 }
216
217 if o.Parallelism != 0 {
218 args = append(args, "-parallelism", strconv.Itoa(o.Parallelism))
219 }
220
221 if o.IndexDir != "" {
222 args = append(args, "-index", o.IndexDir)
223 }
224
225 if o.CTagsMustSucceed {
226 args = append(args, "-require_ctags")
227 }
228
229 for _, a := range o.LargeFiles {
230 args = append(args, "-large_file", a)
231 }
232
233 // Sourcegraph specific
234 if o.DisableCTags {
235 args = append(args, "-disable_ctags")
236 }
237
238 return args
239}
240
241// Builder manages (parallel) creation of uniformly sized shards. The
242// builder buffers up documents until it collects enough documents and
243// then builds a shard and writes.
244type Builder struct {
245 opts Options
246 throttle chan int
247
248 nextShardNum int
249 todo []*zoekt.Document
250 size int
251
252 parserMap ctags.ParserMap
253
254 building sync.WaitGroup
255
256 errMu sync.Mutex
257 buildError error
258
259 // temp name => final name for finished shards. We only rename
260 // them once all shards succeed to avoid Frankstein corpuses.
261 finishedShards map[string]string
262
263 shardLogger io.WriteCloser
264
265 // indexTime is set by tests for doing reproducible builds.
266 indexTime time.Time
267
268 // a sortable 20 chars long id.
269 id string
270
271 finishCalled bool
272}
273
274type finishedShard struct {
275 temp, final string
276}
277
278func checkCTags() string {
279 if ctags := os.Getenv("CTAGS_COMMAND"); ctags != "" {
280 return ctags
281 }
282
283 if ctags, err := exec.LookPath("universal-ctags"); err == nil {
284 return ctags
285 }
286
287 return ""
288}
289
290func checkScipCTags() string {
291 if ctags := os.Getenv("SCIP_CTAGS_COMMAND"); ctags != "" {
292 return ctags
293 }
294
295 if ctags, err := exec.LookPath("scip-ctags"); err == nil {
296 return ctags
297 }
298
299 return ""
300}
301
302// SetDefaults sets reasonable default options.
303func (o *Options) SetDefaults() {
304 if o.CTagsPath == "" && !o.DisableCTags {
305 o.CTagsPath = checkCTags()
306 }
307
308 if o.ScipCTagsPath == "" && !o.DisableCTags {
309 o.ScipCTagsPath = checkScipCTags()
310 }
311
312 if o.Parallelism == 0 {
313 o.Parallelism = 4
314 }
315 if o.SizeMax == 0 {
316 o.SizeMax = 2 << 20
317 }
318 if o.ShardMax == 0 {
319 o.ShardMax = 100 << 20
320 }
321 if o.TrigramMax == 0 {
322 o.TrigramMax = 20000
323 }
324
325 if o.RepositoryDescription.Name == "" && o.RepositoryDescription.URL != "" {
326 parsed, _ := url.Parse(o.RepositoryDescription.URL)
327 if parsed != nil {
328 o.RepositoryDescription.Name = filepath.Join(parsed.Host, parsed.Path)
329 }
330 }
331}
332
333func hashString(s string) string {
334 h := sha1.New()
335 _, _ = io.WriteString(h, s)
336 return fmt.Sprintf("%x", h.Sum(nil))
337}
338
339// ShardName returns the name the given index shard.
340func (o *Options) shardName(n int) string {
341 return o.shardNameVersion(zoekt.IndexFormatVersion, n)
342}
343
344func (o *Options) shardNameVersion(version, n int) string {
345 abs := url.QueryEscape(o.RepositoryDescription.Name)
346 if len(abs) > 200 {
347 abs = abs[:200] + hashString(abs)[:8]
348 }
349 return filepath.Join(o.IndexDir,
350 fmt.Sprintf("%s_v%d.%05d.zoekt", abs, version, n))
351}
352
353type IndexState string
354
355const (
356 IndexStateMissing IndexState = "missing"
357 IndexStateCorrupt IndexState = "corrupt"
358 IndexStateVersion IndexState = "version-mismatch"
359 IndexStateOption IndexState = "option-mismatch"
360 IndexStateMeta IndexState = "meta-mismatch"
361 IndexStateContent IndexState = "content-mismatch"
362 IndexStateEqual IndexState = "equal"
363)
364
365var readVersions = []struct {
366 IndexFormatVersion int
367 FeatureVersion int
368}{{
369 IndexFormatVersion: zoekt.IndexFormatVersion,
370 FeatureVersion: zoekt.FeatureVersion,
371}, {
372 IndexFormatVersion: zoekt.NextIndexFormatVersion,
373 FeatureVersion: zoekt.FeatureVersion,
374}}
375
376// IncrementalSkipIndexing returns true if the index present on disk matches
377// the build options.
378func (o *Options) IncrementalSkipIndexing() bool {
379 state, _ := o.IndexState()
380 return state == IndexStateEqual
381}
382
383// IndexState checks how the index present on disk compares to the build
384// options and returns the IndexState and the name of the first shard.
385func (o *Options) IndexState() (IndexState, string) {
386 // Open the latest version we support that is on disk.
387 fn := o.findShard()
388 if fn == "" {
389 return IndexStateMissing, fn
390 }
391
392 repos, index, err := zoekt.ReadMetadataPathAlive(fn)
393 if os.IsNotExist(err) {
394 return IndexStateMissing, fn
395 } else if err != nil {
396 return IndexStateCorrupt, fn
397 }
398
399 for _, v := range readVersions {
400 if v.IndexFormatVersion == index.IndexFormatVersion && v.FeatureVersion != index.IndexFeatureVersion {
401 return IndexStateVersion, fn
402 }
403 }
404
405 var repo *zoekt.Repository
406 for _, cand := range repos {
407 if cand.Name == o.RepositoryDescription.Name {
408 repo = cand
409 break
410 }
411 }
412
413 if repo == nil {
414 return IndexStateCorrupt, fn
415 }
416
417 if repo.IndexOptions != o.GetHash() {
418 return IndexStateOption, fn
419 }
420
421 if !reflect.DeepEqual(repo.Branches, o.RepositoryDescription.Branches) {
422 return IndexStateContent, fn
423 }
424
425 // We can mutate repo since it lives in the scope of this function call.
426 if updated, err := repo.MergeMutable(&o.RepositoryDescription); err != nil {
427 // non-nil err means we are trying to update an immutable field =>
428 // reindex content.
429 log.Printf("warn: immutable field changed, requires re-index: %s", err)
430 return IndexStateContent, fn
431 } else if updated {
432 return IndexStateMeta, fn
433 }
434
435 return IndexStateEqual, fn
436}
437
438// FindRepositoryMetadata returns the index metadata for the repository
439// specified in the options. 'ok' is false if the repository's metadata
440// couldn't be found or if an error occurred.
441func (o *Options) FindRepositoryMetadata() (repository *zoekt.Repository, metadata *zoekt.IndexMetadata, ok bool, err error) {
442 shard := o.findShard()
443 if shard == "" {
444 return nil, nil, false, nil
445 }
446
447 repositories, metadata, err := zoekt.ReadMetadataPathAlive(shard)
448 if err != nil {
449 return nil, nil, false, fmt.Errorf("reading metadata for shard %q: %w", shard, err)
450 }
451
452 ID := o.RepositoryDescription.ID
453 for _, r := range repositories {
454 // compound shards contain multiple repositories, so we
455 // have to pick only the one we're looking for
456 if r.ID == ID {
457 return r, metadata, true, nil
458 }
459 }
460
461 // If we're here, then we're somehow in a state where we found a matching
462 // shard that's missing the repository metadata we're looking for. This
463 // should never happen.
464 name := o.RepositoryDescription.Name
465 return nil, nil, false, fmt.Errorf("matching shard %q doesn't contain metadata for repo id %d (%q)", shard, ID, name)
466}
467
468func (o *Options) findShard() string {
469 for _, v := range readVersions {
470 fn := o.shardNameVersion(v.IndexFormatVersion, 0)
471 if _, err := os.Stat(fn); err == nil {
472 return fn
473 }
474 }
475
476 // Brute force finding the shard in compound shards. We should only hit this
477 // code path for repositories that are not already existing or are in
478 // compound shards.
479 //
480 // TODO add an oracle which can speed this up in the case of repositories
481 // already in compound shards.
482 compoundShards, err := filepath.Glob(path.Join(o.IndexDir, "compound-*.zoekt"))
483 if err != nil {
484 return ""
485 }
486 for _, fn := range compoundShards {
487 repos, _, err := zoekt.ReadMetadataPathAlive(fn)
488 if err != nil {
489 continue
490 }
491 for _, repo := range repos {
492 if repo.ID == o.RepositoryDescription.ID {
493 return fn
494 }
495 }
496 }
497
498 return ""
499}
500
501func (o *Options) FindAllShards() []string {
502 for _, v := range readVersions {
503 fn := o.shardNameVersion(v.IndexFormatVersion, 0)
504 if _, err := os.Stat(fn); err == nil {
505 shards := []string{fn}
506 for i := 1; ; i++ {
507 fn := o.shardNameVersion(v.IndexFormatVersion, i)
508 if _, err := os.Stat(fn); err != nil {
509 return shards
510 }
511 shards = append(shards, fn)
512 }
513 }
514 }
515
516 // lazily fallback to findShard which will look for a compound shard.
517 if fn := o.findShard(); fn != "" {
518 return []string{fn}
519 }
520
521 return nil
522}
523
524// IgnoreSizeMax determines whether the max size should be ignored.
525func (o *Options) IgnoreSizeMax(name string) bool {
526 // A pattern match will override preceding pattern matches.
527 for i := len(o.LargeFiles) - 1; i >= 0; i-- {
528 pattern := strings.TrimSpace(o.LargeFiles[i])
529 negated, validatedPattern := checkIsNegatePattern(pattern)
530
531 if m, _ := doublestar.PathMatch(validatedPattern, name); m {
532 if negated {
533 return false
534 } else {
535 return true
536 }
537 }
538 }
539
540 return false
541}
542
543func checkIsNegatePattern(pattern string) (bool, string) {
544 negate := "!"
545
546 // if negated then strip prefix meta character which identifies negated filter pattern
547 if strings.HasPrefix(pattern, negate) {
548 return true, pattern[len(negate):]
549 }
550
551 return false, pattern
552}
553
554// NewBuilder creates a new Builder instance.
555func NewBuilder(opts Options) (*Builder, error) {
556 opts.SetDefaults()
557 if opts.RepositoryDescription.Name == "" {
558 return nil, fmt.Errorf("builder: must set Name")
559 }
560
561 b := &Builder{
562 opts: opts,
563 throttle: make(chan int, opts.Parallelism),
564 finishedShards: map[string]string{},
565 }
566
567 if b.opts.CTagsPath == "" && b.opts.CTagsMustSucceed {
568 return nil, fmt.Errorf("ctags binary not found, but CTagsMustSucceed set")
569 }
570
571 parserMap, err := ctags.NewParserMap(ctags.ParserBinMap{
572 ctags.UniversalCTags: b.opts.CTagsPath,
573 ctags.ScipCTags: b.opts.ScipCTagsPath,
574 }, b.opts.CTagsMustSucceed)
575
576 if err != nil {
577 return nil, err
578 }
579
580 b.parserMap = parserMap
581
582 b.shardLogger = &lumberjack.Logger{
583 Filename: filepath.Join(opts.IndexDir, "zoekt-builder-shard-log.tsv"),
584 MaxSize: 100, // Megabyte
585 MaxBackups: 5,
586 }
587
588 if opts.IsDelta {
589 // Delta shards build on top of previously existing shards.
590 // As a consequence, the shardNum for delta shards starts from
591 // the number following the most recently generated shard - not 0.
592 //
593 // Using this numbering scheme allows all the shards to be
594 // discovered as a set.
595 shards := b.opts.FindAllShards()
596 b.nextShardNum = len(shards) // shards are zero indexed, so len() provides the next number after the last one
597 }
598
599 if _, err := b.newShardBuilder(); err != nil {
600 return nil, err
601 }
602
603 now := time.Now()
604 b.indexTime = now
605 b.id = xid.NewWithTime(now).String()
606
607 return b, nil
608}
609
610// AddFile is a convenience wrapper for the Add method
611func (b *Builder) AddFile(name string, content []byte) error {
612 return b.Add(zoekt.Document{Name: name, Content: content})
613}
614
615func (b *Builder) Add(doc zoekt.Document) error {
616 if b.finishCalled {
617 return nil
618 }
619
620 allowLargeFile := b.opts.IgnoreSizeMax(doc.Name)
621
622 // Adjust trigramMax for allowed large files so we don't exclude them.
623 trigramMax := b.opts.TrigramMax
624 if allowLargeFile {
625 trigramMax = math.MaxInt64
626 }
627
628 if len(doc.Content) > b.opts.SizeMax && !allowLargeFile {
629 // We could pass the document on to the shardbuilder, but if
630 // we pass through a part of the source tree with binary/large
631 // files, the corresponding shard would be mostly empty, so
632 // insert a reason here too.
633 doc.SkipReason = fmt.Sprintf("document size %d larger than limit %d", len(doc.Content), b.opts.SizeMax)
634 } else if err := zoekt.CheckText(doc.Content, trigramMax); err != nil {
635 doc.SkipReason = err.Error()
636 doc.Language = "binary"
637 }
638
639 b.todo = append(b.todo, &doc)
640
641 if doc.SkipReason == "" {
642 b.size += len(doc.Name) + len(doc.Content)
643 } else {
644 b.size += len(doc.Name) + len(doc.SkipReason)
645 // Drop the content if we are skipping the document. Skipped content is not counted towards the
646 // shard size limit, so otherwise we might buffer too much data in memory before flushing.
647 doc.Content = nil
648 }
649
650 if b.size > b.opts.ShardMax {
651 return b.flush()
652 }
653
654 return nil
655}
656
657// MarkFileAsChangedOrRemoved indicates that the file specified by the given path
658// has been changed or removed since the last indexing job for this repository.
659//
660// If this build is a delta build, these files will be tombstoned in the older shards for this repository.
661func (b *Builder) MarkFileAsChangedOrRemoved(path string) {
662 b.opts.changedOrRemovedFiles = append(b.opts.changedOrRemovedFiles, path)
663}
664
665// Finish creates a last shard from the buffered documents, and clears
666// stale shards from previous runs. This should always be called, also
667// in failure cases, to ensure cleanup.
668//
669// It is safe to call Finish() multiple times.
670func (b *Builder) Finish() error {
671 if b.finishCalled {
672 return b.buildError
673 }
674
675 b.finishCalled = true
676
677 b.flush()
678 b.building.Wait()
679
680 if b.buildError != nil {
681 for tmp := range b.finishedShards {
682 log.Printf("Builder.Finish %s", tmp)
683 os.Remove(tmp)
684 }
685 b.finishedShards = map[string]string{}
686 return b.buildError
687 }
688
689 // map of temporary -> final names for all updated shards + shard metadata files
690 artifactPaths := make(map[string]string)
691 for tmp, final := range b.finishedShards {
692 artifactPaths[tmp] = final
693 }
694
695 oldShards := b.opts.FindAllShards()
696
697 if b.opts.IsDelta {
698 // Delta shard builds need to update FileTombstone and branch commit information for all
699 // existing shards
700 for _, shard := range oldShards {
701 repositories, _, err := zoekt.ReadMetadataPathAlive(shard)
702 if err != nil {
703 return fmt.Errorf("reading metadata from shard %q: %w", shard, err)
704 }
705
706 if len(repositories) > 1 {
707 return fmt.Errorf("delta shard builds don't support repositories contained in compound shards (shard %q)", shard)
708 }
709
710 if len(repositories) == 0 {
711 return fmt.Errorf("failed to update repository metadata for shard %q - shard contains no repositories", shard)
712 }
713
714 repository := repositories[0]
715 if repository.ID != b.opts.RepositoryDescription.ID {
716 return fmt.Errorf("shard %q doesn't contain repository ID %d (%q)", shard, b.opts.RepositoryDescription.ID, b.opts.RepositoryDescription.Name)
717 }
718
719 if len(b.opts.changedOrRemovedFiles) > 0 && repository.FileTombstones == nil {
720 repository.FileTombstones = make(map[string]struct{}, len(b.opts.changedOrRemovedFiles))
721 }
722
723 for _, f := range b.opts.changedOrRemovedFiles {
724 repository.FileTombstones[f] = struct{}{}
725 }
726
727 if !BranchNamesEqual(repository.Branches, b.opts.RepositoryDescription.Branches) {
728 return deltaBranchSetError{
729 shardName: shard,
730 old: repository.Branches,
731 new: b.opts.RepositoryDescription.Branches,
732 }
733 }
734
735 if b.opts.GetHash() != repository.IndexOptions {
736 return &deltaIndexOptionsMismatchError{
737 shardName: shard,
738 newOptions: b.opts.HashOptions(),
739 }
740 }
741
742 repository.Branches = b.opts.RepositoryDescription.Branches
743
744 repository.LatestCommitDate = b.opts.RepositoryDescription.LatestCommitDate
745
746 tempPath, finalPath, err := zoekt.JsonMarshalRepoMetaTemp(shard, repository)
747 if err != nil {
748 return fmt.Errorf("writing repository metadta for shard %q: %w", shard, err)
749 }
750
751 artifactPaths[tempPath] = finalPath
752 }
753 }
754
755 // We mark finished shards as empty when we successfully finish. Return now
756 // to allow call sites to call Finish idempotently.
757 if len(artifactPaths) == 0 {
758 return b.buildError
759 }
760
761 defer b.shardLogger.Close()
762
763 // Collect a map of the old shards on disk. For each new shard we replace we
764 // delete it from toDelete. Anything remaining in toDelete will be removed
765 // after we have renamed everything into place.
766
767 var toDelete map[string]struct{}
768 if !b.opts.IsDelta {
769 // Non-delta shard builds delete all existing shards before they write out
770 // new ones.
771 // By contrast, delta shard builds work by stacking changes on top of existing shards.
772 // So, we skip populating the toDelete map if we're building delta shards.
773
774 toDelete = make(map[string]struct{})
775 for _, name := range oldShards {
776 paths, err := zoekt.IndexFilePaths(name)
777 if err != nil {
778 b.buildError = fmt.Errorf("failed to find old paths for %s: %w", name, err)
779 }
780 for _, p := range paths {
781 toDelete[p] = struct{}{}
782 }
783 }
784 }
785
786 for tmp, final := range artifactPaths {
787 if err := os.Rename(tmp, final); err != nil {
788 b.buildError = err
789 continue
790 }
791
792 delete(toDelete, final)
793
794 b.shardLog("upsert", final, b.opts.RepositoryDescription.Name)
795 }
796
797 b.finishedShards = map[string]string{}
798
799 for p := range toDelete {
800 // Don't delete compound shards, set tombstones instead.
801 if zoekt.ShardMergingEnabled() && strings.HasPrefix(filepath.Base(p), "compound-") {
802 if !strings.HasSuffix(p, ".zoekt") {
803 continue
804 }
805 b.shardLog("tomb", p, b.opts.RepositoryDescription.Name)
806 err := zoekt.SetTombstone(p, b.opts.RepositoryDescription.ID)
807 b.buildError = err
808 continue
809 }
810 log.Printf("removing old shard file: %s", p)
811 b.shardLog("remove", p, b.opts.RepositoryDescription.Name)
812 if err := os.Remove(p); err != nil {
813 b.buildError = err
814 }
815 }
816
817 return b.buildError
818}
819
820// BranchNamesEqual compares the given zoekt.RepositoryBranch slices, and returns true
821// iff both slices specify the same set of branch names in the same order.
822func BranchNamesEqual(a, b []zoekt.RepositoryBranch) bool {
823 if len(a) != len(b) {
824 return false
825 }
826
827 for i := range a {
828 x, y := a[i], b[i]
829 if x.Name != y.Name {
830 return false
831 }
832 }
833
834 return true
835}
836
837func (b *Builder) flush() error {
838 todo := b.todo
839 b.todo = nil
840 b.size = 0
841 b.errMu.Lock()
842 defer b.errMu.Unlock()
843 if b.buildError != nil {
844 return b.buildError
845 }
846
847 hasShard := b.nextShardNum > 0
848 if len(todo) == 0 && hasShard {
849 return nil
850 }
851
852 shard := b.nextShardNum
853 b.nextShardNum++
854
855 if b.opts.Parallelism > 1 && b.opts.MemProfile == "" {
856 b.building.Add(1)
857 b.throttle <- 1
858 go func() {
859 done, err := b.buildShard(todo, shard)
860 <-b.throttle
861
862 b.errMu.Lock()
863 defer b.errMu.Unlock()
864 if err != nil && b.buildError == nil {
865 b.buildError = err
866 }
867 if err == nil {
868 b.finishedShards[done.temp] = done.final
869 }
870 b.building.Done()
871 }()
872 } else {
873 // No goroutines when we're not parallel. This
874 // simplifies memory profiling.
875 done, err := b.buildShard(todo, shard)
876 b.buildError = err
877 if err == nil {
878 b.finishedShards[done.temp] = done.final
879 }
880 if b.opts.MemProfile != "" {
881 // drop memory, and profile.
882 todo = nil
883 b.writeMemProfile(b.opts.MemProfile)
884 }
885
886 return b.buildError
887 }
888
889 return nil
890}
891
892func (b *Builder) shardLog(action, shard string, repoName string) {
893 shard = filepath.Base(shard)
894 var shardSize int64
895 if fi, err := os.Stat(filepath.Join(b.opts.IndexDir, shard)); err == nil {
896 shardSize = fi.Size()
897 }
898 _, _ = fmt.Fprintf(b.shardLogger, "%s\t%s\t%s\t%d\t%s\n", time.Now().UTC().Format(time.RFC3339), action, shard, shardSize, repoName)
899}
900
901var profileNumber int
902
903func (b *Builder) writeMemProfile(name string) {
904 nm := fmt.Sprintf("%s.%d", name, profileNumber)
905 profileNumber++
906 f, err := os.Create(nm)
907 if err != nil {
908 log.Fatal("could not create memory profile: ", err)
909 }
910 runtime.GC() // get up-to-date statistics
911 if err := pprof.WriteHeapProfile(f); err != nil {
912 log.Fatal("could not write memory profile: ", err)
913 }
914 f.Close()
915 log.Printf("wrote mem profile %q", nm)
916}
917
918// map [0,inf) to [0,1) monotonically
919func squashRange(j int) float64 {
920 x := float64(j)
921 return x / (1 + x)
922}
923
924// IsLowPriority takes a file name and makes an educated guess about its priority
925// in search results. A file is considered low priority if it looks like a test,
926// vendored, or generated file.
927//
928// These 'priority' criteria affects how documents are ordered within a shard. It's
929// also used to help guess a file's rank when we're missing ranking information.
930func IsLowPriority(file string) bool {
931 return testRe.MatchString(file) || isGenerated(file) || isVendored(file)
932}
933
934var testRe = regexp.MustCompile("[Tt]est")
935
936func isGenerated(file string) bool {
937 return strings.HasSuffix(file, "min.js") || strings.HasSuffix(file, "js.map")
938}
939
940func isVendored(file string) bool {
941 return strings.Contains(file, "vendor/") || strings.Contains(file, "node_modules/")
942}
943
944type rankedDoc struct {
945 *zoekt.Document
946 rank []float64
947}
948
949// rank returns a vector of scores which is used at index-time to sort documents
950// before writing them to disk. The order of documents in the shard is important
951// at query time, because earlier documents receive a boost at query time and
952// have a higher chance of being searched before limits kick in.
953func rank(d *zoekt.Document, origIdx int) []float64 {
954 skipped := 0.0
955 if d.SkipReason != "" {
956 skipped = 1.0
957 }
958
959 generated := 0.0
960 if isGenerated(d.Name) {
961 generated = 1.0
962 }
963
964 vendor := 0.0
965 if isVendored(d.Name) {
966 vendor = 1.0
967 }
968
969 test := 0.0
970 if testRe.MatchString(d.Name) {
971 test = 1.0
972 }
973
974 // Smaller is earlier (=better).
975 return []float64{
976 // Always place skipped docs last
977 skipped,
978
979 // Prefer docs that are not generated
980 generated,
981
982 // Prefer docs that are not vendored
983 vendor,
984
985 // Prefer docs that are not tests
986 test,
987
988 // With short names
989 squashRange(len(d.Name)),
990
991 // With many symbols
992 1.0 - squashRange(len(d.Symbols)),
993
994 // With short content
995 squashRange(len(d.Content)),
996
997 // That is present is as many branches as possible
998 1.0 - squashRange(len(d.Branches)),
999
1000 // Preserve original ordering.
1001 squashRange(origIdx),
1002 }
1003}
1004
1005func sortDocuments(todo []*zoekt.Document) {
1006 rs := make([]rankedDoc, 0, len(todo))
1007 for i, t := range todo {
1008 rd := rankedDoc{t, rank(t, i)}
1009 rs = append(rs, rd)
1010 }
1011 sort.Slice(rs, func(i, j int) bool {
1012 r1 := rs[i].rank
1013 r2 := rs[j].rank
1014 for i := range r1 {
1015 if r1[i] < r2[i] {
1016 return true
1017 }
1018 if r1[i] > r2[i] {
1019 return false
1020 }
1021 }
1022
1023 return false
1024 })
1025 for i := range todo {
1026 todo[i] = rs[i].Document
1027 }
1028}
1029
1030func (b *Builder) buildShard(todo []*zoekt.Document, nextShardNum int) (*finishedShard, error) {
1031 if !b.opts.DisableCTags && (b.opts.CTagsPath != "" || b.opts.ScipCTagsPath != "") {
1032 err := ctagsAddSymbolsParserMap(todo, b.opts.LanguageMap, b.parserMap)
1033 if b.opts.CTagsMustSucceed && err != nil {
1034 return nil, err
1035 }
1036 if err != nil {
1037 log.Printf("ignoring universal:%s or scip:%s error: %v", b.opts.CTagsPath, b.opts.ScipCTagsPath, err)
1038 }
1039 }
1040
1041 name := b.opts.shardName(nextShardNum)
1042
1043 shardBuilder, err := b.newShardBuilder()
1044 if err != nil {
1045 return nil, err
1046 }
1047
1048 sortDocuments(todo)
1049
1050 for _, t := range todo {
1051 if err := shardBuilder.Add(*t); err != nil {
1052 return nil, err
1053 }
1054 }
1055
1056 return b.writeShard(name, shardBuilder)
1057}
1058
1059func (b *Builder) newShardBuilder() (*zoekt.IndexBuilder, error) {
1060 desc := b.opts.RepositoryDescription
1061 desc.HasSymbols = !b.opts.DisableCTags && b.opts.CTagsPath != ""
1062 desc.SubRepoMap = b.opts.SubRepositories
1063 desc.IndexOptions = b.opts.GetHash()
1064
1065 shardBuilder, err := zoekt.NewIndexBuilder(&desc)
1066 if err != nil {
1067 return nil, err
1068 }
1069 shardBuilder.IndexTime = b.indexTime
1070 shardBuilder.ID = b.id
1071 return shardBuilder, nil
1072}
1073
1074func (b *Builder) writeShard(fn string, ib *zoekt.IndexBuilder) (*finishedShard, error) {
1075 dir := filepath.Dir(fn)
1076 if err := os.MkdirAll(dir, 0o700); err != nil {
1077 return nil, err
1078 }
1079
1080 f, err := os.CreateTemp(dir, filepath.Base(fn)+".*.tmp")
1081 if err != nil {
1082 return nil, err
1083 }
1084 if runtime.GOOS != "windows" {
1085 if err := f.Chmod(0o666 &^ umask); err != nil {
1086 return nil, err
1087 }
1088 }
1089
1090 defer f.Close()
1091 if err := ib.Write(f); err != nil {
1092 return nil, err
1093 }
1094 fi, err := f.Stat()
1095 if err != nil {
1096 return nil, err
1097 }
1098 if err := f.Close(); err != nil {
1099 return nil, err
1100 }
1101
1102 log.Printf("finished shard %s: %d index bytes (overhead %3.1f), %d files processed \n",
1103 fn,
1104 fi.Size(),
1105 float64(fi.Size())/float64(ib.ContentSize()+1),
1106 ib.NumFiles())
1107
1108 return &finishedShard{f.Name(), fn}, nil
1109}
1110
1111type deltaBranchSetError struct {
1112 shardName string
1113 old, new []zoekt.RepositoryBranch
1114}
1115
1116func (e deltaBranchSetError) Error() string {
1117 return fmt.Sprintf("repository metadata in shard %q contains a different set of branch names than what was requested, which is unsupported in a delta shard build. old: %+v, new: %+v", e.shardName, e.old, e.new)
1118}
1119
1120type deltaIndexOptionsMismatchError struct {
1121 shardName string
1122 newOptions HashOptions
1123}
1124
1125func (e *deltaIndexOptionsMismatchError) Error() string {
1126 return fmt.Sprintf("one or more index options for shard %q do not match Builder's index options. These index option updates are incompatible with delta build. New index options: %+v", e.shardName, e.newOptions)
1127}
1128
1129// umask holds the Umask of the current process
1130var umask os.FileMode