fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// package build implements a more convenient interface for building
16// zoekt indices.
17package build
18
19import (
20 "crypto/sha1"
21 "flag"
22 "fmt"
23 "io"
24 "log"
25 "math"
26 "net/url"
27 "os"
28 "os/exec"
29 "path"
30 "path/filepath"
31 "reflect"
32 "runtime"
33 "runtime/pprof"
34 "sort"
35 "strconv"
36 "strings"
37 "sync"
38 "time"
39
40 "github.com/bmatcuk/doublestar"
41 "github.com/grafana/regexp"
42 "github.com/rs/xid"
43 "gopkg.in/natefinch/lumberjack.v2"
44
45 "github.com/sourcegraph/zoekt"
46 "github.com/sourcegraph/zoekt/ctags"
47)
48
49var DefaultDir = filepath.Join(os.Getenv("HOME"), ".zoekt")
50
51// Branch describes a single branch version.
52type Branch struct {
53 Name string
54 Version string
55}
56
57// Options sets options for the index building.
58type Options struct {
59 // IndexDir is a directory that holds *.zoekt index files.
60 IndexDir string
61
62 // SizeMax is the maximum file size
63 SizeMax int
64
65 // Parallelism is the maximum number of shards to index in parallel
66 Parallelism int
67
68 // ShardMax sets the maximum corpus size for a single shard
69 ShardMax int
70
71 // TrigramMax sets the maximum number of distinct trigrams per document.
72 TrigramMax int
73
74 // RepositoryDescription holds names and URLs for the repository.
75 RepositoryDescription zoekt.Repository
76
77 // SubRepositories is a path => sub repository map.
78 SubRepositories map[string]*zoekt.Repository
79
80 // DisableCTags disables the generation of ctags metadata.
81 DisableCTags bool
82
83 // CtagsPath is the path to the ctags binary to run, or empty
84 // if a valid binary couldn't be found.
85 CTagsPath string
86
87 // If set, ctags must succeed.
88 CTagsMustSucceed bool
89
90 // Write memory profiles to this file.
91 MemProfile string
92
93 // LargeFiles is a slice of glob patterns, including ** for any number
94 // of directories, where matching file paths should be indexed
95 // regardless of their size. The full pattern syntax is here:
96 // https://github.com/bmatcuk/doublestar/tree/v1#patterns.
97 LargeFiles []string
98
99 // IsDelta is true if this run contains only the changed documents since the
100 // last run.
101 IsDelta bool
102
103 // DocumentRanksPath is the path to the file with document ranks. If empty,
104 // ranks will be computed on-the-fly.
105 DocumentRanksPath string
106
107 // DocumentRanksVersion is a string which when changed will cause us to
108 // reindex a shard. This field is used so that when the contents of
109 // DocumentRanksPath changes, we can reindex.
110 DocumentRanksVersion string
111
112 // changedOrRemovedFiles is a list of file paths that have been changed or removed
113 // since the last indexing job for this repository. These files will be tombstoned
114 // in the older shards for this repository.
115 changedOrRemovedFiles []string
116}
117
118// HashOptions contains only the options in Options that upon modification leads to IndexState of IndexStateMismatch during the next index building.
119type HashOptions struct {
120 sizeMax int
121 disableCTags bool
122 ctagsPath string
123 cTagsMustSucceed bool
124 largeFiles []string
125
126 // documentRankVersion is an experimental field which will change when the
127 // DocumentRanksPath content changes. If empty we ignore it.
128 documentRankVersion string
129}
130
131func (o *Options) HashOptions() HashOptions {
132 return HashOptions{
133 sizeMax: o.SizeMax,
134 disableCTags: o.DisableCTags,
135 ctagsPath: o.CTagsPath,
136 cTagsMustSucceed: o.CTagsMustSucceed,
137 largeFiles: o.LargeFiles,
138 documentRankVersion: o.DocumentRanksVersion,
139 }
140}
141
142func (o *Options) GetHash() string {
143 h := o.HashOptions()
144 hasher := sha1.New()
145
146 hasher.Write([]byte(h.ctagsPath))
147 hasher.Write([]byte(fmt.Sprintf("%t", h.cTagsMustSucceed)))
148 hasher.Write([]byte(fmt.Sprintf("%d", h.sizeMax)))
149 hasher.Write([]byte(fmt.Sprintf("%q", h.largeFiles)))
150 hasher.Write([]byte(fmt.Sprintf("%t", h.disableCTags)))
151
152 if h.documentRankVersion != "" {
153 hasher.Write([]byte{0})
154 io.WriteString(hasher, h.documentRankVersion)
155 }
156
157 return fmt.Sprintf("%x", hasher.Sum(nil))
158}
159
160type largeFilesFlag struct{ *Options }
161
162func (f largeFilesFlag) String() string {
163 // From flag.Value documentation:
164 //
165 // The flag package may call the String method with a zero-valued receiver,
166 // such as a nil pointer.
167 if f.Options == nil {
168 return ""
169 }
170 s := append([]string{""}, f.LargeFiles...)
171 return strings.Join(s, "-large_file ")
172}
173
174func (f largeFilesFlag) Set(value string) error {
175 f.LargeFiles = append(f.LargeFiles, value)
176 return nil
177}
178
179// Flags adds flags for build options to fs. It is the "inverse" of Args.
180func (o *Options) Flags(fs *flag.FlagSet) {
181 x := *o
182 x.SetDefaults()
183 fs.IntVar(&o.SizeMax, "file_limit", x.SizeMax, "maximum file size")
184 fs.IntVar(&o.TrigramMax, "max_trigram_count", x.TrigramMax, "maximum number of trigrams per document")
185 fs.IntVar(&o.ShardMax, "shard_limit", x.ShardMax, "maximum corpus size for a shard")
186 fs.IntVar(&o.Parallelism, "parallelism", x.Parallelism, "maximum number of parallel indexing processes.")
187 fs.StringVar(&o.IndexDir, "index", x.IndexDir, "directory for search indices")
188 fs.BoolVar(&o.CTagsMustSucceed, "require_ctags", x.CTagsMustSucceed, "If set, ctags calls must succeed.")
189 fs.Var(largeFilesFlag{o}, "large_file", "A glob pattern where matching files are to be index regardless of their size. You can add multiple patterns by setting this more than once.")
190 fs.StringVar(&o.MemProfile, "memprofile", "", "write memory profile(s) to `file.shardnum`. Note: sets parallelism to 1.")
191
192 // Sourcegraph specific
193 fs.BoolVar(&o.DisableCTags, "disable_ctags", x.DisableCTags, "If set, ctags will not be called.")
194}
195
196// Args generates command line arguments for o. It is the "inverse" of Flags.
197func (o *Options) Args() []string {
198 var args []string
199
200 if o.SizeMax != 0 {
201 args = append(args, "-file_limit", strconv.Itoa(o.SizeMax))
202 }
203
204 if o.TrigramMax != 0 {
205 args = append(args, "-max_trigram_count", strconv.Itoa(o.TrigramMax))
206 }
207
208 if o.ShardMax != 0 {
209 args = append(args, "-shard_limit", strconv.Itoa(o.ShardMax))
210 }
211
212 if o.Parallelism != 0 {
213 args = append(args, "-parallelism", strconv.Itoa(o.Parallelism))
214 }
215
216 if o.IndexDir != "" {
217 args = append(args, "-index", o.IndexDir)
218 }
219
220 if o.CTagsMustSucceed {
221 args = append(args, "-require_ctags")
222 }
223
224 for _, a := range o.LargeFiles {
225 args = append(args, "-large_file", a)
226 }
227
228 // Sourcegraph specific
229 if o.DisableCTags {
230 args = append(args, "-disable_ctags")
231 }
232
233 return args
234}
235
236// Builder manages (parallel) creation of uniformly sized shards. The
237// builder buffers up documents until it collects enough documents and
238// then builds a shard and writes.
239type Builder struct {
240 opts Options
241 throttle chan int
242
243 nextShardNum int
244 todo []*zoekt.Document
245 size int
246
247 parser ctags.Parser
248
249 building sync.WaitGroup
250
251 errMu sync.Mutex
252 buildError error
253
254 // temp name => final name for finished shards. We only rename
255 // them once all shards succeed to avoid Frankstein corpuses.
256 finishedShards map[string]string
257
258 shardLogger io.WriteCloser
259
260 // indexTime is set by tests for doing reproducible builds.
261 indexTime time.Time
262
263 // a sortable 20 chars long id.
264 id string
265
266 finishCalled bool
267}
268
269type finishedShard struct {
270 temp, final string
271}
272
273func checkCTags() string {
274 if ctags := os.Getenv("CTAGS_COMMAND"); ctags != "" {
275 return ctags
276 }
277
278 if ctags, err := exec.LookPath("universal-ctags"); err == nil {
279 return ctags
280 }
281
282 if ctags, err := exec.LookPath("ctags-exuberant"); err == nil {
283 return ctags
284 }
285 return ""
286}
287
288// SetDefaults sets reasonable default options.
289func (o *Options) SetDefaults() {
290 if o.CTagsPath == "" && !o.DisableCTags {
291 o.CTagsPath = checkCTags()
292 }
293
294 if o.Parallelism == 0 {
295 o.Parallelism = 4
296 }
297 if o.SizeMax == 0 {
298 o.SizeMax = 2 << 20
299 }
300 if o.ShardMax == 0 {
301 o.ShardMax = 100 << 20
302 }
303 if o.TrigramMax == 0 {
304 o.TrigramMax = 20000
305 }
306
307 if o.RepositoryDescription.Name == "" && o.RepositoryDescription.URL != "" {
308 parsed, _ := url.Parse(o.RepositoryDescription.URL)
309 if parsed != nil {
310 o.RepositoryDescription.Name = filepath.Join(parsed.Host, parsed.Path)
311 }
312 }
313}
314
315func hashString(s string) string {
316 h := sha1.New()
317 _, _ = io.WriteString(h, s)
318 return fmt.Sprintf("%x", h.Sum(nil))
319}
320
321// ShardName returns the name the given index shard.
322func (o *Options) shardName(n int) string {
323 return o.shardNameVersion(zoekt.IndexFormatVersion, n)
324}
325
326func (o *Options) shardNameVersion(version, n int) string {
327 abs := url.QueryEscape(o.RepositoryDescription.Name)
328 if len(abs) > 200 {
329 abs = abs[:200] + hashString(abs)[:8]
330 }
331 return filepath.Join(o.IndexDir,
332 fmt.Sprintf("%s_v%d.%05d.zoekt", abs, version, n))
333}
334
335type IndexState string
336
337const (
338 IndexStateMissing IndexState = "missing"
339 IndexStateCorrupt IndexState = "corrupt"
340 IndexStateVersion IndexState = "version-mismatch"
341 IndexStateOption IndexState = "option-mismatch"
342 IndexStateMeta IndexState = "meta-mismatch"
343 IndexStateContent IndexState = "content-mismatch"
344 IndexStateEqual IndexState = "equal"
345)
346
347var readVersions = []struct {
348 IndexFormatVersion int
349 FeatureVersion int
350}{{
351 IndexFormatVersion: zoekt.IndexFormatVersion,
352 FeatureVersion: zoekt.FeatureVersion,
353}, {
354 IndexFormatVersion: zoekt.NextIndexFormatVersion,
355 FeatureVersion: zoekt.FeatureVersion,
356}}
357
358// IncrementalSkipIndexing returns true if the index present on disk matches
359// the build options.
360func (o *Options) IncrementalSkipIndexing() bool {
361 state, _ := o.IndexState()
362 return state == IndexStateEqual
363}
364
365// IndexState checks how the index present on disk compares to the build
366// options and returns the IndexState and the name of the first shard.
367func (o *Options) IndexState() (IndexState, string) {
368 // Open the latest version we support that is on disk.
369 fn := o.findShard()
370 if fn == "" {
371 return IndexStateMissing, fn
372 }
373
374 repos, index, err := zoekt.ReadMetadataPathAlive(fn)
375 if os.IsNotExist(err) {
376 return IndexStateMissing, fn
377 } else if err != nil {
378 return IndexStateCorrupt, fn
379 }
380
381 for _, v := range readVersions {
382 if v.IndexFormatVersion == index.IndexFormatVersion && v.FeatureVersion != index.IndexFeatureVersion {
383 return IndexStateVersion, fn
384 }
385 }
386
387 var repo *zoekt.Repository
388 for _, cand := range repos {
389 if cand.Name == o.RepositoryDescription.Name {
390 repo = cand
391 break
392 }
393 }
394
395 if repo == nil {
396 return IndexStateCorrupt, fn
397 }
398
399 if repo.IndexOptions != o.GetHash() {
400 return IndexStateOption, fn
401 }
402
403 if !reflect.DeepEqual(repo.Branches, o.RepositoryDescription.Branches) {
404 return IndexStateContent, fn
405 }
406
407 // We can mutate repo since it lives in the scope of this function call.
408 if updated, err := repo.MergeMutable(&o.RepositoryDescription); err != nil {
409 // non-nil err means we are trying to update an immutable field =>
410 // reindex content.
411 log.Printf("warn: immutable field changed, requires re-index: %s", err)
412 return IndexStateContent, fn
413 } else if updated {
414 return IndexStateMeta, fn
415 }
416
417 return IndexStateEqual, fn
418}
419
420// FindRepositoryMetadata returns the index metadata for the repository
421// specified in the options. 'ok' is false if the repository's metadata
422// couldn't be found or if an error occurred.
423func (o *Options) FindRepositoryMetadata() (repository *zoekt.Repository, ok bool, err error) {
424 shard := o.findShard()
425 if shard == "" {
426 return nil, false, nil
427 }
428
429 repositories, _, err := zoekt.ReadMetadataPathAlive(shard)
430 if err != nil {
431 return nil, false, fmt.Errorf("reading metadata for shard %q: %w", shard, err)
432 }
433
434 ID := o.RepositoryDescription.ID
435 for _, r := range repositories {
436 // compound shards contain multiple repositories, so we
437 // have to pick only the one we're looking for
438 if r.ID == ID {
439 return r, true, nil
440 }
441 }
442
443 // If we're here, then we're somehow in a state where we found a matching
444 // shard that's missing the repository metadata we're looking for. This
445 // should never happen.
446 name := o.RepositoryDescription.Name
447 return nil, false, fmt.Errorf("matching shard %q doesn't contain metadata for repo id %d (%q)", shard, ID, name)
448}
449
450func (o *Options) findShard() string {
451 for _, v := range readVersions {
452 fn := o.shardNameVersion(v.IndexFormatVersion, 0)
453 if _, err := os.Stat(fn); err == nil {
454 return fn
455 }
456 }
457
458 // Brute force finding the shard in compound shards. We should only hit this
459 // code path for repositories that are not already existing or are in
460 // compound shards.
461 //
462 // TODO add an oracle which can speed this up in the case of repositories
463 // already in compound shards.
464 compoundShards, err := filepath.Glob(path.Join(o.IndexDir, "compound-*.zoekt"))
465 if err != nil {
466 return ""
467 }
468 for _, fn := range compoundShards {
469 repos, _, err := zoekt.ReadMetadataPathAlive(fn)
470 if err != nil {
471 continue
472 }
473 for _, repo := range repos {
474 if repo.ID == o.RepositoryDescription.ID {
475 return fn
476 }
477 }
478 }
479
480 return ""
481}
482
483func (o *Options) FindAllShards() []string {
484 for _, v := range readVersions {
485 fn := o.shardNameVersion(v.IndexFormatVersion, 0)
486 if _, err := os.Stat(fn); err == nil {
487 shards := []string{fn}
488 for i := 1; ; i++ {
489 fn := o.shardNameVersion(v.IndexFormatVersion, i)
490 if _, err := os.Stat(fn); err != nil {
491 return shards
492 }
493 shards = append(shards, fn)
494 }
495 }
496 }
497
498 // lazily fallback to findShard which will look for a compound shard.
499 if fn := o.findShard(); fn != "" {
500 return []string{fn}
501 }
502
503 return nil
504}
505
506// IgnoreSizeMax determines whether the max size should be ignored.
507func (o *Options) IgnoreSizeMax(name string) bool {
508 // A pattern match will override preceding pattern matches.
509 for i := len(o.LargeFiles) - 1; i >= 0; i-- {
510 pattern := strings.TrimSpace(o.LargeFiles[i])
511 negated, validatedPattern := checkIsNegatePattern(pattern)
512
513 if m, _ := doublestar.PathMatch(validatedPattern, name); m {
514 if negated {
515 return false
516 } else {
517 return true
518 }
519 }
520 }
521
522 return false
523}
524
525func checkIsNegatePattern(pattern string) (bool, string) {
526 negate := "!"
527
528 // if negated then strip prefix meta character which identifies negated filter pattern
529 if strings.HasPrefix(pattern, negate) {
530 return true, pattern[len(negate):]
531 }
532
533 return false, pattern
534}
535
536// NewBuilder creates a new Builder instance.
537func NewBuilder(opts Options) (*Builder, error) {
538 opts.SetDefaults()
539 if opts.RepositoryDescription.Name == "" {
540 return nil, fmt.Errorf("builder: must set Name")
541 }
542
543 b := &Builder{
544 opts: opts,
545 throttle: make(chan int, opts.Parallelism),
546 finishedShards: map[string]string{},
547 }
548
549 if b.opts.CTagsPath == "" && b.opts.CTagsMustSucceed {
550 return nil, fmt.Errorf("ctags binary not found, but CTagsMustSucceed set")
551 }
552
553 if strings.Contains(opts.CTagsPath, "universal-ctags") {
554 parser, err := ctags.NewParser(opts.CTagsPath)
555 if err != nil && opts.CTagsMustSucceed {
556 return nil, fmt.Errorf("ctags.NewParser: %v", err)
557 }
558
559 b.parser = parser
560 }
561
562 b.shardLogger = &lumberjack.Logger{
563 Filename: filepath.Join(opts.IndexDir, "zoekt-builder-shard-log.tsv"),
564 MaxSize: 100, // Megabyte
565 MaxBackups: 5,
566 }
567
568 if opts.IsDelta {
569 // Delta shards build on top of previously existing shards.
570 // As a consequence, the shardNum for delta shards starts from
571 // the number following the most recently generated shard - not 0.
572 //
573 // Using this numbering scheme allows all the shards to be
574 // discovered as a set.
575 shards := b.opts.FindAllShards()
576 b.nextShardNum = len(shards) // shards are zero indexed, so len() provides the next number after the last one
577 }
578
579 if _, err := b.newShardBuilder(); err != nil {
580 return nil, err
581 }
582
583 now := time.Now()
584 b.indexTime = now
585 b.id = xid.NewWithTime(now).String()
586
587 return b, nil
588}
589
590// AddFile is a convenience wrapper for the Add method
591func (b *Builder) AddFile(name string, content []byte) error {
592 return b.Add(zoekt.Document{Name: name, Content: content})
593}
594
595func (b *Builder) Add(doc zoekt.Document) error {
596 if b.finishCalled {
597 return nil
598 }
599
600 allowLargeFile := b.opts.IgnoreSizeMax(doc.Name)
601
602 // Adjust trigramMax for allowed large files so we don't exclude them.
603 trigramMax := b.opts.TrigramMax
604 if allowLargeFile {
605 trigramMax = math.MaxInt64
606 }
607
608 if len(doc.Content) > b.opts.SizeMax && !allowLargeFile {
609 // We could pass the document on to the shardbuilder, but if
610 // we pass through a part of the source tree with binary/large
611 // files, the corresponding shard would be mostly empty, so
612 // insert a reason here too.
613 doc.SkipReason = fmt.Sprintf("document size %d larger than limit %d", len(doc.Content), b.opts.SizeMax)
614 } else if err := zoekt.CheckText(doc.Content, trigramMax); err != nil {
615 doc.SkipReason = err.Error()
616 doc.Language = "binary"
617 }
618
619 b.todo = append(b.todo, &doc)
620
621 if doc.SkipReason == "" {
622 b.size += len(doc.Name) + len(doc.Content)
623 } else {
624 b.size += len(doc.Name) + len(doc.SkipReason)
625 }
626
627 if b.size > b.opts.ShardMax {
628 return b.flush()
629 }
630
631 return nil
632}
633
634// MarkFileAsChangedOrRemoved indicates that the file specified by the given path
635// has been changed or removed since the last indexing job for this repository.
636//
637// If this build is a delta build, these files will be tombstoned in the older shards for this repository.
638func (b *Builder) MarkFileAsChangedOrRemoved(path string) {
639 b.opts.changedOrRemovedFiles = append(b.opts.changedOrRemovedFiles, path)
640}
641
642// Finish creates a last shard from the buffered documents, and clears
643// stale shards from previous runs. This should always be called, also
644// in failure cases, to ensure cleanup.
645//
646// It is safe to call Finish() multiple times.
647func (b *Builder) Finish() error {
648 if b.finishCalled {
649 return b.buildError
650 }
651
652 b.finishCalled = true
653
654 b.flush()
655 b.building.Wait()
656
657 if b.buildError != nil {
658 for tmp := range b.finishedShards {
659 log.Printf("Builder.Finish %s", tmp)
660 os.Remove(tmp)
661 }
662 b.finishedShards = map[string]string{}
663 return b.buildError
664 }
665
666 // map of temporary -> final names for all updated shards + shard metadata files
667 artifactPaths := make(map[string]string)
668 for tmp, final := range b.finishedShards {
669 artifactPaths[tmp] = final
670 }
671
672 oldShards := b.opts.FindAllShards()
673
674 if b.opts.IsDelta {
675 // Delta shard builds need to update FileTombstone and branch commit information for all
676 // existing shards
677 for _, shard := range oldShards {
678 repositories, _, err := zoekt.ReadMetadataPathAlive(shard)
679 if err != nil {
680 return fmt.Errorf("reading metadata from shard %q: %w", shard, err)
681 }
682
683 if len(repositories) > 1 {
684 return fmt.Errorf("delta shard builds don't support repositories contained in compound shards (shard %q)", shard)
685 }
686
687 if len(repositories) == 0 {
688 return fmt.Errorf("failed to update repository metadata for shard %q - shard contains no repositories", shard)
689 }
690
691 repository := repositories[0]
692 if repository.ID != b.opts.RepositoryDescription.ID {
693 return fmt.Errorf("shard %q doesn't contain repository ID %d (%q)", shard, b.opts.RepositoryDescription.ID, b.opts.RepositoryDescription.Name)
694 }
695
696 if len(b.opts.changedOrRemovedFiles) > 0 && repository.FileTombstones == nil {
697 repository.FileTombstones = make(map[string]struct{}, len(b.opts.changedOrRemovedFiles))
698 }
699
700 for _, f := range b.opts.changedOrRemovedFiles {
701 repository.FileTombstones[f] = struct{}{}
702 }
703
704 if !BranchNamesEqual(repository.Branches, b.opts.RepositoryDescription.Branches) {
705 return deltaBranchSetError{
706 shardName: shard,
707 old: repository.Branches,
708 new: b.opts.RepositoryDescription.Branches,
709 }
710 }
711
712 if b.opts.GetHash() != repository.IndexOptions {
713 return &deltaIndexOptionsMismatchError{
714 shardName: shard,
715 newOptions: b.opts.HashOptions(),
716 }
717 }
718
719 repository.Branches = b.opts.RepositoryDescription.Branches
720
721 repository.LatestCommitDate = b.opts.RepositoryDescription.LatestCommitDate
722
723 tempPath, finalPath, err := zoekt.JsonMarshalRepoMetaTemp(shard, repository)
724 if err != nil {
725 return fmt.Errorf("writing repository metadta for shard %q: %w", shard, err)
726 }
727
728 artifactPaths[tempPath] = finalPath
729 }
730 }
731
732 // We mark finished shards as empty when we successfully finish. Return now
733 // to allow call sites to call Finish idempotently.
734 if len(artifactPaths) == 0 {
735 return b.buildError
736 }
737
738 defer b.shardLogger.Close()
739
740 // Collect a map of the old shards on disk. For each new shard we replace we
741 // delete it from toDelete. Anything remaining in toDelete will be removed
742 // after we have renamed everything into place.
743
744 var toDelete map[string]struct{}
745 if !b.opts.IsDelta {
746 // Non-delta shard builds delete all existing shards before they write out
747 // new ones.
748 // By contrast, delta shard builds work by stacking changes on top of existing shards.
749 // So, we skip populating the toDelete map if we're building delta shards.
750
751 toDelete = make(map[string]struct{})
752 for _, name := range oldShards {
753 paths, err := zoekt.IndexFilePaths(name)
754 if err != nil {
755 b.buildError = fmt.Errorf("failed to find old paths for %s: %w", name, err)
756 }
757 for _, p := range paths {
758 toDelete[p] = struct{}{}
759 }
760 }
761 }
762
763 for tmp, final := range artifactPaths {
764 if err := os.Rename(tmp, final); err != nil {
765 b.buildError = err
766 continue
767 }
768
769 delete(toDelete, final)
770
771 b.shardLog("upsert", final, b.opts.RepositoryDescription.Name)
772 }
773
774 b.finishedShards = map[string]string{}
775
776 for p := range toDelete {
777 // Don't delete compound shards, set tombstones instead.
778 if zoekt.ShardMergingEnabled() && strings.HasPrefix(filepath.Base(p), "compound-") {
779 if !strings.HasSuffix(p, ".zoekt") {
780 continue
781 }
782 b.shardLog("tomb", p, b.opts.RepositoryDescription.Name)
783 err := zoekt.SetTombstone(p, b.opts.RepositoryDescription.ID)
784 b.buildError = err
785 continue
786 }
787 log.Printf("removing old shard file: %s", p)
788 b.shardLog("remove", p, b.opts.RepositoryDescription.Name)
789 if err := os.Remove(p); err != nil {
790 b.buildError = err
791 }
792 }
793
794 return b.buildError
795}
796
797// BranchNamesEqual compares the given zoekt.RepositoryBranch slices, and returns true
798// iff both slices specify the same set of branch names in the same order.
799func BranchNamesEqual(a, b []zoekt.RepositoryBranch) bool {
800 if len(a) != len(b) {
801 return false
802 }
803
804 for i := range a {
805 x, y := a[i], b[i]
806 if x.Name != y.Name {
807 return false
808 }
809 }
810
811 return true
812}
813
814func (b *Builder) flush() error {
815 todo := b.todo
816 b.todo = nil
817 b.size = 0
818 b.errMu.Lock()
819 defer b.errMu.Unlock()
820 if b.buildError != nil {
821 return b.buildError
822 }
823
824 hasShard := b.nextShardNum > 0
825 if len(todo) == 0 && hasShard {
826 return nil
827 }
828
829 shard := b.nextShardNum
830 b.nextShardNum++
831
832 if b.opts.Parallelism > 1 && b.opts.MemProfile == "" {
833 b.building.Add(1)
834 go func() {
835 b.throttle <- 1
836 done, err := b.buildShard(todo, shard)
837 <-b.throttle
838
839 b.errMu.Lock()
840 defer b.errMu.Unlock()
841 if err != nil && b.buildError == nil {
842 b.buildError = err
843 }
844 if err == nil {
845 b.finishedShards[done.temp] = done.final
846 }
847 b.building.Done()
848 }()
849 } else {
850 // No goroutines when we're not parallel. This
851 // simplifies memory profiling.
852 done, err := b.buildShard(todo, shard)
853 b.buildError = err
854 if err == nil {
855 b.finishedShards[done.temp] = done.final
856 }
857 if b.opts.MemProfile != "" {
858 // drop memory, and profile.
859 todo = nil
860 b.writeMemProfile(b.opts.MemProfile)
861 }
862
863 return b.buildError
864 }
865
866 return nil
867}
868
869func (b *Builder) shardLog(action, shard string, repoName string) {
870 shard = filepath.Base(shard)
871 var shardSize int64
872 if fi, err := os.Stat(filepath.Join(b.opts.IndexDir, shard)); err == nil {
873 shardSize = fi.Size()
874 }
875 _, _ = fmt.Fprintf(b.shardLogger, "%s\t%s\t%s\t%d\t%s\n", time.Now().UTC().Format(time.RFC3339), action, shard, shardSize, repoName)
876}
877
878var profileNumber int
879
880func (b *Builder) writeMemProfile(name string) {
881 nm := fmt.Sprintf("%s.%d", name, profileNumber)
882 profileNumber++
883 f, err := os.Create(nm)
884 if err != nil {
885 log.Fatal("could not create memory profile: ", err)
886 }
887 runtime.GC() // get up-to-date statistics
888 if err := pprof.WriteHeapProfile(f); err != nil {
889 log.Fatal("could not write memory profile: ", err)
890 }
891 f.Close()
892 log.Printf("wrote mem profile %q", nm)
893}
894
895// map [0,inf) to [0,1) monotonically
896func squashRange(j int) float64 {
897 x := float64(j)
898 return x / (1 + x)
899}
900
901// IsLowPriority takes a file name and makes an educated guess about its priority
902// in search results. A file is considered low priority if it looks like a test,
903// vendored, or generated file.
904//
905// These 'priority' criteria affects how documents are ordered within a shard. It's
906// also used to help guess a file's rank when we're missing ranking information.
907func IsLowPriority(file string) bool {
908 return testRe.MatchString(file) || isGenerated(file) || isVendored(file)
909}
910
911var testRe = regexp.MustCompile("[Tt]est")
912
913func isGenerated(file string) bool {
914 return strings.HasSuffix(file, "min.js") || strings.HasSuffix(file, "js.map")
915}
916
917func isVendored(file string) bool {
918 return strings.Contains(file, "vendor/") || strings.Contains(file, "node_modules/")
919}
920
921type rankedDoc struct {
922 *zoekt.Document
923 rank []float64
924}
925
926// rank returns a vector of scores which is used at index-time to sort documents
927// before writing them to disk. The order of documents in the shard is important
928// at query time, because earlier documents receive a boost at query time and
929// have a higher chance of being searched before limits kick in.
930func rank(d *zoekt.Document, origIdx int) []float64 {
931 generated := 0.0
932 if isGenerated(d.Name) {
933 generated = 1.0
934 }
935
936 vendor := 0.0
937 if isVendored(d.Name) {
938 vendor = 1.0
939 }
940
941 test := 0.0
942 if testRe.MatchString(d.Name) {
943 test = 1.0
944 }
945
946 // Smaller is earlier (=better).
947 return []float64{
948 // Prefer docs that are not generated
949 generated,
950
951 // Prefer docs that are not vendored
952 vendor,
953
954 // Prefer docs that are not tests
955 test,
956
957 // With short names
958 squashRange(len(d.Name)),
959
960 // With many symbols
961 1.0 - squashRange(len(d.Symbols)),
962
963 // With short content
964 squashRange(len(d.Content)),
965
966 // That is present is as many branches as possible
967 1.0 - squashRange(len(d.Branches)),
968
969 // Preserve original ordering.
970 squashRange(origIdx),
971 }
972}
973
974func sortDocuments(todo []*zoekt.Document) {
975 rs := make([]rankedDoc, 0, len(todo))
976 for i, t := range todo {
977 rd := rankedDoc{t, rank(t, i)}
978 rs = append(rs, rd)
979 }
980 sort.Slice(rs, func(i, j int) bool {
981 r1 := rs[i].rank
982 r2 := rs[j].rank
983 for i := range r1 {
984 if r1[i] < r2[i] {
985 return true
986 }
987 if r1[i] > r2[i] {
988 return false
989 }
990 }
991
992 return false
993 })
994 for i := range todo {
995 todo[i] = rs[i].Document
996 }
997}
998
999func (b *Builder) buildShard(todo []*zoekt.Document, nextShardNum int) (*finishedShard, error) {
1000 if !b.opts.DisableCTags && b.opts.CTagsPath != "" {
1001 err := ctagsAddSymbols(todo, b.parser, b.opts.CTagsPath)
1002 if b.opts.CTagsMustSucceed && err != nil {
1003 return nil, err
1004 }
1005 if err != nil {
1006 log.Printf("ignoring %s error: %v", b.opts.CTagsPath, err)
1007 }
1008 }
1009
1010 name := b.opts.shardName(nextShardNum)
1011
1012 shardBuilder, err := b.newShardBuilder()
1013 if err != nil {
1014 return nil, err
1015 }
1016
1017 sortDocuments(todo)
1018
1019 for _, t := range todo {
1020 if err := shardBuilder.Add(*t); err != nil {
1021 return nil, err
1022 }
1023 }
1024
1025 return b.writeShard(name, shardBuilder)
1026}
1027
1028func (b *Builder) newShardBuilder() (*zoekt.IndexBuilder, error) {
1029 desc := b.opts.RepositoryDescription
1030 desc.HasSymbols = !b.opts.DisableCTags && b.opts.CTagsPath != ""
1031 desc.SubRepoMap = b.opts.SubRepositories
1032 desc.IndexOptions = b.opts.GetHash()
1033
1034 shardBuilder, err := zoekt.NewIndexBuilder(&desc)
1035 if err != nil {
1036 return nil, err
1037 }
1038 shardBuilder.IndexTime = b.indexTime
1039 shardBuilder.ID = b.id
1040 return shardBuilder, nil
1041}
1042
1043func (b *Builder) writeShard(fn string, ib *zoekt.IndexBuilder) (*finishedShard, error) {
1044 dir := filepath.Dir(fn)
1045 if err := os.MkdirAll(dir, 0o700); err != nil {
1046 return nil, err
1047 }
1048
1049 f, err := os.CreateTemp(dir, filepath.Base(fn)+".*.tmp")
1050 if err != nil {
1051 return nil, err
1052 }
1053 if runtime.GOOS != "windows" {
1054 if err := f.Chmod(0o666 &^ umask); err != nil {
1055 return nil, err
1056 }
1057 }
1058
1059 defer f.Close()
1060 if err := ib.Write(f); err != nil {
1061 return nil, err
1062 }
1063 fi, err := f.Stat()
1064 if err != nil {
1065 return nil, err
1066 }
1067 if err := f.Close(); err != nil {
1068 return nil, err
1069 }
1070
1071 log.Printf("finished %s: %d index bytes (overhead %3.1f)", fn, fi.Size(),
1072 float64(fi.Size())/float64(ib.ContentSize()+1))
1073
1074 return &finishedShard{f.Name(), fn}, nil
1075}
1076
1077type deltaBranchSetError struct {
1078 shardName string
1079 old, new []zoekt.RepositoryBranch
1080}
1081
1082func (e deltaBranchSetError) Error() string {
1083 return fmt.Sprintf("repository metadata in shard %q contains a different set of branch names than what was requested, which is unsupported in a delta shard build. old: %+v, new: %+v", e.shardName, e.old, e.new)
1084}
1085
1086type deltaIndexOptionsMismatchError struct {
1087 shardName string
1088 newOptions HashOptions
1089}
1090
1091func (e *deltaIndexOptionsMismatchError) Error() string {
1092 return fmt.Sprintf("one or more index options for shard %q do not match Builder's index options. These index option updates are incompatible with delta build. New index options: %+v", e.shardName, e.newOptions)
1093}
1094
1095// umask holds the Umask of the current process
1096var umask os.FileMode