fork of https://github.com/sourcegraph/zoekt
1package main
2
3import (
4 "bytes"
5 "context"
6 "crypto/sha1"
7 "errors"
8 "fmt"
9 "io"
10 "net/url"
11 "os"
12 "os/exec"
13 "path/filepath"
14 "sort"
15 "strconv"
16 "strings"
17 "time"
18
19 sglog "github.com/sourcegraph/log"
20
21 "github.com/sourcegraph/zoekt"
22 configv1 "github.com/sourcegraph/zoekt/cmd/zoekt-sourcegraph-indexserver/grpc/protos/sourcegraph/zoekt/configuration/v1"
23 "github.com/sourcegraph/zoekt/index"
24 "github.com/sourcegraph/zoekt/internal/ctags"
25)
26
27const defaultIndexingTimeout = 1*time.Hour + 30*time.Minute
28
29// IndexOptions are the options that Sourcegraph can set via it's search
30// configuration endpoint.
31type IndexOptions struct {
32 // LargeFiles is a slice of glob patterns where matching file paths should
33 // be indexed regardless of their size. The pattern syntax can be found
34 // here: https://golang.org/pkg/path/filepath/#Match.
35 LargeFiles []string
36
37 // Symbols if true will make zoekt index the output of ctags.
38 Symbols bool
39
40 // Branches is a slice of branches to index.
41 Branches []zoekt.RepositoryBranch
42
43 // RepoID is the Sourcegraph Repository ID.
44 RepoID uint32
45
46 // Name is the Repository Name.
47 Name string
48
49 // CloneURL is the internal clone URL for Name.
50 CloneURL string
51
52 // Priority indicates ranking in results, higher first.
53 Priority float64
54
55 // Public is true if the repository is public.
56 Public bool
57
58 // Fork is true if the repository is a fork.
59 Fork bool
60
61 // Archived is true if the repository is archived.
62 Archived bool
63
64 // Map from language to scip-ctags, universal-ctags, or neither
65 LanguageMap ctags.LanguageMap
66
67 // The number of threads to use for indexing shards. Defaults to the number of available
68 // CPUs. If the server flag -cpu_fraction is set, then this value overrides it.
69 ShardConcurrency int32
70
71 // TenantID is the tenant ID for the repository.
72 TenantID int
73}
74
75// indexArgs represents the arguments we pass to zoekt-git-index
76type indexArgs struct {
77 IndexOptions
78
79 // Incremental indicates to skip indexing if already indexed.
80 Incremental bool
81
82 // IndexDir is the index directory to store the shards.
83 IndexDir string
84
85 // Parallelism is the number of shards to compute in parallel.
86 Parallelism int
87
88 // UseDelta is true if we want to use the new delta indexer. This should
89 // only be true for repositories we explicitly enable.
90 UseDelta bool
91
92 // DeltaShardNumberFallbackThreshold is an upper limit on the number of preexisting shards that can exist
93 // before attempting a delta build.
94 DeltaShardNumberFallbackThreshold uint64
95
96 // ShardMerging is true if we want zoekt-git-index to respect compound shards.
97 ShardMerging bool
98}
99
100// BuildOptions returns a index.Options represented by indexArgs. Note: it
101// doesn't set fields like repository/branch.
102func (o *indexArgs) BuildOptions() *index.Options {
103 return &index.Options{
104 // It is important that this RepositoryDescription exactly matches what
105 // the indexer we call will produce. This is to ensure that
106 // IncrementalSkipIndexing and IndexState can correctly calculate if
107 // nothing needs to be done.
108 RepositoryDescription: zoekt.Repository{
109 TenantID: o.TenantID,
110 ID: o.RepoID,
111 Name: o.Name,
112 Branches: o.Branches,
113 RawConfig: map[string]string{
114 "repoid": strconv.Itoa(int(o.RepoID)),
115 "priority": strconv.FormatFloat(o.Priority, 'g', -1, 64),
116 "public": marshalBool(o.Public),
117 "fork": marshalBool(o.Fork),
118 "archived": marshalBool(o.Archived),
119 // Calculate repo rank based on the latest commit date.
120 "latestCommitDate": "1",
121 "tenantID": strconv.Itoa(o.TenantID),
122 },
123 },
124 IndexDir: o.IndexDir,
125 Parallelism: o.Parallelism,
126 SizeMax: MaxFileSize,
127 LargeFiles: o.LargeFiles,
128 CTagsMustSucceed: o.Symbols,
129 DisableCTags: !o.Symbols,
130 IsDelta: o.UseDelta,
131
132 LanguageMap: o.LanguageMap,
133
134 ShardMerging: o.ShardMerging,
135 }
136}
137
138func marshalBool(b bool) string {
139 if b {
140 return "1"
141 }
142 return "0"
143}
144
145func (o *indexArgs) String() string {
146 s := fmt.Sprintf("%d %s", o.RepoID, o.Name)
147 for i, b := range o.Branches {
148 if i == 0 {
149 s = fmt.Sprintf("%s@%s=%s", s, b.Name, b.Version)
150 } else {
151 s = fmt.Sprintf("%s,%s=%s", s, b.Name, b.Version)
152 }
153 }
154 return s
155}
156
157type gitIndexConfig struct {
158 // runCmd is the function that's used to execute all external commands (such as calls to "git" or "zoekt-git-index")
159 // that gitIndex may construct.
160 runCmd func(*exec.Cmd) error
161
162 // findRepositoryMetadata is the function that returns the repository metadata for the
163 // repository specified in args. 'ok' is false if the repository's metadata
164 // couldn't be found or if an error occurred.
165 //
166 // The primary purpose of this configuration option is to be able to provide a stub
167 // implementation for this in our test suite. All other callers should use build.Options.FindRepositoryMetadata().
168 findRepositoryMetadata func(args *indexArgs) (repository *zoekt.Repository, metadata *zoekt.IndexMetadata, ok bool, err error)
169
170 // timeout defines how long the index server waits before killing an indexing job.
171 timeout time.Duration
172}
173
174func gitIndex(ctx context.Context, c gitIndexConfig, o *indexArgs, sourcegraph Sourcegraph, l sglog.Logger) error {
175 logger := l.Scoped("gitIndex")
176
177 if len(o.Branches) == 0 {
178 return errors.New("zoekt-git-index requires 1 or more branches")
179 }
180
181 if c.runCmd == nil {
182 return errors.New("runCmd in provided configuration was nil - a function must be provided")
183 }
184
185 if c.findRepositoryMetadata == nil {
186 return errors.New("findRepositoryMetadata in provided configuration was nil - a function must be provided")
187 }
188
189 ctx, cancel := context.WithTimeout(ctx, c.timeout)
190 defer cancel()
191
192 gitDir, err := tmpGitDir(o.Name)
193 if err != nil {
194 return err
195 }
196 defer os.RemoveAll(gitDir) // best-effort cleanup
197
198 err = fetchRepo(ctx, gitDir, o, c, logger)
199 if err != nil {
200 return err
201 }
202
203 err = setZoektConfig(ctx, gitDir, o, c)
204 if err != nil {
205 return err
206 }
207
208 err = indexRepo(ctx, gitDir, sourcegraph, o, c, logger)
209 if err != nil {
210 return err
211 }
212
213 return nil
214}
215
216func fetchRepo(ctx context.Context, gitDir string, o *indexArgs, c gitIndexConfig, logger sglog.Logger) error {
217 // Create a repo to fetch into
218 cmd := exec.CommandContext(ctx, "git",
219 // use a random default branch. This is so that HEAD isn't a symref to a
220 // branch that is indexed. For example if you are indexing
221 // HEAD,master. Then HEAD would be pointing to master by default.
222 "-c", "init.defaultBranch=nonExistentBranchBB0FOFCH32",
223 "init",
224 // we don't need a working copy
225 "--bare",
226 gitDir)
227 cmd.Stdin = &bytes.Buffer{}
228 if err := c.runCmd(cmd); err != nil {
229 return err
230 }
231
232 for _, header := range []string{
233 "X-Sourcegraph-Actor-UID: internal",
234 "X-Sourcegraph-Tenant-ID: " + strconv.Itoa(o.TenantID),
235 } {
236 cmd = exec.CommandContext(ctx, "git", "-C", gitDir, "config", "--add", "http.extraHeader", header)
237 cmd.Stdin = &bytes.Buffer{}
238 if err := c.runCmd(cmd); err != nil {
239 return err
240 }
241 }
242
243 var fetchDuration time.Duration
244 successfullyFetchedCommitsCount := 0
245 allFetchesSucceeded := true
246
247 defer func() {
248 success := strconv.FormatBool(allFetchesSucceeded)
249 name := repoNameForMetric(o.Name)
250 metricFetchDuration.WithLabelValues(success, name).Observe(fetchDuration.Seconds())
251 }()
252
253 runFetch := func(branches []zoekt.RepositoryBranch) error {
254 // We shallow fetch each commit specified in zoekt.Branches. This requires
255 // the server to have configured both uploadpack.allowAnySHA1InWant and
256 // uploadpack.allowFilter. (See gitservice.go in the Sourcegraph repository)
257 fetchArgs := []string{
258 "-C", gitDir,
259 "-c", "protocol.version=2",
260 "fetch", "--depth=1", "--no-tags",
261 }
262
263 // Git's blob:limit filter excludes blobs whose size is >= the given limit,
264 // while zoekt indexes files up to and including FileLimit bytes.
265 if len(o.LargeFiles) == 0 {
266 fetchArgs = append(fetchArgs, fmt.Sprintf("--filter=blob:limit=%d", int64(MaxFileSize)+1))
267 }
268
269 fetchArgs = append(fetchArgs, o.CloneURL)
270
271 var commits []string
272 for _, b := range branches {
273 commits = append(commits, b.Version)
274 }
275
276 fetchArgs = append(fetchArgs, commits...)
277
278 cmd = exec.CommandContext(ctx, "git", fetchArgs...)
279 cmd.Stdin = &bytes.Buffer{}
280
281 start := time.Now()
282 err := c.runCmd(cmd)
283 fetchDuration += time.Since(start)
284
285 if err != nil {
286 allFetchesSucceeded = false
287 var bs []string
288 for _, b := range branches {
289 bs = append(bs, b.String())
290 }
291
292 formattedBranches := strings.Join(bs, ", ")
293 return fmt.Errorf("fetching %s: %w", formattedBranches, err)
294 }
295
296 successfullyFetchedCommitsCount += len(commits)
297 return nil
298 }
299
300 fetchPriorAndLatestCommits := func() error {
301 prior, err := priorBranches(c, o)
302 if err != nil {
303 return err
304 }
305
306 var allBranches []zoekt.RepositoryBranch
307 allBranches = append(allBranches, o.Branches...)
308 allBranches = append(allBranches, prior...)
309
310 return runFetch(allBranches)
311 }
312
313 fetchOnlyLatestCommits := func() error {
314 return runFetch(o.Branches)
315 }
316
317 if o.UseDelta {
318 err := fetchPriorAndLatestCommits()
319 if err != nil {
320 name := o.BuildOptions().RepositoryDescription.Name
321 id := o.BuildOptions().RepositoryDescription.ID
322
323 errorLog.Printf("delta build: failed to prepare delta build for %q (ID %d): failed to fetch both latest and prior commits: %s", name, id, err)
324 err = fetchOnlyLatestCommits()
325 if err != nil {
326 return err
327 }
328 }
329 } else {
330 err := fetchOnlyLatestCommits()
331 if err != nil {
332 return err
333 }
334 }
335
336 // We then create the relevant refs for each fetched commit.
337 for _, b := range o.Branches {
338 ref := b.Name
339 if ref != "HEAD" {
340 ref = "refs/heads/" + ref
341 }
342 cmd := exec.CommandContext(ctx, "git", "-C", gitDir, "update-ref", ref, b.Version)
343 cmd.Stdin = &bytes.Buffer{}
344 if err := c.runCmd(cmd); err != nil {
345 return fmt.Errorf("failed update-ref %s to %s: %w", ref, b.Version, err)
346 }
347 }
348
349 logger.Debug("successfully fetched git data",
350 sglog.String("repo", o.Name),
351 sglog.Uint32("id", o.RepoID),
352 sglog.Int("commits_count", successfullyFetchedCommitsCount),
353 sglog.Duration("duration", fetchDuration),
354 )
355 return nil
356}
357
358func setZoektConfig(ctx context.Context, gitDir string, o *indexArgs, c gitIndexConfig) error {
359 // create git configuration with options
360 type configKV struct{ Key, Value string }
361 config := []configKV{{
362 // zoekt.name is used by zoekt-git-index to set the repository name.
363 Key: "name",
364 Value: o.Name,
365 }}
366 for k, v := range o.BuildOptions().RepositoryDescription.RawConfig {
367 config = append(config, configKV{Key: k, Value: v})
368 }
369 sort.Slice(config, func(i, j int) bool {
370 return config[i].Key < config[j].Key
371 })
372
373 // write git configuration to repo
374 for _, kv := range config {
375 cmd := exec.CommandContext(ctx, "git", "-C", gitDir, "config", "zoekt."+kv.Key, kv.Value)
376 cmd.Stdin = &bytes.Buffer{}
377 if err := c.runCmd(cmd); err != nil {
378 return err
379 }
380 }
381 return nil
382}
383
384func indexRepo(ctx context.Context, gitDir string, sourcegraph Sourcegraph, o *indexArgs, c gitIndexConfig, logger sglog.Logger) error {
385 args := []string{
386 "-submodules=false",
387 }
388
389 // Even though we check for incremental in this process, we still pass it
390 // in just in case we regress in how we check in process. We will still
391 // notice thanks to metrics and increased load on gitserver.
392 if o.Incremental {
393 args = append(args, "-incremental")
394 }
395
396 var branches []string
397 for _, b := range o.Branches {
398 branches = append(branches, b.Name)
399 }
400 args = append(args, "-branches", strings.Join(branches, ","))
401
402 if o.UseDelta {
403 args = append(args, "-delta")
404 args = append(args, "-delta_threshold", strconv.FormatUint(o.DeltaShardNumberFallbackThreshold, 10))
405 }
406
407 if len(o.LanguageMap) > 0 {
408 var languageMap []string
409 for language, parser := range o.LanguageMap {
410 languageMap = append(languageMap, language+":"+ctags.ParserToString(parser))
411 }
412 args = append(args, "-language_map", strings.Join(languageMap, ","))
413 }
414
415 args = append(args, o.BuildOptions().Args()...)
416 args = append(args, gitDir)
417
418 cmd := exec.CommandContext(ctx, "zoekt-git-index", args...)
419 cmd.Stdin = &bytes.Buffer{}
420 if err := c.runCmd(cmd); err != nil {
421 return err
422 }
423 return nil
424}
425
426func priorBranches(c gitIndexConfig, o *indexArgs) ([]zoekt.RepositoryBranch, error) {
427 existingRepository, _, found, err := c.findRepositoryMetadata(o)
428 if err != nil {
429 return nil, fmt.Errorf("loading repository metadata: %w", err)
430 }
431
432 if !found || len(existingRepository.Branches) == 0 {
433 return nil, fmt.Errorf("no prior shards found")
434 }
435
436 return existingRepository.Branches, nil
437}
438
439func tmpGitDir(name string) (string, error) {
440 abs := url.QueryEscape(name)
441 if len(abs) > 200 {
442 h := sha1.New()
443 _, _ = io.WriteString(h, abs)
444 abs = abs[:200] + fmt.Sprintf("%x", h.Sum(nil))[:8]
445 }
446 dir := filepath.Join(os.TempDir(), abs+".git")
447 if _, err := os.Stat(dir); err == nil {
448 if err := os.RemoveAll(dir); err != nil {
449 return "", err
450 }
451 }
452 return dir, nil
453}
454
455// FromProto converts a ZoektIndexOptions proto message into an IndexOptions struct.
456func (o *IndexOptions) FromProto(x *configv1.ZoektIndexOptions) {
457 branches := make([]zoekt.RepositoryBranch, 0, len(x.Branches))
458 for _, b := range x.GetBranches() {
459 branches = append(branches, zoekt.RepositoryBranch{
460 Name: b.GetName(),
461 Version: b.GetVersion(),
462 })
463 }
464
465 languageMap := make(map[string]ctags.CTagsParserType)
466 for _, lang := range x.GetLanguageMap() {
467 languageMap[lang.GetLanguage()] = ctags.CTagsParserType(lang.GetCtags().Number())
468 }
469
470 *o = IndexOptions{
471 RepoID: uint32(x.GetRepoId()),
472 LargeFiles: x.GetLargeFiles(),
473 Symbols: x.GetSymbols(),
474 Branches: branches,
475 Name: x.GetName(),
476
477 Priority: x.GetPriority(),
478
479 Public: x.GetPublic(),
480 Fork: x.GetFork(),
481 Archived: x.GetArchived(),
482
483 LanguageMap: languageMap,
484 ShardConcurrency: x.GetShardConcurrency(),
485
486 TenantID: int(x.TenantId),
487 }
488}