fork of https://github.com/sourcegraph/zoekt
1package main
2
3import (
4 "bytes"
5 "context"
6 "crypto/sha1"
7 "errors"
8 "fmt"
9 "io"
10 "log"
11 "net/url"
12 "os"
13 "os/exec"
14 "path/filepath"
15 "sort"
16 "strconv"
17 "strings"
18 "time"
19
20 sglog "github.com/sourcegraph/log"
21
22 "github.com/sourcegraph/zoekt"
23 "github.com/sourcegraph/zoekt/build"
24 "github.com/sourcegraph/zoekt/ctags"
25 "github.com/sourcegraph/zoekt/internal/tenant"
26)
27
28const defaultIndexingTimeout = 1*time.Hour + 30*time.Minute
29
30// IndexOptions are the options that Sourcegraph can set via it's search
31// configuration endpoint.
32type IndexOptions struct {
33 // LargeFiles is a slice of glob patterns where matching file paths should
34 // be indexed regardless of their size. The pattern syntax can be found
35 // here: https://golang.org/pkg/path/filepath/#Match.
36 LargeFiles []string
37
38 // Symbols if true will make zoekt index the output of ctags.
39 Symbols bool
40
41 // Branches is a slice of branches to index.
42 Branches []zoekt.RepositoryBranch
43
44 // RepoID is the Sourcegraph Repository ID.
45 RepoID uint32
46
47 // Name is the Repository Name.
48 Name string
49
50 // CloneURL is the internal clone URL for Name.
51 CloneURL string
52
53 // Priority indicates ranking in results, higher first.
54 Priority float64
55
56 // Public is true if the repository is public.
57 Public bool
58
59 // Fork is true if the repository is a fork.
60 Fork bool
61
62 // Archived is true if the repository is archived.
63 Archived bool
64
65 // Map from language to scip-ctags, universal-ctags, or neither
66 LanguageMap ctags.LanguageMap
67
68 // The number of threads to use for indexing shards. Defaults to the number of available
69 // CPUs. If the server flag -cpu_fraction is set, then this value overrides it.
70 ShardConcurrency int32
71
72 // TenantID is the tenant ID for the repository.
73 TenantID int
74}
75
76// indexArgs represents the arguments we pass to zoekt-git-index
77type indexArgs struct {
78 IndexOptions
79
80 // Incremental indicates to skip indexing if already indexed.
81 Incremental bool
82
83 // IndexDir is the index directory to store the shards.
84 IndexDir string
85
86 // Parallelism is the number of shards to compute in parallel.
87 Parallelism int
88
89 // FileLimit is the maximum size of a file
90 FileLimit int
91
92 // UseDelta is true if we want to use the new delta indexer. This should
93 // only be true for repositories we explicitly enable.
94 UseDelta bool
95
96 // DeltaShardNumberFallbackThreshold is an upper limit on the number of preexisting shards that can exist
97 // before attempting a delta build.
98 DeltaShardNumberFallbackThreshold uint64
99
100 // ShardMerging is true if we want zoekt-git-index to respect compound shards.
101 ShardMerging bool
102}
103
104// BuildOptions returns a build.Options represented by indexArgs. Note: it
105// doesn't set fields like repository/branch.
106func (o *indexArgs) BuildOptions() *build.Options {
107 shardPrefix := ""
108 if tenant.EnforceTenant() {
109 shardPrefix = tenant.SrcPrefix(o.TenantID, o.RepoID)
110 }
111
112 return &build.Options{
113 // It is important that this RepositoryDescription exactly matches what
114 // the indexer we call will produce. This is to ensure that
115 // IncrementalSkipIndexing and IndexState can correctly calculate if
116 // nothing needs to be done.
117 RepositoryDescription: zoekt.Repository{
118 TenantID: o.TenantID,
119 ID: uint32(o.IndexOptions.RepoID),
120 Name: o.Name,
121 Branches: o.Branches,
122 RawConfig: map[string]string{
123 "repoid": strconv.Itoa(int(o.IndexOptions.RepoID)),
124 "priority": strconv.FormatFloat(o.Priority, 'g', -1, 64),
125 "public": marshalBool(o.Public),
126 "fork": marshalBool(o.Fork),
127 "archived": marshalBool(o.Archived),
128 // Calculate repo rank based on the latest commit date.
129 "latestCommitDate": "1",
130 "tenantID": strconv.Itoa(o.TenantID),
131 },
132 },
133 IndexDir: o.IndexDir,
134 Parallelism: o.Parallelism,
135 SizeMax: o.FileLimit,
136 LargeFiles: o.LargeFiles,
137 CTagsMustSucceed: o.Symbols,
138 DisableCTags: !o.Symbols,
139 IsDelta: o.UseDelta,
140
141 LanguageMap: o.LanguageMap,
142
143 ShardMerging: o.ShardMerging,
144
145 ShardPrefix: shardPrefix,
146 }
147}
148
149func marshalBool(b bool) string {
150 if b {
151 return "1"
152 }
153 return "0"
154}
155
156func (o *indexArgs) String() string {
157 s := fmt.Sprintf("%d %s", o.RepoID, o.Name)
158 for i, b := range o.Branches {
159 if i == 0 {
160 s = fmt.Sprintf("%s@%s=%s", s, b.Name, b.Version)
161 } else {
162 s = fmt.Sprintf("%s,%s=%s", s, b.Name, b.Version)
163 }
164 }
165 return s
166}
167
168type gitIndexConfig struct {
169 // runCmd is the function that's used to execute all external commands (such as calls to "git" or "zoekt-git-index")
170 // that gitIndex may construct.
171 runCmd func(*exec.Cmd) error
172
173 // findRepositoryMetadata is the function that returns the repository metadata for the
174 // repository specified in args. 'ok' is false if the repository's metadata
175 // couldn't be found or if an error occurred.
176 //
177 // The primary purpose of this configuration option is to be able to provide a stub
178 // implementation for this in our test suite. All other callers should use build.Options.FindRepositoryMetadata().
179 findRepositoryMetadata func(args *indexArgs) (repository *zoekt.Repository, metadata *zoekt.IndexMetadata, ok bool, err error)
180
181 // timeout defines how long the index server waits before killing an indexing job.
182 timeout time.Duration
183}
184
185func gitIndex(c gitIndexConfig, o *indexArgs, sourcegraph Sourcegraph, l sglog.Logger) error {
186 logger := l.Scoped("gitIndex")
187
188 if len(o.Branches) == 0 {
189 return errors.New("zoekt-git-index requires 1 or more branches")
190 }
191
192 if c.runCmd == nil {
193 return errors.New("runCmd in provided configuration was nil - a function must be provided")
194 }
195
196 if c.findRepositoryMetadata == nil {
197 return errors.New("findRepositoryMetadata in provided configuration was nil - a function must be provided")
198 }
199
200 ctx, cancel := context.WithTimeout(context.Background(), c.timeout)
201 defer cancel()
202
203 gitDir, err := tmpGitDir(o.Name)
204 if err != nil {
205 return err
206 }
207 defer os.RemoveAll(gitDir) // best-effort cleanup
208
209 err = fetchRepo(ctx, gitDir, o, c, logger)
210 if err != nil {
211 return err
212 }
213
214 err = setZoektConfig(ctx, gitDir, o, c)
215 if err != nil {
216 return err
217 }
218
219 err = indexRepo(ctx, gitDir, sourcegraph, o, c, logger)
220 if err != nil {
221 return err
222 }
223
224 return nil
225}
226
227func fetchRepo(ctx context.Context, gitDir string, o *indexArgs, c gitIndexConfig, logger sglog.Logger) error {
228 // Create a repo to fetch into
229 cmd := exec.CommandContext(ctx, "git",
230 // use a random default branch. This is so that HEAD isn't a symref to a
231 // branch that is indexed. For example if you are indexing
232 // HEAD,master. Then HEAD would be pointing to master by default.
233 "-c", "init.defaultBranch=nonExistentBranchBB0FOFCH32",
234 "init",
235 // we don't need a working copy
236 "--bare",
237 gitDir)
238 cmd.Stdin = &bytes.Buffer{}
239 if err := c.runCmd(cmd); err != nil {
240 return err
241 }
242
243 var fetchDuration time.Duration
244 successfullyFetchedCommitsCount := 0
245 allFetchesSucceeded := true
246
247 defer func() {
248 success := strconv.FormatBool(allFetchesSucceeded)
249 name := repoNameForMetric(o.Name)
250 metricFetchDuration.WithLabelValues(success, name).Observe(fetchDuration.Seconds())
251 }()
252
253 runFetch := func(branches []zoekt.RepositoryBranch) error {
254 // We shallow fetch each commit specified in zoekt.Branches. This requires
255 // the server to have configured both uploadpack.allowAnySHA1InWant and
256 // uploadpack.allowFilter. (See gitservice.go in the Sourcegraph repository)
257 fetchArgs := []string{
258 "-C", gitDir,
259 "-c", "protocol.version=2",
260 "-c", "http.extraHeader=X-Sourcegraph-Actor-UID: internal",
261 "-c", "http.extraHeader=X-Sourcegraph-Tenant-ID: " + strconv.Itoa(o.TenantID),
262 "fetch", "--depth=1", "--no-tags",
263 }
264
265 // If there are no exceptions to MaxFileSize (1MB), we can avoid fetching these large files.
266 if len(o.LargeFiles) == 0 {
267 fetchArgs = append(fetchArgs, "--filter=blob:limit=1m")
268 }
269
270 fetchArgs = append(fetchArgs, o.CloneURL)
271
272 var commits []string
273 for _, b := range branches {
274 commits = append(commits, b.Version)
275 }
276
277 fetchArgs = append(fetchArgs, commits...)
278
279 cmd = exec.CommandContext(ctx, "git", fetchArgs...)
280 cmd.Stdin = &bytes.Buffer{}
281
282 start := time.Now()
283 err := c.runCmd(cmd)
284 fetchDuration += time.Since(start)
285
286 if err != nil {
287 allFetchesSucceeded = false
288 var bs []string
289 for _, b := range branches {
290 bs = append(bs, b.String())
291 }
292
293 formattedBranches := strings.Join(bs, ", ")
294 return fmt.Errorf("fetching %s: %w", formattedBranches, err)
295 }
296
297 successfullyFetchedCommitsCount += len(commits)
298 return nil
299 }
300
301 fetchPriorAndLatestCommits := func() error {
302 prior, err := priorBranches(c, o)
303 if err != nil {
304 return err
305 }
306
307 var allBranches []zoekt.RepositoryBranch
308 allBranches = append(allBranches, o.Branches...)
309 allBranches = append(allBranches, prior...)
310
311 return runFetch(allBranches)
312 }
313
314 fetchOnlyLatestCommits := func() error {
315 return runFetch(o.Branches)
316 }
317
318 if o.UseDelta {
319 err := fetchPriorAndLatestCommits()
320 if err != nil {
321 name := o.BuildOptions().RepositoryDescription.Name
322 id := o.BuildOptions().RepositoryDescription.ID
323
324 log.Printf("delta build: failed to prepare delta build for %q (ID %d): failed to fetch both latest and prior commits: %s", name, id, err)
325 err = fetchOnlyLatestCommits()
326 if err != nil {
327 return err
328 }
329 }
330 } else {
331 err := fetchOnlyLatestCommits()
332 if err != nil {
333 return err
334 }
335 }
336
337 // We then create the relevant refs for each fetched commit.
338 for _, b := range o.Branches {
339 ref := b.Name
340 if ref != "HEAD" {
341 ref = "refs/heads/" + ref
342 }
343 cmd := exec.CommandContext(ctx, "git", "-C", gitDir, "update-ref", ref, b.Version)
344 cmd.Stdin = &bytes.Buffer{}
345 if err := c.runCmd(cmd); err != nil {
346 return fmt.Errorf("failed update-ref %s to %s: %w", ref, b.Version, err)
347 }
348 }
349
350 logger.Debug("successfully fetched git data",
351 sglog.String("repo", o.Name),
352 sglog.Uint32("id", o.RepoID),
353 sglog.Int("commits_count", successfullyFetchedCommitsCount),
354 sglog.Duration("duration", fetchDuration),
355 )
356 return nil
357}
358
359func setZoektConfig(ctx context.Context, gitDir string, o *indexArgs, c gitIndexConfig) error {
360 // create git configuration with options
361 type configKV struct{ Key, Value string }
362 config := []configKV{{
363 // zoekt.name is used by zoekt-git-index to set the repository name.
364 Key: "name",
365 Value: o.Name,
366 }}
367 for k, v := range o.BuildOptions().RepositoryDescription.RawConfig {
368 config = append(config, configKV{Key: k, Value: v})
369 }
370 sort.Slice(config, func(i, j int) bool {
371 return config[i].Key < config[j].Key
372 })
373
374 // write git configuration to repo
375 for _, kv := range config {
376 cmd := exec.CommandContext(ctx, "git", "-C", gitDir, "config", "zoekt."+kv.Key, kv.Value)
377 cmd.Stdin = &bytes.Buffer{}
378 if err := c.runCmd(cmd); err != nil {
379 return err
380 }
381 }
382 return nil
383}
384
385func indexRepo(ctx context.Context, gitDir string, sourcegraph Sourcegraph, o *indexArgs, c gitIndexConfig, logger sglog.Logger) error {
386 args := []string{
387 "-submodules=false",
388 }
389
390 // Even though we check for incremental in this process, we still pass it
391 // in just in case we regress in how we check in process. We will still
392 // notice thanks to metrics and increased load on gitserver.
393 if o.Incremental {
394 args = append(args, "-incremental")
395 }
396
397 var branches []string
398 for _, b := range o.Branches {
399 branches = append(branches, b.Name)
400 }
401 args = append(args, "-branches", strings.Join(branches, ","))
402
403 if o.UseDelta {
404 args = append(args, "-delta")
405 args = append(args, "-delta_threshold", strconv.FormatUint(o.DeltaShardNumberFallbackThreshold, 10))
406 }
407
408 if len(o.LanguageMap) > 0 {
409 var languageMap []string
410 for language, parser := range o.LanguageMap {
411 languageMap = append(languageMap, language+":"+ctags.ParserToString(parser))
412 }
413 args = append(args, "-language_map", strings.Join(languageMap, ","))
414 }
415
416 args = append(args, o.BuildOptions().Args()...)
417 args = append(args, gitDir)
418
419 cmd := exec.CommandContext(ctx, "zoekt-git-index", args...)
420 cmd.Stdin = &bytes.Buffer{}
421 if err := c.runCmd(cmd); err != nil {
422 return err
423 }
424 return nil
425}
426
427func priorBranches(c gitIndexConfig, o *indexArgs) ([]zoekt.RepositoryBranch, error) {
428 existingRepository, _, found, err := c.findRepositoryMetadata(o)
429 if err != nil {
430 return nil, fmt.Errorf("loading repository metadata: %w", err)
431 }
432
433 if !found || len(existingRepository.Branches) == 0 {
434 return nil, fmt.Errorf("no prior shards found")
435 }
436
437 return existingRepository.Branches, nil
438}
439
440func tmpGitDir(name string) (string, error) {
441 abs := url.QueryEscape(name)
442 if len(abs) > 200 {
443 h := sha1.New()
444 _, _ = io.WriteString(h, abs)
445 abs = abs[:200] + fmt.Sprintf("%x", h.Sum(nil))[:8]
446 }
447 dir := filepath.Join(os.TempDir(), abs+".git")
448 if _, err := os.Stat(dir); err == nil {
449 if err := os.RemoveAll(dir); err != nil {
450 return "", err
451 }
452 }
453 return dir, nil
454}