fork of https://github.com/sourcegraph/zoekt
1package main
2
3import (
4 "bytes"
5 "context"
6 "crypto/sha1"
7 "errors"
8 "fmt"
9 "io"
10 "net/url"
11 "os"
12 "os/exec"
13 "path/filepath"
14 "sort"
15 "strconv"
16 "strings"
17 "time"
18
19 sglog "github.com/sourcegraph/log"
20 "github.com/sourcegraph/zoekt"
21 "github.com/sourcegraph/zoekt/index"
22 "github.com/sourcegraph/zoekt/internal/ctags"
23 "github.com/sourcegraph/zoekt/internal/tenant"
24)
25
26const defaultIndexingTimeout = 1*time.Hour + 30*time.Minute
27
28// IndexOptions are the options that Sourcegraph can set via it's search
29// configuration endpoint.
30type IndexOptions struct {
31 // LargeFiles is a slice of glob patterns where matching file paths should
32 // be indexed regardless of their size. The pattern syntax can be found
33 // here: https://golang.org/pkg/path/filepath/#Match.
34 LargeFiles []string
35
36 // Symbols if true will make zoekt index the output of ctags.
37 Symbols bool
38
39 // Branches is a slice of branches to index.
40 Branches []zoekt.RepositoryBranch
41
42 // RepoID is the Sourcegraph Repository ID.
43 RepoID uint32
44
45 // Name is the Repository Name.
46 Name string
47
48 // CloneURL is the internal clone URL for Name.
49 CloneURL string
50
51 // Priority indicates ranking in results, higher first.
52 Priority float64
53
54 // Public is true if the repository is public.
55 Public bool
56
57 // Fork is true if the repository is a fork.
58 Fork bool
59
60 // Archived is true if the repository is archived.
61 Archived bool
62
63 // Map from language to scip-ctags, universal-ctags, or neither
64 LanguageMap ctags.LanguageMap
65
66 // The number of threads to use for indexing shards. Defaults to the number of available
67 // CPUs. If the server flag -cpu_fraction is set, then this value overrides it.
68 ShardConcurrency int32
69
70 // TenantID is the tenant ID for the repository.
71 TenantID int
72}
73
74// indexArgs represents the arguments we pass to zoekt-git-index
75type indexArgs struct {
76 IndexOptions
77
78 // Incremental indicates to skip indexing if already indexed.
79 Incremental bool
80
81 // IndexDir is the index directory to store the shards.
82 IndexDir string
83
84 // Parallelism is the number of shards to compute in parallel.
85 Parallelism int
86
87 // FileLimit is the maximum size of a file
88 FileLimit int
89
90 // UseDelta is true if we want to use the new delta indexer. This should
91 // only be true for repositories we explicitly enable.
92 UseDelta bool
93
94 // DeltaShardNumberFallbackThreshold is an upper limit on the number of preexisting shards that can exist
95 // before attempting a delta build.
96 DeltaShardNumberFallbackThreshold uint64
97
98 // ShardMerging is true if we want zoekt-git-index to respect compound shards.
99 ShardMerging bool
100}
101
102// BuildOptions returns a index.Options represented by indexArgs. Note: it
103// doesn't set fields like repository/branch.
104func (o *indexArgs) BuildOptions() *index.Options {
105 shardPrefix := ""
106 if tenant.EnforceTenant() {
107 shardPrefix = tenant.SrcPrefix(o.TenantID, o.RepoID)
108 }
109
110 return &index.Options{
111 // It is important that this RepositoryDescription exactly matches what
112 // the indexer we call will produce. This is to ensure that
113 // IncrementalSkipIndexing and IndexState can correctly calculate if
114 // nothing needs to be done.
115 RepositoryDescription: zoekt.Repository{
116 TenantID: o.TenantID,
117 ID: uint32(o.IndexOptions.RepoID),
118 Name: o.Name,
119 Branches: o.Branches,
120 RawConfig: map[string]string{
121 "repoid": strconv.Itoa(int(o.IndexOptions.RepoID)),
122 "priority": strconv.FormatFloat(o.Priority, 'g', -1, 64),
123 "public": marshalBool(o.Public),
124 "fork": marshalBool(o.Fork),
125 "archived": marshalBool(o.Archived),
126 // Calculate repo rank based on the latest commit date.
127 "latestCommitDate": "1",
128 "tenantID": strconv.Itoa(o.TenantID),
129 },
130 },
131 IndexDir: o.IndexDir,
132 Parallelism: o.Parallelism,
133 SizeMax: o.FileLimit,
134 LargeFiles: o.LargeFiles,
135 CTagsMustSucceed: o.Symbols,
136 DisableCTags: !o.Symbols,
137 IsDelta: o.UseDelta,
138
139 LanguageMap: o.LanguageMap,
140
141 ShardMerging: o.ShardMerging,
142
143 ShardPrefix: shardPrefix,
144 }
145}
146
147func marshalBool(b bool) string {
148 if b {
149 return "1"
150 }
151 return "0"
152}
153
154func (o *indexArgs) String() string {
155 s := fmt.Sprintf("%d %s", o.RepoID, o.Name)
156 for i, b := range o.Branches {
157 if i == 0 {
158 s = fmt.Sprintf("%s@%s=%s", s, b.Name, b.Version)
159 } else {
160 s = fmt.Sprintf("%s,%s=%s", s, b.Name, b.Version)
161 }
162 }
163 return s
164}
165
166type gitIndexConfig struct {
167 // runCmd is the function that's used to execute all external commands (such as calls to "git" or "zoekt-git-index")
168 // that gitIndex may construct.
169 runCmd func(*exec.Cmd) error
170
171 // findRepositoryMetadata is the function that returns the repository metadata for the
172 // repository specified in args. 'ok' is false if the repository's metadata
173 // couldn't be found or if an error occurred.
174 //
175 // The primary purpose of this configuration option is to be able to provide a stub
176 // implementation for this in our test suite. All other callers should use build.Options.FindRepositoryMetadata().
177 findRepositoryMetadata func(args *indexArgs) (repository *zoekt.Repository, metadata *zoekt.IndexMetadata, ok bool, err error)
178
179 // timeout defines how long the index server waits before killing an indexing job.
180 timeout time.Duration
181}
182
183func gitIndex(c gitIndexConfig, o *indexArgs, sourcegraph Sourcegraph, l sglog.Logger) error {
184 logger := l.Scoped("gitIndex")
185
186 if len(o.Branches) == 0 {
187 return errors.New("zoekt-git-index requires 1 or more branches")
188 }
189
190 if c.runCmd == nil {
191 return errors.New("runCmd in provided configuration was nil - a function must be provided")
192 }
193
194 if c.findRepositoryMetadata == nil {
195 return errors.New("findRepositoryMetadata in provided configuration was nil - a function must be provided")
196 }
197
198 ctx, cancel := context.WithTimeout(context.Background(), c.timeout)
199 defer cancel()
200
201 gitDir, err := tmpGitDir(o.Name)
202 if err != nil {
203 return err
204 }
205 defer os.RemoveAll(gitDir) // best-effort cleanup
206
207 err = fetchRepo(ctx, gitDir, o, c, logger)
208 if err != nil {
209 return err
210 }
211
212 err = setZoektConfig(ctx, gitDir, o, c)
213 if err != nil {
214 return err
215 }
216
217 err = indexRepo(ctx, gitDir, sourcegraph, o, c, logger)
218 if err != nil {
219 return err
220 }
221
222 return nil
223}
224
225func fetchRepo(ctx context.Context, gitDir string, o *indexArgs, c gitIndexConfig, logger sglog.Logger) error {
226 // Create a repo to fetch into
227 cmd := exec.CommandContext(ctx, "git",
228 // use a random default branch. This is so that HEAD isn't a symref to a
229 // branch that is indexed. For example if you are indexing
230 // HEAD,master. Then HEAD would be pointing to master by default.
231 "-c", "init.defaultBranch=nonExistentBranchBB0FOFCH32",
232 "init",
233 // we don't need a working copy
234 "--bare",
235 gitDir)
236 cmd.Stdin = &bytes.Buffer{}
237 if err := c.runCmd(cmd); err != nil {
238 return err
239 }
240
241 var fetchDuration time.Duration
242 successfullyFetchedCommitsCount := 0
243 allFetchesSucceeded := true
244
245 defer func() {
246 success := strconv.FormatBool(allFetchesSucceeded)
247 name := repoNameForMetric(o.Name)
248 metricFetchDuration.WithLabelValues(success, name).Observe(fetchDuration.Seconds())
249 }()
250
251 runFetch := func(branches []zoekt.RepositoryBranch) error {
252 // We shallow fetch each commit specified in zoekt.Branches. This requires
253 // the server to have configured both uploadpack.allowAnySHA1InWant and
254 // uploadpack.allowFilter. (See gitservice.go in the Sourcegraph repository)
255 fetchArgs := []string{
256 "-C", gitDir,
257 "-c", "protocol.version=2",
258 "-c", "http.extraHeader=X-Sourcegraph-Actor-UID: internal",
259 "-c", "http.extraHeader=X-Sourcegraph-Tenant-ID: " + strconv.Itoa(o.TenantID),
260 "fetch", "--depth=1", "--no-tags",
261 }
262
263 // If there are no exceptions to MaxFileSize (1MB), we can avoid fetching these large files.
264 if len(o.LargeFiles) == 0 {
265 fetchArgs = append(fetchArgs, "--filter=blob:limit=1m")
266 }
267
268 fetchArgs = append(fetchArgs, o.CloneURL)
269
270 var commits []string
271 for _, b := range branches {
272 commits = append(commits, b.Version)
273 }
274
275 fetchArgs = append(fetchArgs, commits...)
276
277 cmd = exec.CommandContext(ctx, "git", fetchArgs...)
278 cmd.Stdin = &bytes.Buffer{}
279
280 start := time.Now()
281 err := c.runCmd(cmd)
282 fetchDuration += time.Since(start)
283
284 if err != nil {
285 allFetchesSucceeded = false
286 var bs []string
287 for _, b := range branches {
288 bs = append(bs, b.String())
289 }
290
291 formattedBranches := strings.Join(bs, ", ")
292 return fmt.Errorf("fetching %s: %w", formattedBranches, err)
293 }
294
295 successfullyFetchedCommitsCount += len(commits)
296 return nil
297 }
298
299 fetchPriorAndLatestCommits := func() error {
300 prior, err := priorBranches(c, o)
301 if err != nil {
302 return err
303 }
304
305 var allBranches []zoekt.RepositoryBranch
306 allBranches = append(allBranches, o.Branches...)
307 allBranches = append(allBranches, prior...)
308
309 return runFetch(allBranches)
310 }
311
312 fetchOnlyLatestCommits := func() error {
313 return runFetch(o.Branches)
314 }
315
316 if o.UseDelta {
317 err := fetchPriorAndLatestCommits()
318 if err != nil {
319 name := o.BuildOptions().RepositoryDescription.Name
320 id := o.BuildOptions().RepositoryDescription.ID
321
322 errorLog.Printf("delta build: failed to prepare delta build for %q (ID %d): failed to fetch both latest and prior commits: %s", name, id, err)
323 err = fetchOnlyLatestCommits()
324 if err != nil {
325 return err
326 }
327 }
328 } else {
329 err := fetchOnlyLatestCommits()
330 if err != nil {
331 return err
332 }
333 }
334
335 // We then create the relevant refs for each fetched commit.
336 for _, b := range o.Branches {
337 ref := b.Name
338 if ref != "HEAD" {
339 ref = "refs/heads/" + ref
340 }
341 cmd := exec.CommandContext(ctx, "git", "-C", gitDir, "update-ref", ref, b.Version)
342 cmd.Stdin = &bytes.Buffer{}
343 if err := c.runCmd(cmd); err != nil {
344 return fmt.Errorf("failed update-ref %s to %s: %w", ref, b.Version, err)
345 }
346 }
347
348 logger.Debug("successfully fetched git data",
349 sglog.String("repo", o.Name),
350 sglog.Uint32("id", o.RepoID),
351 sglog.Int("commits_count", successfullyFetchedCommitsCount),
352 sglog.Duration("duration", fetchDuration),
353 )
354 return nil
355}
356
357func setZoektConfig(ctx context.Context, gitDir string, o *indexArgs, c gitIndexConfig) error {
358 // create git configuration with options
359 type configKV struct{ Key, Value string }
360 config := []configKV{{
361 // zoekt.name is used by zoekt-git-index to set the repository name.
362 Key: "name",
363 Value: o.Name,
364 }}
365 for k, v := range o.BuildOptions().RepositoryDescription.RawConfig {
366 config = append(config, configKV{Key: k, Value: v})
367 }
368 sort.Slice(config, func(i, j int) bool {
369 return config[i].Key < config[j].Key
370 })
371
372 // write git configuration to repo
373 for _, kv := range config {
374 cmd := exec.CommandContext(ctx, "git", "-C", gitDir, "config", "zoekt."+kv.Key, kv.Value)
375 cmd.Stdin = &bytes.Buffer{}
376 if err := c.runCmd(cmd); err != nil {
377 return err
378 }
379 }
380 return nil
381}
382
383func indexRepo(ctx context.Context, gitDir string, sourcegraph Sourcegraph, o *indexArgs, c gitIndexConfig, logger sglog.Logger) error {
384 args := []string{
385 "-submodules=false",
386 }
387
388 // Even though we check for incremental in this process, we still pass it
389 // in just in case we regress in how we check in process. We will still
390 // notice thanks to metrics and increased load on gitserver.
391 if o.Incremental {
392 args = append(args, "-incremental")
393 }
394
395 var branches []string
396 for _, b := range o.Branches {
397 branches = append(branches, b.Name)
398 }
399 args = append(args, "-branches", strings.Join(branches, ","))
400
401 if o.UseDelta {
402 args = append(args, "-delta")
403 args = append(args, "-delta_threshold", strconv.FormatUint(o.DeltaShardNumberFallbackThreshold, 10))
404 }
405
406 if len(o.LanguageMap) > 0 {
407 var languageMap []string
408 for language, parser := range o.LanguageMap {
409 languageMap = append(languageMap, language+":"+ctags.ParserToString(parser))
410 }
411 args = append(args, "-language_map", strings.Join(languageMap, ","))
412 }
413
414 args = append(args, o.BuildOptions().Args()...)
415 args = append(args, gitDir)
416
417 cmd := exec.CommandContext(ctx, "zoekt-git-index", args...)
418 cmd.Stdin = &bytes.Buffer{}
419 if err := c.runCmd(cmd); err != nil {
420 return err
421 }
422 return nil
423}
424
425func priorBranches(c gitIndexConfig, o *indexArgs) ([]zoekt.RepositoryBranch, error) {
426 existingRepository, _, found, err := c.findRepositoryMetadata(o)
427 if err != nil {
428 return nil, fmt.Errorf("loading repository metadata: %w", err)
429 }
430
431 if !found || len(existingRepository.Branches) == 0 {
432 return nil, fmt.Errorf("no prior shards found")
433 }
434
435 return existingRepository.Branches, nil
436}
437
438func tmpGitDir(name string) (string, error) {
439 abs := url.QueryEscape(name)
440 if len(abs) > 200 {
441 h := sha1.New()
442 _, _ = io.WriteString(h, abs)
443 abs = abs[:200] + fmt.Sprintf("%x", h.Sum(nil))[:8]
444 }
445 dir := filepath.Join(os.TempDir(), abs+".git")
446 if _, err := os.Stat(dir); err == nil {
447 if err := os.RemoveAll(dir); err != nil {
448 return "", err
449 }
450 }
451 return dir, nil
452}