fork of https://github.com/sourcegraph/zoekt
1package main
2
3import (
4 "bytes"
5 "context"
6 "crypto/sha1"
7 "encoding/json"
8 "errors"
9 "fmt"
10 "io"
11 "log"
12 "net/url"
13 "os"
14 "os/exec"
15 "path/filepath"
16 "sort"
17 "strconv"
18 "strings"
19 "time"
20
21 "github.com/sourcegraph/zoekt"
22 "github.com/sourcegraph/zoekt/build"
23 "github.com/sourcegraph/zoekt/ctags"
24
25 sglog "github.com/sourcegraph/log"
26)
27
28const defaultIndexingTimeout = 1*time.Hour + 30*time.Minute
29
30// IndexOptions are the options that Sourcegraph can set via it's search
31// configuration endpoint.
32type IndexOptions struct {
33 // LargeFiles is a slice of glob patterns where matching file paths should
34 // be indexed regardless of their size. The pattern syntax can be found
35 // here: https://golang.org/pkg/path/filepath/#Match.
36 LargeFiles []string
37
38 // Symbols if true will make zoekt index the output of ctags.
39 Symbols bool
40
41 // Branches is a slice of branches to index.
42 Branches []zoekt.RepositoryBranch
43
44 // RepoID is the Sourcegraph Repository ID.
45 RepoID uint32
46
47 // Name is the Repository Name.
48 Name string
49
50 // CloneURL is the internal clone URL for Name.
51 CloneURL string
52
53 // Priority indicates ranking in results, higher first.
54 Priority float64
55
56 // DocumentRanksVersion when non-empty will lead to indexing using offline
57 // ranking. When the string changes this will also cause us to re-index with
58 // new ranks.
59 DocumentRanksVersion string
60
61 // Public is true if the repository is public.
62 Public bool
63
64 // Fork is true if the repository is a fork.
65 Fork bool
66
67 // Archived is true if the repository is archived.
68 Archived bool
69
70 // Map from language to scip-ctags, universal-ctags, or neither
71 LanguageMap ctags.LanguageMap
72
73 // The number of threads to use for indexing shards. Defaults to the number of available
74 // CPUs. If the server flag -cpu_fraction is set, then this value overrides it.
75 ShardConcurrency int32
76}
77
78// indexArgs represents the arguments we pass to zoekt-git-index
79type indexArgs struct {
80 IndexOptions
81
82 // Incremental indicates to skip indexing if already indexed.
83 Incremental bool
84
85 // IndexDir is the index directory to store the shards.
86 IndexDir string
87
88 // Parallelism is the number of shards to compute in parallel.
89 Parallelism int
90
91 // FileLimit is the maximum size of a file
92 FileLimit int
93
94 // UseDelta is true if we want to use the new delta indexer. This should
95 // only be true for repositories we explicitly enable.
96 UseDelta bool
97
98 // DeltaShardNumberFallbackThreshold is an upper limit on the number of preexisting shards that can exist
99 // before attempting a delta build.
100 DeltaShardNumberFallbackThreshold uint64
101
102 // ShardMerging is true if we want zoekt-git-index to respect compound shards.
103 ShardMerging bool
104}
105
106// BuildOptions returns a build.Options represented by indexArgs. Note: it
107// doesn't set fields like repository/branch.
108func (o *indexArgs) BuildOptions() *build.Options {
109 return &build.Options{
110 // It is important that this RepositoryDescription exactly matches what
111 // the indexer we call will produce. This is to ensure that
112 // IncrementalSkipIndexing and IndexState can correctly calculate if
113 // nothing needs to be done.
114 RepositoryDescription: zoekt.Repository{
115 ID: uint32(o.IndexOptions.RepoID),
116 Name: o.Name,
117 Branches: o.Branches,
118 RawConfig: map[string]string{
119 "repoid": strconv.Itoa(int(o.IndexOptions.RepoID)),
120 "priority": strconv.FormatFloat(o.Priority, 'g', -1, 64),
121 "public": marshalBool(o.Public),
122 "fork": marshalBool(o.Fork),
123 "archived": marshalBool(o.Archived),
124 },
125 },
126 IndexDir: o.IndexDir,
127 Parallelism: o.Parallelism,
128 SizeMax: o.FileLimit,
129 LargeFiles: o.LargeFiles,
130 CTagsMustSucceed: o.Symbols,
131 DisableCTags: !o.Symbols,
132 IsDelta: o.UseDelta,
133
134 DocumentRanksVersion: o.DocumentRanksVersion,
135
136 LanguageMap: o.LanguageMap,
137
138 ShardMerging: o.ShardMerging,
139 }
140}
141
142func marshalBool(b bool) string {
143 if b {
144 return "1"
145 }
146 return "0"
147}
148
149func (o *indexArgs) String() string {
150 s := fmt.Sprintf("%d %s", o.RepoID, o.Name)
151 for i, b := range o.Branches {
152 if i == 0 {
153 s = fmt.Sprintf("%s@%s=%s", s, b.Name, b.Version)
154 } else {
155 s = fmt.Sprintf("%s,%s=%s", s, b.Name, b.Version)
156 }
157 }
158 return s
159}
160
161type gitIndexConfig struct {
162 // runCmd is the function that's used to execute all external commands (such as calls to "git" or "zoekt-git-index")
163 // that gitIndex may construct.
164 runCmd func(*exec.Cmd) error
165
166 // findRepositoryMetadata is the function that returns the repository metadata for the
167 // repository specified in args. 'ok' is false if the repository's metadata
168 // couldn't be found or if an error occurred.
169 //
170 // The primary purpose of this configuration option is to be able to provide a stub
171 // implementation for this in our test suite. All other callers should use build.Options.FindRepositoryMetadata().
172 findRepositoryMetadata func(args *indexArgs) (repository *zoekt.Repository, metadata *zoekt.IndexMetadata, ok bool, err error)
173
174 // timeout defines how long the index server waits before killing an indexing job.
175 timeout time.Duration
176}
177
178func gitIndex(c gitIndexConfig, o *indexArgs, sourcegraph Sourcegraph, l sglog.Logger) error {
179 logger := l.Scoped("gitIndex")
180
181 if len(o.Branches) == 0 {
182 return errors.New("zoekt-git-index requires 1 or more branches")
183 }
184
185 if c.runCmd == nil {
186 return errors.New("runCmd in provided configuration was nil - a function must be provided")
187 }
188 runCmd := c.runCmd
189
190 if c.findRepositoryMetadata == nil {
191 return errors.New("findRepositoryMetadata in provided configuration was nil - a function must be provided")
192 }
193
194 buildOptions := o.BuildOptions()
195 ctx, cancel := context.WithTimeout(context.Background(), c.timeout)
196 defer cancel()
197
198 gitDir, err := tmpGitDir(o.Name)
199 if err != nil {
200 return err
201 }
202 defer os.RemoveAll(gitDir) // best-effort cleanup
203
204 // Create a repo to fetch into
205 cmd := exec.CommandContext(ctx, "git",
206 // use a random default branch. This is so that HEAD isn't a symref to a
207 // branch that is indexed. For example if you are indexing
208 // HEAD,master. Then HEAD would be pointing to master by default.
209 "-c", "init.defaultBranch=nonExistentBranchBB0FOFCH32",
210 "init",
211 // we don't need a working copy
212 "--bare",
213 gitDir)
214 cmd.Stdin = &bytes.Buffer{}
215 if err := runCmd(cmd); err != nil {
216 return err
217 }
218
219 var fetchDuration time.Duration
220 successfullyFetchedCommitsCount := 0
221 allFetchesSucceeded := true
222
223 defer func() {
224 success := strconv.FormatBool(allFetchesSucceeded)
225 name := repoNameForMetric(o.Name)
226 metricFetchDuration.WithLabelValues(success, name).Observe(fetchDuration.Seconds())
227 }()
228
229 runFetch := func(branches []zoekt.RepositoryBranch) error {
230 // We shallow fetch each commit specified in zoekt.Branches. This requires
231 // the server to have configured both uploadpack.allowAnySHA1InWant and
232 // uploadpack.allowFilter. (See gitservice.go in the Sourcegraph repository)
233 fetchArgs := []string{
234 "-C", gitDir,
235 "-c", "protocol.version=2",
236 "-c", "http.extraHeader=X-Sourcegraph-Actor-UID: internal",
237 "fetch", "--depth=1", o.CloneURL,
238 }
239
240 var commits []string
241 for _, b := range branches {
242 commits = append(commits, b.Version)
243 }
244
245 fetchArgs = append(fetchArgs, commits...)
246
247 cmd = exec.CommandContext(ctx, "git", fetchArgs...)
248 cmd.Stdin = &bytes.Buffer{}
249
250 start := time.Now()
251 err := runCmd(cmd)
252 fetchDuration += time.Since(start)
253
254 if err != nil {
255 allFetchesSucceeded = false
256 var bs []string
257 for _, b := range branches {
258 bs = append(bs, b.String())
259 }
260
261 formattedBranches := strings.Join(bs, ", ")
262 return fmt.Errorf("fetching %s: %w", formattedBranches, err)
263 }
264
265 successfullyFetchedCommitsCount += len(commits)
266 return nil
267 }
268
269 fetchPriorAndLatestCommits := func() error {
270 prior, err := priorBranches(c, o)
271 if err != nil {
272 return err
273 }
274
275 var allBranches []zoekt.RepositoryBranch
276 allBranches = append(allBranches, o.Branches...)
277 allBranches = append(allBranches, prior...)
278
279 return runFetch(allBranches)
280 }
281
282 fetchOnlyLatestCommits := func() error {
283 return runFetch(o.Branches)
284 }
285
286 if o.UseDelta {
287 err := fetchPriorAndLatestCommits()
288 if err != nil {
289 name := buildOptions.RepositoryDescription.Name
290 id := buildOptions.RepositoryDescription.ID
291
292 log.Printf("delta build: failed to prepare delta build for %q (ID %d): failed to fetch both latest and prior commits: %s", name, id, err)
293 err = fetchOnlyLatestCommits()
294 if err != nil {
295 return err
296 }
297 }
298 } else {
299 err := fetchOnlyLatestCommits()
300 if err != nil {
301 return err
302 }
303 }
304
305 logger.Debug("successfully fetched git data",
306 sglog.String("repo", o.Name),
307 sglog.Uint32("id", o.RepoID),
308 sglog.Int("commits_count", successfullyFetchedCommitsCount),
309 sglog.Duration("duration", fetchDuration),
310 )
311
312 // We then create the relevant refs for each fetched commit.
313 for _, b := range o.Branches {
314 ref := b.Name
315 if ref != "HEAD" {
316 ref = "refs/heads/" + ref
317 }
318 cmd = exec.CommandContext(ctx, "git", "-C", gitDir, "update-ref", ref, b.Version)
319 cmd.Stdin = &bytes.Buffer{}
320 if err := runCmd(cmd); err != nil {
321 return fmt.Errorf("failed update-ref %s to %s: %w", ref, b.Version, err)
322 }
323 }
324
325 // create git configuration with options
326 type configKV struct{ Key, Value string }
327 config := []configKV{{
328 // zoekt.name is used by zoekt-git-index to set the repository name.
329 Key: "name",
330 Value: o.Name,
331 }}
332 for k, v := range buildOptions.RepositoryDescription.RawConfig {
333 config = append(config, configKV{Key: k, Value: v})
334 }
335 sort.Slice(config, func(i, j int) bool {
336 return config[i].Key < config[j].Key
337 })
338
339 // write git configuration to repo
340 for _, kv := range config {
341 cmd = exec.CommandContext(ctx, "git", "-C", gitDir, "config", "zoekt."+kv.Key, kv.Value)
342 cmd.Stdin = &bytes.Buffer{}
343 if err := runCmd(cmd); err != nil {
344 return err
345 }
346 }
347
348 args := []string{
349 "-submodules=false",
350 }
351
352 if o.DocumentRanksVersion != "" {
353 // We store the document ranks as JSON in gitDir and tell zoekt-git-index where
354 // to find the file.
355 documentsRankFile := filepath.Join(gitDir, "documents.rank")
356
357 saveDocumentRanks := func() error {
358 r, err := sourcegraph.GetDocumentRanks(context.Background(), o.Name)
359 if err != nil {
360 return fmt.Errorf("GetDocumentRanks: %w", err)
361 }
362
363 b, err := json.Marshal(r)
364 if err != nil {
365 return err
366 }
367
368 if err := os.WriteFile(documentsRankFile, b, 0o600); err != nil {
369 return fmt.Errorf("failed to write %s to disk: %w", documentsRankFile, err)
370 }
371
372 return nil
373 }
374
375 if err := saveDocumentRanks(); err != nil {
376 // log and fall back to online ranking
377 logger.Warn(
378 "error saving document ranks. Falling back to online ranking",
379 sglog.Error(err),
380 sglog.String("repo", o.Name),
381 sglog.Uint32("id", o.RepoID),
382 )
383 } else {
384 args = append(args,
385 "-offline_ranking", documentsRankFile,
386 "-offline_ranking_version", o.DocumentRanksVersion)
387 }
388 }
389
390 // Even though we check for incremental in this process, we still pass it
391 // in just in case we regress in how we check in process. We will still
392 // notice thanks to metrics and increased load on gitserver.
393 if o.Incremental {
394 args = append(args, "-incremental")
395 }
396
397 var branches []string
398 for _, b := range o.Branches {
399 branches = append(branches, b.Name)
400 }
401 args = append(args, "-branches", strings.Join(branches, ","))
402
403 if o.UseDelta {
404 args = append(args, "-delta")
405 args = append(args, "-delta_threshold", strconv.FormatUint(o.DeltaShardNumberFallbackThreshold, 10))
406 }
407
408 if len(o.LanguageMap) > 0 {
409 var languageMap []string
410 for language, parser := range o.LanguageMap {
411 languageMap = append(languageMap, language+":"+ctags.ParserToString(parser))
412 }
413 args = append(args, "-language_map", strings.Join(languageMap, ","))
414 }
415
416 args = append(args, buildOptions.Args()...)
417 args = append(args, gitDir)
418
419 cmd = exec.CommandContext(ctx, "zoekt-git-index", args...)
420 cmd.Stdin = &bytes.Buffer{}
421 if err := runCmd(cmd); err != nil {
422 return err
423 }
424
425 return nil
426}
427
428func priorBranches(c gitIndexConfig, o *indexArgs) ([]zoekt.RepositoryBranch, error) {
429 existingRepository, _, found, err := c.findRepositoryMetadata(o)
430 if err != nil {
431 return nil, fmt.Errorf("loading repository metadata: %w", err)
432 }
433
434 if !found || len(existingRepository.Branches) == 0 {
435 return nil, fmt.Errorf("no prior shards found")
436 }
437
438 return existingRepository.Branches, nil
439}
440
441func tmpGitDir(name string) (string, error) {
442 abs := url.QueryEscape(name)
443 if len(abs) > 200 {
444 h := sha1.New()
445 _, _ = io.WriteString(h, abs)
446 abs = abs[:200] + fmt.Sprintf("%x", h.Sum(nil))[:8]
447 }
448 dir := filepath.Join(os.TempDir(), abs+".git")
449 if _, err := os.Stat(dir); err == nil {
450 if err := os.RemoveAll(dir); err != nil {
451 return "", err
452 }
453 }
454 return dir, nil
455}