fork of https://github.com/sourcegraph/zoekt
1package main
2
3import (
4 "bytes"
5 "context"
6 "crypto/sha1"
7 "encoding/json"
8 "errors"
9 "fmt"
10 "io"
11 "log"
12 "net/url"
13 "os"
14 "os/exec"
15 "path/filepath"
16 "sort"
17 "strconv"
18 "strings"
19 "time"
20
21 "github.com/sourcegraph/zoekt"
22 "github.com/sourcegraph/zoekt/build"
23 "github.com/sourcegraph/zoekt/ctags"
24
25 sglog "github.com/sourcegraph/log"
26)
27
28// indexTimeout defines how long the indexserver waits before
29// killing an indexing job.
30const indexTimeout = 1*time.Hour + 30*time.Minute // an index should never take longer than an hour and a half
31
32// IndexOptions are the options that Sourcegraph can set via it's search
33// configuration endpoint.
34type IndexOptions struct {
35 // LargeFiles is a slice of glob patterns where matching file paths should
36 // be indexed regardless of their size. The pattern syntax can be found
37 // here: https://golang.org/pkg/path/filepath/#Match.
38 LargeFiles []string
39
40 // Symbols if true will make zoekt index the output of ctags.
41 Symbols bool
42
43 // Branches is a slice of branches to index.
44 Branches []zoekt.RepositoryBranch
45
46 // RepoID is the Sourcegraph Repository ID.
47 RepoID uint32
48
49 // Name is the Repository Name.
50 Name string
51
52 // CloneURL is the internal clone URL for Name.
53 CloneURL string
54
55 // Priority indicates ranking in results, higher first.
56 Priority float64
57
58 // DocumentRanksVersion when non-empty will lead to indexing using offline
59 // ranking. When the string changes this will also cause us to re-index with
60 // new ranks.
61 DocumentRanksVersion string
62
63 // Public is true if the repository is public.
64 Public bool
65
66 // Fork is true if the repository is a fork.
67 Fork bool
68
69 // Archived is true if the repository is archived.
70 Archived bool
71
72 // Map from language to scip-ctags, universal-ctags, or neither
73 LanguageMap ctags.LanguageMap
74}
75
76// indexArgs represents the arguments we pass to zoekt-git-index
77type indexArgs struct {
78 IndexOptions
79
80 // Incremental indicates to skip indexing if already indexed.
81 Incremental bool
82
83 // IndexDir is the index directory to store the shards.
84 IndexDir string
85
86 // Parallelism is the number of shards to compute in parallel.
87 Parallelism int
88
89 // FileLimit is the maximum size of a file
90 FileLimit int
91
92 // UseDelta is true if we want to use the new delta indexer. This should
93 // only be true for repositories we explicitly enable.
94 UseDelta bool
95
96 // DeltaShardNumberFallbackThreshold is an upper limit on the number of preexisting shards that can exist
97 // before attempting a delta build.
98 DeltaShardNumberFallbackThreshold uint64
99}
100
101// BuildOptions returns a build.Options represented by indexArgs. Note: it
102// doesn't set fields like repository/branch.
103func (o *indexArgs) BuildOptions() *build.Options {
104 return &build.Options{
105 // It is important that this RepositoryDescription exactly matches what
106 // the indexer we call will produce. This is to ensure that
107 // IncrementalSkipIndexing and IndexState can correctly calculate if
108 // nothing needs to be done.
109 RepositoryDescription: zoekt.Repository{
110 ID: uint32(o.IndexOptions.RepoID),
111 Name: o.Name,
112 Branches: o.Branches,
113 RawConfig: map[string]string{
114 "repoid": strconv.Itoa(int(o.IndexOptions.RepoID)),
115 "priority": strconv.FormatFloat(o.Priority, 'g', -1, 64),
116 "public": marshalBool(o.Public),
117 "fork": marshalBool(o.Fork),
118 "archived": marshalBool(o.Archived),
119 },
120 },
121 IndexDir: o.IndexDir,
122 Parallelism: o.Parallelism,
123 SizeMax: o.FileLimit,
124 LargeFiles: o.LargeFiles,
125 CTagsMustSucceed: o.Symbols,
126 DisableCTags: !o.Symbols,
127 IsDelta: o.UseDelta,
128
129 DocumentRanksVersion: o.DocumentRanksVersion,
130
131 LanguageMap: o.LanguageMap,
132 }
133}
134
135func marshalBool(b bool) string {
136 if b {
137 return "1"
138 }
139 return "0"
140}
141
142func (o *indexArgs) String() string {
143 s := fmt.Sprintf("%d %s", o.RepoID, o.Name)
144 for i, b := range o.Branches {
145 if i == 0 {
146 s = fmt.Sprintf("%s@%s=%s", s, b.Name, b.Version)
147 } else {
148 s = fmt.Sprintf("%s,%s=%s", s, b.Name, b.Version)
149 }
150 }
151 return s
152}
153
154type gitIndexConfig struct {
155 // runCmd is the function that's used to execute all external commands (such as calls to "git" or "zoekt-git-index")
156 // that gitIndex may construct.
157 runCmd func(*exec.Cmd) error
158
159 // findRepositoryMetadata is the function that returns the repository metadata for the
160 // repository specified in args. 'ok' is false if the repository's metadata
161 // couldn't be found or if an error occurred.
162 //
163 // The primary purpose of this configuration option is to be able to provide a stub
164 // implementation for this in our test suite. All other callers should use build.Options.FindRepositoryMetadata().
165 findRepositoryMetadata func(args *indexArgs) (repository *zoekt.Repository, metadata *zoekt.IndexMetadata, ok bool, err error)
166}
167
168func gitIndex(c gitIndexConfig, o *indexArgs, sourcegraph Sourcegraph, l sglog.Logger) error {
169 logger := l.Scoped("gitIndex", "fetch commits and then run zoekt-git-index against contents")
170
171 if len(o.Branches) == 0 {
172 return errors.New("zoekt-git-index requires 1 or more branches")
173 }
174
175 if c.runCmd == nil {
176 return errors.New("runCmd in provided configuration was nil - a function must be provided")
177 }
178 runCmd := c.runCmd
179
180 if c.findRepositoryMetadata == nil {
181 return errors.New("findRepositoryMetadata in provided configuration was nil - a function must be provided")
182 }
183
184 buildOptions := o.BuildOptions()
185
186 ctx, cancel := context.WithTimeout(context.Background(), indexTimeout)
187 defer cancel()
188
189 gitDir, err := tmpGitDir(o.Name)
190 if err != nil {
191 return err
192 }
193 defer os.RemoveAll(gitDir) // best-effort cleanup
194
195 // Create a repo to fetch into
196 cmd := exec.CommandContext(ctx, "git",
197 // use a random default branch. This is so that HEAD isn't a symref to a
198 // branch that is indexed. For example if you are indexing
199 // HEAD,master. Then HEAD would be pointing to master by default.
200 "-c", "init.defaultBranch=nonExistentBranchBB0FOFCH32",
201 "init",
202 // we don't need a working copy
203 "--bare",
204 gitDir)
205 cmd.Stdin = &bytes.Buffer{}
206 if err := runCmd(cmd); err != nil {
207 return err
208 }
209
210 var fetchDuration time.Duration
211 successfullyFetchedCommitsCount := 0
212 allFetchesSucceeded := true
213
214 defer func() {
215 success := strconv.FormatBool(allFetchesSucceeded)
216 name := repoNameForMetric(o.Name)
217 metricFetchDuration.WithLabelValues(success, name).Observe(fetchDuration.Seconds())
218 }()
219
220 var runFetch = func(branches []zoekt.RepositoryBranch) error {
221 // We shallow fetch each commit specified in zoekt.Branches. This requires
222 // the server to have configured both uploadpack.allowAnySHA1InWant and
223 // uploadpack.allowFilter. (See gitservice.go in the Sourcegraph repository)
224 fetchArgs := []string{
225 "-C", gitDir,
226 "-c", "protocol.version=2",
227 "-c", "http.extraHeader=X-Sourcegraph-Actor-UID: internal",
228 "fetch", "--depth=1", o.CloneURL}
229
230 var commits []string
231 for _, b := range branches {
232 commits = append(commits, b.Version)
233 }
234
235 fetchArgs = append(fetchArgs, commits...)
236
237 cmd = exec.CommandContext(ctx, "git", fetchArgs...)
238 cmd.Stdin = &bytes.Buffer{}
239
240 start := time.Now()
241 err := runCmd(cmd)
242 fetchDuration += time.Since(start)
243
244 if err != nil {
245 allFetchesSucceeded = false
246 var bs []string
247 for _, b := range branches {
248 bs = append(bs, b.String())
249 }
250
251 formattedBranches := strings.Join(bs, ", ")
252 return fmt.Errorf("fetching %s: %w", formattedBranches, err)
253 }
254
255 successfullyFetchedCommitsCount += len(commits)
256 return nil
257 }
258
259 fetchPriorAndLatestCommits := func() error {
260 prior, err := priorBranches(c, o)
261 if err != nil {
262 return err
263 }
264
265 var allBranches []zoekt.RepositoryBranch
266 allBranches = append(allBranches, o.Branches...)
267 allBranches = append(allBranches, prior...)
268
269 return runFetch(allBranches)
270 }
271
272 fetchOnlyLatestCommits := func() error {
273 return runFetch(o.Branches)
274 }
275
276 if o.UseDelta {
277 err := fetchPriorAndLatestCommits()
278 if err != nil {
279 name := buildOptions.RepositoryDescription.Name
280 id := buildOptions.RepositoryDescription.ID
281
282 log.Printf("delta build: failed to prepare delta build for %q (ID %d): failed to fetch both latest and prior commits: %s", name, id, err)
283 err = fetchOnlyLatestCommits()
284 if err != nil {
285 return err
286 }
287 }
288 } else {
289 err := fetchOnlyLatestCommits()
290 if err != nil {
291 return err
292 }
293 }
294
295 logger.Debug("successfully fetched git data",
296 sglog.String("repo", o.Name),
297 sglog.Uint32("id", o.RepoID),
298 sglog.Int("commits_count", successfullyFetchedCommitsCount),
299 sglog.Duration("duration", fetchDuration),
300 )
301
302 // We then create the relevant refs for each fetched commit.
303 for _, b := range o.Branches {
304 ref := b.Name
305 if ref != "HEAD" {
306 ref = "refs/heads/" + ref
307 }
308 cmd = exec.CommandContext(ctx, "git", "-C", gitDir, "update-ref", ref, b.Version)
309 cmd.Stdin = &bytes.Buffer{}
310 if err := runCmd(cmd); err != nil {
311 return fmt.Errorf("failed update-ref %s to %s: %w", ref, b.Version, err)
312 }
313 }
314
315 // create git configuration with options
316 type configKV struct{ Key, Value string }
317 config := []configKV{{
318 // zoekt.name is used by zoekt-git-index to set the repository name.
319 Key: "name",
320 Value: o.Name,
321 }}
322 for k, v := range buildOptions.RepositoryDescription.RawConfig {
323 config = append(config, configKV{Key: k, Value: v})
324 }
325 sort.Slice(config, func(i, j int) bool {
326 return config[i].Key < config[j].Key
327 })
328
329 // write git configuration to repo
330 for _, kv := range config {
331 cmd = exec.CommandContext(ctx, "git", "-C", gitDir, "config", "zoekt."+kv.Key, kv.Value)
332 cmd.Stdin = &bytes.Buffer{}
333 if err := runCmd(cmd); err != nil {
334 return err
335 }
336 }
337
338 args := []string{
339 "-submodules=false",
340 }
341
342 if o.DocumentRanksVersion != "" {
343 // We store the document ranks as JSON in gitDir and tell zoekt-git-index where
344 // to find the file.
345 documentsRankFile := filepath.Join(gitDir, "documents.rank")
346
347 saveDocumentRanks := func() error {
348 r, err := sourcegraph.GetDocumentRanks(context.Background(), o.Name)
349 if err != nil {
350 return fmt.Errorf("GetDocumentRanks: %w", err)
351 }
352
353 b, err := json.Marshal(r)
354 if err != nil {
355 return err
356 }
357
358 if err := os.WriteFile(documentsRankFile, b, 0600); err != nil {
359 return fmt.Errorf("failed to write %s to disk: %w", documentsRankFile, err)
360 }
361
362 return nil
363 }
364
365 if err := saveDocumentRanks(); err != nil {
366 // log and fall back to online ranking
367 logger.Warn(
368 "error saving document ranks. Falling back to online ranking",
369 sglog.Error(err),
370 sglog.String("repo", o.Name),
371 sglog.Uint32("id", o.RepoID),
372 )
373 } else {
374 args = append(args,
375 "-offline_ranking", documentsRankFile,
376 "-offline_ranking_version", o.DocumentRanksVersion)
377 }
378 }
379
380 // Even though we check for incremental in this process, we still pass it
381 // in just in case we regress in how we check in process. We will still
382 // notice thanks to metrics and increased load on gitserver.
383 if o.Incremental {
384 args = append(args, "-incremental")
385 }
386
387 var branches []string
388 for _, b := range o.Branches {
389 branches = append(branches, b.Name)
390 }
391 args = append(args, "-branches", strings.Join(branches, ","))
392
393 if o.UseDelta {
394 args = append(args, "-delta")
395 args = append(args, "-delta_threshold", strconv.FormatUint(o.DeltaShardNumberFallbackThreshold, 10))
396 }
397
398 if len(o.LanguageMap) > 0 {
399 var languageMap []string
400 for language, parser := range o.LanguageMap {
401 languageMap = append(languageMap, language+":"+ctags.ParserToString(parser))
402 }
403 args = append(args, "-language_map", strings.Join(languageMap, ","))
404 }
405
406 args = append(args, buildOptions.Args()...)
407 args = append(args, gitDir)
408
409 cmd = exec.CommandContext(ctx, "zoekt-git-index", args...)
410 cmd.Stdin = &bytes.Buffer{}
411 if err := runCmd(cmd); err != nil {
412 return err
413 }
414
415 return nil
416}
417
418func priorBranches(c gitIndexConfig, o *indexArgs) ([]zoekt.RepositoryBranch, error) {
419 existingRepository, _, found, err := c.findRepositoryMetadata(o)
420 if err != nil {
421 return nil, fmt.Errorf("loading repository metadata: %w", err)
422 }
423
424 if !found || len(existingRepository.Branches) == 0 {
425 return nil, fmt.Errorf("no prior shards found")
426 }
427
428 return existingRepository.Branches, nil
429}
430
431func tmpGitDir(name string) (string, error) {
432 abs := url.QueryEscape(name)
433 if len(abs) > 200 {
434 h := sha1.New()
435 _, _ = io.WriteString(h, abs)
436 abs = abs[:200] + fmt.Sprintf("%x", h.Sum(nil))[:8]
437 }
438 dir := filepath.Join(os.TempDir(), abs+".git")
439 if _, err := os.Stat(dir); err == nil {
440 if err := os.RemoveAll(dir); err != nil {
441 return "", err
442 }
443 }
444 return dir, nil
445}