fork of https://github.com/sourcegraph/zoekt
1// Command zoekt-archive-index indexes an archive.
2//
3// Example via github.com:
4//
5// zoekt-archive-index -incremental -commit b57cb1605fd11ba2ecfa7f68992b4b9cc791934d -name github.com/gorilla/mux -strip_components 1 https://codeload.github.com/gorilla/mux/legacy.tar.gz/b57cb1605fd11ba2ecfa7f68992b4b9cc791934d
6//
7// zoekt-archive-index -branch master https://github.com/gorilla/mux/commit/b57cb1605fd11ba2ecfa7f68992b4b9cc791934d
8package main
9
10import (
11 "errors"
12 "flag"
13 "fmt"
14 "io"
15 "log"
16 "net/url"
17 "strings"
18
19 "github.com/sourcegraph/zoekt"
20 "github.com/sourcegraph/zoekt/build"
21 "github.com/sourcegraph/zoekt/cmd"
22 "go.uber.org/automaxprocs/maxprocs"
23)
24
25// stripComponents removes the specified number of leading path
26// elements. Pathnames with fewer elements will return the empty string.
27func stripComponents(path string, count int) string {
28 for i := 0; path != "" && i < count; i++ {
29 i := strings.Index(path, "/")
30 if i < 0 {
31 return ""
32 }
33 path = path[i+1:]
34 }
35 return path
36}
37
38// isGitOID checks if the revision is a git OID SHA string.
39//
40// Note: This doesn't mean the SHA exists in a repository, nor does it mean it
41// isn't a ref. Git allows 40-char hexadecimal strings to be references.
42func isGitOID(s string) bool {
43 if len(s) != 40 {
44 return false
45 }
46 for _, r := range s {
47 if !(('0' <= r && r <= '9') ||
48 ('a' <= r && r <= 'f') ||
49 ('A' <= r && r <= 'F')) {
50 return false
51 }
52 }
53 return true
54}
55
56type Options struct {
57 Incremental bool
58
59 Archive string
60 Name string
61 RepoURL string
62 Branch string
63 Commit string
64 Strip int
65}
66
67func (o *Options) SetDefaults() {
68 // We guess based on the archive URL.
69 u, _ := url.Parse(o.Archive)
70 if u == nil {
71 return
72 }
73
74 setRef := func(ref string) {
75 if isGitOID(ref) && o.Commit == "" {
76 o.Commit = ref
77 }
78 if !isGitOID(ref) && o.Branch == "" {
79 o.Branch = ref
80 }
81 }
82
83 switch u.Host {
84 case "github.com", "codeload.github.com":
85 // https://github.com/octokit/octokit.rb/commit/3d21ec53a331a6f037a91c368710b99387d012c1
86 // https://github.com/octokit/octokit.rb/blob/master/README.md
87 // https://github.com/octokit/octokit.rb/tree/master/lib
88 // https://codeload.github.com/octokit/octokit.rb/legacy.tar.gz/master
89 parts := strings.Split(u.Path, "/")
90 if len(parts) > 2 && o.Name == "" {
91 o.Name = fmt.Sprintf("github.com/%s/%s", parts[1], parts[2])
92 o.RepoURL = fmt.Sprintf("https://github.com/%s/%s", parts[1], parts[2])
93 }
94 if len(parts) > 4 {
95 setRef(parts[4])
96 if u.Host == "github.com" {
97 o.Archive = fmt.Sprintf("https://codeload.github.com/%s/%s/legacy.tar.gz/%s", parts[1], parts[2], parts[4])
98 }
99 }
100 o.Strip = 1
101 case "api.github.com":
102 // https://api.github.com/repos/octokit/octokit.rb/tarball/master
103 parts := strings.Split(u.Path, "/")
104 if len(parts) > 2 && o.Name == "" {
105 o.Name = fmt.Sprintf("github.com/%s/%s", parts[1], parts[2])
106 o.RepoURL = fmt.Sprintf("https://github.com/%s/%s", parts[1], parts[2])
107 }
108 if len(parts) > 5 {
109 setRef(parts[5])
110 }
111 o.Strip = 1
112 }
113}
114
115func do(opts Options, bopts build.Options) error {
116 opts.SetDefaults()
117
118 if opts.Name == "" && opts.RepoURL == "" {
119 return errors.New("-name or -url required")
120 }
121 if opts.Branch == "" {
122 return errors.New("-branch required")
123 }
124
125 if opts.Name != "" {
126 bopts.RepositoryDescription.Name = opts.Name
127 }
128 // We do not use this functionality to avoid pulling in the transitive deps of gitindex
129 /*
130 if opts.RepoURL != "" {
131 u, err := url.Parse(opts.RepoURL)
132 if err != nil {
133 return err
134 }
135 if err := gitindex.SetTemplatesFromOrigin(&bopts.RepositoryDescription, u); err != nil {
136 return err
137 }
138 }
139 */
140 bopts.SetDefaults()
141 bopts.RepositoryDescription.Branches = []zoekt.RepositoryBranch{{Name: opts.Branch, Version: opts.Commit}}
142 brs := []string{opts.Branch}
143
144 if opts.Incremental && bopts.IncrementalSkipIndexing() {
145 return nil
146 }
147
148 a, err := openArchive(opts.Archive)
149 if err != nil {
150 return err
151 }
152 defer a.Close()
153
154 bopts.RepositoryDescription.Source = opts.Archive
155 builder, err := build.NewBuilder(bopts)
156 if err != nil {
157 return err
158 }
159
160 add := func(f *File) error {
161 defer f.Close()
162
163 contents, err := io.ReadAll(f)
164 if err != nil {
165 return err
166 }
167
168 name := stripComponents(f.Name, opts.Strip)
169 if name == "" {
170 return nil
171 }
172
173 return builder.Add(zoekt.Document{
174 Name: name,
175 Content: contents,
176 Branches: brs,
177 })
178 }
179
180 for {
181 f, err := a.Next()
182 if err == io.EOF {
183 break
184 }
185 if err != nil {
186 return err
187 }
188
189 if err := add(f); err != nil {
190 return err
191 }
192 }
193
194 return builder.Finish()
195}
196
197func main() {
198 var (
199 incremental = flag.Bool("incremental", true, "only index changed repositories")
200
201 name = flag.String("name", "", "The repository name for the archive")
202 urlRaw = flag.String("url", "", "The repository URL for the archive")
203 branch = flag.String("branch", "", "The branch name for the archive")
204 commit = flag.String("commit", "", "The commit sha for the archive. If incremental this will avoid updating shards already at commit")
205 strip = flag.Int("strip_components", 0, "Remove the specified number of leading path elements. Pathnames with fewer elements will be silently skipped.")
206
207 downloadLimitMbps = flag.Int64("download-limit-mbps", 0, "If non-zero, limit archive downloads to specified amount in megabits per second")
208 )
209 flag.Parse()
210
211 // Tune GOMAXPROCS to match Linux container CPU quota.
212 _, _ = maxprocs.Set()
213
214 log.SetFlags(log.LstdFlags | log.Lshortfile)
215
216 if len(flag.Args()) != 1 {
217 log.Fatal("expected argument for archive location")
218 }
219 archive := flag.Args()[0]
220 bopts := cmd.OptionsFromFlags()
221 opts := Options{
222 Incremental: *incremental,
223
224 Archive: archive,
225 Name: *name,
226 RepoURL: *urlRaw,
227 Branch: *branch,
228 Commit: *commit,
229 Strip: *strip,
230 }
231
232 // Sourcegraph specific: Limit HTTP traffic
233 limitHTTPDefaultClient(*downloadLimitMbps)
234
235 if err := do(opts, *bopts); err != nil {
236 log.Fatal(err)
237 }
238}