fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// Command zoekt-repo-index indexes repository that uses the Android 'repo'
16// tool (https://android.googlesource.com/tools/repo). The constituent git
17// repositories should already have been downloaded to the --repo_cache
18// directory, for example:
19//
20// go install github.com/sourcegraph/zoekt/cmd/zoekt-repo-index &&
21//
22// zoekt-repo-index -base_url https://gfiber.googlesource.com/ \
23// -manifest_repo_url https://gfiber.googlesource.com/manifests \
24// -manifest_rev_prefix=refs/heads/ \
25// -rev_prefix="refs/remotes/" \
26// -repo_cache ~/zoekt-serving/repos/ \
27// -shard_limit 50000000 \
28// master:default_unrestricted.xml
29package main
30
31import (
32 "crypto/sha1"
33 "flag"
34 "fmt"
35 "io"
36 "log"
37 "net/url"
38 "path"
39 "path/filepath"
40 "sort"
41 "strings"
42
43 "github.com/google/slothfs/manifest"
44 "github.com/sourcegraph/zoekt"
45 "github.com/sourcegraph/zoekt/ignore"
46 "github.com/sourcegraph/zoekt/index"
47 "github.com/sourcegraph/zoekt/internal/gitindex"
48 "go.uber.org/automaxprocs/maxprocs"
49
50 git "github.com/go-git/go-git/v5"
51 "github.com/go-git/go-git/v5/plumbing"
52)
53
54var _ = log.Println
55
56type fileKey struct {
57 SubRepoPath string
58 Path string
59 ID plumbing.Hash
60}
61
62func (k *fileKey) FullPath() string {
63 return filepath.Join(k.SubRepoPath, k.Path)
64}
65
66type branchFile struct {
67 branch, file string
68 mf *manifest.Manifest
69 manifestPath string
70}
71
72func parseBranches(manifestRepoURL, revPrefix string, cache *gitindex.RepoCache, args []string) ([]branchFile, error) {
73 var branches []branchFile
74 if manifestRepoURL != "" {
75 u, err := url.Parse(manifestRepoURL)
76 if err != nil {
77 return nil, err
78 }
79
80 repo, err := cache.Open(u)
81 if err != nil {
82 return nil, err
83 }
84
85 for _, f := range args {
86 fs := strings.SplitN(f, ":", 2)
87 if len(fs) != 2 {
88 return nil, fmt.Errorf("cannot parse %q as BRANCH:FILE", f)
89 }
90 mf, err := getManifest(repo, revPrefix+fs[0], fs[1])
91 if err != nil {
92 return nil, fmt.Errorf("manifest %s:%s: %v", fs[0], fs[1], err)
93 }
94
95 branches = append(branches, branchFile{
96 branch: fs[0],
97 file: fs[1],
98 mf: mf,
99 manifestPath: cache.Path(u),
100 })
101 }
102 } else {
103 if len(args) == 0 {
104 return nil, fmt.Errorf("must give XML file argument")
105 }
106 for _, f := range args {
107 mf, err := manifest.ParseFile(f)
108 if err != nil {
109 return nil, err
110 }
111
112 branches = append(branches, branchFile{
113 branch: "HEAD",
114 file: filepath.Base(f),
115 mf: mf,
116 manifestPath: f,
117 })
118 }
119 }
120 return branches, nil
121}
122
123func main() {
124 sizeMax := flag.Int("file_limit", 128<<10, "maximum file size")
125 shardLimit := flag.Int("shard_limit", 100<<20, "maximum corpus size for a shard")
126 parallelism := flag.Int("parallelism", 1, "maximum number of parallel indexing processes")
127
128 revPrefix := flag.String("rev_prefix", "refs/remotes/origin/", "prefix for references")
129 baseURLStr := flag.String("base_url", "", "base url to interpret repository names")
130 repoCacheDir := flag.String("repo_cache", "", "root for repository cache")
131 indexDir := flag.String("index", index.DefaultDir, "index directory for *.zoekt files")
132 manifestRepoURL := flag.String("manifest_repo_url", "", "set a URL for a git repository holding manifest XML file. Provide the BRANCH:XML-FILE as further command-line arguments")
133 manifestRevPrefix := flag.String("manifest_rev_prefix", "refs/remotes/origin/", "prefixes for branches in manifest repository")
134 repoName := flag.String("name", "", "set repository name")
135 repoURL := flag.String("url", "", "set repository URL")
136 maxSubProjects := flag.Int("max_sub_projects", 0, "trim number of projects in manifest, for debugging.")
137 incremental := flag.Bool("incremental", true, "only index if the repository has changed.")
138 flag.Parse()
139
140 // Tune GOMAXPROCS to match Linux container CPU quota.
141 _, _ = maxprocs.Set()
142
143 if *repoCacheDir == "" {
144 log.Fatal("must set --repo_cache")
145 }
146 repoCache := gitindex.NewRepoCache(*repoCacheDir)
147
148 if u, err := url.Parse(*baseURLStr); err != nil {
149 log.Fatalf("Parse(%q): %v", u, err)
150 } else if *repoName == "" {
151 *repoName = filepath.Join(u.Host, u.Path)
152 }
153
154 opts := index.Options{
155 Parallelism: *parallelism,
156 SizeMax: *sizeMax,
157 ShardMax: *shardLimit,
158 IndexDir: *indexDir,
159 RepositoryDescription: zoekt.Repository{
160 Name: *repoName,
161 URL: *repoURL,
162 },
163 }
164 opts.SetDefaults()
165 baseURL, err := url.Parse(*baseURLStr)
166 if err != nil {
167 log.Fatalf("Parse baseURL %q: %v", *baseURLStr, err)
168 }
169
170 branches, err := parseBranches(*manifestRepoURL, *manifestRevPrefix, repoCache, flag.Args())
171 if err != nil {
172 log.Fatalf("parseBranches(%s, %s): %v", *manifestRepoURL, *manifestRevPrefix, err)
173 }
174 if len(branches) == 0 {
175 log.Fatal("must specify at least one branch")
176 }
177 if *maxSubProjects > 0 {
178 for _, b := range branches {
179 if *maxSubProjects < len(b.mf.Project) {
180 b.mf.Project = b.mf.Project[:*maxSubProjects]
181 }
182 }
183 }
184
185 perBranch := map[string]map[fileKey]gitindex.BlobLocation{}
186 opts.SubRepositories = map[string]*zoekt.Repository{}
187
188 // branch => repo => version
189 versionMap := map[string]map[string]plumbing.Hash{}
190 for _, br := range branches {
191 br.mf.Filter()
192 files, versions, err := iterateManifest(br.mf, *baseURL, *revPrefix, repoCache)
193 if err != nil {
194 log.Fatalf("iterateManifest: %v", err)
195 }
196
197 perBranch[br.branch] = files
198 for key, repo := range files {
199 _, ok := opts.SubRepositories[key.SubRepoPath]
200 if ok {
201 // This can be incorrect: if the layout of manifests
202 // changes across branches, then the same file could
203 // be in different subRepos. We'll pretend this is not
204 // a problem.
205 continue
206 }
207
208 desc := &zoekt.Repository{}
209 if err := gitindex.SetTemplatesFromOrigin(desc, repo.URL); err != nil {
210 log.Fatalf("SetTemplatesFromOrigin(%s): %v", repo.URL, err)
211 }
212
213 opts.SubRepositories[key.SubRepoPath] = desc
214 }
215 versionMap[br.branch] = versions
216 }
217
218 for _, br := range branches {
219 var paths []string
220 for p := range opts.SubRepositories {
221 paths = append(paths, p)
222 }
223 sort.Strings(paths)
224
225 // Compute a version of the aggregate. This version
226 // has nothing to do with git, but will let us do
227 // incrementality correctly.
228 hasher := sha1.New()
229 for _, p := range paths {
230 repo := opts.SubRepositories[p]
231 id := versionMap[br.branch][p]
232
233 // it is possible that 'id' is zero, if this
234 // branch of the manifest doesn't have this
235 // particular subrepository.
236 hasher.Write([]byte(p))
237 hasher.Write([]byte(id.String()))
238 repo.Branches = append(repo.Branches, zoekt.RepositoryBranch{
239 Name: br.branch,
240 Version: id.String(),
241 })
242 }
243
244 opts.RepositoryDescription.Branches = append(opts.RepositoryDescription.Branches, zoekt.RepositoryBranch{
245 Name: br.branch,
246 Version: fmt.Sprintf("%x", hasher.Sum(nil)),
247 })
248 }
249
250 // key => branch
251 all := map[fileKey][]string{}
252 for br, files := range perBranch {
253 for k := range files {
254 all[k] = append(all[k], br)
255 }
256 }
257
258 if *incremental && opts.IncrementalSkipIndexing() {
259 return
260 }
261
262 builder, err := index.NewBuilder(opts)
263 if err != nil {
264 log.Fatal(err)
265 }
266 for k, branches := range all {
267 loc := perBranch[branches[0]][k]
268 data, err := loc.Blob(&k.ID)
269 if err != nil {
270 log.Fatal(err)
271 }
272
273 doc := index.Document{
274 Name: k.FullPath(),
275 Content: data,
276 SubRepositoryPath: k.SubRepoPath,
277 }
278
279 doc.Branches = append(doc.Branches, branches...)
280 if err := builder.Add(doc); err != nil {
281 log.Printf("Add(%s): %v", doc.Name, err)
282 break
283 }
284 }
285 if err := builder.Finish(); err != nil {
286 log.Fatalf("Finish: %v", err)
287 }
288}
289
290// getManifest parses the manifest XML at the given branch/path inside a Git repository.
291func getManifest(repo *git.Repository, branch, path string) (*manifest.Manifest, error) {
292 ref, err := repo.Reference(plumbing.ReferenceName("refs/heads/"+branch), true)
293 if err != nil {
294 return nil, err
295 }
296
297 commit, err := repo.CommitObject(ref.Hash())
298 if err != nil {
299 return nil, err
300 }
301
302 tree, err := repo.TreeObject(commit.TreeHash)
303 if err != nil {
304 return nil, err
305 }
306
307 entry, err := tree.FindEntry(path)
308 if err != nil {
309 return nil, err
310 }
311
312 blob, err := repo.BlobObject(entry.Hash)
313 if err != nil {
314 return nil, err
315 }
316 r, err := blob.Reader()
317 if err != nil {
318 return nil, err
319 }
320 defer r.Close()
321
322 content, _ := io.ReadAll(r)
323 return manifest.Parse(content)
324}
325
326// iterateManifest constructs a complete tree from the given Manifest.
327func iterateManifest(mf *manifest.Manifest,
328 baseURL url.URL, revPrefix string,
329 cache *gitindex.RepoCache,
330) (map[fileKey]gitindex.BlobLocation, map[string]plumbing.Hash, error) {
331 allFiles := map[fileKey]gitindex.BlobLocation{}
332 allVersions := map[string]plumbing.Hash{}
333 for _, p := range mf.Project {
334 rev := mf.ProjectRevision(&p)
335
336 projURL := baseURL
337 projURL.Path = path.Join(projURL.Path, p.Name)
338
339 topRepo, err := cache.Open(&projURL)
340 if err != nil {
341 return nil, nil, err
342 }
343
344 ref, err := topRepo.Reference(plumbing.ReferenceName(revPrefix+rev), true)
345 if err != nil {
346 return nil, nil, err
347 }
348
349 commit, err := topRepo.CommitObject(ref.Hash())
350 if err != nil {
351 return nil, nil, err
352 }
353 if err != nil {
354 return nil, nil, err
355 }
356
357 allVersions[p.GetPath()] = commit.Hash
358
359 tree, err := commit.Tree()
360 if err != nil {
361 return nil, nil, err
362 }
363
364 rw := gitindex.NewRepoWalker(topRepo, projURL.String(), cache)
365 subVersions, err := rw.CollectFiles(tree, rev, &ignore.Matcher{})
366 if err != nil {
367 return nil, nil, err
368 }
369
370 for key, repo := range rw.Files {
371 allFiles[fileKey{
372 SubRepoPath: filepath.Join(p.GetPath(), key.SubRepoPath),
373 Path: key.Path,
374 ID: key.ID,
375 }] = repo
376 }
377
378 for path, version := range subVersions {
379 allVersions[filepath.Join(p.GetPath(), path)] = version
380 }
381 }
382
383 return allFiles, allVersions, nil
384}