fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// zoekt-repo-index indexes a repo-based repository. The constituent git
16// repositories should already have been downloaded to the --repo_cache
17// directory, eg.
18//
19// go install github.com/sourcegraph/zoekt/cmd/zoekt-repo-index &&
20//
21// zoekt-repo-index -base_url https://gfiber.googlesource.com/ \
22// -manifest_repo_url https://gfiber.googlesource.com/manifests \
23// -manifest_rev_prefix=refs/heads/ \
24// -rev_prefix="refs/remotes/" \
25// -repo_cache ~/zoekt-serving/repos/ \
26// -shard_limit 50000000 \
27// master:default_unrestricted.xml
28package main
29
30import (
31 "crypto/sha1"
32 "flag"
33 "fmt"
34 "io"
35 "log"
36 "net/url"
37 "path"
38 "path/filepath"
39 "sort"
40 "strings"
41
42 "github.com/google/slothfs/manifest"
43 "github.com/sourcegraph/zoekt"
44 "github.com/sourcegraph/zoekt/build"
45 "github.com/sourcegraph/zoekt/gitindex"
46 "github.com/sourcegraph/zoekt/ignore"
47 "go.uber.org/automaxprocs/maxprocs"
48
49 git "github.com/go-git/go-git/v5"
50 "github.com/go-git/go-git/v5/plumbing"
51)
52
53var _ = log.Println
54
55type fileKey struct {
56 SubRepoPath string
57 Path string
58 ID plumbing.Hash
59}
60
61func (k *fileKey) FullPath() string {
62 return filepath.Join(k.SubRepoPath, k.Path)
63}
64
65type branchFile struct {
66 branch, file string
67 mf *manifest.Manifest
68 manifestPath string
69}
70
71func parseBranches(manifestRepoURL, revPrefix string, cache *gitindex.RepoCache, args []string) ([]branchFile, error) {
72 var branches []branchFile
73 if manifestRepoURL != "" {
74 u, err := url.Parse(manifestRepoURL)
75 if err != nil {
76 return nil, err
77 }
78
79 repo, err := cache.Open(u)
80 if err != nil {
81 return nil, err
82 }
83
84 for _, f := range args {
85 fs := strings.SplitN(f, ":", 2)
86 if len(fs) != 2 {
87 return nil, fmt.Errorf("cannot parse %q as BRANCH:FILE", f)
88 }
89 mf, err := getManifest(repo, revPrefix+fs[0], fs[1])
90 if err != nil {
91 return nil, fmt.Errorf("manifest %s:%s: %v", fs[0], fs[1], err)
92 }
93
94 branches = append(branches, branchFile{
95 branch: fs[0],
96 file: fs[1],
97 mf: mf,
98 manifestPath: cache.Path(u),
99 })
100 }
101 } else {
102 if len(args) == 0 {
103 return nil, fmt.Errorf("must give XML file argument")
104 }
105 for _, f := range args {
106 mf, err := manifest.ParseFile(f)
107 if err != nil {
108 return nil, err
109 }
110
111 branches = append(branches, branchFile{
112 branch: "HEAD",
113 file: filepath.Base(f),
114 mf: mf,
115 manifestPath: f,
116 })
117 }
118 }
119 return branches, nil
120}
121
122func main() {
123 sizeMax := flag.Int("file_limit", 128<<10, "maximum file size")
124 shardLimit := flag.Int("shard_limit", 100<<20, "maximum corpus size for a shard")
125 parallelism := flag.Int("parallelism", 1, "maximum number of parallel indexing processes")
126
127 revPrefix := flag.String("rev_prefix", "refs/remotes/origin/", "prefix for references")
128 baseURLStr := flag.String("base_url", "", "base url to interpret repository names")
129 repoCacheDir := flag.String("repo_cache", "", "root for repository cache")
130 indexDir := flag.String("index", build.DefaultDir, "index directory for *.zoekt files")
131 manifestRepoURL := flag.String("manifest_repo_url", "", "set a URL for a git repository holding manifest XML file. Provide the BRANCH:XML-FILE as further command-line arguments")
132 manifestRevPrefix := flag.String("manifest_rev_prefix", "refs/remotes/origin/", "prefixes for branches in manifest repository")
133 repoName := flag.String("name", "", "set repository name")
134 repoURL := flag.String("url", "", "set repository URL")
135 maxSubProjects := flag.Int("max_sub_projects", 0, "trim number of projects in manifest, for debugging.")
136 incremental := flag.Bool("incremental", true, "only index if the repository has changed.")
137 flag.Parse()
138
139 // Tune GOMAXPROCS to match Linux container CPU quota.
140 _, _ = maxprocs.Set()
141
142 if *repoCacheDir == "" {
143 log.Fatal("must set --repo_cache")
144 }
145 repoCache := gitindex.NewRepoCache(*repoCacheDir)
146
147 if u, err := url.Parse(*baseURLStr); err != nil {
148 log.Fatalf("Parse(%q): %v", u, err)
149 } else if *repoName == "" {
150 *repoName = filepath.Join(u.Host, u.Path)
151 }
152
153 opts := build.Options{
154 Parallelism: *parallelism,
155 SizeMax: *sizeMax,
156 ShardMax: *shardLimit,
157 IndexDir: *indexDir,
158 RepositoryDescription: zoekt.Repository{
159 Name: *repoName,
160 URL: *repoURL,
161 },
162 }
163 opts.SetDefaults()
164 baseURL, err := url.Parse(*baseURLStr)
165 if err != nil {
166 log.Fatalf("Parse baseURL %q: %v", *baseURLStr, err)
167 }
168
169 branches, err := parseBranches(*manifestRepoURL, *manifestRevPrefix, repoCache, flag.Args())
170 if err != nil {
171 log.Fatalf("parseBranches(%s, %s): %v", *manifestRepoURL, *manifestRevPrefix, err)
172 }
173 if len(branches) == 0 {
174 log.Fatal("must specify at least one branch")
175 }
176 if *maxSubProjects > 0 {
177 for _, b := range branches {
178 if *maxSubProjects < len(b.mf.Project) {
179 b.mf.Project = b.mf.Project[:*maxSubProjects]
180 }
181 }
182 }
183
184 perBranch := map[string]map[fileKey]gitindex.BlobLocation{}
185 opts.SubRepositories = map[string]*zoekt.Repository{}
186
187 // branch => repo => version
188 versionMap := map[string]map[string]plumbing.Hash{}
189 for _, br := range branches {
190 br.mf.Filter()
191 files, versions, err := iterateManifest(br.mf, *baseURL, *revPrefix, repoCache)
192 if err != nil {
193 log.Fatalf("iterateManifest: %v", err)
194 }
195
196 perBranch[br.branch] = files
197 for key, repo := range files {
198 _, ok := opts.SubRepositories[key.SubRepoPath]
199 if ok {
200 // This can be incorrect: if the layout of manifests
201 // changes across branches, then the same file could
202 // be in different subRepos. We'll pretend this is not
203 // a problem.
204 continue
205 }
206
207 desc := &zoekt.Repository{}
208 if err := gitindex.SetTemplatesFromOrigin(desc, repo.URL); err != nil {
209 log.Fatalf("SetTemplatesFromOrigin(%s): %v", repo.URL, err)
210 }
211
212 opts.SubRepositories[key.SubRepoPath] = desc
213 }
214 versionMap[br.branch] = versions
215 }
216
217 for _, br := range branches {
218 var paths []string
219 for p := range opts.SubRepositories {
220 paths = append(paths, p)
221 }
222 sort.Strings(paths)
223
224 // Compute a version of the aggregate. This version
225 // has nothing to do with git, but will let us do
226 // incrementality correctly.
227 hasher := sha1.New()
228 for _, p := range paths {
229 repo := opts.SubRepositories[p]
230 id := versionMap[br.branch][p]
231
232 // it is possible that 'id' is zero, if this
233 // branch of the manifest doesn't have this
234 // particular subrepository.
235 hasher.Write([]byte(p))
236 hasher.Write([]byte(id.String()))
237 repo.Branches = append(repo.Branches, zoekt.RepositoryBranch{
238 Name: br.branch,
239 Version: id.String(),
240 })
241 }
242
243 opts.RepositoryDescription.Branches = append(opts.RepositoryDescription.Branches, zoekt.RepositoryBranch{
244 Name: br.branch,
245 Version: fmt.Sprintf("%x", hasher.Sum(nil)),
246 })
247 }
248
249 // key => branch
250 all := map[fileKey][]string{}
251 for br, files := range perBranch {
252 for k := range files {
253 all[k] = append(all[k], br)
254 }
255 }
256
257 if *incremental && opts.IncrementalSkipIndexing() {
258 return
259 }
260
261 builder, err := build.NewBuilder(opts)
262 if err != nil {
263 log.Fatal(err)
264 }
265 for k, branches := range all {
266 loc := perBranch[branches[0]][k]
267 data, err := loc.Blob(&k.ID)
268 if err != nil {
269 log.Fatal(err)
270 }
271
272 doc := zoekt.Document{
273 Name: k.FullPath(),
274 Content: data,
275 SubRepositoryPath: k.SubRepoPath,
276 }
277
278 doc.Branches = append(doc.Branches, branches...)
279 if err := builder.Add(doc); err != nil {
280 log.Printf("Add(%s): %v", doc.Name, err)
281 break
282 }
283 }
284 if err := builder.Finish(); err != nil {
285 log.Fatalf("Finish: %v", err)
286 }
287}
288
289// getManifest parses the manifest XML at the given branch/path inside a Git repository.
290func getManifest(repo *git.Repository, branch, path string) (*manifest.Manifest, error) {
291 ref, err := repo.Reference(plumbing.ReferenceName("refs/heads/"+branch), true)
292 if err != nil {
293 return nil, err
294 }
295
296 commit, err := repo.CommitObject(ref.Hash())
297 if err != nil {
298 return nil, err
299 }
300
301 tree, err := repo.TreeObject(commit.TreeHash)
302 if err != nil {
303 return nil, err
304 }
305
306 entry, err := tree.FindEntry(path)
307 if err != nil {
308 return nil, err
309 }
310
311 blob, err := repo.BlobObject(entry.Hash)
312 if err != nil {
313 return nil, err
314 }
315 r, err := blob.Reader()
316 if err != nil {
317 return nil, err
318 }
319 defer r.Close()
320
321 content, _ := io.ReadAll(r)
322 return manifest.Parse(content)
323}
324
325// iterateManifest constructs a complete tree from the given Manifest.
326func iterateManifest(mf *manifest.Manifest,
327 baseURL url.URL, revPrefix string,
328 cache *gitindex.RepoCache,
329) (map[fileKey]gitindex.BlobLocation, map[string]plumbing.Hash, error) {
330 allFiles := map[fileKey]gitindex.BlobLocation{}
331 allVersions := map[string]plumbing.Hash{}
332 for _, p := range mf.Project {
333 rev := mf.ProjectRevision(&p)
334
335 projURL := baseURL
336 projURL.Path = path.Join(projURL.Path, p.Name)
337
338 topRepo, err := cache.Open(&projURL)
339 if err != nil {
340 return nil, nil, err
341 }
342
343 ref, err := topRepo.Reference(plumbing.ReferenceName(revPrefix+rev), true)
344 if err != nil {
345 return nil, nil, err
346 }
347
348 commit, err := topRepo.CommitObject(ref.Hash())
349 if err != nil {
350 return nil, nil, err
351 }
352 if err != nil {
353 return nil, nil, err
354 }
355
356 allVersions[p.GetPath()] = commit.Hash
357
358 tree, err := commit.Tree()
359 if err != nil {
360 return nil, nil, err
361 }
362
363 rw := gitindex.NewRepoWalker(topRepo, projURL.String(), cache)
364 subVersions, err := rw.CollectFiles(tree, rev, &ignore.Matcher{})
365 if err != nil {
366 return nil, nil, err
367 }
368
369 for key, repo := range rw.Files {
370 allFiles[fileKey{
371 SubRepoPath: filepath.Join(p.GetPath(), key.SubRepoPath),
372 Path: key.Path,
373 ID: key.ID,
374 }] = repo
375 }
376
377 for path, version := range subVersions {
378 allVersions[filepath.Join(p.GetPath(), path)] = version
379 }
380 }
381
382 return allFiles, allVersions, nil
383}