fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// zoekt-repo-index indexes a repo-based repository. The constituent git
16// repositories should already have been downloaded to the --repo_cache
17// directory, eg.
18//
19// go install github.com/sourcegraph/zoekt/cmd/zoekt-repo-index &&
20//
21// zoekt-repo-index -base_url https://gfiber.googlesource.com/ \
22// -manifest_repo_url https://gfiber.googlesource.com/manifests \
23// -manifest_rev_prefix=refs/heads/ \
24// -rev_prefix="refs/remotes/" \
25// -repo_cache ~/zoekt-serving/repos/ \
26// -shard_limit 50000000 \
27// master:default_unrestricted.xml
28package main
29
30import (
31 "crypto/sha1"
32 "flag"
33 "fmt"
34 "io"
35 "log"
36 "net/url"
37 "path"
38 "path/filepath"
39 "sort"
40 "strings"
41
42 "github.com/google/slothfs/manifest"
43 "github.com/sourcegraph/zoekt"
44 "github.com/sourcegraph/zoekt/build"
45 "github.com/sourcegraph/zoekt/gitindex"
46 "go.uber.org/automaxprocs/maxprocs"
47
48 git "github.com/go-git/go-git/v5"
49 "github.com/go-git/go-git/v5/plumbing"
50)
51
52var _ = log.Println
53
54type fileKey struct {
55 SubRepoPath string
56 Path string
57 ID plumbing.Hash
58}
59
60func (k *fileKey) FullPath() string {
61 return filepath.Join(k.SubRepoPath, k.Path)
62}
63
64type branchFile struct {
65 branch, file string
66 mf *manifest.Manifest
67 manifestPath string
68}
69
70func parseBranches(manifestRepoURL, revPrefix string, cache *gitindex.RepoCache, args []string) ([]branchFile, error) {
71 var branches []branchFile
72 if manifestRepoURL != "" {
73 u, err := url.Parse(manifestRepoURL)
74 if err != nil {
75 return nil, err
76 }
77
78 repo, err := cache.Open(u)
79 if err != nil {
80 return nil, err
81 }
82
83 for _, f := range args {
84 fs := strings.SplitN(f, ":", 2)
85 if len(fs) != 2 {
86 return nil, fmt.Errorf("cannot parse %q as BRANCH:FILE", f)
87 }
88 mf, err := getManifest(repo, revPrefix+fs[0], fs[1])
89 if err != nil {
90 return nil, fmt.Errorf("manifest %s:%s: %v", fs[0], fs[1], err)
91 }
92
93 branches = append(branches, branchFile{
94 branch: fs[0],
95 file: fs[1],
96 mf: mf,
97 manifestPath: cache.Path(u),
98 })
99 }
100 } else {
101 if len(args) == 0 {
102 return nil, fmt.Errorf("must give XML file argument")
103 }
104 for _, f := range args {
105 mf, err := manifest.ParseFile(f)
106 if err != nil {
107 return nil, err
108 }
109
110 branches = append(branches, branchFile{
111 branch: "HEAD",
112 file: filepath.Base(f),
113 mf: mf,
114 manifestPath: f,
115 })
116 }
117 }
118 return branches, nil
119}
120
121func main() {
122 sizeMax := flag.Int("file_limit", 128<<10, "maximum file size")
123 shardLimit := flag.Int("shard_limit", 100<<20, "maximum corpus size for a shard")
124 parallelism := flag.Int("parallelism", 1, "maximum number of parallel indexing processes")
125
126 revPrefix := flag.String("rev_prefix", "refs/remotes/origin/", "prefix for references")
127 baseURLStr := flag.String("base_url", "", "base url to interpret repository names")
128 repoCacheDir := flag.String("repo_cache", "", "root for repository cache")
129 indexDir := flag.String("index", build.DefaultDir, "index directory for *.zoekt files")
130 manifestRepoURL := flag.String("manifest_repo_url", "", "set a URL for a git repository holding manifest XML file. Provide the BRANCH:XML-FILE as further command-line arguments")
131 manifestRevPrefix := flag.String("manifest_rev_prefix", "refs/remotes/origin/", "prefixes for branches in manifest repository")
132 repoName := flag.String("name", "", "set repository name")
133 repoURL := flag.String("url", "", "set repository URL")
134 maxSubProjects := flag.Int("max_sub_projects", 0, "trim number of projects in manifest, for debugging.")
135 incremental := flag.Bool("incremental", true, "only index if the repository has changed.")
136 flag.Parse()
137
138 // Tune GOMAXPROCS to match Linux container CPU quota.
139 _, _ = maxprocs.Set()
140
141 if *repoCacheDir == "" {
142 log.Fatal("must set --repo_cache")
143 }
144 repoCache := gitindex.NewRepoCache(*repoCacheDir)
145
146 if u, err := url.Parse(*baseURLStr); err != nil {
147 log.Fatalf("Parse(%q): %v", u, err)
148 } else if *repoName == "" {
149 *repoName = filepath.Join(u.Host, u.Path)
150 }
151
152 opts := build.Options{
153 Parallelism: *parallelism,
154 SizeMax: *sizeMax,
155 ShardMax: *shardLimit,
156 IndexDir: *indexDir,
157 RepositoryDescription: zoekt.Repository{
158 Name: *repoName,
159 URL: *repoURL,
160 },
161 }
162 opts.SetDefaults()
163 baseURL, err := url.Parse(*baseURLStr)
164 if err != nil {
165 log.Fatalf("Parse baseURL %q: %v", *baseURLStr, err)
166 }
167
168 branches, err := parseBranches(*manifestRepoURL, *manifestRevPrefix, repoCache, flag.Args())
169 if err != nil {
170 log.Fatalf("parseBranches(%s, %s): %v", *manifestRepoURL, *manifestRevPrefix, err)
171 }
172 if len(branches) == 0 {
173 log.Fatal("must specify at least one branch")
174 }
175 if *maxSubProjects > 0 {
176 for _, b := range branches {
177 if *maxSubProjects < len(b.mf.Project) {
178 b.mf.Project = b.mf.Project[:*maxSubProjects]
179 }
180 }
181 }
182
183 perBranch := map[string]map[fileKey]gitindex.BlobLocation{}
184 opts.SubRepositories = map[string]*zoekt.Repository{}
185
186 // branch => repo => version
187 versionMap := map[string]map[string]plumbing.Hash{}
188 for _, br := range branches {
189 br.mf.Filter()
190 files, versions, err := iterateManifest(br.mf, *baseURL, *revPrefix, repoCache)
191 if err != nil {
192 log.Fatalf("iterateManifest: %v", err)
193 }
194
195 perBranch[br.branch] = files
196 for key, loc := range files {
197 _, ok := opts.SubRepositories[key.SubRepoPath]
198 if ok {
199 // This can be incorrect: if the layout of manifests
200 // changes across branches, then the same file could
201 // be in different subRepos. We'll pretend this is not
202 // a problem.
203 continue
204 }
205
206 desc := &zoekt.Repository{}
207 if err := gitindex.SetTemplatesFromOrigin(desc, loc.URL); err != nil {
208 log.Fatalf("SetTemplatesFromOrigin(%s): %v", loc.URL, err)
209 }
210
211 opts.SubRepositories[key.SubRepoPath] = desc
212 }
213 versionMap[br.branch] = versions
214 }
215
216 for _, br := range branches {
217 var paths []string
218 for p := range opts.SubRepositories {
219 paths = append(paths, p)
220 }
221 sort.Strings(paths)
222
223 // Compute a version of the aggregate. This version
224 // has nothing to do with git, but will let us do
225 // incrementality correctly.
226 hasher := sha1.New()
227 for _, p := range paths {
228 repo := opts.SubRepositories[p]
229 id := versionMap[br.branch][p]
230
231 // it is possible that 'id' is zero, if this
232 // branch of the manifest doesn't have this
233 // particular subrepository.
234 hasher.Write([]byte(p))
235 hasher.Write([]byte(id.String()))
236 repo.Branches = append(repo.Branches, zoekt.RepositoryBranch{
237 Name: br.branch,
238 Version: id.String(),
239 })
240 }
241
242 opts.RepositoryDescription.Branches = append(opts.RepositoryDescription.Branches, zoekt.RepositoryBranch{
243 Name: br.branch,
244 Version: fmt.Sprintf("%x", hasher.Sum(nil)),
245 })
246 }
247
248 // key => branch
249 all := map[fileKey][]string{}
250 for br, files := range perBranch {
251 for k := range files {
252 all[k] = append(all[k], br)
253 }
254 }
255
256 if *incremental && opts.IncrementalSkipIndexing() {
257 return
258 }
259
260 builder, err := build.NewBuilder(opts)
261 if err != nil {
262 log.Fatal(err)
263 }
264 for k, branches := range all {
265 loc := perBranch[branches[0]][k]
266 data, err := loc.Blob(&k.ID)
267 if err != nil {
268 log.Fatal(err)
269 }
270
271 doc := zoekt.Document{
272 Name: k.FullPath(),
273 Content: data,
274 SubRepositoryPath: k.SubRepoPath,
275 }
276
277 doc.Branches = append(doc.Branches, branches...)
278 if err := builder.Add(doc); err != nil {
279 log.Printf("Add(%s): %v", doc.Name, err)
280 break
281 }
282 }
283 if err := builder.Finish(); err != nil {
284 log.Fatalf("Finish: %v", err)
285 }
286}
287
288// getManifest parses the manifest XML at the given branch/path inside a Git repository.
289func getManifest(repo *git.Repository, branch, path string) (*manifest.Manifest, error) {
290 ref, err := repo.Reference(plumbing.ReferenceName("refs/heads/"+branch), true)
291 if err != nil {
292 return nil, err
293 }
294
295 commit, err := repo.CommitObject(ref.Hash())
296 if err != nil {
297 return nil, err
298 }
299
300 tree, err := repo.TreeObject(commit.TreeHash)
301 if err != nil {
302 return nil, err
303 }
304
305 entry, err := tree.FindEntry(path)
306 if err != nil {
307 return nil, err
308 }
309
310 blob, err := repo.BlobObject(entry.Hash)
311 if err != nil {
312 return nil, err
313 }
314 r, err := blob.Reader()
315 if err != nil {
316 return nil, err
317 }
318 defer r.Close()
319
320 content, _ := io.ReadAll(r)
321 return manifest.Parse(content)
322}
323
324// iterateManifest constructs a complete tree from the given Manifest.
325func iterateManifest(mf *manifest.Manifest,
326 baseURL url.URL, revPrefix string,
327 cache *gitindex.RepoCache,
328) (map[fileKey]gitindex.BlobLocation, map[string]plumbing.Hash, error) {
329 allFiles := map[fileKey]gitindex.BlobLocation{}
330 allVersions := map[string]plumbing.Hash{}
331 for _, p := range mf.Project {
332 rev := mf.ProjectRevision(&p)
333
334 projURL := baseURL
335 projURL.Path = path.Join(projURL.Path, p.Name)
336
337 topRepo, err := cache.Open(&projURL)
338 if err != nil {
339 return nil, nil, err
340 }
341
342 ref, err := topRepo.Reference(plumbing.ReferenceName(revPrefix+rev), true)
343 if err != nil {
344 return nil, nil, err
345 }
346
347 commit, err := topRepo.CommitObject(ref.Hash())
348 if err != nil {
349 return nil, nil, err
350 }
351 if err != nil {
352 return nil, nil, err
353 }
354
355 allVersions[p.GetPath()] = commit.Hash
356
357 tree, err := commit.Tree()
358 if err != nil {
359 return nil, nil, err
360 }
361
362 files, versions, err := gitindex.TreeToFiles(topRepo, tree, projURL.String(), cache)
363 if err != nil {
364 return nil, nil, err
365 }
366
367 for key, repo := range files {
368 allFiles[fileKey{
369 SubRepoPath: filepath.Join(p.GetPath(), key.SubRepoPath),
370 Path: key.Path,
371 ID: key.ID,
372 }] = repo
373 }
374
375 for path, version := range versions {
376 allVersions[filepath.Join(p.GetPath(), path)] = version
377 }
378 }
379
380 return allFiles, allVersions, nil
381}