fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15// zoekt-repo-index indexes a repo-based repository. The constituent git 16// repositories should already have been downloaded to the --repo_cache 17// directory, eg. 18// 19// go install github.com/sourcegraph/zoekt/cmd/zoekt-repo-index && 20// 21// zoekt-repo-index -base_url https://gfiber.googlesource.com/ \ 22// -manifest_repo_url https://gfiber.googlesource.com/manifests \ 23// -manifest_rev_prefix=refs/heads/ \ 24// -rev_prefix="refs/remotes/" \ 25// -repo_cache ~/zoekt-serving/repos/ \ 26// -shard_limit 50000000 \ 27// master:default_unrestricted.xml 28package main 29 30import ( 31 "crypto/sha1" 32 "flag" 33 "fmt" 34 "io" 35 "log" 36 "net/url" 37 "path" 38 "path/filepath" 39 "sort" 40 "strings" 41 42 "github.com/google/slothfs/manifest" 43 "github.com/sourcegraph/zoekt" 44 "github.com/sourcegraph/zoekt/build" 45 "github.com/sourcegraph/zoekt/gitindex" 46 "go.uber.org/automaxprocs/maxprocs" 47 48 git "github.com/go-git/go-git/v5" 49 "github.com/go-git/go-git/v5/plumbing" 50) 51 52var _ = log.Println 53 54type fileKey struct { 55 SubRepoPath string 56 Path string 57 ID plumbing.Hash 58} 59 60func (k *fileKey) FullPath() string { 61 return filepath.Join(k.SubRepoPath, k.Path) 62} 63 64type branchFile struct { 65 branch, file string 66 mf *manifest.Manifest 67 manifestPath string 68} 69 70func parseBranches(manifestRepoURL, revPrefix string, cache *gitindex.RepoCache, args []string) ([]branchFile, error) { 71 var branches []branchFile 72 if manifestRepoURL != "" { 73 u, err := url.Parse(manifestRepoURL) 74 if err != nil { 75 return nil, err 76 } 77 78 repo, err := cache.Open(u) 79 if err != nil { 80 return nil, err 81 } 82 83 for _, f := range args { 84 fs := strings.SplitN(f, ":", 2) 85 if len(fs) != 2 { 86 return nil, fmt.Errorf("cannot parse %q as BRANCH:FILE", f) 87 } 88 mf, err := getManifest(repo, revPrefix+fs[0], fs[1]) 89 if err != nil { 90 return nil, fmt.Errorf("manifest %s:%s: %v", fs[0], fs[1], err) 91 } 92 93 branches = append(branches, branchFile{ 94 branch: fs[0], 95 file: fs[1], 96 mf: mf, 97 manifestPath: cache.Path(u), 98 }) 99 } 100 } else { 101 if len(args) == 0 { 102 return nil, fmt.Errorf("must give XML file argument") 103 } 104 for _, f := range args { 105 mf, err := manifest.ParseFile(f) 106 if err != nil { 107 return nil, err 108 } 109 110 branches = append(branches, branchFile{ 111 branch: "HEAD", 112 file: filepath.Base(f), 113 mf: mf, 114 manifestPath: f, 115 }) 116 } 117 } 118 return branches, nil 119} 120 121func main() { 122 sizeMax := flag.Int("file_limit", 128<<10, "maximum file size") 123 shardLimit := flag.Int("shard_limit", 100<<20, "maximum corpus size for a shard") 124 parallelism := flag.Int("parallelism", 1, "maximum number of parallel indexing processes") 125 126 revPrefix := flag.String("rev_prefix", "refs/remotes/origin/", "prefix for references") 127 baseURLStr := flag.String("base_url", "", "base url to interpret repository names") 128 repoCacheDir := flag.String("repo_cache", "", "root for repository cache") 129 indexDir := flag.String("index", build.DefaultDir, "index directory for *.zoekt files") 130 manifestRepoURL := flag.String("manifest_repo_url", "", "set a URL for a git repository holding manifest XML file. Provide the BRANCH:XML-FILE as further command-line arguments") 131 manifestRevPrefix := flag.String("manifest_rev_prefix", "refs/remotes/origin/", "prefixes for branches in manifest repository") 132 repoName := flag.String("name", "", "set repository name") 133 repoURL := flag.String("url", "", "set repository URL") 134 maxSubProjects := flag.Int("max_sub_projects", 0, "trim number of projects in manifest, for debugging.") 135 incremental := flag.Bool("incremental", true, "only index if the repository has changed.") 136 flag.Parse() 137 138 // Tune GOMAXPROCS to match Linux container CPU quota. 139 _, _ = maxprocs.Set() 140 141 if *repoCacheDir == "" { 142 log.Fatal("must set --repo_cache") 143 } 144 repoCache := gitindex.NewRepoCache(*repoCacheDir) 145 146 if u, err := url.Parse(*baseURLStr); err != nil { 147 log.Fatalf("Parse(%q): %v", u, err) 148 } else if *repoName == "" { 149 *repoName = filepath.Join(u.Host, u.Path) 150 } 151 152 opts := build.Options{ 153 Parallelism: *parallelism, 154 SizeMax: *sizeMax, 155 ShardMax: *shardLimit, 156 IndexDir: *indexDir, 157 RepositoryDescription: zoekt.Repository{ 158 Name: *repoName, 159 URL: *repoURL, 160 }, 161 } 162 opts.SetDefaults() 163 baseURL, err := url.Parse(*baseURLStr) 164 if err != nil { 165 log.Fatalf("Parse baseURL %q: %v", *baseURLStr, err) 166 } 167 168 branches, err := parseBranches(*manifestRepoURL, *manifestRevPrefix, repoCache, flag.Args()) 169 if err != nil { 170 log.Fatalf("parseBranches(%s, %s): %v", *manifestRepoURL, *manifestRevPrefix, err) 171 } 172 if len(branches) == 0 { 173 log.Fatal("must specify at least one branch") 174 } 175 if *maxSubProjects > 0 { 176 for _, b := range branches { 177 if *maxSubProjects < len(b.mf.Project) { 178 b.mf.Project = b.mf.Project[:*maxSubProjects] 179 } 180 } 181 } 182 183 perBranch := map[string]map[fileKey]gitindex.BlobLocation{} 184 opts.SubRepositories = map[string]*zoekt.Repository{} 185 186 // branch => repo => version 187 versionMap := map[string]map[string]plumbing.Hash{} 188 for _, br := range branches { 189 br.mf.Filter() 190 files, versions, err := iterateManifest(br.mf, *baseURL, *revPrefix, repoCache) 191 if err != nil { 192 log.Fatalf("iterateManifest: %v", err) 193 } 194 195 perBranch[br.branch] = files 196 for key, loc := range files { 197 _, ok := opts.SubRepositories[key.SubRepoPath] 198 if ok { 199 // This can be incorrect: if the layout of manifests 200 // changes across branches, then the same file could 201 // be in different subRepos. We'll pretend this is not 202 // a problem. 203 continue 204 } 205 206 desc := &zoekt.Repository{} 207 if err := gitindex.SetTemplatesFromOrigin(desc, loc.URL); err != nil { 208 log.Fatalf("SetTemplatesFromOrigin(%s): %v", loc.URL, err) 209 } 210 211 opts.SubRepositories[key.SubRepoPath] = desc 212 } 213 versionMap[br.branch] = versions 214 } 215 216 for _, br := range branches { 217 var paths []string 218 for p := range opts.SubRepositories { 219 paths = append(paths, p) 220 } 221 sort.Strings(paths) 222 223 // Compute a version of the aggregate. This version 224 // has nothing to do with git, but will let us do 225 // incrementality correctly. 226 hasher := sha1.New() 227 for _, p := range paths { 228 repo := opts.SubRepositories[p] 229 id := versionMap[br.branch][p] 230 231 // it is possible that 'id' is zero, if this 232 // branch of the manifest doesn't have this 233 // particular subrepository. 234 hasher.Write([]byte(p)) 235 hasher.Write([]byte(id.String())) 236 repo.Branches = append(repo.Branches, zoekt.RepositoryBranch{ 237 Name: br.branch, 238 Version: id.String(), 239 }) 240 } 241 242 opts.RepositoryDescription.Branches = append(opts.RepositoryDescription.Branches, zoekt.RepositoryBranch{ 243 Name: br.branch, 244 Version: fmt.Sprintf("%x", hasher.Sum(nil)), 245 }) 246 } 247 248 // key => branch 249 all := map[fileKey][]string{} 250 for br, files := range perBranch { 251 for k := range files { 252 all[k] = append(all[k], br) 253 } 254 } 255 256 if *incremental && opts.IncrementalSkipIndexing() { 257 return 258 } 259 260 builder, err := build.NewBuilder(opts) 261 if err != nil { 262 log.Fatal(err) 263 } 264 for k, branches := range all { 265 loc := perBranch[branches[0]][k] 266 data, err := loc.Blob(&k.ID) 267 if err != nil { 268 log.Fatal(err) 269 } 270 271 doc := zoekt.Document{ 272 Name: k.FullPath(), 273 Content: data, 274 SubRepositoryPath: k.SubRepoPath, 275 } 276 277 doc.Branches = append(doc.Branches, branches...) 278 if err := builder.Add(doc); err != nil { 279 log.Printf("Add(%s): %v", doc.Name, err) 280 break 281 } 282 } 283 if err := builder.Finish(); err != nil { 284 log.Fatalf("Finish: %v", err) 285 } 286} 287 288// getManifest parses the manifest XML at the given branch/path inside a Git repository. 289func getManifest(repo *git.Repository, branch, path string) (*manifest.Manifest, error) { 290 ref, err := repo.Reference(plumbing.ReferenceName("refs/heads/"+branch), true) 291 if err != nil { 292 return nil, err 293 } 294 295 commit, err := repo.CommitObject(ref.Hash()) 296 if err != nil { 297 return nil, err 298 } 299 300 tree, err := repo.TreeObject(commit.TreeHash) 301 if err != nil { 302 return nil, err 303 } 304 305 entry, err := tree.FindEntry(path) 306 if err != nil { 307 return nil, err 308 } 309 310 blob, err := repo.BlobObject(entry.Hash) 311 if err != nil { 312 return nil, err 313 } 314 r, err := blob.Reader() 315 if err != nil { 316 return nil, err 317 } 318 defer r.Close() 319 320 content, _ := io.ReadAll(r) 321 return manifest.Parse(content) 322} 323 324// iterateManifest constructs a complete tree from the given Manifest. 325func iterateManifest(mf *manifest.Manifest, 326 baseURL url.URL, revPrefix string, 327 cache *gitindex.RepoCache, 328) (map[fileKey]gitindex.BlobLocation, map[string]plumbing.Hash, error) { 329 allFiles := map[fileKey]gitindex.BlobLocation{} 330 allVersions := map[string]plumbing.Hash{} 331 for _, p := range mf.Project { 332 rev := mf.ProjectRevision(&p) 333 334 projURL := baseURL 335 projURL.Path = path.Join(projURL.Path, p.Name) 336 337 topRepo, err := cache.Open(&projURL) 338 if err != nil { 339 return nil, nil, err 340 } 341 342 ref, err := topRepo.Reference(plumbing.ReferenceName(revPrefix+rev), true) 343 if err != nil { 344 return nil, nil, err 345 } 346 347 commit, err := topRepo.CommitObject(ref.Hash()) 348 if err != nil { 349 return nil, nil, err 350 } 351 if err != nil { 352 return nil, nil, err 353 } 354 355 allVersions[p.GetPath()] = commit.Hash 356 357 tree, err := commit.Tree() 358 if err != nil { 359 return nil, nil, err 360 } 361 362 files, versions, err := gitindex.TreeToFiles(topRepo, tree, projURL.String(), cache) 363 if err != nil { 364 return nil, nil, err 365 } 366 367 for key, repo := range files { 368 allFiles[fileKey{ 369 SubRepoPath: filepath.Join(p.GetPath(), key.SubRepoPath), 370 Path: key.Path, 371 ID: key.ID, 372 }] = repo 373 } 374 375 for path, version := range versions { 376 allVersions[filepath.Join(p.GetPath(), path)] = version 377 } 378 } 379 380 return allFiles, allVersions, nil 381}