fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15// zoekt-repo-index indexes a repo-based repository. The constituent git 16// repositories should already have been downloaded to the --repo_cache 17// directory, eg. 18// 19// go install github.com/sourcegraph/zoekt/cmd/zoekt-repo-index && 20// 21// zoekt-repo-index -base_url https://gfiber.googlesource.com/ \ 22// -manifest_repo_url https://gfiber.googlesource.com/manifests \ 23// -manifest_rev_prefix=refs/heads/ \ 24// -rev_prefix="refs/remotes/" \ 25// -repo_cache ~/zoekt-serving/repos/ \ 26// -shard_limit 50000000 \ 27// master:default_unrestricted.xml 28package main 29 30import ( 31 "crypto/sha1" 32 "flag" 33 "fmt" 34 "io" 35 "log" 36 "net/url" 37 "path" 38 "path/filepath" 39 "sort" 40 "strings" 41 42 "github.com/google/slothfs/manifest" 43 "github.com/sourcegraph/zoekt" 44 "github.com/sourcegraph/zoekt/build" 45 "github.com/sourcegraph/zoekt/gitindex" 46 "github.com/sourcegraph/zoekt/ignore" 47 "go.uber.org/automaxprocs/maxprocs" 48 49 git "github.com/go-git/go-git/v5" 50 "github.com/go-git/go-git/v5/plumbing" 51) 52 53var _ = log.Println 54 55type fileKey struct { 56 SubRepoPath string 57 Path string 58 ID plumbing.Hash 59} 60 61func (k *fileKey) FullPath() string { 62 return filepath.Join(k.SubRepoPath, k.Path) 63} 64 65type branchFile struct { 66 branch, file string 67 mf *manifest.Manifest 68 manifestPath string 69} 70 71func parseBranches(manifestRepoURL, revPrefix string, cache *gitindex.RepoCache, args []string) ([]branchFile, error) { 72 var branches []branchFile 73 if manifestRepoURL != "" { 74 u, err := url.Parse(manifestRepoURL) 75 if err != nil { 76 return nil, err 77 } 78 79 repo, err := cache.Open(u) 80 if err != nil { 81 return nil, err 82 } 83 84 for _, f := range args { 85 fs := strings.SplitN(f, ":", 2) 86 if len(fs) != 2 { 87 return nil, fmt.Errorf("cannot parse %q as BRANCH:FILE", f) 88 } 89 mf, err := getManifest(repo, revPrefix+fs[0], fs[1]) 90 if err != nil { 91 return nil, fmt.Errorf("manifest %s:%s: %v", fs[0], fs[1], err) 92 } 93 94 branches = append(branches, branchFile{ 95 branch: fs[0], 96 file: fs[1], 97 mf: mf, 98 manifestPath: cache.Path(u), 99 }) 100 } 101 } else { 102 if len(args) == 0 { 103 return nil, fmt.Errorf("must give XML file argument") 104 } 105 for _, f := range args { 106 mf, err := manifest.ParseFile(f) 107 if err != nil { 108 return nil, err 109 } 110 111 branches = append(branches, branchFile{ 112 branch: "HEAD", 113 file: filepath.Base(f), 114 mf: mf, 115 manifestPath: f, 116 }) 117 } 118 } 119 return branches, nil 120} 121 122func main() { 123 sizeMax := flag.Int("file_limit", 128<<10, "maximum file size") 124 shardLimit := flag.Int("shard_limit", 100<<20, "maximum corpus size for a shard") 125 parallelism := flag.Int("parallelism", 1, "maximum number of parallel indexing processes") 126 127 revPrefix := flag.String("rev_prefix", "refs/remotes/origin/", "prefix for references") 128 baseURLStr := flag.String("base_url", "", "base url to interpret repository names") 129 repoCacheDir := flag.String("repo_cache", "", "root for repository cache") 130 indexDir := flag.String("index", build.DefaultDir, "index directory for *.zoekt files") 131 manifestRepoURL := flag.String("manifest_repo_url", "", "set a URL for a git repository holding manifest XML file. Provide the BRANCH:XML-FILE as further command-line arguments") 132 manifestRevPrefix := flag.String("manifest_rev_prefix", "refs/remotes/origin/", "prefixes for branches in manifest repository") 133 repoName := flag.String("name", "", "set repository name") 134 repoURL := flag.String("url", "", "set repository URL") 135 maxSubProjects := flag.Int("max_sub_projects", 0, "trim number of projects in manifest, for debugging.") 136 incremental := flag.Bool("incremental", true, "only index if the repository has changed.") 137 flag.Parse() 138 139 // Tune GOMAXPROCS to match Linux container CPU quota. 140 _, _ = maxprocs.Set() 141 142 if *repoCacheDir == "" { 143 log.Fatal("must set --repo_cache") 144 } 145 repoCache := gitindex.NewRepoCache(*repoCacheDir) 146 147 if u, err := url.Parse(*baseURLStr); err != nil { 148 log.Fatalf("Parse(%q): %v", u, err) 149 } else if *repoName == "" { 150 *repoName = filepath.Join(u.Host, u.Path) 151 } 152 153 opts := build.Options{ 154 Parallelism: *parallelism, 155 SizeMax: *sizeMax, 156 ShardMax: *shardLimit, 157 IndexDir: *indexDir, 158 RepositoryDescription: zoekt.Repository{ 159 Name: *repoName, 160 URL: *repoURL, 161 }, 162 } 163 opts.SetDefaults() 164 baseURL, err := url.Parse(*baseURLStr) 165 if err != nil { 166 log.Fatalf("Parse baseURL %q: %v", *baseURLStr, err) 167 } 168 169 branches, err := parseBranches(*manifestRepoURL, *manifestRevPrefix, repoCache, flag.Args()) 170 if err != nil { 171 log.Fatalf("parseBranches(%s, %s): %v", *manifestRepoURL, *manifestRevPrefix, err) 172 } 173 if len(branches) == 0 { 174 log.Fatal("must specify at least one branch") 175 } 176 if *maxSubProjects > 0 { 177 for _, b := range branches { 178 if *maxSubProjects < len(b.mf.Project) { 179 b.mf.Project = b.mf.Project[:*maxSubProjects] 180 } 181 } 182 } 183 184 perBranch := map[string]map[fileKey]gitindex.BlobLocation{} 185 opts.SubRepositories = map[string]*zoekt.Repository{} 186 187 // branch => repo => version 188 versionMap := map[string]map[string]plumbing.Hash{} 189 for _, br := range branches { 190 br.mf.Filter() 191 files, versions, err := iterateManifest(br.mf, *baseURL, *revPrefix, repoCache) 192 if err != nil { 193 log.Fatalf("iterateManifest: %v", err) 194 } 195 196 perBranch[br.branch] = files 197 for key, repo := range files { 198 _, ok := opts.SubRepositories[key.SubRepoPath] 199 if ok { 200 // This can be incorrect: if the layout of manifests 201 // changes across branches, then the same file could 202 // be in different subRepos. We'll pretend this is not 203 // a problem. 204 continue 205 } 206 207 desc := &zoekt.Repository{} 208 if err := gitindex.SetTemplatesFromOrigin(desc, repo.URL); err != nil { 209 log.Fatalf("SetTemplatesFromOrigin(%s): %v", repo.URL, err) 210 } 211 212 opts.SubRepositories[key.SubRepoPath] = desc 213 } 214 versionMap[br.branch] = versions 215 } 216 217 for _, br := range branches { 218 var paths []string 219 for p := range opts.SubRepositories { 220 paths = append(paths, p) 221 } 222 sort.Strings(paths) 223 224 // Compute a version of the aggregate. This version 225 // has nothing to do with git, but will let us do 226 // incrementality correctly. 227 hasher := sha1.New() 228 for _, p := range paths { 229 repo := opts.SubRepositories[p] 230 id := versionMap[br.branch][p] 231 232 // it is possible that 'id' is zero, if this 233 // branch of the manifest doesn't have this 234 // particular subrepository. 235 hasher.Write([]byte(p)) 236 hasher.Write([]byte(id.String())) 237 repo.Branches = append(repo.Branches, zoekt.RepositoryBranch{ 238 Name: br.branch, 239 Version: id.String(), 240 }) 241 } 242 243 opts.RepositoryDescription.Branches = append(opts.RepositoryDescription.Branches, zoekt.RepositoryBranch{ 244 Name: br.branch, 245 Version: fmt.Sprintf("%x", hasher.Sum(nil)), 246 }) 247 } 248 249 // key => branch 250 all := map[fileKey][]string{} 251 for br, files := range perBranch { 252 for k := range files { 253 all[k] = append(all[k], br) 254 } 255 } 256 257 if *incremental && opts.IncrementalSkipIndexing() { 258 return 259 } 260 261 builder, err := build.NewBuilder(opts) 262 if err != nil { 263 log.Fatal(err) 264 } 265 for k, branches := range all { 266 loc := perBranch[branches[0]][k] 267 data, err := loc.Blob(&k.ID) 268 if err != nil { 269 log.Fatal(err) 270 } 271 272 doc := zoekt.Document{ 273 Name: k.FullPath(), 274 Content: data, 275 SubRepositoryPath: k.SubRepoPath, 276 } 277 278 doc.Branches = append(doc.Branches, branches...) 279 if err := builder.Add(doc); err != nil { 280 log.Printf("Add(%s): %v", doc.Name, err) 281 break 282 } 283 } 284 if err := builder.Finish(); err != nil { 285 log.Fatalf("Finish: %v", err) 286 } 287} 288 289// getManifest parses the manifest XML at the given branch/path inside a Git repository. 290func getManifest(repo *git.Repository, branch, path string) (*manifest.Manifest, error) { 291 ref, err := repo.Reference(plumbing.ReferenceName("refs/heads/"+branch), true) 292 if err != nil { 293 return nil, err 294 } 295 296 commit, err := repo.CommitObject(ref.Hash()) 297 if err != nil { 298 return nil, err 299 } 300 301 tree, err := repo.TreeObject(commit.TreeHash) 302 if err != nil { 303 return nil, err 304 } 305 306 entry, err := tree.FindEntry(path) 307 if err != nil { 308 return nil, err 309 } 310 311 blob, err := repo.BlobObject(entry.Hash) 312 if err != nil { 313 return nil, err 314 } 315 r, err := blob.Reader() 316 if err != nil { 317 return nil, err 318 } 319 defer r.Close() 320 321 content, _ := io.ReadAll(r) 322 return manifest.Parse(content) 323} 324 325// iterateManifest constructs a complete tree from the given Manifest. 326func iterateManifest(mf *manifest.Manifest, 327 baseURL url.URL, revPrefix string, 328 cache *gitindex.RepoCache, 329) (map[fileKey]gitindex.BlobLocation, map[string]plumbing.Hash, error) { 330 allFiles := map[fileKey]gitindex.BlobLocation{} 331 allVersions := map[string]plumbing.Hash{} 332 for _, p := range mf.Project { 333 rev := mf.ProjectRevision(&p) 334 335 projURL := baseURL 336 projURL.Path = path.Join(projURL.Path, p.Name) 337 338 topRepo, err := cache.Open(&projURL) 339 if err != nil { 340 return nil, nil, err 341 } 342 343 ref, err := topRepo.Reference(plumbing.ReferenceName(revPrefix+rev), true) 344 if err != nil { 345 return nil, nil, err 346 } 347 348 commit, err := topRepo.CommitObject(ref.Hash()) 349 if err != nil { 350 return nil, nil, err 351 } 352 if err != nil { 353 return nil, nil, err 354 } 355 356 allVersions[p.GetPath()] = commit.Hash 357 358 tree, err := commit.Tree() 359 if err != nil { 360 return nil, nil, err 361 } 362 363 rw := gitindex.NewRepoWalker(topRepo, projURL.String(), cache) 364 subVersions, err := rw.CollectFiles(tree, rev, &ignore.Matcher{}) 365 if err != nil { 366 return nil, nil, err 367 } 368 369 for key, repo := range rw.Files { 370 allFiles[fileKey{ 371 SubRepoPath: filepath.Join(p.GetPath(), key.SubRepoPath), 372 Path: key.Path, 373 ID: key.ID, 374 }] = repo 375 } 376 377 for path, version := range subVersions { 378 allVersions[filepath.Join(p.GetPath(), path)] = version 379 } 380 } 381 382 return allFiles, allVersions, nil 383}