fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package gitindex
16
17import (
18 "fmt"
19 "io"
20 "log"
21 "net/url"
22 "path"
23 "path/filepath"
24 "strings"
25
26 "github.com/go-git/go-git/v5"
27 "github.com/go-git/go-git/v5/plumbing"
28 "github.com/go-git/go-git/v5/plumbing/filemode"
29 "github.com/go-git/go-git/v5/plumbing/object"
30
31 "github.com/sourcegraph/zoekt/ignore"
32)
33
34// RepoWalker walks one or more commit trees, collecting the files to index in its Files map.
35//
36// It also recurses into submodules if Options.Submodules is enabled.
37type RepoWalker struct {
38 Files map[fileKey]BlobLocation
39
40 repo *git.Repository
41 repoURL *url.URL
42
43 // Path => SubmoduleEntry
44 submodules map[string]*SubmoduleEntry
45 repoCache *RepoCache
46}
47
48// subURL returns the URL for a submodule.
49func (rw *RepoWalker) subURL(relURL string) (*url.URL, error) {
50 if rw.repoURL == nil {
51 return nil, fmt.Errorf("no URL for base repo")
52 }
53 if strings.HasPrefix(relURL, "../") {
54 u := *rw.repoURL
55 u.Path = path.Join(u.Path, relURL)
56 return &u, nil
57 }
58
59 return url.Parse(relURL)
60}
61
62// NewRepoWalker creates a new RepoWalker.
63func NewRepoWalker(r *git.Repository, repoURL string, repoCache *RepoCache) *RepoWalker {
64 u, _ := url.Parse(repoURL)
65 return &RepoWalker{
66 repo: r,
67 repoURL: u,
68 Files: map[fileKey]BlobLocation{},
69 repoCache: repoCache,
70 }
71}
72
73// parseModuleMap initializes rw.submodules.
74func (rw *RepoWalker) parseModuleMap(t *object.Tree) error {
75 if rw.repoCache == nil {
76 return nil
77 }
78 modEntry, _ := t.File(".gitmodules")
79 if modEntry != nil {
80 c, err := blobContents(&modEntry.Blob)
81 if err != nil {
82 return fmt.Errorf("blobContents: %w", err)
83 }
84 mods, err := ParseGitModules(c)
85 if err != nil {
86 return fmt.Errorf("ParseGitModules: %w", err)
87 }
88 rw.submodules = map[string]*SubmoduleEntry{}
89 for _, entry := range mods {
90 rw.submodules[entry.Path] = entry
91 }
92 }
93 return nil
94}
95
96// CollectFiles fetches the blob SHA1s for the tree. If repoCache is
97// non-nil, recurse into submodules. In addition, it returns a mapping
98// that indicates in which repo each SHA1 can be found.
99//
100// The collected files are available through the RepoWalker.Files map.
101func (rw *RepoWalker) CollectFiles(t *object.Tree, branch string, ig *ignore.Matcher) (map[string]plumbing.Hash, error) {
102 if err := rw.parseModuleMap(t); err != nil {
103 return nil, fmt.Errorf("parseModuleMap: %w", err)
104 }
105
106 ig, err := newIgnoreMatcher(t)
107 if err != nil {
108 return nil, fmt.Errorf("newIgnoreMatcher: %w", err)
109 }
110
111 tw := object.NewTreeWalker(t, true, make(map[plumbing.Hash]bool))
112 defer tw.Close()
113
114 // Path => commit SHA1
115 subRepoVersions := make(map[string]plumbing.Hash)
116 for {
117 name, entry, err := tw.Next()
118 if err == io.EOF {
119 break
120 }
121 if err := rw.handleEntry(name, &entry, branch, subRepoVersions, ig); err != nil {
122 return nil, fmt.Errorf("handleEntry: %w", err)
123 }
124 }
125 return subRepoVersions, nil
126}
127
128func (rw *RepoWalker) tryHandleSubmodule(p string, id *plumbing.Hash, branch string, subRepoVersions map[string]plumbing.Hash, ig *ignore.Matcher) error {
129 if err := rw.handleSubmodule(p, id, branch, subRepoVersions, ig); err != nil {
130 log.Printf("submodule %s: ignoring error %v", p, err)
131 }
132 return nil
133}
134
135func (rw *RepoWalker) handleSubmodule(p string, id *plumbing.Hash, branch string, subRepoVersions map[string]plumbing.Hash, ig *ignore.Matcher) error {
136 submod := rw.submodules[p]
137 if submod == nil {
138 return fmt.Errorf("no entry for submodule path %q", rw.repoURL)
139 }
140
141 subURL, err := rw.subURL(submod.URL)
142 if err != nil {
143 return err
144 }
145
146 subRepo, err := rw.repoCache.Open(subURL)
147 if err != nil {
148 return err
149 }
150
151 obj, err := subRepo.CommitObject(*id)
152 if err != nil {
153 return err
154 }
155 tree, err := subRepo.TreeObject(obj.TreeHash)
156 if err != nil {
157 return err
158 }
159
160 subRepoVersions[p] = *id
161
162 sw := NewRepoWalker(subRepo, subURL.String(), rw.repoCache)
163 subVersions, err := sw.CollectFiles(tree, branch, ig)
164 if err != nil {
165 return err
166 }
167 for k, repo := range sw.Files {
168 rw.Files[fileKey{
169 SubRepoPath: filepath.Join(p, k.SubRepoPath),
170 Path: k.Path,
171 ID: k.ID,
172 }] = repo
173 }
174 for k, v := range subVersions {
175 subRepoVersions[filepath.Join(p, k)] = v
176 }
177 return nil
178}
179
180func (rw *RepoWalker) handleEntry(p string, e *object.TreeEntry, branch string, subRepoVersions map[string]plumbing.Hash, ig *ignore.Matcher) error {
181 if e.Mode == filemode.Submodule {
182 if rw.repoCache != nil {
183 // Index the submodule using repo cache
184 if err := rw.tryHandleSubmodule(p, &e.Hash, branch, subRepoVersions, ig); err != nil {
185 return fmt.Errorf("submodule %s: %v", p, err)
186 }
187 } else {
188 // Record the commit ID for the submodule path
189 // This will be the submodule's commit hash, not the parent's
190 subRepoVersions[p] = e.Hash
191 }
192 }
193
194 switch e.Mode {
195 case filemode.Regular, filemode.Executable, filemode.Symlink:
196 default:
197 return nil
198 }
199
200 // Skip ignored files
201 if ig.Match(p) {
202 return nil
203 }
204
205 key := fileKey{Path: p, ID: e.Hash}
206 if existing, ok := rw.Files[key]; ok {
207 existing.Branches = append(existing.Branches, branch)
208 rw.Files[key] = existing
209 } else {
210 rw.Files[key] = BlobLocation{GitRepo: rw.repo, URL: rw.repoURL, Branches: []string{branch}}
211 }
212
213 return nil
214}
215
216// fileKey describes a blob at a location in the final tree. We also
217// record the subrepository from where it came.
218type fileKey struct {
219 SubRepoPath string
220 Path string
221 ID plumbing.Hash
222}
223
224func (k *fileKey) FullPath() string {
225 return filepath.Join(k.SubRepoPath, k.Path)
226}
227
228// BlobLocation holds the repo where the blob can be found, plus other information
229// needed for indexing like its branches.
230type BlobLocation struct {
231 GitRepo *git.Repository
232 URL *url.URL
233
234 // Branches is the list of branches that contain the blob.
235 Branches []string
236}
237
238func (l *BlobLocation) Blob(id *plumbing.Hash) ([]byte, error) {
239 blob, err := l.GitRepo.BlobObject(*id)
240 if err != nil {
241 return nil, err
242 }
243 return blobContents(blob)
244}