fork of https://github.com/sourcegraph/zoekt
1// package archive provides indexing of archives from remote URLs.
2package archive
3
4import (
5 "errors"
6 "fmt"
7 "io"
8 "net/url"
9 "strings"
10 "sync"
11
12 "github.com/sourcegraph/zoekt"
13 "github.com/sourcegraph/zoekt/build"
14)
15
16// Options specify the archive specific indexing options.
17type Options struct {
18 Incremental bool
19
20 Archive string
21 Name string
22 RepoURL string
23 Branch string
24 Commit string
25 Strip int
26}
27
28func (o *Options) SetDefaults() {
29 // We guess based on the archive URL.
30 u, _ := url.Parse(o.Archive)
31 if u == nil {
32 return
33 }
34
35 setRef := func(ref string) {
36 if isGitOID(ref) && o.Commit == "" {
37 o.Commit = ref
38 }
39 if !isGitOID(ref) && o.Branch == "" {
40 o.Branch = ref
41 }
42 }
43
44 switch u.Host {
45 case "github.com", "codeload.github.com":
46 // https://github.com/octokit/octokit.rb/commit/3d21ec53a331a6f037a91c368710b99387d012c1
47 // https://github.com/octokit/octokit.rb/blob/master/README.md
48 // https://github.com/octokit/octokit.rb/tree/master/lib
49 // https://codeload.github.com/octokit/octokit.rb/legacy.tar.gz/master
50 parts := strings.Split(u.Path, "/")
51 if len(parts) > 2 && o.Name == "" {
52 o.Name = fmt.Sprintf("github.com/%s/%s", parts[1], parts[2])
53 o.RepoURL = fmt.Sprintf("https://github.com/%s/%s", parts[1], parts[2])
54 }
55 if len(parts) > 4 {
56 setRef(parts[4])
57 if u.Host == "github.com" {
58 o.Archive = fmt.Sprintf("https://codeload.github.com/%s/%s/legacy.tar.gz/%s", parts[1], parts[2], parts[4])
59 }
60 }
61 o.Strip = 1
62 case "api.github.com":
63 // https://api.github.com/repos/octokit/octokit.rb/tarball/master
64 parts := strings.Split(u.Path, "/")
65 if len(parts) > 2 && o.Name == "" {
66 o.Name = fmt.Sprintf("github.com/%s/%s", parts[1], parts[2])
67 o.RepoURL = fmt.Sprintf("https://github.com/%s/%s", parts[1], parts[2])
68 }
69 if len(parts) > 5 {
70 setRef(parts[5])
71 }
72 o.Strip = 1
73 }
74}
75
76// Index archive specified in opts using bopts.
77func Index(opts Options, bopts build.Options) error {
78 opts.SetDefaults()
79
80 if opts.Name == "" && opts.RepoURL == "" {
81 return errors.New("-name or -url required")
82 }
83 if opts.Branch == "" {
84 return errors.New("-branch required")
85 }
86
87 if opts.Name != "" {
88 bopts.RepositoryDescription.Name = opts.Name
89 }
90 // We do not use this functionality to avoid pulling in the transitive deps of gitindex
91 /*
92 if opts.RepoURL != "" {
93 u, err := url.Parse(opts.RepoURL)
94 if err != nil {
95 return err
96 }
97 if err := gitindex.SetTemplatesFromOrigin(&bopts.RepositoryDescription, u); err != nil {
98 return err
99 }
100 }
101 */
102 bopts.SetDefaults()
103 bopts.RepositoryDescription.Branches = []zoekt.RepositoryBranch{{Name: opts.Branch, Version: opts.Commit}}
104 brs := []string{opts.Branch}
105
106 if opts.Incremental && bopts.IncrementalSkipIndexing() {
107 return nil
108 }
109
110 a, err := openArchive(opts.Archive)
111 if err != nil {
112 return err
113 }
114 defer a.Close()
115
116 bopts.RepositoryDescription.Source = opts.Archive
117 var builder *build.Builder
118
119 once := sync.Once{}
120 var onceErr error
121 add := func(f *File) error {
122 defer f.Close()
123
124 once.Do(func() {
125 // We use the ModTime of the first file as a proxy for the latest commit date.
126 bopts.RepositoryDescription.LatestCommitDate = f.ModTime
127 builder, onceErr = build.NewBuilder(bopts)
128 })
129 if onceErr != nil {
130 return onceErr
131 }
132
133 contents, err := io.ReadAll(f)
134 if err != nil {
135 return err
136 }
137
138 name := stripComponents(f.Name, opts.Strip)
139 if name == "" {
140 return nil
141 }
142
143 return builder.Add(zoekt.Document{
144 Name: name,
145 Content: contents,
146 Branches: brs,
147 })
148 }
149
150 for {
151 f, err := a.Next()
152 if err == io.EOF {
153 break
154 }
155 if err != nil {
156 return err
157 }
158
159 if err := add(f); err != nil {
160 return err
161 }
162 }
163
164 return builder.Finish()
165}
166
167// stripComponents removes the specified number of leading path
168// elements. Pathnames with fewer elements will return the empty string.
169func stripComponents(path string, count int) string {
170 for i := 0; path != "" && i < count; i++ {
171 i := strings.Index(path, "/")
172 if i < 0 {
173 return ""
174 }
175 path = path[i+1:]
176 }
177 return path
178}
179
180// isGitOID checks if the revision is a git OID SHA string.
181//
182// Note: This doesn't mean the SHA exists in a repository, nor does it mean it
183// isn't a ref. Git allows 40-char hexadecimal strings to be references.
184func isGitOID(s string) bool {
185 if len(s) != 40 {
186 return false
187 }
188 for _, r := range s {
189 if !(('0' <= r && r <= '9') ||
190 ('a' <= r && r <= 'f') ||
191 ('A' <= r && r <= 'F')) {
192 return false
193 }
194 }
195 return true
196}