fork of https://github.com/sourcegraph/zoekt
1// package archive provides indexing of archives from remote URLs.
2package archive
3
4import (
5 "errors"
6 "fmt"
7 "io"
8 "net/url"
9 "strings"
10
11 "github.com/sourcegraph/zoekt"
12 "github.com/sourcegraph/zoekt/build"
13)
14
15// Options specify the archive specific indexing options.
16type Options struct {
17 Incremental bool
18
19 Archive string
20 Name string
21 RepoURL string
22 Branch string
23 Commit string
24 Strip int
25}
26
27func (o *Options) SetDefaults() {
28 // We guess based on the archive URL.
29 u, _ := url.Parse(o.Archive)
30 if u == nil {
31 return
32 }
33
34 setRef := func(ref string) {
35 if isGitOID(ref) && o.Commit == "" {
36 o.Commit = ref
37 }
38 if !isGitOID(ref) && o.Branch == "" {
39 o.Branch = ref
40 }
41 }
42
43 switch u.Host {
44 case "github.com", "codeload.github.com":
45 // https://github.com/octokit/octokit.rb/commit/3d21ec53a331a6f037a91c368710b99387d012c1
46 // https://github.com/octokit/octokit.rb/blob/master/README.md
47 // https://github.com/octokit/octokit.rb/tree/master/lib
48 // https://codeload.github.com/octokit/octokit.rb/legacy.tar.gz/master
49 parts := strings.Split(u.Path, "/")
50 if len(parts) > 2 && o.Name == "" {
51 o.Name = fmt.Sprintf("github.com/%s/%s", parts[1], parts[2])
52 o.RepoURL = fmt.Sprintf("https://github.com/%s/%s", parts[1], parts[2])
53 }
54 if len(parts) > 4 {
55 setRef(parts[4])
56 if u.Host == "github.com" {
57 o.Archive = fmt.Sprintf("https://codeload.github.com/%s/%s/legacy.tar.gz/%s", parts[1], parts[2], parts[4])
58 }
59 }
60 o.Strip = 1
61 case "api.github.com":
62 // https://api.github.com/repos/octokit/octokit.rb/tarball/master
63 parts := strings.Split(u.Path, "/")
64 if len(parts) > 2 && o.Name == "" {
65 o.Name = fmt.Sprintf("github.com/%s/%s", parts[1], parts[2])
66 o.RepoURL = fmt.Sprintf("https://github.com/%s/%s", parts[1], parts[2])
67 }
68 if len(parts) > 5 {
69 setRef(parts[5])
70 }
71 o.Strip = 1
72 }
73}
74
75// Index archive specified in opts using bopts.
76func Index(opts Options, bopts build.Options) error {
77 opts.SetDefaults()
78
79 if opts.Name == "" && opts.RepoURL == "" {
80 return errors.New("-name or -url required")
81 }
82 if opts.Branch == "" {
83 return errors.New("-branch required")
84 }
85
86 if opts.Name != "" {
87 bopts.RepositoryDescription.Name = opts.Name
88 }
89 // We do not use this functionality to avoid pulling in the transitive deps of gitindex
90 /*
91 if opts.RepoURL != "" {
92 u, err := url.Parse(opts.RepoURL)
93 if err != nil {
94 return err
95 }
96 if err := gitindex.SetTemplatesFromOrigin(&bopts.RepositoryDescription, u); err != nil {
97 return err
98 }
99 }
100 */
101 bopts.SetDefaults()
102 bopts.RepositoryDescription.Branches = []zoekt.RepositoryBranch{{Name: opts.Branch, Version: opts.Commit}}
103 brs := []string{opts.Branch}
104
105 if opts.Incremental && bopts.IncrementalSkipIndexing() {
106 return nil
107 }
108
109 a, err := openArchive(opts.Archive)
110 if err != nil {
111 return err
112 }
113 defer a.Close()
114
115 bopts.RepositoryDescription.Source = opts.Archive
116 builder, err := build.NewBuilder(bopts)
117 if err != nil {
118 return err
119 }
120
121 add := func(f *File) error {
122 defer f.Close()
123
124 contents, err := io.ReadAll(f)
125 if err != nil {
126 return err
127 }
128
129 name := stripComponents(f.Name, opts.Strip)
130 if name == "" {
131 return nil
132 }
133
134 return builder.Add(zoekt.Document{
135 Name: name,
136 Content: contents,
137 Branches: brs,
138 })
139 }
140
141 for {
142 f, err := a.Next()
143 if err == io.EOF {
144 break
145 }
146 if err != nil {
147 return err
148 }
149
150 if err := add(f); err != nil {
151 return err
152 }
153 }
154
155 return builder.Finish()
156}
157
158// stripComponents removes the specified number of leading path
159// elements. Pathnames with fewer elements will return the empty string.
160func stripComponents(path string, count int) string {
161 for i := 0; path != "" && i < count; i++ {
162 i := strings.Index(path, "/")
163 if i < 0 {
164 return ""
165 }
166 path = path[i+1:]
167 }
168 return path
169}
170
171// isGitOID checks if the revision is a git OID SHA string.
172//
173// Note: This doesn't mean the SHA exists in a repository, nor does it mean it
174// isn't a ref. Git allows 40-char hexadecimal strings to be references.
175func isGitOID(s string) bool {
176 if len(s) != 40 {
177 return false
178 }
179 for _, r := range s {
180 if !(('0' <= r && r <= '9') ||
181 ('a' <= r && r <= 'f') ||
182 ('A' <= r && r <= 'F')) {
183 return false
184 }
185 }
186 return true
187}