fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

zoekt-archive-index: split out ranking tests and archive indexing (#712)

We had ranking e2e tests living in the zoekt-archive-index cmd for
convenience since that contained useful functions for indexing a remote
tarball from the GitHub API. This commit splits the archive
functionality into a new internal/archive package and the ranking tests
into a new internal/e2e package.

The zoekt-archive-index code is now quite minimal. This is similiar to
how zoekt-git-index mostly just calls out to the gitindex package. What
is different is that archive package is marked internal, unlike
gitindex. gitindex should also be internal, but the code predates go's
support for internal.

I suspect more of our e2e tests will end up in this package.

Test Plan: go test ./...

+224 -193
+4 -3
cmd/zoekt-archive-index/archive.go internal/archive/archive.go
··· 1 - package main 1 + package archive 2 2 3 3 import ( 4 4 "archive/tar" ··· 126 126 return ct, io.MultiReader(bytes.NewReader(buf[:n]), r), nil 127 127 } 128 128 129 - func openReader(u string) (io.ReadCloser, error) { 129 + // OpenReader returns a reader for the archive at the URL u. 130 + func OpenReader(u string) (io.ReadCloser, error) { 130 131 if strings.HasPrefix(u, "https://") || strings.HasPrefix(u, "http://") { 131 132 resp, err := http.Get(u) 132 133 if err != nil { ··· 155 156 // openArchive opens the tar at the URL or filepath u. Also supported is tgz 156 157 // files over http. 157 158 func openArchive(u string) (ar Archive, err error) { 158 - readCloser, err := openReader(u) 159 + readCloser, err := OpenReader(u) 159 160 if err != nil { 160 161 return nil, err 161 162 }
+5 -4
cmd/zoekt-archive-index/e2e_rank_test.go internal/e2e/e2e_rank_test.go
··· 1 - package main 1 + package e2e 2 2 3 3 import ( 4 4 "bytes" ··· 17 17 "github.com/google/go-cmp/cmp" 18 18 "github.com/sourcegraph/zoekt" 19 19 "github.com/sourcegraph/zoekt/build" 20 + "github.com/sourcegraph/zoekt/internal/archive" 20 21 "github.com/sourcegraph/zoekt/query" 21 22 "github.com/sourcegraph/zoekt/shards" 22 23 ) ··· 135 136 return err 136 137 } 137 138 138 - opts := Options{ 139 + opts := archive.Options{ 139 140 Archive: u, 140 141 } 141 142 opts.SetDefaults() // sets metadata like Name and the codeload URL ··· 158 159 // languageMap[lang] = ctags.ScipCTags 159 160 // } 160 161 161 - err := do(opts, build.Options{ 162 + err := archive.Index(opts, build.Options{ 162 163 IndexDir: indexDir, 163 164 CTagsMustSucceed: true, 164 165 }) ··· 172 173 func download(url, dst string) error { 173 174 tmpPath := dst + ".part" 174 175 175 - rc, err := openReader(url) 176 + rc, err := archive.OpenReader(url) 176 177 if err != nil { 177 178 return err 178 179 }
+2 -2
cmd/zoekt-archive-index/e2e_test.go internal/archive/e2e_test.go
··· 1 - package main 1 + package archive 2 2 3 3 import ( 4 4 "archive/tar" ··· 163 163 Strip: 0, 164 164 } 165 165 166 - if err := do(opts, bopts); err != nil { 166 + if err := Index(opts, bopts); err != nil { 167 167 t.Fatalf("error creating index: %v", err) 168 168 } 169 169
+7 -184
cmd/zoekt-archive-index/main.go
··· 8 8 package main 9 9 10 10 import ( 11 - "errors" 12 11 "flag" 13 - "fmt" 14 - "io" 15 12 "log" 16 - "net/url" 17 - "strings" 18 13 19 - "github.com/sourcegraph/zoekt" 20 - "github.com/sourcegraph/zoekt/build" 21 - "github.com/sourcegraph/zoekt/cmd" 22 14 "go.uber.org/automaxprocs/maxprocs" 23 - ) 24 15 25 - // stripComponents removes the specified number of leading path 26 - // elements. Pathnames with fewer elements will return the empty string. 27 - func stripComponents(path string, count int) string { 28 - for i := 0; path != "" && i < count; i++ { 29 - i := strings.Index(path, "/") 30 - if i < 0 { 31 - return "" 32 - } 33 - path = path[i+1:] 34 - } 35 - return path 36 - } 37 - 38 - // isGitOID checks if the revision is a git OID SHA string. 39 - // 40 - // Note: This doesn't mean the SHA exists in a repository, nor does it mean it 41 - // isn't a ref. Git allows 40-char hexadecimal strings to be references. 42 - func isGitOID(s string) bool { 43 - if len(s) != 40 { 44 - return false 45 - } 46 - for _, r := range s { 47 - if !(('0' <= r && r <= '9') || 48 - ('a' <= r && r <= 'f') || 49 - ('A' <= r && r <= 'F')) { 50 - return false 51 - } 52 - } 53 - return true 54 - } 55 - 56 - type Options struct { 57 - Incremental bool 58 - 59 - Archive string 60 - Name string 61 - RepoURL string 62 - Branch string 63 - Commit string 64 - Strip int 65 - } 66 - 67 - func (o *Options) SetDefaults() { 68 - // We guess based on the archive URL. 69 - u, _ := url.Parse(o.Archive) 70 - if u == nil { 71 - return 72 - } 73 - 74 - setRef := func(ref string) { 75 - if isGitOID(ref) && o.Commit == "" { 76 - o.Commit = ref 77 - } 78 - if !isGitOID(ref) && o.Branch == "" { 79 - o.Branch = ref 80 - } 81 - } 82 - 83 - switch u.Host { 84 - case "github.com", "codeload.github.com": 85 - // https://github.com/octokit/octokit.rb/commit/3d21ec53a331a6f037a91c368710b99387d012c1 86 - // https://github.com/octokit/octokit.rb/blob/master/README.md 87 - // https://github.com/octokit/octokit.rb/tree/master/lib 88 - // https://codeload.github.com/octokit/octokit.rb/legacy.tar.gz/master 89 - parts := strings.Split(u.Path, "/") 90 - if len(parts) > 2 && o.Name == "" { 91 - o.Name = fmt.Sprintf("github.com/%s/%s", parts[1], parts[2]) 92 - o.RepoURL = fmt.Sprintf("https://github.com/%s/%s", parts[1], parts[2]) 93 - } 94 - if len(parts) > 4 { 95 - setRef(parts[4]) 96 - if u.Host == "github.com" { 97 - o.Archive = fmt.Sprintf("https://codeload.github.com/%s/%s/legacy.tar.gz/%s", parts[1], parts[2], parts[4]) 98 - } 99 - } 100 - o.Strip = 1 101 - case "api.github.com": 102 - // https://api.github.com/repos/octokit/octokit.rb/tarball/master 103 - parts := strings.Split(u.Path, "/") 104 - if len(parts) > 2 && o.Name == "" { 105 - o.Name = fmt.Sprintf("github.com/%s/%s", parts[1], parts[2]) 106 - o.RepoURL = fmt.Sprintf("https://github.com/%s/%s", parts[1], parts[2]) 107 - } 108 - if len(parts) > 5 { 109 - setRef(parts[5]) 110 - } 111 - o.Strip = 1 112 - } 113 - } 114 - 115 - func do(opts Options, bopts build.Options) error { 116 - opts.SetDefaults() 117 - 118 - if opts.Name == "" && opts.RepoURL == "" { 119 - return errors.New("-name or -url required") 120 - } 121 - if opts.Branch == "" { 122 - return errors.New("-branch required") 123 - } 124 - 125 - if opts.Name != "" { 126 - bopts.RepositoryDescription.Name = opts.Name 127 - } 128 - // We do not use this functionality to avoid pulling in the transitive deps of gitindex 129 - /* 130 - if opts.RepoURL != "" { 131 - u, err := url.Parse(opts.RepoURL) 132 - if err != nil { 133 - return err 134 - } 135 - if err := gitindex.SetTemplatesFromOrigin(&bopts.RepositoryDescription, u); err != nil { 136 - return err 137 - } 138 - } 139 - */ 140 - bopts.SetDefaults() 141 - bopts.RepositoryDescription.Branches = []zoekt.RepositoryBranch{{Name: opts.Branch, Version: opts.Commit}} 142 - brs := []string{opts.Branch} 143 - 144 - if opts.Incremental && bopts.IncrementalSkipIndexing() { 145 - return nil 146 - } 147 - 148 - a, err := openArchive(opts.Archive) 149 - if err != nil { 150 - return err 151 - } 152 - defer a.Close() 153 - 154 - bopts.RepositoryDescription.Source = opts.Archive 155 - builder, err := build.NewBuilder(bopts) 156 - if err != nil { 157 - return err 158 - } 159 - 160 - add := func(f *File) error { 161 - defer f.Close() 162 - 163 - contents, err := io.ReadAll(f) 164 - if err != nil { 165 - return err 166 - } 167 - 168 - name := stripComponents(f.Name, opts.Strip) 169 - if name == "" { 170 - return nil 171 - } 172 - 173 - return builder.Add(zoekt.Document{ 174 - Name: name, 175 - Content: contents, 176 - Branches: brs, 177 - }) 178 - } 179 - 180 - for { 181 - f, err := a.Next() 182 - if err == io.EOF { 183 - break 184 - } 185 - if err != nil { 186 - return err 187 - } 188 - 189 - if err := add(f); err != nil { 190 - return err 191 - } 192 - } 193 - 194 - return builder.Finish() 195 - } 16 + "github.com/sourcegraph/zoekt/cmd" 17 + "github.com/sourcegraph/zoekt/internal/archive" 18 + ) 196 19 197 20 func main() { 198 21 var ( ··· 216 39 if len(flag.Args()) != 1 { 217 40 log.Fatal("expected argument for archive location") 218 41 } 219 - archive := flag.Args()[0] 42 + archiveURL := flag.Args()[0] 220 43 bopts := cmd.OptionsFromFlags() 221 - opts := Options{ 44 + opts := archive.Options{ 222 45 Incremental: *incremental, 223 46 224 - Archive: archive, 47 + Archive: archiveURL, 225 48 Name: *name, 226 49 RepoURL: *urlRaw, 227 50 Branch: *branch, ··· 232 55 // Sourcegraph specific: Limit HTTP traffic 233 56 limitHTTPDefaultClient(*downloadLimitMbps) 234 57 235 - if err := do(opts, *bopts); err != nil { 58 + if err := archive.Index(opts, *bopts); err != nil { 236 59 log.Fatal(err) 237 60 } 238 61 }
cmd/zoekt-archive-index/testdata/Get_databaseuser.txt internal/e2e/testdata/Get_databaseuser.txt
cmd/zoekt-archive-index/testdata/InternalDoer.txt internal/e2e/testdata/InternalDoer.txt
cmd/zoekt-archive-index/testdata/Repository_metadata_Write_rbac.txt internal/e2e/testdata/Repository_metadata_Write_rbac.txt
cmd/zoekt-archive-index/testdata/bufio_buffer.txt internal/e2e/testdata/bufio_buffer.txt
cmd/zoekt-archive-index/testdata/bytes_buffer.txt internal/e2e/testdata/bytes_buffer.txt
cmd/zoekt-archive-index/testdata/generate_unit_test.txt internal/e2e/testdata/generate_unit_test.txt
cmd/zoekt-archive-index/testdata/graphql_type_User.txt internal/e2e/testdata/graphql_type_User.txt
cmd/zoekt-archive-index/testdata/r_cody_sourcegraph_url.txt internal/e2e/testdata/r_cody_sourcegraph_url.txt
cmd/zoekt-archive-index/testdata/test_server.txt internal/e2e/testdata/test_server.txt
+187
internal/archive/index.go
··· 1 + // package archive provides indexing of archives from remote URLs. 2 + package archive 3 + 4 + import ( 5 + "errors" 6 + "fmt" 7 + "io" 8 + "net/url" 9 + "strings" 10 + 11 + "github.com/sourcegraph/zoekt" 12 + "github.com/sourcegraph/zoekt/build" 13 + ) 14 + 15 + // Options specify the archive specific indexing options. 16 + type Options struct { 17 + Incremental bool 18 + 19 + Archive string 20 + Name string 21 + RepoURL string 22 + Branch string 23 + Commit string 24 + Strip int 25 + } 26 + 27 + func (o *Options) SetDefaults() { 28 + // We guess based on the archive URL. 29 + u, _ := url.Parse(o.Archive) 30 + if u == nil { 31 + return 32 + } 33 + 34 + setRef := func(ref string) { 35 + if isGitOID(ref) && o.Commit == "" { 36 + o.Commit = ref 37 + } 38 + if !isGitOID(ref) && o.Branch == "" { 39 + o.Branch = ref 40 + } 41 + } 42 + 43 + switch u.Host { 44 + case "github.com", "codeload.github.com": 45 + // https://github.com/octokit/octokit.rb/commit/3d21ec53a331a6f037a91c368710b99387d012c1 46 + // https://github.com/octokit/octokit.rb/blob/master/README.md 47 + // https://github.com/octokit/octokit.rb/tree/master/lib 48 + // https://codeload.github.com/octokit/octokit.rb/legacy.tar.gz/master 49 + parts := strings.Split(u.Path, "/") 50 + if len(parts) > 2 && o.Name == "" { 51 + o.Name = fmt.Sprintf("github.com/%s/%s", parts[1], parts[2]) 52 + o.RepoURL = fmt.Sprintf("https://github.com/%s/%s", parts[1], parts[2]) 53 + } 54 + if len(parts) > 4 { 55 + setRef(parts[4]) 56 + if u.Host == "github.com" { 57 + o.Archive = fmt.Sprintf("https://codeload.github.com/%s/%s/legacy.tar.gz/%s", parts[1], parts[2], parts[4]) 58 + } 59 + } 60 + o.Strip = 1 61 + case "api.github.com": 62 + // https://api.github.com/repos/octokit/octokit.rb/tarball/master 63 + parts := strings.Split(u.Path, "/") 64 + if len(parts) > 2 && o.Name == "" { 65 + o.Name = fmt.Sprintf("github.com/%s/%s", parts[1], parts[2]) 66 + o.RepoURL = fmt.Sprintf("https://github.com/%s/%s", parts[1], parts[2]) 67 + } 68 + if len(parts) > 5 { 69 + setRef(parts[5]) 70 + } 71 + o.Strip = 1 72 + } 73 + } 74 + 75 + // Index archive specified in opts using bopts. 76 + func Index(opts Options, bopts build.Options) error { 77 + opts.SetDefaults() 78 + 79 + if opts.Name == "" && opts.RepoURL == "" { 80 + return errors.New("-name or -url required") 81 + } 82 + if opts.Branch == "" { 83 + return errors.New("-branch required") 84 + } 85 + 86 + if opts.Name != "" { 87 + bopts.RepositoryDescription.Name = opts.Name 88 + } 89 + // We do not use this functionality to avoid pulling in the transitive deps of gitindex 90 + /* 91 + if opts.RepoURL != "" { 92 + u, err := url.Parse(opts.RepoURL) 93 + if err != nil { 94 + return err 95 + } 96 + if err := gitindex.SetTemplatesFromOrigin(&bopts.RepositoryDescription, u); err != nil { 97 + return err 98 + } 99 + } 100 + */ 101 + bopts.SetDefaults() 102 + bopts.RepositoryDescription.Branches = []zoekt.RepositoryBranch{{Name: opts.Branch, Version: opts.Commit}} 103 + brs := []string{opts.Branch} 104 + 105 + if opts.Incremental && bopts.IncrementalSkipIndexing() { 106 + return nil 107 + } 108 + 109 + a, err := openArchive(opts.Archive) 110 + if err != nil { 111 + return err 112 + } 113 + defer a.Close() 114 + 115 + bopts.RepositoryDescription.Source = opts.Archive 116 + builder, err := build.NewBuilder(bopts) 117 + if err != nil { 118 + return err 119 + } 120 + 121 + add := func(f *File) error { 122 + defer f.Close() 123 + 124 + contents, err := io.ReadAll(f) 125 + if err != nil { 126 + return err 127 + } 128 + 129 + name := stripComponents(f.Name, opts.Strip) 130 + if name == "" { 131 + return nil 132 + } 133 + 134 + return builder.Add(zoekt.Document{ 135 + Name: name, 136 + Content: contents, 137 + Branches: brs, 138 + }) 139 + } 140 + 141 + for { 142 + f, err := a.Next() 143 + if err == io.EOF { 144 + break 145 + } 146 + if err != nil { 147 + return err 148 + } 149 + 150 + if err := add(f); err != nil { 151 + return err 152 + } 153 + } 154 + 155 + return builder.Finish() 156 + } 157 + 158 + // stripComponents removes the specified number of leading path 159 + // elements. Pathnames with fewer elements will return the empty string. 160 + func stripComponents(path string, count int) string { 161 + for i := 0; path != "" && i < count; i++ { 162 + i := strings.Index(path, "/") 163 + if i < 0 { 164 + return "" 165 + } 166 + path = path[i+1:] 167 + } 168 + return path 169 + } 170 + 171 + // isGitOID checks if the revision is a git OID SHA string. 172 + // 173 + // Note: This doesn't mean the SHA exists in a repository, nor does it mean it 174 + // isn't a ref. Git allows 40-char hexadecimal strings to be references. 175 + func isGitOID(s string) bool { 176 + if len(s) != 40 { 177 + return false 178 + } 179 + for _, r := range s { 180 + if !(('0' <= r && r <= '9') || 181 + ('a' <= r && r <= 'f') || 182 + ('A' <= r && r <= 'F')) { 183 + return false 184 + } 185 + } 186 + return true 187 + }
+2
internal/e2e/doc.go
··· 1 + // package e2e contains end to end tests 2 + package e2e
+17
internal/e2e/e2e_test.go
··· 1 + package e2e 2 + 3 + import ( 4 + "flag" 5 + "io" 6 + "log" 7 + "os" 8 + "testing" 9 + ) 10 + 11 + func TestMain(m *testing.M) { 12 + flag.Parse() 13 + if !testing.Verbose() { 14 + log.SetOutput(io.Discard) 15 + } 16 + os.Exit(m.Run()) 17 + }