fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package main
16
17import (
18 "bytes"
19 "fmt"
20 "io"
21 "log"
22 "net/http"
23 "net/url"
24 "strings"
25
26 "github.com/grafana/regexp"
27)
28
29// I will go to programmer hell for trying to parse HTML with
30// regexps. Why doesn't CGit have a JSON interface?
31var cgitRepoEntryRE = regexp.MustCompile(
32 `class='sublevel-repo'><a title='([^'"]*)' href='([^']*)'>`)
33
34func normalizedGet(u *url.URL) ([]byte, error) {
35 rep, err := http.Get(u.String())
36 if err != nil {
37 return nil, err
38 }
39 defer rep.Body.Close()
40 if rep.StatusCode != 200 {
41 return nil, fmt.Errorf("status %s", rep.Status)
42 }
43
44 c, err := io.ReadAll(rep.Body)
45 if err != nil {
46 return nil, err
47 }
48
49 c = bytes.Replace(c, []byte{'\n'}, []byte{' '}, -1)
50 return c, nil
51}
52
53// getCGitRepos finds repo names from the CGit index page hosted at
54// URL `u`.
55func getCGitRepos(u *url.URL, filter func(string) bool) (map[string]*crawlTarget, error) {
56 c, err := normalizedGet(u)
57 if err != nil {
58 return nil, err
59 }
60
61 pages := map[string]*crawlTarget{}
62 for _, m := range cgitRepoEntryRE.FindAllSubmatch(c, -1) {
63 nm := strings.TrimSuffix(string(m[1]), ".git")
64
65 if !filter(nm) {
66 continue
67 }
68
69 relUrl := string(m[2])
70
71 u, err := u.Parse(relUrl)
72 if err != nil {
73 log.Printf("ignoring u.Parse(%q): %v", relUrl, err)
74 continue
75 }
76 pages[nm] = &crawlTarget{
77 webURL: u.String(),
78 webURLType: "cgit",
79 }
80 }
81
82 // TODO - parallel?
83 for _, target := range pages {
84 u, _ := url.Parse(target.webURL)
85 c, err := cgitCloneURL(u)
86 if err != nil {
87 log.Printf("ignoring cgitCloneURL(%s): %v", u, c)
88 continue
89 }
90
91 target.cloneURL = c.String()
92 }
93 return pages, nil
94}
95
96// We'll take the first URL we get. This may put the git:// URL (which
97// is insecure) at the top, but individual machines (such as
98// git.savannah.gnu) probably would rather receive git:// traffic
99// which is more efficient.
100
101// TODO - do something like `Clone.*<a.*href=` to get the first
102// URL. Older versions don't say vcs-git.
103var cloneURLRe = regexp.MustCompile(
104 `rel=["']vcs-git["'] *href=["']([^"']*)["']`)
105
106func cgitCloneURL(u *url.URL) (*url.URL, error) {
107 c, err := normalizedGet(u)
108 if err != nil {
109 return nil, err
110 }
111
112 m := cloneURLRe.FindSubmatch(c)
113 cl, err := url.Parse(string(m[1]))
114 if err != nil {
115 return nil, err
116 }
117
118 return cl, nil
119}