fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

at main 2.9 kB View raw
1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package main 16 17import ( 18 "bytes" 19 "fmt" 20 "io" 21 "log" 22 "net/http" 23 "net/url" 24 "strings" 25 26 "github.com/grafana/regexp" 27) 28 29// I will go to programmer hell for trying to parse HTML with 30// regexps. Why doesn't CGit have a JSON interface? 31var cgitRepoEntryRE = regexp.MustCompile( 32 `class='sublevel-repo'><a title='([^'"]*)' href='([^']*)'>`) 33 34func normalizedGet(u *url.URL) ([]byte, error) { 35 rep, err := http.Get(u.String()) 36 if err != nil { 37 return nil, err 38 } 39 defer rep.Body.Close() 40 if rep.StatusCode != 200 { 41 return nil, fmt.Errorf("status %s", rep.Status) 42 } 43 44 c, err := io.ReadAll(rep.Body) 45 if err != nil { 46 return nil, err 47 } 48 49 c = bytes.Replace(c, []byte{'\n'}, []byte{' '}, -1) 50 return c, nil 51} 52 53// getCGitRepos finds repo names from the CGit index page hosted at 54// URL `u`. 55func getCGitRepos(u *url.URL, filter func(string) bool) (map[string]*crawlTarget, error) { 56 c, err := normalizedGet(u) 57 if err != nil { 58 return nil, err 59 } 60 61 pages := map[string]*crawlTarget{} 62 for _, m := range cgitRepoEntryRE.FindAllSubmatch(c, -1) { 63 nm := strings.TrimSuffix(string(m[1]), ".git") 64 65 if !filter(nm) { 66 continue 67 } 68 69 relUrl := string(m[2]) 70 71 u, err := u.Parse(relUrl) 72 if err != nil { 73 log.Printf("ignoring u.Parse(%q): %v", relUrl, err) 74 continue 75 } 76 pages[nm] = &crawlTarget{ 77 webURL: u.String(), 78 webURLType: "cgit", 79 } 80 } 81 82 // TODO - parallel? 83 for _, target := range pages { 84 u, _ := url.Parse(target.webURL) 85 c, err := cgitCloneURL(u) 86 if err != nil { 87 log.Printf("ignoring cgitCloneURL(%s): %v", u, c) 88 continue 89 } 90 91 target.cloneURL = c.String() 92 } 93 return pages, nil 94} 95 96// We'll take the first URL we get. This may put the git:// URL (which 97// is insecure) at the top, but individual machines (such as 98// git.savannah.gnu) probably would rather receive git:// traffic 99// which is more efficient. 100 101// TODO - do something like `Clone.*<a.*href=` to get the first 102// URL. Older versions don't say vcs-git. 103var cloneURLRe = regexp.MustCompile( 104 `rel=["']vcs-git["'] *href=["']([^"']*)["']`) 105 106func cgitCloneURL(u *url.URL) (*url.URL, error) { 107 c, err := normalizedGet(u) 108 if err != nil { 109 return nil, err 110 } 111 112 m := cloneURLRe.FindSubmatch(c) 113 cl, err := url.Parse(string(m[1])) 114 if err != nil { 115 return nil, err 116 } 117 118 return cl, nil 119}