cmd/zoekt-mirror-gitiles/cgit.go at 22155f1aa1e35c6be9edcd65670e856c17b3ddd7 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / cmd / zoekt-mirror-gitiles / cgit.go
at 22155f1aa1e35c6be9edcd65670e856c17b3ddd7 2.9 kB View raw
Maxim Pimenov all: stop using the deprecated package io/ioutil (#360) 4y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package main
 16
 17import (
 18	"bytes"
 19	"fmt"
 20	"io"
 21	"log"
 22	"net/http"
 23	"net/url"
 24	"strings"
 25
 26	"github.com/grafana/regexp"
 27)
 28
 29// I will go to programmer hell for trying to parse HTML with
 30// regexps. Why doesn't CGit have a JSON interface?
 31var cgitRepoEntryRE = regexp.MustCompile(
 32	`class='sublevel-repo'><a title='([^'"]*)' href='([^']*)'>`)
 33
 34func normalizedGet(u *url.URL) ([]byte, error) {
 35	rep, err := http.Get(u.String())
 36	if err != nil {
 37		return nil, err
 38	}
 39	defer rep.Body.Close()
 40	if rep.StatusCode != 200 {
 41		return nil, fmt.Errorf("status %s", rep.Status)
 42	}
 43
 44	c, err := io.ReadAll(rep.Body)
 45	if err != nil {
 46		return nil, err
 47	}
 48
 49	c = bytes.Replace(c, []byte{'\n'}, []byte{' '}, -1)
 50	return c, nil
 51}
 52
 53// getCGitRepos finds repo names from the CGit index page hosted at
 54// URL `u`.
 55func getCGitRepos(u *url.URL, filter func(string) bool) (map[string]*crawlTarget, error) {
 56	c, err := normalizedGet(u)
 57	if err != nil {
 58		return nil, err
 59	}
 60
 61	pages := map[string]*crawlTarget{}
 62	for _, m := range cgitRepoEntryRE.FindAllSubmatch(c, -1) {
 63		nm := strings.TrimSuffix(string(m[1]), ".git")
 64
 65		if !filter(nm) {
 66			continue
 67		}
 68
 69		relUrl := string(m[2])
 70
 71		u, err := u.Parse(relUrl)
 72		if err != nil {
 73			log.Printf("ignoring u.Parse(%q): %v", relUrl, err)
 74			continue
 75		}
 76		pages[nm] = &crawlTarget{
 77			webURL:     u.String(),
 78			webURLType: "cgit",
 79		}
 80	}
 81
 82	// TODO - parallel?
 83	for _, target := range pages {
 84		u, _ := url.Parse(target.webURL)
 85		c, err := cgitCloneURL(u)
 86		if err != nil {
 87			log.Printf("ignoring cgitCloneURL(%s): %v", u, c)
 88			continue
 89		}
 90
 91		target.cloneURL = c.String()
 92	}
 93	return pages, nil
 94}
 95
 96// We'll take the first URL we get. This may put the git:// URL (which
 97// is insecure) at the top, but individual machines (such as
 98// git.savannah.gnu) probably would rather receive git:// traffic
 99// which is more efficient.
100
101// TODO - do something like `Clone.*<a.*href=` to get the first
102// URL. Older versions don't say vcs-git.
103var cloneURLRe = regexp.MustCompile(
104	`rel=["']vcs-git["'] *href=["']([^"']*)["']`)
105
106func cgitCloneURL(u *url.URL) (*url.URL, error) {
107	c, err := normalizedGet(u)
108	if err != nil {
109		return nil, err
110	}
111
112	m := cloneURLRe.FindSubmatch(c)
113	cl, err := url.Parse(string(m[1]))
114	if err != nil {
115		return nil, err
116	}
117
118	return cl, nil
119}
Configure Feed

Configure Feed