fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1package main 2 3import ( 4 "bytes" 5 "context" 6 "crypto/sha1" 7 "encoding/json" 8 "errors" 9 "fmt" 10 "io" 11 "log" 12 "net/url" 13 "os" 14 "os/exec" 15 "path/filepath" 16 "sort" 17 "strconv" 18 "strings" 19 "time" 20 21 "github.com/sourcegraph/zoekt" 22 "github.com/sourcegraph/zoekt/build" 23 "github.com/sourcegraph/zoekt/ctags" 24 25 sglog "github.com/sourcegraph/log" 26) 27 28// indexTimeout defines how long the indexserver waits before 29// killing an indexing job. 30const indexTimeout = 1*time.Hour + 30*time.Minute // an index should never take longer than an hour and a half 31 32// IndexOptions are the options that Sourcegraph can set via it's search 33// configuration endpoint. 34type IndexOptions struct { 35 // LargeFiles is a slice of glob patterns where matching file paths should 36 // be indexed regardless of their size. The pattern syntax can be found 37 // here: https://golang.org/pkg/path/filepath/#Match. 38 LargeFiles []string 39 40 // Symbols if true will make zoekt index the output of ctags. 41 Symbols bool 42 43 // Branches is a slice of branches to index. 44 Branches []zoekt.RepositoryBranch 45 46 // RepoID is the Sourcegraph Repository ID. 47 RepoID uint32 48 49 // Name is the Repository Name. 50 Name string 51 52 // CloneURL is the internal clone URL for Name. 53 CloneURL string 54 55 // Priority indicates ranking in results, higher first. 56 Priority float64 57 58 // DocumentRanksVersion when non-empty will lead to indexing using offline 59 // ranking. When the string changes this will also cause us to re-index with 60 // new ranks. 61 DocumentRanksVersion string 62 63 // Public is true if the repository is public. 64 Public bool 65 66 // Fork is true if the repository is a fork. 67 Fork bool 68 69 // Archived is true if the repository is archived. 70 Archived bool 71 72 // Map from language to scip-ctags, universal-ctags, or neither 73 LanguageMap ctags.LanguageMap 74} 75 76// indexArgs represents the arguments we pass to zoekt-git-index 77type indexArgs struct { 78 IndexOptions 79 80 // Incremental indicates to skip indexing if already indexed. 81 Incremental bool 82 83 // IndexDir is the index directory to store the shards. 84 IndexDir string 85 86 // Parallelism is the number of shards to compute in parallel. 87 Parallelism int 88 89 // FileLimit is the maximum size of a file 90 FileLimit int 91 92 // UseDelta is true if we want to use the new delta indexer. This should 93 // only be true for repositories we explicitly enable. 94 UseDelta bool 95 96 // DeltaShardNumberFallbackThreshold is an upper limit on the number of preexisting shards that can exist 97 // before attempting a delta build. 98 DeltaShardNumberFallbackThreshold uint64 99} 100 101// BuildOptions returns a build.Options represented by indexArgs. Note: it 102// doesn't set fields like repository/branch. 103func (o *indexArgs) BuildOptions() *build.Options { 104 return &build.Options{ 105 // It is important that this RepositoryDescription exactly matches what 106 // the indexer we call will produce. This is to ensure that 107 // IncrementalSkipIndexing and IndexState can correctly calculate if 108 // nothing needs to be done. 109 RepositoryDescription: zoekt.Repository{ 110 ID: uint32(o.IndexOptions.RepoID), 111 Name: o.Name, 112 Branches: o.Branches, 113 RawConfig: map[string]string{ 114 "repoid": strconv.Itoa(int(o.IndexOptions.RepoID)), 115 "priority": strconv.FormatFloat(o.Priority, 'g', -1, 64), 116 "public": marshalBool(o.Public), 117 "fork": marshalBool(o.Fork), 118 "archived": marshalBool(o.Archived), 119 }, 120 }, 121 IndexDir: o.IndexDir, 122 Parallelism: o.Parallelism, 123 SizeMax: o.FileLimit, 124 LargeFiles: o.LargeFiles, 125 CTagsMustSucceed: o.Symbols, 126 DisableCTags: !o.Symbols, 127 IsDelta: o.UseDelta, 128 129 DocumentRanksVersion: o.DocumentRanksVersion, 130 131 LanguageMap: o.LanguageMap, 132 } 133} 134 135func marshalBool(b bool) string { 136 if b { 137 return "1" 138 } 139 return "0" 140} 141 142func (o *indexArgs) String() string { 143 s := fmt.Sprintf("%d %s", o.RepoID, o.Name) 144 for i, b := range o.Branches { 145 if i == 0 { 146 s = fmt.Sprintf("%s@%s=%s", s, b.Name, b.Version) 147 } else { 148 s = fmt.Sprintf("%s,%s=%s", s, b.Name, b.Version) 149 } 150 } 151 return s 152} 153 154type gitIndexConfig struct { 155 // runCmd is the function that's used to execute all external commands (such as calls to "git" or "zoekt-git-index") 156 // that gitIndex may construct. 157 runCmd func(*exec.Cmd) error 158 159 // findRepositoryMetadata is the function that returns the repository metadata for the 160 // repository specified in args. 'ok' is false if the repository's metadata 161 // couldn't be found or if an error occurred. 162 // 163 // The primary purpose of this configuration option is to be able to provide a stub 164 // implementation for this in our test suite. All other callers should use build.Options.FindRepositoryMetadata(). 165 findRepositoryMetadata func(args *indexArgs) (repository *zoekt.Repository, metadata *zoekt.IndexMetadata, ok bool, err error) 166} 167 168func gitIndex(c gitIndexConfig, o *indexArgs, sourcegraph Sourcegraph, l sglog.Logger) error { 169 logger := l.Scoped("gitIndex") 170 171 if len(o.Branches) == 0 { 172 return errors.New("zoekt-git-index requires 1 or more branches") 173 } 174 175 if c.runCmd == nil { 176 return errors.New("runCmd in provided configuration was nil - a function must be provided") 177 } 178 runCmd := c.runCmd 179 180 if c.findRepositoryMetadata == nil { 181 return errors.New("findRepositoryMetadata in provided configuration was nil - a function must be provided") 182 } 183 184 buildOptions := o.BuildOptions() 185 186 ctx, cancel := context.WithTimeout(context.Background(), indexTimeout) 187 defer cancel() 188 189 gitDir, err := tmpGitDir(o.Name) 190 if err != nil { 191 return err 192 } 193 defer os.RemoveAll(gitDir) // best-effort cleanup 194 195 // Create a repo to fetch into 196 cmd := exec.CommandContext(ctx, "git", 197 // use a random default branch. This is so that HEAD isn't a symref to a 198 // branch that is indexed. For example if you are indexing 199 // HEAD,master. Then HEAD would be pointing to master by default. 200 "-c", "init.defaultBranch=nonExistentBranchBB0FOFCH32", 201 "init", 202 // we don't need a working copy 203 "--bare", 204 gitDir) 205 cmd.Stdin = &bytes.Buffer{} 206 if err := runCmd(cmd); err != nil { 207 return err 208 } 209 210 var fetchDuration time.Duration 211 successfullyFetchedCommitsCount := 0 212 allFetchesSucceeded := true 213 214 defer func() { 215 success := strconv.FormatBool(allFetchesSucceeded) 216 name := repoNameForMetric(o.Name) 217 metricFetchDuration.WithLabelValues(success, name).Observe(fetchDuration.Seconds()) 218 }() 219 220 var runFetch = func(branches []zoekt.RepositoryBranch) error { 221 // We shallow fetch each commit specified in zoekt.Branches. This requires 222 // the server to have configured both uploadpack.allowAnySHA1InWant and 223 // uploadpack.allowFilter. (See gitservice.go in the Sourcegraph repository) 224 fetchArgs := []string{ 225 "-C", gitDir, 226 "-c", "protocol.version=2", 227 "-c", "http.extraHeader=X-Sourcegraph-Actor-UID: internal", 228 "fetch", "--depth=1", o.CloneURL} 229 230 var commits []string 231 for _, b := range branches { 232 commits = append(commits, b.Version) 233 } 234 235 fetchArgs = append(fetchArgs, commits...) 236 237 cmd = exec.CommandContext(ctx, "git", fetchArgs...) 238 cmd.Stdin = &bytes.Buffer{} 239 240 start := time.Now() 241 err := runCmd(cmd) 242 fetchDuration += time.Since(start) 243 244 if err != nil { 245 allFetchesSucceeded = false 246 var bs []string 247 for _, b := range branches { 248 bs = append(bs, b.String()) 249 } 250 251 formattedBranches := strings.Join(bs, ", ") 252 return fmt.Errorf("fetching %s: %w", formattedBranches, err) 253 } 254 255 successfullyFetchedCommitsCount += len(commits) 256 return nil 257 } 258 259 fetchPriorAndLatestCommits := func() error { 260 prior, err := priorBranches(c, o) 261 if err != nil { 262 return err 263 } 264 265 var allBranches []zoekt.RepositoryBranch 266 allBranches = append(allBranches, o.Branches...) 267 allBranches = append(allBranches, prior...) 268 269 return runFetch(allBranches) 270 } 271 272 fetchOnlyLatestCommits := func() error { 273 return runFetch(o.Branches) 274 } 275 276 if o.UseDelta { 277 err := fetchPriorAndLatestCommits() 278 if err != nil { 279 name := buildOptions.RepositoryDescription.Name 280 id := buildOptions.RepositoryDescription.ID 281 282 log.Printf("delta build: failed to prepare delta build for %q (ID %d): failed to fetch both latest and prior commits: %s", name, id, err) 283 err = fetchOnlyLatestCommits() 284 if err != nil { 285 return err 286 } 287 } 288 } else { 289 err := fetchOnlyLatestCommits() 290 if err != nil { 291 return err 292 } 293 } 294 295 logger.Debug("successfully fetched git data", 296 sglog.String("repo", o.Name), 297 sglog.Uint32("id", o.RepoID), 298 sglog.Int("commits_count", successfullyFetchedCommitsCount), 299 sglog.Duration("duration", fetchDuration), 300 ) 301 302 // We then create the relevant refs for each fetched commit. 303 for _, b := range o.Branches { 304 ref := b.Name 305 if ref != "HEAD" { 306 ref = "refs/heads/" + ref 307 } 308 cmd = exec.CommandContext(ctx, "git", "-C", gitDir, "update-ref", ref, b.Version) 309 cmd.Stdin = &bytes.Buffer{} 310 if err := runCmd(cmd); err != nil { 311 return fmt.Errorf("failed update-ref %s to %s: %w", ref, b.Version, err) 312 } 313 } 314 315 // create git configuration with options 316 type configKV struct{ Key, Value string } 317 config := []configKV{{ 318 // zoekt.name is used by zoekt-git-index to set the repository name. 319 Key: "name", 320 Value: o.Name, 321 }} 322 for k, v := range buildOptions.RepositoryDescription.RawConfig { 323 config = append(config, configKV{Key: k, Value: v}) 324 } 325 sort.Slice(config, func(i, j int) bool { 326 return config[i].Key < config[j].Key 327 }) 328 329 // write git configuration to repo 330 for _, kv := range config { 331 cmd = exec.CommandContext(ctx, "git", "-C", gitDir, "config", "zoekt."+kv.Key, kv.Value) 332 cmd.Stdin = &bytes.Buffer{} 333 if err := runCmd(cmd); err != nil { 334 return err 335 } 336 } 337 338 args := []string{ 339 "-submodules=false", 340 } 341 342 if o.DocumentRanksVersion != "" { 343 // We store the document ranks as JSON in gitDir and tell zoekt-git-index where 344 // to find the file. 345 documentsRankFile := filepath.Join(gitDir, "documents.rank") 346 347 saveDocumentRanks := func() error { 348 r, err := sourcegraph.GetDocumentRanks(context.Background(), o.Name) 349 if err != nil { 350 return fmt.Errorf("GetDocumentRanks: %w", err) 351 } 352 353 b, err := json.Marshal(r) 354 if err != nil { 355 return err 356 } 357 358 if err := os.WriteFile(documentsRankFile, b, 0600); err != nil { 359 return fmt.Errorf("failed to write %s to disk: %w", documentsRankFile, err) 360 } 361 362 return nil 363 } 364 365 if err := saveDocumentRanks(); err != nil { 366 // log and fall back to online ranking 367 logger.Warn( 368 "error saving document ranks. Falling back to online ranking", 369 sglog.Error(err), 370 sglog.String("repo", o.Name), 371 sglog.Uint32("id", o.RepoID), 372 ) 373 } else { 374 args = append(args, 375 "-offline_ranking", documentsRankFile, 376 "-offline_ranking_version", o.DocumentRanksVersion) 377 } 378 } 379 380 // Even though we check for incremental in this process, we still pass it 381 // in just in case we regress in how we check in process. We will still 382 // notice thanks to metrics and increased load on gitserver. 383 if o.Incremental { 384 args = append(args, "-incremental") 385 } 386 387 var branches []string 388 for _, b := range o.Branches { 389 branches = append(branches, b.Name) 390 } 391 args = append(args, "-branches", strings.Join(branches, ",")) 392 393 if o.UseDelta { 394 args = append(args, "-delta") 395 args = append(args, "-delta_threshold", strconv.FormatUint(o.DeltaShardNumberFallbackThreshold, 10)) 396 } 397 398 if len(o.LanguageMap) > 0 { 399 var languageMap []string 400 for language, parser := range o.LanguageMap { 401 languageMap = append(languageMap, language+":"+ctags.ParserToString(parser)) 402 } 403 args = append(args, "-language_map", strings.Join(languageMap, ",")) 404 } 405 406 args = append(args, buildOptions.Args()...) 407 args = append(args, gitDir) 408 409 cmd = exec.CommandContext(ctx, "zoekt-git-index", args...) 410 cmd.Stdin = &bytes.Buffer{} 411 if err := runCmd(cmd); err != nil { 412 return err 413 } 414 415 return nil 416} 417 418func priorBranches(c gitIndexConfig, o *indexArgs) ([]zoekt.RepositoryBranch, error) { 419 existingRepository, _, found, err := c.findRepositoryMetadata(o) 420 if err != nil { 421 return nil, fmt.Errorf("loading repository metadata: %w", err) 422 } 423 424 if !found || len(existingRepository.Branches) == 0 { 425 return nil, fmt.Errorf("no prior shards found") 426 } 427 428 return existingRepository.Branches, nil 429} 430 431func tmpGitDir(name string) (string, error) { 432 abs := url.QueryEscape(name) 433 if len(abs) > 200 { 434 h := sha1.New() 435 _, _ = io.WriteString(h, abs) 436 abs = abs[:200] + fmt.Sprintf("%x", h.Sum(nil))[:8] 437 } 438 dir := filepath.Join(os.TempDir(), abs+".git") 439 if _, err := os.Stat(dir); err == nil { 440 if err := os.RemoveAll(dir); err != nil { 441 return "", err 442 } 443 } 444 return dir, nil 445}