fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1package main 2 3import ( 4 "bytes" 5 "context" 6 "crypto/sha1" 7 "errors" 8 "fmt" 9 "io" 10 "log" 11 "net/url" 12 "os" 13 "os/exec" 14 "path/filepath" 15 "sort" 16 "strconv" 17 "strings" 18 "time" 19 20 sglog "github.com/sourcegraph/log" 21 22 "github.com/sourcegraph/zoekt" 23 "github.com/sourcegraph/zoekt/build" 24 "github.com/sourcegraph/zoekt/ctags" 25 "github.com/sourcegraph/zoekt/internal/tenant" 26) 27 28const defaultIndexingTimeout = 1*time.Hour + 30*time.Minute 29 30// IndexOptions are the options that Sourcegraph can set via it's search 31// configuration endpoint. 32type IndexOptions struct { 33 // LargeFiles is a slice of glob patterns where matching file paths should 34 // be indexed regardless of their size. The pattern syntax can be found 35 // here: https://golang.org/pkg/path/filepath/#Match. 36 LargeFiles []string 37 38 // Symbols if true will make zoekt index the output of ctags. 39 Symbols bool 40 41 // Branches is a slice of branches to index. 42 Branches []zoekt.RepositoryBranch 43 44 // RepoID is the Sourcegraph Repository ID. 45 RepoID uint32 46 47 // Name is the Repository Name. 48 Name string 49 50 // CloneURL is the internal clone URL for Name. 51 CloneURL string 52 53 // Priority indicates ranking in results, higher first. 54 Priority float64 55 56 // Public is true if the repository is public. 57 Public bool 58 59 // Fork is true if the repository is a fork. 60 Fork bool 61 62 // Archived is true if the repository is archived. 63 Archived bool 64 65 // Map from language to scip-ctags, universal-ctags, or neither 66 LanguageMap ctags.LanguageMap 67 68 // The number of threads to use for indexing shards. Defaults to the number of available 69 // CPUs. If the server flag -cpu_fraction is set, then this value overrides it. 70 ShardConcurrency int32 71 72 // TenantID is the tenant ID for the repository. 73 TenantID int 74} 75 76// indexArgs represents the arguments we pass to zoekt-git-index 77type indexArgs struct { 78 IndexOptions 79 80 // Incremental indicates to skip indexing if already indexed. 81 Incremental bool 82 83 // IndexDir is the index directory to store the shards. 84 IndexDir string 85 86 // Parallelism is the number of shards to compute in parallel. 87 Parallelism int 88 89 // FileLimit is the maximum size of a file 90 FileLimit int 91 92 // UseDelta is true if we want to use the new delta indexer. This should 93 // only be true for repositories we explicitly enable. 94 UseDelta bool 95 96 // DeltaShardNumberFallbackThreshold is an upper limit on the number of preexisting shards that can exist 97 // before attempting a delta build. 98 DeltaShardNumberFallbackThreshold uint64 99 100 // ShardMerging is true if we want zoekt-git-index to respect compound shards. 101 ShardMerging bool 102} 103 104// BuildOptions returns a build.Options represented by indexArgs. Note: it 105// doesn't set fields like repository/branch. 106func (o *indexArgs) BuildOptions() *build.Options { 107 shardPrefix := "" 108 if tenant.EnforceTenant() { 109 shardPrefix = tenant.SrcPrefix(o.TenantID, o.RepoID) 110 } 111 112 return &build.Options{ 113 // It is important that this RepositoryDescription exactly matches what 114 // the indexer we call will produce. This is to ensure that 115 // IncrementalSkipIndexing and IndexState can correctly calculate if 116 // nothing needs to be done. 117 RepositoryDescription: zoekt.Repository{ 118 TenantID: o.TenantID, 119 ID: uint32(o.IndexOptions.RepoID), 120 Name: o.Name, 121 Branches: o.Branches, 122 RawConfig: map[string]string{ 123 "repoid": strconv.Itoa(int(o.IndexOptions.RepoID)), 124 "priority": strconv.FormatFloat(o.Priority, 'g', -1, 64), 125 "public": marshalBool(o.Public), 126 "fork": marshalBool(o.Fork), 127 "archived": marshalBool(o.Archived), 128 // Calculate repo rank based on the latest commit date. 129 "latestCommitDate": "1", 130 "tenantID": strconv.Itoa(o.TenantID), 131 }, 132 }, 133 IndexDir: o.IndexDir, 134 Parallelism: o.Parallelism, 135 SizeMax: o.FileLimit, 136 LargeFiles: o.LargeFiles, 137 CTagsMustSucceed: o.Symbols, 138 DisableCTags: !o.Symbols, 139 IsDelta: o.UseDelta, 140 141 LanguageMap: o.LanguageMap, 142 143 ShardMerging: o.ShardMerging, 144 145 ShardPrefix: shardPrefix, 146 } 147} 148 149func marshalBool(b bool) string { 150 if b { 151 return "1" 152 } 153 return "0" 154} 155 156func (o *indexArgs) String() string { 157 s := fmt.Sprintf("%d %s", o.RepoID, o.Name) 158 for i, b := range o.Branches { 159 if i == 0 { 160 s = fmt.Sprintf("%s@%s=%s", s, b.Name, b.Version) 161 } else { 162 s = fmt.Sprintf("%s,%s=%s", s, b.Name, b.Version) 163 } 164 } 165 return s 166} 167 168type gitIndexConfig struct { 169 // runCmd is the function that's used to execute all external commands (such as calls to "git" or "zoekt-git-index") 170 // that gitIndex may construct. 171 runCmd func(*exec.Cmd) error 172 173 // findRepositoryMetadata is the function that returns the repository metadata for the 174 // repository specified in args. 'ok' is false if the repository's metadata 175 // couldn't be found or if an error occurred. 176 // 177 // The primary purpose of this configuration option is to be able to provide a stub 178 // implementation for this in our test suite. All other callers should use build.Options.FindRepositoryMetadata(). 179 findRepositoryMetadata func(args *indexArgs) (repository *zoekt.Repository, metadata *zoekt.IndexMetadata, ok bool, err error) 180 181 // timeout defines how long the index server waits before killing an indexing job. 182 timeout time.Duration 183} 184 185func gitIndex(c gitIndexConfig, o *indexArgs, sourcegraph Sourcegraph, l sglog.Logger) error { 186 logger := l.Scoped("gitIndex") 187 188 if len(o.Branches) == 0 { 189 return errors.New("zoekt-git-index requires 1 or more branches") 190 } 191 192 if c.runCmd == nil { 193 return errors.New("runCmd in provided configuration was nil - a function must be provided") 194 } 195 196 if c.findRepositoryMetadata == nil { 197 return errors.New("findRepositoryMetadata in provided configuration was nil - a function must be provided") 198 } 199 200 ctx, cancel := context.WithTimeout(context.Background(), c.timeout) 201 defer cancel() 202 203 gitDir, err := tmpGitDir(o.Name) 204 if err != nil { 205 return err 206 } 207 defer os.RemoveAll(gitDir) // best-effort cleanup 208 209 err = fetchRepo(ctx, gitDir, o, c, logger) 210 if err != nil { 211 return err 212 } 213 214 err = setZoektConfig(ctx, gitDir, o, c) 215 if err != nil { 216 return err 217 } 218 219 err = indexRepo(ctx, gitDir, sourcegraph, o, c, logger) 220 if err != nil { 221 return err 222 } 223 224 return nil 225} 226 227func fetchRepo(ctx context.Context, gitDir string, o *indexArgs, c gitIndexConfig, logger sglog.Logger) error { 228 // Create a repo to fetch into 229 cmd := exec.CommandContext(ctx, "git", 230 // use a random default branch. This is so that HEAD isn't a symref to a 231 // branch that is indexed. For example if you are indexing 232 // HEAD,master. Then HEAD would be pointing to master by default. 233 "-c", "init.defaultBranch=nonExistentBranchBB0FOFCH32", 234 "init", 235 // we don't need a working copy 236 "--bare", 237 gitDir) 238 cmd.Stdin = &bytes.Buffer{} 239 if err := c.runCmd(cmd); err != nil { 240 return err 241 } 242 243 var fetchDuration time.Duration 244 successfullyFetchedCommitsCount := 0 245 allFetchesSucceeded := true 246 247 defer func() { 248 success := strconv.FormatBool(allFetchesSucceeded) 249 name := repoNameForMetric(o.Name) 250 metricFetchDuration.WithLabelValues(success, name).Observe(fetchDuration.Seconds()) 251 }() 252 253 runFetch := func(branches []zoekt.RepositoryBranch) error { 254 // We shallow fetch each commit specified in zoekt.Branches. This requires 255 // the server to have configured both uploadpack.allowAnySHA1InWant and 256 // uploadpack.allowFilter. (See gitservice.go in the Sourcegraph repository) 257 fetchArgs := []string{ 258 "-C", gitDir, 259 "-c", "protocol.version=2", 260 "-c", "http.extraHeader=X-Sourcegraph-Actor-UID: internal", 261 "-c", "http.extraHeader=X-Sourcegraph-Tenant-ID: " + strconv.Itoa(o.TenantID), 262 "fetch", "--depth=1", "--no-tags", 263 } 264 265 // If there are no exceptions to MaxFileSize (1MB), we can avoid fetching these large files. 266 if len(o.LargeFiles) == 0 { 267 fetchArgs = append(fetchArgs, "--filter=blob:limit=1m") 268 } 269 270 fetchArgs = append(fetchArgs, o.CloneURL) 271 272 var commits []string 273 for _, b := range branches { 274 commits = append(commits, b.Version) 275 } 276 277 fetchArgs = append(fetchArgs, commits...) 278 279 cmd = exec.CommandContext(ctx, "git", fetchArgs...) 280 cmd.Stdin = &bytes.Buffer{} 281 282 start := time.Now() 283 err := c.runCmd(cmd) 284 fetchDuration += time.Since(start) 285 286 if err != nil { 287 allFetchesSucceeded = false 288 var bs []string 289 for _, b := range branches { 290 bs = append(bs, b.String()) 291 } 292 293 formattedBranches := strings.Join(bs, ", ") 294 return fmt.Errorf("fetching %s: %w", formattedBranches, err) 295 } 296 297 successfullyFetchedCommitsCount += len(commits) 298 return nil 299 } 300 301 fetchPriorAndLatestCommits := func() error { 302 prior, err := priorBranches(c, o) 303 if err != nil { 304 return err 305 } 306 307 var allBranches []zoekt.RepositoryBranch 308 allBranches = append(allBranches, o.Branches...) 309 allBranches = append(allBranches, prior...) 310 311 return runFetch(allBranches) 312 } 313 314 fetchOnlyLatestCommits := func() error { 315 return runFetch(o.Branches) 316 } 317 318 if o.UseDelta { 319 err := fetchPriorAndLatestCommits() 320 if err != nil { 321 name := o.BuildOptions().RepositoryDescription.Name 322 id := o.BuildOptions().RepositoryDescription.ID 323 324 log.Printf("delta build: failed to prepare delta build for %q (ID %d): failed to fetch both latest and prior commits: %s", name, id, err) 325 err = fetchOnlyLatestCommits() 326 if err != nil { 327 return err 328 } 329 } 330 } else { 331 err := fetchOnlyLatestCommits() 332 if err != nil { 333 return err 334 } 335 } 336 337 // We then create the relevant refs for each fetched commit. 338 for _, b := range o.Branches { 339 ref := b.Name 340 if ref != "HEAD" { 341 ref = "refs/heads/" + ref 342 } 343 cmd := exec.CommandContext(ctx, "git", "-C", gitDir, "update-ref", ref, b.Version) 344 cmd.Stdin = &bytes.Buffer{} 345 if err := c.runCmd(cmd); err != nil { 346 return fmt.Errorf("failed update-ref %s to %s: %w", ref, b.Version, err) 347 } 348 } 349 350 logger.Debug("successfully fetched git data", 351 sglog.String("repo", o.Name), 352 sglog.Uint32("id", o.RepoID), 353 sglog.Int("commits_count", successfullyFetchedCommitsCount), 354 sglog.Duration("duration", fetchDuration), 355 ) 356 return nil 357} 358 359func setZoektConfig(ctx context.Context, gitDir string, o *indexArgs, c gitIndexConfig) error { 360 // create git configuration with options 361 type configKV struct{ Key, Value string } 362 config := []configKV{{ 363 // zoekt.name is used by zoekt-git-index to set the repository name. 364 Key: "name", 365 Value: o.Name, 366 }} 367 for k, v := range o.BuildOptions().RepositoryDescription.RawConfig { 368 config = append(config, configKV{Key: k, Value: v}) 369 } 370 sort.Slice(config, func(i, j int) bool { 371 return config[i].Key < config[j].Key 372 }) 373 374 // write git configuration to repo 375 for _, kv := range config { 376 cmd := exec.CommandContext(ctx, "git", "-C", gitDir, "config", "zoekt."+kv.Key, kv.Value) 377 cmd.Stdin = &bytes.Buffer{} 378 if err := c.runCmd(cmd); err != nil { 379 return err 380 } 381 } 382 return nil 383} 384 385func indexRepo(ctx context.Context, gitDir string, sourcegraph Sourcegraph, o *indexArgs, c gitIndexConfig, logger sglog.Logger) error { 386 args := []string{ 387 "-submodules=false", 388 } 389 390 // Even though we check for incremental in this process, we still pass it 391 // in just in case we regress in how we check in process. We will still 392 // notice thanks to metrics and increased load on gitserver. 393 if o.Incremental { 394 args = append(args, "-incremental") 395 } 396 397 var branches []string 398 for _, b := range o.Branches { 399 branches = append(branches, b.Name) 400 } 401 args = append(args, "-branches", strings.Join(branches, ",")) 402 403 if o.UseDelta { 404 args = append(args, "-delta") 405 args = append(args, "-delta_threshold", strconv.FormatUint(o.DeltaShardNumberFallbackThreshold, 10)) 406 } 407 408 if len(o.LanguageMap) > 0 { 409 var languageMap []string 410 for language, parser := range o.LanguageMap { 411 languageMap = append(languageMap, language+":"+ctags.ParserToString(parser)) 412 } 413 args = append(args, "-language_map", strings.Join(languageMap, ",")) 414 } 415 416 args = append(args, o.BuildOptions().Args()...) 417 args = append(args, gitDir) 418 419 cmd := exec.CommandContext(ctx, "zoekt-git-index", args...) 420 cmd.Stdin = &bytes.Buffer{} 421 if err := c.runCmd(cmd); err != nil { 422 return err 423 } 424 return nil 425} 426 427func priorBranches(c gitIndexConfig, o *indexArgs) ([]zoekt.RepositoryBranch, error) { 428 existingRepository, _, found, err := c.findRepositoryMetadata(o) 429 if err != nil { 430 return nil, fmt.Errorf("loading repository metadata: %w", err) 431 } 432 433 if !found || len(existingRepository.Branches) == 0 { 434 return nil, fmt.Errorf("no prior shards found") 435 } 436 437 return existingRepository.Branches, nil 438} 439 440func tmpGitDir(name string) (string, error) { 441 abs := url.QueryEscape(name) 442 if len(abs) > 200 { 443 h := sha1.New() 444 _, _ = io.WriteString(h, abs) 445 abs = abs[:200] + fmt.Sprintf("%x", h.Sum(nil))[:8] 446 } 447 dir := filepath.Join(os.TempDir(), abs+".git") 448 if _, err := os.Stat(dir); err == nil { 449 if err := os.RemoveAll(dir); err != nil { 450 return "", err 451 } 452 } 453 return dir, nil 454}