fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1package main 2 3import ( 4 "bytes" 5 "context" 6 "crypto/sha1" 7 "errors" 8 "fmt" 9 "io" 10 "net/url" 11 "os" 12 "os/exec" 13 "path/filepath" 14 "sort" 15 "strconv" 16 "strings" 17 "time" 18 19 sglog "github.com/sourcegraph/log" 20 "github.com/sourcegraph/zoekt" 21 "github.com/sourcegraph/zoekt/index" 22 "github.com/sourcegraph/zoekt/internal/ctags" 23 "github.com/sourcegraph/zoekt/internal/tenant" 24) 25 26const defaultIndexingTimeout = 1*time.Hour + 30*time.Minute 27 28// IndexOptions are the options that Sourcegraph can set via it's search 29// configuration endpoint. 30type IndexOptions struct { 31 // LargeFiles is a slice of glob patterns where matching file paths should 32 // be indexed regardless of their size. The pattern syntax can be found 33 // here: https://golang.org/pkg/path/filepath/#Match. 34 LargeFiles []string 35 36 // Symbols if true will make zoekt index the output of ctags. 37 Symbols bool 38 39 // Branches is a slice of branches to index. 40 Branches []zoekt.RepositoryBranch 41 42 // RepoID is the Sourcegraph Repository ID. 43 RepoID uint32 44 45 // Name is the Repository Name. 46 Name string 47 48 // CloneURL is the internal clone URL for Name. 49 CloneURL string 50 51 // Priority indicates ranking in results, higher first. 52 Priority float64 53 54 // Public is true if the repository is public. 55 Public bool 56 57 // Fork is true if the repository is a fork. 58 Fork bool 59 60 // Archived is true if the repository is archived. 61 Archived bool 62 63 // Map from language to scip-ctags, universal-ctags, or neither 64 LanguageMap ctags.LanguageMap 65 66 // The number of threads to use for indexing shards. Defaults to the number of available 67 // CPUs. If the server flag -cpu_fraction is set, then this value overrides it. 68 ShardConcurrency int32 69 70 // TenantID is the tenant ID for the repository. 71 TenantID int 72} 73 74// indexArgs represents the arguments we pass to zoekt-git-index 75type indexArgs struct { 76 IndexOptions 77 78 // Incremental indicates to skip indexing if already indexed. 79 Incremental bool 80 81 // IndexDir is the index directory to store the shards. 82 IndexDir string 83 84 // Parallelism is the number of shards to compute in parallel. 85 Parallelism int 86 87 // FileLimit is the maximum size of a file 88 FileLimit int 89 90 // UseDelta is true if we want to use the new delta indexer. This should 91 // only be true for repositories we explicitly enable. 92 UseDelta bool 93 94 // DeltaShardNumberFallbackThreshold is an upper limit on the number of preexisting shards that can exist 95 // before attempting a delta build. 96 DeltaShardNumberFallbackThreshold uint64 97 98 // ShardMerging is true if we want zoekt-git-index to respect compound shards. 99 ShardMerging bool 100} 101 102// BuildOptions returns a index.Options represented by indexArgs. Note: it 103// doesn't set fields like repository/branch. 104func (o *indexArgs) BuildOptions() *index.Options { 105 shardPrefix := "" 106 if tenant.EnforceTenant() { 107 shardPrefix = tenant.SrcPrefix(o.TenantID, o.RepoID) 108 } 109 110 return &index.Options{ 111 // It is important that this RepositoryDescription exactly matches what 112 // the indexer we call will produce. This is to ensure that 113 // IncrementalSkipIndexing and IndexState can correctly calculate if 114 // nothing needs to be done. 115 RepositoryDescription: zoekt.Repository{ 116 TenantID: o.TenantID, 117 ID: uint32(o.IndexOptions.RepoID), 118 Name: o.Name, 119 Branches: o.Branches, 120 RawConfig: map[string]string{ 121 "repoid": strconv.Itoa(int(o.IndexOptions.RepoID)), 122 "priority": strconv.FormatFloat(o.Priority, 'g', -1, 64), 123 "public": marshalBool(o.Public), 124 "fork": marshalBool(o.Fork), 125 "archived": marshalBool(o.Archived), 126 // Calculate repo rank based on the latest commit date. 127 "latestCommitDate": "1", 128 "tenantID": strconv.Itoa(o.TenantID), 129 }, 130 }, 131 IndexDir: o.IndexDir, 132 Parallelism: o.Parallelism, 133 SizeMax: o.FileLimit, 134 LargeFiles: o.LargeFiles, 135 CTagsMustSucceed: o.Symbols, 136 DisableCTags: !o.Symbols, 137 IsDelta: o.UseDelta, 138 139 LanguageMap: o.LanguageMap, 140 141 ShardMerging: o.ShardMerging, 142 143 ShardPrefix: shardPrefix, 144 } 145} 146 147func marshalBool(b bool) string { 148 if b { 149 return "1" 150 } 151 return "0" 152} 153 154func (o *indexArgs) String() string { 155 s := fmt.Sprintf("%d %s", o.RepoID, o.Name) 156 for i, b := range o.Branches { 157 if i == 0 { 158 s = fmt.Sprintf("%s@%s=%s", s, b.Name, b.Version) 159 } else { 160 s = fmt.Sprintf("%s,%s=%s", s, b.Name, b.Version) 161 } 162 } 163 return s 164} 165 166type gitIndexConfig struct { 167 // runCmd is the function that's used to execute all external commands (such as calls to "git" or "zoekt-git-index") 168 // that gitIndex may construct. 169 runCmd func(*exec.Cmd) error 170 171 // findRepositoryMetadata is the function that returns the repository metadata for the 172 // repository specified in args. 'ok' is false if the repository's metadata 173 // couldn't be found or if an error occurred. 174 // 175 // The primary purpose of this configuration option is to be able to provide a stub 176 // implementation for this in our test suite. All other callers should use build.Options.FindRepositoryMetadata(). 177 findRepositoryMetadata func(args *indexArgs) (repository *zoekt.Repository, metadata *zoekt.IndexMetadata, ok bool, err error) 178 179 // timeout defines how long the index server waits before killing an indexing job. 180 timeout time.Duration 181} 182 183func gitIndex(c gitIndexConfig, o *indexArgs, sourcegraph Sourcegraph, l sglog.Logger) error { 184 logger := l.Scoped("gitIndex") 185 186 if len(o.Branches) == 0 { 187 return errors.New("zoekt-git-index requires 1 or more branches") 188 } 189 190 if c.runCmd == nil { 191 return errors.New("runCmd in provided configuration was nil - a function must be provided") 192 } 193 194 if c.findRepositoryMetadata == nil { 195 return errors.New("findRepositoryMetadata in provided configuration was nil - a function must be provided") 196 } 197 198 ctx, cancel := context.WithTimeout(context.Background(), c.timeout) 199 defer cancel() 200 201 gitDir, err := tmpGitDir(o.Name) 202 if err != nil { 203 return err 204 } 205 defer os.RemoveAll(gitDir) // best-effort cleanup 206 207 err = fetchRepo(ctx, gitDir, o, c, logger) 208 if err != nil { 209 return err 210 } 211 212 err = setZoektConfig(ctx, gitDir, o, c) 213 if err != nil { 214 return err 215 } 216 217 err = indexRepo(ctx, gitDir, sourcegraph, o, c, logger) 218 if err != nil { 219 return err 220 } 221 222 return nil 223} 224 225func fetchRepo(ctx context.Context, gitDir string, o *indexArgs, c gitIndexConfig, logger sglog.Logger) error { 226 // Create a repo to fetch into 227 cmd := exec.CommandContext(ctx, "git", 228 // use a random default branch. This is so that HEAD isn't a symref to a 229 // branch that is indexed. For example if you are indexing 230 // HEAD,master. Then HEAD would be pointing to master by default. 231 "-c", "init.defaultBranch=nonExistentBranchBB0FOFCH32", 232 "init", 233 // we don't need a working copy 234 "--bare", 235 gitDir) 236 cmd.Stdin = &bytes.Buffer{} 237 if err := c.runCmd(cmd); err != nil { 238 return err 239 } 240 241 var fetchDuration time.Duration 242 successfullyFetchedCommitsCount := 0 243 allFetchesSucceeded := true 244 245 defer func() { 246 success := strconv.FormatBool(allFetchesSucceeded) 247 name := repoNameForMetric(o.Name) 248 metricFetchDuration.WithLabelValues(success, name).Observe(fetchDuration.Seconds()) 249 }() 250 251 runFetch := func(branches []zoekt.RepositoryBranch) error { 252 // We shallow fetch each commit specified in zoekt.Branches. This requires 253 // the server to have configured both uploadpack.allowAnySHA1InWant and 254 // uploadpack.allowFilter. (See gitservice.go in the Sourcegraph repository) 255 fetchArgs := []string{ 256 "-C", gitDir, 257 "-c", "protocol.version=2", 258 "-c", "http.extraHeader=X-Sourcegraph-Actor-UID: internal", 259 "-c", "http.extraHeader=X-Sourcegraph-Tenant-ID: " + strconv.Itoa(o.TenantID), 260 "fetch", "--depth=1", "--no-tags", 261 } 262 263 // If there are no exceptions to MaxFileSize (1MB), we can avoid fetching these large files. 264 if len(o.LargeFiles) == 0 { 265 fetchArgs = append(fetchArgs, "--filter=blob:limit=1m") 266 } 267 268 fetchArgs = append(fetchArgs, o.CloneURL) 269 270 var commits []string 271 for _, b := range branches { 272 commits = append(commits, b.Version) 273 } 274 275 fetchArgs = append(fetchArgs, commits...) 276 277 cmd = exec.CommandContext(ctx, "git", fetchArgs...) 278 cmd.Stdin = &bytes.Buffer{} 279 280 start := time.Now() 281 err := c.runCmd(cmd) 282 fetchDuration += time.Since(start) 283 284 if err != nil { 285 allFetchesSucceeded = false 286 var bs []string 287 for _, b := range branches { 288 bs = append(bs, b.String()) 289 } 290 291 formattedBranches := strings.Join(bs, ", ") 292 return fmt.Errorf("fetching %s: %w", formattedBranches, err) 293 } 294 295 successfullyFetchedCommitsCount += len(commits) 296 return nil 297 } 298 299 fetchPriorAndLatestCommits := func() error { 300 prior, err := priorBranches(c, o) 301 if err != nil { 302 return err 303 } 304 305 var allBranches []zoekt.RepositoryBranch 306 allBranches = append(allBranches, o.Branches...) 307 allBranches = append(allBranches, prior...) 308 309 return runFetch(allBranches) 310 } 311 312 fetchOnlyLatestCommits := func() error { 313 return runFetch(o.Branches) 314 } 315 316 if o.UseDelta { 317 err := fetchPriorAndLatestCommits() 318 if err != nil { 319 name := o.BuildOptions().RepositoryDescription.Name 320 id := o.BuildOptions().RepositoryDescription.ID 321 322 errorLog.Printf("delta build: failed to prepare delta build for %q (ID %d): failed to fetch both latest and prior commits: %s", name, id, err) 323 err = fetchOnlyLatestCommits() 324 if err != nil { 325 return err 326 } 327 } 328 } else { 329 err := fetchOnlyLatestCommits() 330 if err != nil { 331 return err 332 } 333 } 334 335 // We then create the relevant refs for each fetched commit. 336 for _, b := range o.Branches { 337 ref := b.Name 338 if ref != "HEAD" { 339 ref = "refs/heads/" + ref 340 } 341 cmd := exec.CommandContext(ctx, "git", "-C", gitDir, "update-ref", ref, b.Version) 342 cmd.Stdin = &bytes.Buffer{} 343 if err := c.runCmd(cmd); err != nil { 344 return fmt.Errorf("failed update-ref %s to %s: %w", ref, b.Version, err) 345 } 346 } 347 348 logger.Debug("successfully fetched git data", 349 sglog.String("repo", o.Name), 350 sglog.Uint32("id", o.RepoID), 351 sglog.Int("commits_count", successfullyFetchedCommitsCount), 352 sglog.Duration("duration", fetchDuration), 353 ) 354 return nil 355} 356 357func setZoektConfig(ctx context.Context, gitDir string, o *indexArgs, c gitIndexConfig) error { 358 // create git configuration with options 359 type configKV struct{ Key, Value string } 360 config := []configKV{{ 361 // zoekt.name is used by zoekt-git-index to set the repository name. 362 Key: "name", 363 Value: o.Name, 364 }} 365 for k, v := range o.BuildOptions().RepositoryDescription.RawConfig { 366 config = append(config, configKV{Key: k, Value: v}) 367 } 368 sort.Slice(config, func(i, j int) bool { 369 return config[i].Key < config[j].Key 370 }) 371 372 // write git configuration to repo 373 for _, kv := range config { 374 cmd := exec.CommandContext(ctx, "git", "-C", gitDir, "config", "zoekt."+kv.Key, kv.Value) 375 cmd.Stdin = &bytes.Buffer{} 376 if err := c.runCmd(cmd); err != nil { 377 return err 378 } 379 } 380 return nil 381} 382 383func indexRepo(ctx context.Context, gitDir string, sourcegraph Sourcegraph, o *indexArgs, c gitIndexConfig, logger sglog.Logger) error { 384 args := []string{ 385 "-submodules=false", 386 } 387 388 // Even though we check for incremental in this process, we still pass it 389 // in just in case we regress in how we check in process. We will still 390 // notice thanks to metrics and increased load on gitserver. 391 if o.Incremental { 392 args = append(args, "-incremental") 393 } 394 395 var branches []string 396 for _, b := range o.Branches { 397 branches = append(branches, b.Name) 398 } 399 args = append(args, "-branches", strings.Join(branches, ",")) 400 401 if o.UseDelta { 402 args = append(args, "-delta") 403 args = append(args, "-delta_threshold", strconv.FormatUint(o.DeltaShardNumberFallbackThreshold, 10)) 404 } 405 406 if len(o.LanguageMap) > 0 { 407 var languageMap []string 408 for language, parser := range o.LanguageMap { 409 languageMap = append(languageMap, language+":"+ctags.ParserToString(parser)) 410 } 411 args = append(args, "-language_map", strings.Join(languageMap, ",")) 412 } 413 414 args = append(args, o.BuildOptions().Args()...) 415 args = append(args, gitDir) 416 417 cmd := exec.CommandContext(ctx, "zoekt-git-index", args...) 418 cmd.Stdin = &bytes.Buffer{} 419 if err := c.runCmd(cmd); err != nil { 420 return err 421 } 422 return nil 423} 424 425func priorBranches(c gitIndexConfig, o *indexArgs) ([]zoekt.RepositoryBranch, error) { 426 existingRepository, _, found, err := c.findRepositoryMetadata(o) 427 if err != nil { 428 return nil, fmt.Errorf("loading repository metadata: %w", err) 429 } 430 431 if !found || len(existingRepository.Branches) == 0 { 432 return nil, fmt.Errorf("no prior shards found") 433 } 434 435 return existingRepository.Branches, nil 436} 437 438func tmpGitDir(name string) (string, error) { 439 abs := url.QueryEscape(name) 440 if len(abs) > 200 { 441 h := sha1.New() 442 _, _ = io.WriteString(h, abs) 443 abs = abs[:200] + fmt.Sprintf("%x", h.Sum(nil))[:8] 444 } 445 dir := filepath.Join(os.TempDir(), abs+".git") 446 if _, err := os.Stat(dir); err == nil { 447 if err := os.RemoveAll(dir); err != nil { 448 return "", err 449 } 450 } 451 return dir, nil 452}