fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1package main 2 3import ( 4 "bytes" 5 "context" 6 "crypto/sha1" 7 "errors" 8 "fmt" 9 "io" 10 "net/url" 11 "os" 12 "os/exec" 13 "path/filepath" 14 "sort" 15 "strconv" 16 "strings" 17 "time" 18 19 sglog "github.com/sourcegraph/log" 20 "github.com/sourcegraph/zoekt" 21 configv1 "github.com/sourcegraph/zoekt/cmd/zoekt-sourcegraph-indexserver/grpc/protos/sourcegraph/zoekt/configuration/v1" 22 "github.com/sourcegraph/zoekt/index" 23 "github.com/sourcegraph/zoekt/internal/ctags" 24) 25 26const defaultIndexingTimeout = 1*time.Hour + 30*time.Minute 27 28// IndexOptions are the options that Sourcegraph can set via it's search 29// configuration endpoint. 30type IndexOptions struct { 31 // LargeFiles is a slice of glob patterns where matching file paths should 32 // be indexed regardless of their size. The pattern syntax can be found 33 // here: https://golang.org/pkg/path/filepath/#Match. 34 LargeFiles []string 35 36 // Symbols if true will make zoekt index the output of ctags. 37 Symbols bool 38 39 // Branches is a slice of branches to index. 40 Branches []zoekt.RepositoryBranch 41 42 // RepoID is the Sourcegraph Repository ID. 43 RepoID uint32 44 45 // Name is the Repository Name. 46 Name string 47 48 // CloneURL is the internal clone URL for Name. 49 CloneURL string 50 51 // Priority indicates ranking in results, higher first. 52 Priority float64 53 54 // Public is true if the repository is public. 55 Public bool 56 57 // Fork is true if the repository is a fork. 58 Fork bool 59 60 // Archived is true if the repository is archived. 61 Archived bool 62 63 // Map from language to scip-ctags, universal-ctags, or neither 64 LanguageMap ctags.LanguageMap 65 66 // The number of threads to use for indexing shards. Defaults to the number of available 67 // CPUs. If the server flag -cpu_fraction is set, then this value overrides it. 68 ShardConcurrency int32 69 70 // TenantID is the tenant ID for the repository. 71 TenantID int 72} 73 74// indexArgs represents the arguments we pass to zoekt-git-index 75type indexArgs struct { 76 IndexOptions 77 78 // Incremental indicates to skip indexing if already indexed. 79 Incremental bool 80 81 // IndexDir is the index directory to store the shards. 82 IndexDir string 83 84 // Parallelism is the number of shards to compute in parallel. 85 Parallelism int 86 87 // FileLimit is the maximum size of a file 88 FileLimit int 89 90 // UseDelta is true if we want to use the new delta indexer. This should 91 // only be true for repositories we explicitly enable. 92 UseDelta bool 93 94 // DeltaShardNumberFallbackThreshold is an upper limit on the number of preexisting shards that can exist 95 // before attempting a delta build. 96 DeltaShardNumberFallbackThreshold uint64 97 98 // ShardMerging is true if we want zoekt-git-index to respect compound shards. 99 ShardMerging bool 100} 101 102// BuildOptions returns a index.Options represented by indexArgs. Note: it 103// doesn't set fields like repository/branch. 104func (o *indexArgs) BuildOptions() *index.Options { 105 return &index.Options{ 106 // It is important that this RepositoryDescription exactly matches what 107 // the indexer we call will produce. This is to ensure that 108 // IncrementalSkipIndexing and IndexState can correctly calculate if 109 // nothing needs to be done. 110 RepositoryDescription: zoekt.Repository{ 111 TenantID: o.TenantID, 112 ID: o.IndexOptions.RepoID, 113 Name: o.Name, 114 Branches: o.Branches, 115 RawConfig: map[string]string{ 116 "repoid": strconv.Itoa(int(o.IndexOptions.RepoID)), 117 "priority": strconv.FormatFloat(o.Priority, 'g', -1, 64), 118 "public": marshalBool(o.Public), 119 "fork": marshalBool(o.Fork), 120 "archived": marshalBool(o.Archived), 121 // Calculate repo rank based on the latest commit date. 122 "latestCommitDate": "1", 123 "tenantID": strconv.Itoa(o.TenantID), 124 }, 125 }, 126 IndexDir: o.IndexDir, 127 Parallelism: o.Parallelism, 128 SizeMax: o.FileLimit, 129 LargeFiles: o.LargeFiles, 130 CTagsMustSucceed: o.Symbols, 131 DisableCTags: !o.Symbols, 132 IsDelta: o.UseDelta, 133 134 LanguageMap: o.LanguageMap, 135 136 ShardMerging: o.ShardMerging, 137 138 TenantID: o.TenantID, 139 RepoID: o.RepoID, 140 } 141} 142 143func marshalBool(b bool) string { 144 if b { 145 return "1" 146 } 147 return "0" 148} 149 150func (o *indexArgs) String() string { 151 s := fmt.Sprintf("%d %s", o.RepoID, o.Name) 152 for i, b := range o.Branches { 153 if i == 0 { 154 s = fmt.Sprintf("%s@%s=%s", s, b.Name, b.Version) 155 } else { 156 s = fmt.Sprintf("%s,%s=%s", s, b.Name, b.Version) 157 } 158 } 159 return s 160} 161 162type gitIndexConfig struct { 163 // runCmd is the function that's used to execute all external commands (such as calls to "git" or "zoekt-git-index") 164 // that gitIndex may construct. 165 runCmd func(*exec.Cmd) error 166 167 // findRepositoryMetadata is the function that returns the repository metadata for the 168 // repository specified in args. 'ok' is false if the repository's metadata 169 // couldn't be found or if an error occurred. 170 // 171 // The primary purpose of this configuration option is to be able to provide a stub 172 // implementation for this in our test suite. All other callers should use build.Options.FindRepositoryMetadata(). 173 findRepositoryMetadata func(args *indexArgs) (repository *zoekt.Repository, metadata *zoekt.IndexMetadata, ok bool, err error) 174 175 // timeout defines how long the index server waits before killing an indexing job. 176 timeout time.Duration 177} 178 179func gitIndex(ctx context.Context, c gitIndexConfig, o *indexArgs, sourcegraph Sourcegraph, l sglog.Logger) error { 180 logger := l.Scoped("gitIndex") 181 182 if len(o.Branches) == 0 { 183 return errors.New("zoekt-git-index requires 1 or more branches") 184 } 185 186 if c.runCmd == nil { 187 return errors.New("runCmd in provided configuration was nil - a function must be provided") 188 } 189 190 if c.findRepositoryMetadata == nil { 191 return errors.New("findRepositoryMetadata in provided configuration was nil - a function must be provided") 192 } 193 194 ctx, cancel := context.WithTimeout(ctx, c.timeout) 195 defer cancel() 196 197 gitDir, err := tmpGitDir(o.Name) 198 if err != nil { 199 return err 200 } 201 defer os.RemoveAll(gitDir) // best-effort cleanup 202 203 err = fetchRepo(ctx, gitDir, o, c, logger) 204 if err != nil { 205 return err 206 } 207 208 err = setZoektConfig(ctx, gitDir, o, c) 209 if err != nil { 210 return err 211 } 212 213 err = indexRepo(ctx, gitDir, sourcegraph, o, c, logger) 214 if err != nil { 215 return err 216 } 217 218 return nil 219} 220 221func fetchRepo(ctx context.Context, gitDir string, o *indexArgs, c gitIndexConfig, logger sglog.Logger) error { 222 // Create a repo to fetch into 223 cmd := exec.CommandContext(ctx, "git", 224 // use a random default branch. This is so that HEAD isn't a symref to a 225 // branch that is indexed. For example if you are indexing 226 // HEAD,master. Then HEAD would be pointing to master by default. 227 "-c", "init.defaultBranch=nonExistentBranchBB0FOFCH32", 228 "init", 229 // we don't need a working copy 230 "--bare", 231 gitDir) 232 cmd.Stdin = &bytes.Buffer{} 233 if err := c.runCmd(cmd); err != nil { 234 return err 235 } 236 237 var fetchDuration time.Duration 238 successfullyFetchedCommitsCount := 0 239 allFetchesSucceeded := true 240 241 defer func() { 242 success := strconv.FormatBool(allFetchesSucceeded) 243 name := repoNameForMetric(o.Name) 244 metricFetchDuration.WithLabelValues(success, name).Observe(fetchDuration.Seconds()) 245 }() 246 247 runFetch := func(branches []zoekt.RepositoryBranch) error { 248 // We shallow fetch each commit specified in zoekt.Branches. This requires 249 // the server to have configured both uploadpack.allowAnySHA1InWant and 250 // uploadpack.allowFilter. (See gitservice.go in the Sourcegraph repository) 251 fetchArgs := []string{ 252 "-C", gitDir, 253 "-c", "protocol.version=2", 254 "-c", "http.extraHeader=X-Sourcegraph-Actor-UID: internal", 255 "-c", "http.extraHeader=X-Sourcegraph-Tenant-ID: " + strconv.Itoa(o.TenantID), 256 "fetch", "--depth=1", "--no-tags", 257 } 258 259 // If there are no exceptions to MaxFileSize (1MB), we can avoid fetching these large files. 260 if len(o.LargeFiles) == 0 { 261 fetchArgs = append(fetchArgs, "--filter=blob:limit=1m") 262 } 263 264 fetchArgs = append(fetchArgs, o.CloneURL) 265 266 var commits []string 267 for _, b := range branches { 268 commits = append(commits, b.Version) 269 } 270 271 fetchArgs = append(fetchArgs, commits...) 272 273 cmd = exec.CommandContext(ctx, "git", fetchArgs...) 274 cmd.Stdin = &bytes.Buffer{} 275 276 start := time.Now() 277 err := c.runCmd(cmd) 278 fetchDuration += time.Since(start) 279 280 if err != nil { 281 allFetchesSucceeded = false 282 var bs []string 283 for _, b := range branches { 284 bs = append(bs, b.String()) 285 } 286 287 formattedBranches := strings.Join(bs, ", ") 288 return fmt.Errorf("fetching %s: %w", formattedBranches, err) 289 } 290 291 successfullyFetchedCommitsCount += len(commits) 292 return nil 293 } 294 295 fetchPriorAndLatestCommits := func() error { 296 prior, err := priorBranches(c, o) 297 if err != nil { 298 return err 299 } 300 301 var allBranches []zoekt.RepositoryBranch 302 allBranches = append(allBranches, o.Branches...) 303 allBranches = append(allBranches, prior...) 304 305 return runFetch(allBranches) 306 } 307 308 fetchOnlyLatestCommits := func() error { 309 return runFetch(o.Branches) 310 } 311 312 if o.UseDelta { 313 err := fetchPriorAndLatestCommits() 314 if err != nil { 315 name := o.BuildOptions().RepositoryDescription.Name 316 id := o.BuildOptions().RepositoryDescription.ID 317 318 errorLog.Printf("delta build: failed to prepare delta build for %q (ID %d): failed to fetch both latest and prior commits: %s", name, id, err) 319 err = fetchOnlyLatestCommits() 320 if err != nil { 321 return err 322 } 323 } 324 } else { 325 err := fetchOnlyLatestCommits() 326 if err != nil { 327 return err 328 } 329 } 330 331 // We then create the relevant refs for each fetched commit. 332 for _, b := range o.Branches { 333 ref := b.Name 334 if ref != "HEAD" { 335 ref = "refs/heads/" + ref 336 } 337 cmd := exec.CommandContext(ctx, "git", "-C", gitDir, "update-ref", ref, b.Version) 338 cmd.Stdin = &bytes.Buffer{} 339 if err := c.runCmd(cmd); err != nil { 340 return fmt.Errorf("failed update-ref %s to %s: %w", ref, b.Version, err) 341 } 342 } 343 344 logger.Debug("successfully fetched git data", 345 sglog.String("repo", o.Name), 346 sglog.Uint32("id", o.RepoID), 347 sglog.Int("commits_count", successfullyFetchedCommitsCount), 348 sglog.Duration("duration", fetchDuration), 349 ) 350 return nil 351} 352 353func setZoektConfig(ctx context.Context, gitDir string, o *indexArgs, c gitIndexConfig) error { 354 // create git configuration with options 355 type configKV struct{ Key, Value string } 356 config := []configKV{{ 357 // zoekt.name is used by zoekt-git-index to set the repository name. 358 Key: "name", 359 Value: o.Name, 360 }} 361 for k, v := range o.BuildOptions().RepositoryDescription.RawConfig { 362 config = append(config, configKV{Key: k, Value: v}) 363 } 364 sort.Slice(config, func(i, j int) bool { 365 return config[i].Key < config[j].Key 366 }) 367 368 // write git configuration to repo 369 for _, kv := range config { 370 cmd := exec.CommandContext(ctx, "git", "-C", gitDir, "config", "zoekt."+kv.Key, kv.Value) 371 cmd.Stdin = &bytes.Buffer{} 372 if err := c.runCmd(cmd); err != nil { 373 return err 374 } 375 } 376 return nil 377} 378 379func indexRepo(ctx context.Context, gitDir string, sourcegraph Sourcegraph, o *indexArgs, c gitIndexConfig, logger sglog.Logger) error { 380 args := []string{ 381 "-submodules=false", 382 } 383 384 // Even though we check for incremental in this process, we still pass it 385 // in just in case we regress in how we check in process. We will still 386 // notice thanks to metrics and increased load on gitserver. 387 if o.Incremental { 388 args = append(args, "-incremental") 389 } 390 391 var branches []string 392 for _, b := range o.Branches { 393 branches = append(branches, b.Name) 394 } 395 args = append(args, "-branches", strings.Join(branches, ",")) 396 397 if o.UseDelta { 398 args = append(args, "-delta") 399 args = append(args, "-delta_threshold", strconv.FormatUint(o.DeltaShardNumberFallbackThreshold, 10)) 400 } 401 402 if len(o.LanguageMap) > 0 { 403 var languageMap []string 404 for language, parser := range o.LanguageMap { 405 languageMap = append(languageMap, language+":"+ctags.ParserToString(parser)) 406 } 407 args = append(args, "-language_map", strings.Join(languageMap, ",")) 408 } 409 410 args = append(args, o.BuildOptions().Args()...) 411 args = append(args, gitDir) 412 413 cmd := exec.CommandContext(ctx, "zoekt-git-index", args...) 414 cmd.Stdin = &bytes.Buffer{} 415 if err := c.runCmd(cmd); err != nil { 416 return err 417 } 418 return nil 419} 420 421func priorBranches(c gitIndexConfig, o *indexArgs) ([]zoekt.RepositoryBranch, error) { 422 existingRepository, _, found, err := c.findRepositoryMetadata(o) 423 if err != nil { 424 return nil, fmt.Errorf("loading repository metadata: %w", err) 425 } 426 427 if !found || len(existingRepository.Branches) == 0 { 428 return nil, fmt.Errorf("no prior shards found") 429 } 430 431 return existingRepository.Branches, nil 432} 433 434func tmpGitDir(name string) (string, error) { 435 abs := url.QueryEscape(name) 436 if len(abs) > 200 { 437 h := sha1.New() 438 _, _ = io.WriteString(h, abs) 439 abs = abs[:200] + fmt.Sprintf("%x", h.Sum(nil))[:8] 440 } 441 dir := filepath.Join(os.TempDir(), abs+".git") 442 if _, err := os.Stat(dir); err == nil { 443 if err := os.RemoveAll(dir); err != nil { 444 return "", err 445 } 446 } 447 return dir, nil 448} 449 450// FromProto converts a ZoektIndexOptions proto message into an IndexOptions struct. 451func (o *IndexOptions) FromProto(x *configv1.ZoektIndexOptions) { 452 branches := make([]zoekt.RepositoryBranch, 0, len(x.Branches)) 453 for _, b := range x.GetBranches() { 454 branches = append(branches, zoekt.RepositoryBranch{ 455 Name: b.GetName(), 456 Version: b.GetVersion(), 457 }) 458 } 459 460 languageMap := make(map[string]ctags.CTagsParserType) 461 for _, lang := range x.GetLanguageMap() { 462 languageMap[lang.GetLanguage()] = ctags.CTagsParserType(lang.GetCtags().Number()) 463 } 464 465 *o = IndexOptions{ 466 RepoID: uint32(x.GetRepoId()), 467 LargeFiles: x.GetLargeFiles(), 468 Symbols: x.GetSymbols(), 469 Branches: branches, 470 Name: x.GetName(), 471 472 Priority: x.GetPriority(), 473 474 Public: x.GetPublic(), 475 Fork: x.GetFork(), 476 Archived: x.GetArchived(), 477 478 LanguageMap: languageMap, 479 ShardConcurrency: x.GetShardConcurrency(), 480 481 TenantID: int(x.TenantId), 482 } 483}