fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1package main 2 3import ( 4 "bytes" 5 "context" 6 "crypto/sha1" 7 "errors" 8 "fmt" 9 "io" 10 "net/url" 11 "os" 12 "os/exec" 13 "path/filepath" 14 "sort" 15 "strconv" 16 "strings" 17 "time" 18 19 sglog "github.com/sourcegraph/log" 20 "github.com/sourcegraph/zoekt" 21 configv1 "github.com/sourcegraph/zoekt/cmd/zoekt-sourcegraph-indexserver/grpc/protos/sourcegraph/zoekt/configuration/v1" 22 "github.com/sourcegraph/zoekt/index" 23 "github.com/sourcegraph/zoekt/internal/ctags" 24 "github.com/sourcegraph/zoekt/internal/tenant" 25) 26 27const defaultIndexingTimeout = 1*time.Hour + 30*time.Minute 28 29// IndexOptions are the options that Sourcegraph can set via it's search 30// configuration endpoint. 31type IndexOptions struct { 32 // LargeFiles is a slice of glob patterns where matching file paths should 33 // be indexed regardless of their size. The pattern syntax can be found 34 // here: https://golang.org/pkg/path/filepath/#Match. 35 LargeFiles []string 36 37 // Symbols if true will make zoekt index the output of ctags. 38 Symbols bool 39 40 // Branches is a slice of branches to index. 41 Branches []zoekt.RepositoryBranch 42 43 // RepoID is the Sourcegraph Repository ID. 44 RepoID uint32 45 46 // Name is the Repository Name. 47 Name string 48 49 // CloneURL is the internal clone URL for Name. 50 CloneURL string 51 52 // Priority indicates ranking in results, higher first. 53 Priority float64 54 55 // Public is true if the repository is public. 56 Public bool 57 58 // Fork is true if the repository is a fork. 59 Fork bool 60 61 // Archived is true if the repository is archived. 62 Archived bool 63 64 // Map from language to scip-ctags, universal-ctags, or neither 65 LanguageMap ctags.LanguageMap 66 67 // The number of threads to use for indexing shards. Defaults to the number of available 68 // CPUs. If the server flag -cpu_fraction is set, then this value overrides it. 69 ShardConcurrency int32 70 71 // TenantID is the tenant ID for the repository. 72 TenantID int 73} 74 75// indexArgs represents the arguments we pass to zoekt-git-index 76type indexArgs struct { 77 IndexOptions 78 79 // Incremental indicates to skip indexing if already indexed. 80 Incremental bool 81 82 // IndexDir is the index directory to store the shards. 83 IndexDir string 84 85 // Parallelism is the number of shards to compute in parallel. 86 Parallelism int 87 88 // FileLimit is the maximum size of a file 89 FileLimit int 90 91 // UseDelta is true if we want to use the new delta indexer. This should 92 // only be true for repositories we explicitly enable. 93 UseDelta bool 94 95 // DeltaShardNumberFallbackThreshold is an upper limit on the number of preexisting shards that can exist 96 // before attempting a delta build. 97 DeltaShardNumberFallbackThreshold uint64 98 99 // ShardMerging is true if we want zoekt-git-index to respect compound shards. 100 ShardMerging bool 101} 102 103// BuildOptions returns a index.Options represented by indexArgs. Note: it 104// doesn't set fields like repository/branch. 105func (o *indexArgs) BuildOptions() *index.Options { 106 shardPrefix := "" 107 if tenant.EnforceTenant() { 108 shardPrefix = tenant.SrcPrefix(o.TenantID, o.RepoID) 109 } 110 111 return &index.Options{ 112 // It is important that this RepositoryDescription exactly matches what 113 // the indexer we call will produce. This is to ensure that 114 // IncrementalSkipIndexing and IndexState can correctly calculate if 115 // nothing needs to be done. 116 RepositoryDescription: zoekt.Repository{ 117 TenantID: o.TenantID, 118 ID: o.IndexOptions.RepoID, 119 Name: o.Name, 120 Branches: o.Branches, 121 RawConfig: map[string]string{ 122 "repoid": strconv.Itoa(int(o.IndexOptions.RepoID)), 123 "priority": strconv.FormatFloat(o.Priority, 'g', -1, 64), 124 "public": marshalBool(o.Public), 125 "fork": marshalBool(o.Fork), 126 "archived": marshalBool(o.Archived), 127 // Calculate repo rank based on the latest commit date. 128 "latestCommitDate": "1", 129 "tenantID": strconv.Itoa(o.TenantID), 130 }, 131 }, 132 IndexDir: o.IndexDir, 133 Parallelism: o.Parallelism, 134 SizeMax: o.FileLimit, 135 LargeFiles: o.LargeFiles, 136 CTagsMustSucceed: o.Symbols, 137 DisableCTags: !o.Symbols, 138 IsDelta: o.UseDelta, 139 140 LanguageMap: o.LanguageMap, 141 142 ShardMerging: o.ShardMerging, 143 144 ShardPrefix: shardPrefix, 145 } 146} 147 148func marshalBool(b bool) string { 149 if b { 150 return "1" 151 } 152 return "0" 153} 154 155func (o *indexArgs) String() string { 156 s := fmt.Sprintf("%d %s", o.RepoID, o.Name) 157 for i, b := range o.Branches { 158 if i == 0 { 159 s = fmt.Sprintf("%s@%s=%s", s, b.Name, b.Version) 160 } else { 161 s = fmt.Sprintf("%s,%s=%s", s, b.Name, b.Version) 162 } 163 } 164 return s 165} 166 167type gitIndexConfig struct { 168 // runCmd is the function that's used to execute all external commands (such as calls to "git" or "zoekt-git-index") 169 // that gitIndex may construct. 170 runCmd func(*exec.Cmd) error 171 172 // findRepositoryMetadata is the function that returns the repository metadata for the 173 // repository specified in args. 'ok' is false if the repository's metadata 174 // couldn't be found or if an error occurred. 175 // 176 // The primary purpose of this configuration option is to be able to provide a stub 177 // implementation for this in our test suite. All other callers should use build.Options.FindRepositoryMetadata(). 178 findRepositoryMetadata func(args *indexArgs) (repository *zoekt.Repository, metadata *zoekt.IndexMetadata, ok bool, err error) 179 180 // timeout defines how long the index server waits before killing an indexing job. 181 timeout time.Duration 182} 183 184func gitIndex(ctx context.Context, c gitIndexConfig, o *indexArgs, sourcegraph Sourcegraph, l sglog.Logger) error { 185 logger := l.Scoped("gitIndex") 186 187 if len(o.Branches) == 0 { 188 return errors.New("zoekt-git-index requires 1 or more branches") 189 } 190 191 if c.runCmd == nil { 192 return errors.New("runCmd in provided configuration was nil - a function must be provided") 193 } 194 195 if c.findRepositoryMetadata == nil { 196 return errors.New("findRepositoryMetadata in provided configuration was nil - a function must be provided") 197 } 198 199 ctx, cancel := context.WithTimeout(ctx, c.timeout) 200 defer cancel() 201 202 gitDir, err := tmpGitDir(o.Name) 203 if err != nil { 204 return err 205 } 206 defer os.RemoveAll(gitDir) // best-effort cleanup 207 208 err = fetchRepo(ctx, gitDir, o, c, logger) 209 if err != nil { 210 return err 211 } 212 213 err = setZoektConfig(ctx, gitDir, o, c) 214 if err != nil { 215 return err 216 } 217 218 err = indexRepo(ctx, gitDir, sourcegraph, o, c, logger) 219 if err != nil { 220 return err 221 } 222 223 return nil 224} 225 226func fetchRepo(ctx context.Context, gitDir string, o *indexArgs, c gitIndexConfig, logger sglog.Logger) error { 227 // Create a repo to fetch into 228 cmd := exec.CommandContext(ctx, "git", 229 // use a random default branch. This is so that HEAD isn't a symref to a 230 // branch that is indexed. For example if you are indexing 231 // HEAD,master. Then HEAD would be pointing to master by default. 232 "-c", "init.defaultBranch=nonExistentBranchBB0FOFCH32", 233 "init", 234 // we don't need a working copy 235 "--bare", 236 gitDir) 237 cmd.Stdin = &bytes.Buffer{} 238 if err := c.runCmd(cmd); err != nil { 239 return err 240 } 241 242 var fetchDuration time.Duration 243 successfullyFetchedCommitsCount := 0 244 allFetchesSucceeded := true 245 246 defer func() { 247 success := strconv.FormatBool(allFetchesSucceeded) 248 name := repoNameForMetric(o.Name) 249 metricFetchDuration.WithLabelValues(success, name).Observe(fetchDuration.Seconds()) 250 }() 251 252 runFetch := func(branches []zoekt.RepositoryBranch) error { 253 // We shallow fetch each commit specified in zoekt.Branches. This requires 254 // the server to have configured both uploadpack.allowAnySHA1InWant and 255 // uploadpack.allowFilter. (See gitservice.go in the Sourcegraph repository) 256 fetchArgs := []string{ 257 "-C", gitDir, 258 "-c", "protocol.version=2", 259 "-c", "http.extraHeader=X-Sourcegraph-Actor-UID: internal", 260 "-c", "http.extraHeader=X-Sourcegraph-Tenant-ID: " + strconv.Itoa(o.TenantID), 261 "fetch", "--depth=1", "--no-tags", 262 } 263 264 // If there are no exceptions to MaxFileSize (1MB), we can avoid fetching these large files. 265 if len(o.LargeFiles) == 0 { 266 fetchArgs = append(fetchArgs, "--filter=blob:limit=1m") 267 } 268 269 fetchArgs = append(fetchArgs, o.CloneURL) 270 271 var commits []string 272 for _, b := range branches { 273 commits = append(commits, b.Version) 274 } 275 276 fetchArgs = append(fetchArgs, commits...) 277 278 cmd = exec.CommandContext(ctx, "git", fetchArgs...) 279 cmd.Stdin = &bytes.Buffer{} 280 281 start := time.Now() 282 err := c.runCmd(cmd) 283 fetchDuration += time.Since(start) 284 285 if err != nil { 286 allFetchesSucceeded = false 287 var bs []string 288 for _, b := range branches { 289 bs = append(bs, b.String()) 290 } 291 292 formattedBranches := strings.Join(bs, ", ") 293 return fmt.Errorf("fetching %s: %w", formattedBranches, err) 294 } 295 296 successfullyFetchedCommitsCount += len(commits) 297 return nil 298 } 299 300 fetchPriorAndLatestCommits := func() error { 301 prior, err := priorBranches(c, o) 302 if err != nil { 303 return err 304 } 305 306 var allBranches []zoekt.RepositoryBranch 307 allBranches = append(allBranches, o.Branches...) 308 allBranches = append(allBranches, prior...) 309 310 return runFetch(allBranches) 311 } 312 313 fetchOnlyLatestCommits := func() error { 314 return runFetch(o.Branches) 315 } 316 317 if o.UseDelta { 318 err := fetchPriorAndLatestCommits() 319 if err != nil { 320 name := o.BuildOptions().RepositoryDescription.Name 321 id := o.BuildOptions().RepositoryDescription.ID 322 323 errorLog.Printf("delta build: failed to prepare delta build for %q (ID %d): failed to fetch both latest and prior commits: %s", name, id, err) 324 err = fetchOnlyLatestCommits() 325 if err != nil { 326 return err 327 } 328 } 329 } else { 330 err := fetchOnlyLatestCommits() 331 if err != nil { 332 return err 333 } 334 } 335 336 // We then create the relevant refs for each fetched commit. 337 for _, b := range o.Branches { 338 ref := b.Name 339 if ref != "HEAD" { 340 ref = "refs/heads/" + ref 341 } 342 cmd := exec.CommandContext(ctx, "git", "-C", gitDir, "update-ref", ref, b.Version) 343 cmd.Stdin = &bytes.Buffer{} 344 if err := c.runCmd(cmd); err != nil { 345 return fmt.Errorf("failed update-ref %s to %s: %w", ref, b.Version, err) 346 } 347 } 348 349 logger.Debug("successfully fetched git data", 350 sglog.String("repo", o.Name), 351 sglog.Uint32("id", o.RepoID), 352 sglog.Int("commits_count", successfullyFetchedCommitsCount), 353 sglog.Duration("duration", fetchDuration), 354 ) 355 return nil 356} 357 358func setZoektConfig(ctx context.Context, gitDir string, o *indexArgs, c gitIndexConfig) error { 359 // create git configuration with options 360 type configKV struct{ Key, Value string } 361 config := []configKV{{ 362 // zoekt.name is used by zoekt-git-index to set the repository name. 363 Key: "name", 364 Value: o.Name, 365 }} 366 for k, v := range o.BuildOptions().RepositoryDescription.RawConfig { 367 config = append(config, configKV{Key: k, Value: v}) 368 } 369 sort.Slice(config, func(i, j int) bool { 370 return config[i].Key < config[j].Key 371 }) 372 373 // write git configuration to repo 374 for _, kv := range config { 375 cmd := exec.CommandContext(ctx, "git", "-C", gitDir, "config", "zoekt."+kv.Key, kv.Value) 376 cmd.Stdin = &bytes.Buffer{} 377 if err := c.runCmd(cmd); err != nil { 378 return err 379 } 380 } 381 return nil 382} 383 384func indexRepo(ctx context.Context, gitDir string, sourcegraph Sourcegraph, o *indexArgs, c gitIndexConfig, logger sglog.Logger) error { 385 args := []string{ 386 "-submodules=false", 387 } 388 389 // Even though we check for incremental in this process, we still pass it 390 // in just in case we regress in how we check in process. We will still 391 // notice thanks to metrics and increased load on gitserver. 392 if o.Incremental { 393 args = append(args, "-incremental") 394 } 395 396 var branches []string 397 for _, b := range o.Branches { 398 branches = append(branches, b.Name) 399 } 400 args = append(args, "-branches", strings.Join(branches, ",")) 401 402 if o.UseDelta { 403 args = append(args, "-delta") 404 args = append(args, "-delta_threshold", strconv.FormatUint(o.DeltaShardNumberFallbackThreshold, 10)) 405 } 406 407 if len(o.LanguageMap) > 0 { 408 var languageMap []string 409 for language, parser := range o.LanguageMap { 410 languageMap = append(languageMap, language+":"+ctags.ParserToString(parser)) 411 } 412 args = append(args, "-language_map", strings.Join(languageMap, ",")) 413 } 414 415 args = append(args, o.BuildOptions().Args()...) 416 args = append(args, gitDir) 417 418 cmd := exec.CommandContext(ctx, "zoekt-git-index", args...) 419 cmd.Stdin = &bytes.Buffer{} 420 if err := c.runCmd(cmd); err != nil { 421 return err 422 } 423 return nil 424} 425 426func priorBranches(c gitIndexConfig, o *indexArgs) ([]zoekt.RepositoryBranch, error) { 427 existingRepository, _, found, err := c.findRepositoryMetadata(o) 428 if err != nil { 429 return nil, fmt.Errorf("loading repository metadata: %w", err) 430 } 431 432 if !found || len(existingRepository.Branches) == 0 { 433 return nil, fmt.Errorf("no prior shards found") 434 } 435 436 return existingRepository.Branches, nil 437} 438 439func tmpGitDir(name string) (string, error) { 440 abs := url.QueryEscape(name) 441 if len(abs) > 200 { 442 h := sha1.New() 443 _, _ = io.WriteString(h, abs) 444 abs = abs[:200] + fmt.Sprintf("%x", h.Sum(nil))[:8] 445 } 446 dir := filepath.Join(os.TempDir(), abs+".git") 447 if _, err := os.Stat(dir); err == nil { 448 if err := os.RemoveAll(dir); err != nil { 449 return "", err 450 } 451 } 452 return dir, nil 453} 454 455// FromProto converts a ZoektIndexOptions proto message into an IndexOptions struct. 456func (o *IndexOptions) FromProto(x *configv1.ZoektIndexOptions) { 457 branches := make([]zoekt.RepositoryBranch, 0, len(x.Branches)) 458 for _, b := range x.GetBranches() { 459 branches = append(branches, zoekt.RepositoryBranch{ 460 Name: b.GetName(), 461 Version: b.GetVersion(), 462 }) 463 } 464 465 languageMap := make(map[string]ctags.CTagsParserType) 466 for _, lang := range x.GetLanguageMap() { 467 languageMap[lang.GetLanguage()] = ctags.CTagsParserType(lang.GetCtags().Number()) 468 } 469 470 *o = IndexOptions{ 471 RepoID: uint32(x.GetRepoId()), 472 LargeFiles: x.GetLargeFiles(), 473 Symbols: x.GetSymbols(), 474 Branches: branches, 475 Name: x.GetName(), 476 477 Priority: x.GetPriority(), 478 479 Public: x.GetPublic(), 480 Fork: x.GetFork(), 481 Archived: x.GetArchived(), 482 483 LanguageMap: languageMap, 484 ShardConcurrency: x.GetShardConcurrency(), 485 486 TenantID: int(x.TenantId), 487 } 488}