fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1package main 2 3import ( 4 "bytes" 5 "context" 6 "crypto/sha1" 7 "errors" 8 "fmt" 9 "io" 10 "net/url" 11 "os" 12 "os/exec" 13 "path/filepath" 14 "sort" 15 "strconv" 16 "strings" 17 "time" 18 19 sglog "github.com/sourcegraph/log" 20 21 "github.com/sourcegraph/zoekt" 22 configv1 "github.com/sourcegraph/zoekt/cmd/zoekt-sourcegraph-indexserver/grpc/protos/sourcegraph/zoekt/configuration/v1" 23 "github.com/sourcegraph/zoekt/index" 24 "github.com/sourcegraph/zoekt/internal/ctags" 25) 26 27const defaultIndexingTimeout = 1*time.Hour + 30*time.Minute 28 29// IndexOptions are the options that Sourcegraph can set via it's search 30// configuration endpoint. 31type IndexOptions struct { 32 // LargeFiles is a slice of glob patterns where matching file paths should 33 // be indexed regardless of their size. The pattern syntax can be found 34 // here: https://golang.org/pkg/path/filepath/#Match. 35 LargeFiles []string 36 37 // Symbols if true will make zoekt index the output of ctags. 38 Symbols bool 39 40 // Branches is a slice of branches to index. 41 Branches []zoekt.RepositoryBranch 42 43 // RepoID is the Sourcegraph Repository ID. 44 RepoID uint32 45 46 // Name is the Repository Name. 47 Name string 48 49 // CloneURL is the internal clone URL for Name. 50 CloneURL string 51 52 // Priority indicates ranking in results, higher first. 53 Priority float64 54 55 // Public is true if the repository is public. 56 Public bool 57 58 // Fork is true if the repository is a fork. 59 Fork bool 60 61 // Archived is true if the repository is archived. 62 Archived bool 63 64 // Map from language to scip-ctags, universal-ctags, or neither 65 LanguageMap ctags.LanguageMap 66 67 // The number of threads to use for indexing shards. Defaults to the number of available 68 // CPUs. If the server flag -cpu_fraction is set, then this value overrides it. 69 ShardConcurrency int32 70 71 // TenantID is the tenant ID for the repository. 72 TenantID int 73} 74 75// indexArgs represents the arguments we pass to zoekt-git-index 76type indexArgs struct { 77 IndexOptions 78 79 // Incremental indicates to skip indexing if already indexed. 80 Incremental bool 81 82 // IndexDir is the index directory to store the shards. 83 IndexDir string 84 85 // Parallelism is the number of shards to compute in parallel. 86 Parallelism int 87 88 // UseDelta is true if we want to use the new delta indexer. This should 89 // only be true for repositories we explicitly enable. 90 UseDelta bool 91 92 // DeltaShardNumberFallbackThreshold is an upper limit on the number of preexisting shards that can exist 93 // before attempting a delta build. 94 DeltaShardNumberFallbackThreshold uint64 95 96 // ShardMerging is true if we want zoekt-git-index to respect compound shards. 97 ShardMerging bool 98} 99 100// BuildOptions returns a index.Options represented by indexArgs. Note: it 101// doesn't set fields like repository/branch. 102func (o *indexArgs) BuildOptions() *index.Options { 103 return &index.Options{ 104 // It is important that this RepositoryDescription exactly matches what 105 // the indexer we call will produce. This is to ensure that 106 // IncrementalSkipIndexing and IndexState can correctly calculate if 107 // nothing needs to be done. 108 RepositoryDescription: zoekt.Repository{ 109 TenantID: o.TenantID, 110 ID: o.RepoID, 111 Name: o.Name, 112 Branches: o.Branches, 113 RawConfig: map[string]string{ 114 "repoid": strconv.Itoa(int(o.RepoID)), 115 "priority": strconv.FormatFloat(o.Priority, 'g', -1, 64), 116 "public": marshalBool(o.Public), 117 "fork": marshalBool(o.Fork), 118 "archived": marshalBool(o.Archived), 119 // Calculate repo rank based on the latest commit date. 120 "latestCommitDate": "1", 121 "tenantID": strconv.Itoa(o.TenantID), 122 }, 123 }, 124 IndexDir: o.IndexDir, 125 Parallelism: o.Parallelism, 126 SizeMax: MaxFileSize, 127 LargeFiles: o.LargeFiles, 128 CTagsMustSucceed: o.Symbols, 129 DisableCTags: !o.Symbols, 130 IsDelta: o.UseDelta, 131 132 LanguageMap: o.LanguageMap, 133 134 ShardMerging: o.ShardMerging, 135 } 136} 137 138func marshalBool(b bool) string { 139 if b { 140 return "1" 141 } 142 return "0" 143} 144 145func (o *indexArgs) String() string { 146 s := fmt.Sprintf("%d %s", o.RepoID, o.Name) 147 for i, b := range o.Branches { 148 if i == 0 { 149 s = fmt.Sprintf("%s@%s=%s", s, b.Name, b.Version) 150 } else { 151 s = fmt.Sprintf("%s,%s=%s", s, b.Name, b.Version) 152 } 153 } 154 return s 155} 156 157type gitIndexConfig struct { 158 // runCmd is the function that's used to execute all external commands (such as calls to "git" or "zoekt-git-index") 159 // that gitIndex may construct. 160 runCmd func(*exec.Cmd) error 161 162 // findRepositoryMetadata is the function that returns the repository metadata for the 163 // repository specified in args. 'ok' is false if the repository's metadata 164 // couldn't be found or if an error occurred. 165 // 166 // The primary purpose of this configuration option is to be able to provide a stub 167 // implementation for this in our test suite. All other callers should use build.Options.FindRepositoryMetadata(). 168 findRepositoryMetadata func(args *indexArgs) (repository *zoekt.Repository, metadata *zoekt.IndexMetadata, ok bool, err error) 169 170 // timeout defines how long the index server waits before killing an indexing job. 171 timeout time.Duration 172} 173 174func gitIndex(ctx context.Context, c gitIndexConfig, o *indexArgs, sourcegraph Sourcegraph, l sglog.Logger) error { 175 logger := l.Scoped("gitIndex") 176 177 if len(o.Branches) == 0 { 178 return errors.New("zoekt-git-index requires 1 or more branches") 179 } 180 181 if c.runCmd == nil { 182 return errors.New("runCmd in provided configuration was nil - a function must be provided") 183 } 184 185 if c.findRepositoryMetadata == nil { 186 return errors.New("findRepositoryMetadata in provided configuration was nil - a function must be provided") 187 } 188 189 ctx, cancel := context.WithTimeout(ctx, c.timeout) 190 defer cancel() 191 192 gitDir, err := tmpGitDir(o.Name) 193 if err != nil { 194 return err 195 } 196 defer os.RemoveAll(gitDir) // best-effort cleanup 197 198 err = fetchRepo(ctx, gitDir, o, c, logger) 199 if err != nil { 200 return err 201 } 202 203 err = setZoektConfig(ctx, gitDir, o, c) 204 if err != nil { 205 return err 206 } 207 208 err = indexRepo(ctx, gitDir, sourcegraph, o, c, logger) 209 if err != nil { 210 return err 211 } 212 213 return nil 214} 215 216func fetchRepo(ctx context.Context, gitDir string, o *indexArgs, c gitIndexConfig, logger sglog.Logger) error { 217 // Create a repo to fetch into 218 cmd := exec.CommandContext(ctx, "git", 219 // use a random default branch. This is so that HEAD isn't a symref to a 220 // branch that is indexed. For example if you are indexing 221 // HEAD,master. Then HEAD would be pointing to master by default. 222 "-c", "init.defaultBranch=nonExistentBranchBB0FOFCH32", 223 "init", 224 // we don't need a working copy 225 "--bare", 226 gitDir) 227 cmd.Stdin = &bytes.Buffer{} 228 if err := c.runCmd(cmd); err != nil { 229 return err 230 } 231 232 for _, header := range []string{ 233 "X-Sourcegraph-Actor-UID: internal", 234 "X-Sourcegraph-Tenant-ID: " + strconv.Itoa(o.TenantID), 235 } { 236 cmd = exec.CommandContext(ctx, "git", "-C", gitDir, "config", "--add", "http.extraHeader", header) 237 cmd.Stdin = &bytes.Buffer{} 238 if err := c.runCmd(cmd); err != nil { 239 return err 240 } 241 } 242 243 var fetchDuration time.Duration 244 successfullyFetchedCommitsCount := 0 245 allFetchesSucceeded := true 246 247 defer func() { 248 success := strconv.FormatBool(allFetchesSucceeded) 249 name := repoNameForMetric(o.Name) 250 metricFetchDuration.WithLabelValues(success, name).Observe(fetchDuration.Seconds()) 251 }() 252 253 runFetch := func(branches []zoekt.RepositoryBranch) error { 254 // We shallow fetch each commit specified in zoekt.Branches. This requires 255 // the server to have configured both uploadpack.allowAnySHA1InWant and 256 // uploadpack.allowFilter. (See gitservice.go in the Sourcegraph repository) 257 fetchArgs := []string{ 258 "-C", gitDir, 259 "-c", "protocol.version=2", 260 "fetch", "--depth=1", "--no-tags", 261 } 262 263 // Git's blob:limit filter excludes blobs whose size is >= the given limit, 264 // while zoekt indexes files up to and including FileLimit bytes. 265 if len(o.LargeFiles) == 0 { 266 fetchArgs = append(fetchArgs, fmt.Sprintf("--filter=blob:limit=%d", int64(MaxFileSize)+1)) 267 } 268 269 fetchArgs = append(fetchArgs, o.CloneURL) 270 271 var commits []string 272 for _, b := range branches { 273 commits = append(commits, b.Version) 274 } 275 276 fetchArgs = append(fetchArgs, commits...) 277 278 cmd = exec.CommandContext(ctx, "git", fetchArgs...) 279 cmd.Stdin = &bytes.Buffer{} 280 281 start := time.Now() 282 err := c.runCmd(cmd) 283 fetchDuration += time.Since(start) 284 285 if err != nil { 286 allFetchesSucceeded = false 287 var bs []string 288 for _, b := range branches { 289 bs = append(bs, b.String()) 290 } 291 292 formattedBranches := strings.Join(bs, ", ") 293 return fmt.Errorf("fetching %s: %w", formattedBranches, err) 294 } 295 296 successfullyFetchedCommitsCount += len(commits) 297 return nil 298 } 299 300 fetchPriorAndLatestCommits := func() error { 301 prior, err := priorBranches(c, o) 302 if err != nil { 303 return err 304 } 305 306 var allBranches []zoekt.RepositoryBranch 307 allBranches = append(allBranches, o.Branches...) 308 allBranches = append(allBranches, prior...) 309 310 return runFetch(allBranches) 311 } 312 313 fetchOnlyLatestCommits := func() error { 314 return runFetch(o.Branches) 315 } 316 317 if o.UseDelta { 318 err := fetchPriorAndLatestCommits() 319 if err != nil { 320 name := o.BuildOptions().RepositoryDescription.Name 321 id := o.BuildOptions().RepositoryDescription.ID 322 323 errorLog.Printf("delta build: failed to prepare delta build for %q (ID %d): failed to fetch both latest and prior commits: %s", name, id, err) 324 err = fetchOnlyLatestCommits() 325 if err != nil { 326 return err 327 } 328 } 329 } else { 330 err := fetchOnlyLatestCommits() 331 if err != nil { 332 return err 333 } 334 } 335 336 // We then create the relevant refs for each fetched commit. 337 for _, b := range o.Branches { 338 ref := b.Name 339 if ref != "HEAD" { 340 ref = "refs/heads/" + ref 341 } 342 cmd := exec.CommandContext(ctx, "git", "-C", gitDir, "update-ref", ref, b.Version) 343 cmd.Stdin = &bytes.Buffer{} 344 if err := c.runCmd(cmd); err != nil { 345 return fmt.Errorf("failed update-ref %s to %s: %w", ref, b.Version, err) 346 } 347 } 348 349 logger.Debug("successfully fetched git data", 350 sglog.String("repo", o.Name), 351 sglog.Uint32("id", o.RepoID), 352 sglog.Int("commits_count", successfullyFetchedCommitsCount), 353 sglog.Duration("duration", fetchDuration), 354 ) 355 return nil 356} 357 358func setZoektConfig(ctx context.Context, gitDir string, o *indexArgs, c gitIndexConfig) error { 359 // create git configuration with options 360 type configKV struct{ Key, Value string } 361 config := []configKV{{ 362 // zoekt.name is used by zoekt-git-index to set the repository name. 363 Key: "name", 364 Value: o.Name, 365 }} 366 for k, v := range o.BuildOptions().RepositoryDescription.RawConfig { 367 config = append(config, configKV{Key: k, Value: v}) 368 } 369 sort.Slice(config, func(i, j int) bool { 370 return config[i].Key < config[j].Key 371 }) 372 373 // write git configuration to repo 374 for _, kv := range config { 375 cmd := exec.CommandContext(ctx, "git", "-C", gitDir, "config", "zoekt."+kv.Key, kv.Value) 376 cmd.Stdin = &bytes.Buffer{} 377 if err := c.runCmd(cmd); err != nil { 378 return err 379 } 380 } 381 return nil 382} 383 384func indexRepo(ctx context.Context, gitDir string, sourcegraph Sourcegraph, o *indexArgs, c gitIndexConfig, logger sglog.Logger) error { 385 args := []string{ 386 "-submodules=false", 387 } 388 389 // Even though we check for incremental in this process, we still pass it 390 // in just in case we regress in how we check in process. We will still 391 // notice thanks to metrics and increased load on gitserver. 392 args = append(args, fmt.Sprintf("-incremental=%v", o.Incremental)) 393 394 var branches []string 395 for _, b := range o.Branches { 396 branches = append(branches, b.Name) 397 } 398 args = append(args, "-branches", strings.Join(branches, ",")) 399 400 if o.UseDelta { 401 args = append(args, "-delta") 402 args = append(args, "-delta_threshold", strconv.FormatUint(o.DeltaShardNumberFallbackThreshold, 10)) 403 } 404 405 if len(o.LanguageMap) > 0 { 406 var languageMap []string 407 for language, parser := range o.LanguageMap { 408 languageMap = append(languageMap, language+":"+ctags.ParserToString(parser)) 409 } 410 args = append(args, "-language_map", strings.Join(languageMap, ",")) 411 } 412 413 args = append(args, o.BuildOptions().Args()...) 414 args = append(args, gitDir) 415 416 cmd := exec.CommandContext(ctx, "zoekt-git-index", args...) 417 cmd.Stdin = &bytes.Buffer{} 418 if err := c.runCmd(cmd); err != nil { 419 return err 420 } 421 return nil 422} 423 424func priorBranches(c gitIndexConfig, o *indexArgs) ([]zoekt.RepositoryBranch, error) { 425 existingRepository, _, found, err := c.findRepositoryMetadata(o) 426 if err != nil { 427 return nil, fmt.Errorf("loading repository metadata: %w", err) 428 } 429 430 if !found || len(existingRepository.Branches) == 0 { 431 return nil, fmt.Errorf("no prior shards found") 432 } 433 434 return existingRepository.Branches, nil 435} 436 437func tmpGitDir(name string) (string, error) { 438 abs := url.QueryEscape(name) 439 if len(abs) > 200 { 440 h := sha1.New() 441 _, _ = io.WriteString(h, abs) 442 abs = abs[:200] + fmt.Sprintf("%x", h.Sum(nil))[:8] 443 } 444 dir := filepath.Join(os.TempDir(), abs+".git") 445 if _, err := os.Stat(dir); err == nil { 446 if err := os.RemoveAll(dir); err != nil { 447 return "", err 448 } 449 } 450 return dir, nil 451} 452 453// FromProto converts a ZoektIndexOptions proto message into an IndexOptions struct. 454func (o *IndexOptions) FromProto(x *configv1.ZoektIndexOptions) { 455 branches := make([]zoekt.RepositoryBranch, 0, len(x.Branches)) 456 for _, b := range x.GetBranches() { 457 branches = append(branches, zoekt.RepositoryBranch{ 458 Name: b.GetName(), 459 Version: b.GetVersion(), 460 }) 461 } 462 463 languageMap := make(map[string]ctags.CTagsParserType) 464 for _, lang := range x.GetLanguageMap() { 465 languageMap[lang.GetLanguage()] = ctags.CTagsParserType(lang.GetCtags().Number()) 466 } 467 468 *o = IndexOptions{ 469 RepoID: uint32(x.GetRepoId()), 470 LargeFiles: x.GetLargeFiles(), 471 Symbols: x.GetSymbols(), 472 Branches: branches, 473 Name: x.GetName(), 474 475 Priority: x.GetPriority(), 476 477 Public: x.GetPublic(), 478 Fork: x.GetFork(), 479 Archived: x.GetArchived(), 480 481 LanguageMap: languageMap, 482 ShardConcurrency: x.GetShardConcurrency(), 483 484 TenantID: int(x.TenantId), 485 } 486}