fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

Avoid fetching files over size limit (#814)

We never index files over 1MB, unless the "LargeFiles" allowlist is set. So in
most cases, we can avoid fetching them at all.

This PR updates the `git fetch` to filter out files over 1MB when possible, and
exclude tags. It also refactors the very long `gitIndex` method.

+78 -38
+52 -23
cmd/zoekt-sourcegraph-indexserver/index.go
··· 185 185 if c.runCmd == nil { 186 186 return errors.New("runCmd in provided configuration was nil - a function must be provided") 187 187 } 188 - runCmd := c.runCmd 189 188 190 189 if c.findRepositoryMetadata == nil { 191 190 return errors.New("findRepositoryMetadata in provided configuration was nil - a function must be provided") 192 191 } 193 192 194 - buildOptions := o.BuildOptions() 195 193 ctx, cancel := context.WithTimeout(context.Background(), c.timeout) 196 194 defer cancel() 197 195 ··· 201 199 } 202 200 defer os.RemoveAll(gitDir) // best-effort cleanup 203 201 202 + err = fetchRepo(ctx, gitDir, o, c, logger) 203 + if err != nil { 204 + return err 205 + } 206 + 207 + err = setZoektConfig(ctx, gitDir, o, c) 208 + if err != nil { 209 + return err 210 + } 211 + 212 + err = indexRepo(ctx, gitDir, sourcegraph, o, c, logger) 213 + if err != nil { 214 + return err 215 + } 216 + 217 + return nil 218 + } 219 + 220 + func fetchRepo(ctx context.Context, gitDir string, o *indexArgs, c gitIndexConfig, logger sglog.Logger) error { 204 221 // Create a repo to fetch into 205 222 cmd := exec.CommandContext(ctx, "git", 206 223 // use a random default branch. This is so that HEAD isn't a symref to a ··· 212 229 "--bare", 213 230 gitDir) 214 231 cmd.Stdin = &bytes.Buffer{} 215 - if err := runCmd(cmd); err != nil { 232 + if err := c.runCmd(cmd); err != nil { 216 233 return err 217 234 } 218 235 ··· 234 251 "-C", gitDir, 235 252 "-c", "protocol.version=2", 236 253 "-c", "http.extraHeader=X-Sourcegraph-Actor-UID: internal", 237 - "fetch", "--depth=1", o.CloneURL, 254 + "fetch", "--depth=1", "--no-tags", 255 + } 256 + 257 + // If there are no exceptions to MaxFileSize (1MB), we can avoid fetching these large files. 258 + if len(o.LargeFiles) == 0 { 259 + fetchArgs = append(fetchArgs, "--filter=blob:limit=1m") 238 260 } 261 + 262 + fetchArgs = append(fetchArgs, o.CloneURL) 239 263 240 264 var commits []string 241 265 for _, b := range branches { ··· 248 272 cmd.Stdin = &bytes.Buffer{} 249 273 250 274 start := time.Now() 251 - err := runCmd(cmd) 275 + err := c.runCmd(cmd) 252 276 fetchDuration += time.Since(start) 253 277 254 278 if err != nil { ··· 286 310 if o.UseDelta { 287 311 err := fetchPriorAndLatestCommits() 288 312 if err != nil { 289 - name := buildOptions.RepositoryDescription.Name 290 - id := buildOptions.RepositoryDescription.ID 313 + name := o.BuildOptions().RepositoryDescription.Name 314 + id := o.BuildOptions().RepositoryDescription.ID 291 315 292 316 log.Printf("delta build: failed to prepare delta build for %q (ID %d): failed to fetch both latest and prior commits: %s", name, id, err) 293 317 err = fetchOnlyLatestCommits() ··· 302 326 } 303 327 } 304 328 305 - logger.Debug("successfully fetched git data", 306 - sglog.String("repo", o.Name), 307 - sglog.Uint32("id", o.RepoID), 308 - sglog.Int("commits_count", successfullyFetchedCommitsCount), 309 - sglog.Duration("duration", fetchDuration), 310 - ) 311 - 312 329 // We then create the relevant refs for each fetched commit. 313 330 for _, b := range o.Branches { 314 331 ref := b.Name 315 332 if ref != "HEAD" { 316 333 ref = "refs/heads/" + ref 317 334 } 318 - cmd = exec.CommandContext(ctx, "git", "-C", gitDir, "update-ref", ref, b.Version) 335 + cmd := exec.CommandContext(ctx, "git", "-C", gitDir, "update-ref", ref, b.Version) 319 336 cmd.Stdin = &bytes.Buffer{} 320 - if err := runCmd(cmd); err != nil { 337 + if err := c.runCmd(cmd); err != nil { 321 338 return fmt.Errorf("failed update-ref %s to %s: %w", ref, b.Version, err) 322 339 } 323 340 } 324 341 342 + logger.Debug("successfully fetched git data", 343 + sglog.String("repo", o.Name), 344 + sglog.Uint32("id", o.RepoID), 345 + sglog.Int("commits_count", successfullyFetchedCommitsCount), 346 + sglog.Duration("duration", fetchDuration), 347 + ) 348 + return nil 349 + } 350 + 351 + func setZoektConfig(ctx context.Context, gitDir string, o *indexArgs, c gitIndexConfig) error { 325 352 // create git configuration with options 326 353 type configKV struct{ Key, Value string } 327 354 config := []configKV{{ ··· 329 356 Key: "name", 330 357 Value: o.Name, 331 358 }} 332 - for k, v := range buildOptions.RepositoryDescription.RawConfig { 359 + for k, v := range o.BuildOptions().RepositoryDescription.RawConfig { 333 360 config = append(config, configKV{Key: k, Value: v}) 334 361 } 335 362 sort.Slice(config, func(i, j int) bool { ··· 338 365 339 366 // write git configuration to repo 340 367 for _, kv := range config { 341 - cmd = exec.CommandContext(ctx, "git", "-C", gitDir, "config", "zoekt."+kv.Key, kv.Value) 368 + cmd := exec.CommandContext(ctx, "git", "-C", gitDir, "config", "zoekt."+kv.Key, kv.Value) 342 369 cmd.Stdin = &bytes.Buffer{} 343 - if err := runCmd(cmd); err != nil { 370 + if err := c.runCmd(cmd); err != nil { 344 371 return err 345 372 } 346 373 } 374 + return nil 375 + } 347 376 377 + func indexRepo(ctx context.Context, gitDir string, sourcegraph Sourcegraph, o *indexArgs, c gitIndexConfig, logger sglog.Logger) error { 348 378 args := []string{ 349 379 "-submodules=false", 350 380 } ··· 413 443 args = append(args, "-language_map", strings.Join(languageMap, ",")) 414 444 } 415 445 416 - args = append(args, buildOptions.Args()...) 446 + args = append(args, o.BuildOptions().Args()...) 417 447 args = append(args, gitDir) 418 448 419 - cmd = exec.CommandContext(ctx, "zoekt-git-index", args...) 449 + cmd := exec.CommandContext(ctx, "zoekt-git-index", args...) 420 450 cmd.Stdin = &bytes.Buffer{} 421 - if err := runCmd(cmd); err != nil { 451 + if err := c.runCmd(cmd); err != nil { 422 452 return err 423 453 } 424 - 425 454 return nil 426 455 } 427 456
+4 -4
cmd/zoekt-sourcegraph-indexserver/index_test.go
··· 489 489 }, 490 490 want: []string{ 491 491 "git -c init.defaultBranch=nonExistentBranchBB0FOFCH32 init --bare $TMPDIR/test%2Frepo.git", 492 - "git -C $TMPDIR/test%2Frepo.git -c protocol.version=2 -c http.extraHeader=X-Sourcegraph-Actor-UID: internal fetch --depth=1 http://api.test/.internal/git/test/repo deadbeef", 492 + "git -C $TMPDIR/test%2Frepo.git -c protocol.version=2 -c http.extraHeader=X-Sourcegraph-Actor-UID: internal fetch --depth=1 --no-tags --filter=blob:limit=1m http://api.test/.internal/git/test/repo deadbeef", 493 493 "git -C $TMPDIR/test%2Frepo.git update-ref HEAD deadbeef", 494 494 "git -C $TMPDIR/test%2Frepo.git config zoekt.archived 0", 495 495 "git -C $TMPDIR/test%2Frepo.git config zoekt.fork 0", ··· 511 511 }, 512 512 want: []string{ 513 513 "git -c init.defaultBranch=nonExistentBranchBB0FOFCH32 init --bare $TMPDIR/test%2Frepo.git", 514 - "git -C $TMPDIR/test%2Frepo.git -c protocol.version=2 -c http.extraHeader=X-Sourcegraph-Actor-UID: internal fetch --depth=1 http://api.test/.internal/git/test/repo deadbeef", 514 + "git -C $TMPDIR/test%2Frepo.git -c protocol.version=2 -c http.extraHeader=X-Sourcegraph-Actor-UID: internal fetch --depth=1 --no-tags --filter=blob:limit=1m http://api.test/.internal/git/test/repo deadbeef", 515 515 "git -C $TMPDIR/test%2Frepo.git update-ref HEAD deadbeef", 516 516 "git -C $TMPDIR/test%2Frepo.git config zoekt.archived 0", 517 517 "git -C $TMPDIR/test%2Frepo.git config zoekt.fork 0", ··· 541 541 }, 542 542 want: []string{ 543 543 "git -c init.defaultBranch=nonExistentBranchBB0FOFCH32 init --bare $TMPDIR/test%2Frepo.git", 544 - "git -C $TMPDIR/test%2Frepo.git -c protocol.version=2 -c http.extraHeader=X-Sourcegraph-Actor-UID: internal fetch --depth=1 http://api.test/.internal/git/test/repo deadbeef feebdaed", 544 + "git -C $TMPDIR/test%2Frepo.git -c protocol.version=2 -c http.extraHeader=X-Sourcegraph-Actor-UID: internal fetch --depth=1 --no-tags http://api.test/.internal/git/test/repo deadbeef feebdaed", 545 545 "git -C $TMPDIR/test%2Frepo.git update-ref HEAD deadbeef", 546 546 "git -C $TMPDIR/test%2Frepo.git update-ref refs/heads/dev feebdaed", 547 547 "git -C $TMPDIR/test%2Frepo.git config zoekt.archived 0", ··· 587 587 }, 588 588 want: []string{ 589 589 "git -c init.defaultBranch=nonExistentBranchBB0FOFCH32 init --bare $TMPDIR/test%2Frepo.git", 590 - "git -C $TMPDIR/test%2Frepo.git -c protocol.version=2 -c http.extraHeader=X-Sourcegraph-Actor-UID: internal fetch --depth=1 http://api.test/.internal/git/test/repo deadbeef feebdaed 12345678 oldhead olddev oldrelease", 590 + "git -C $TMPDIR/test%2Frepo.git -c protocol.version=2 -c http.extraHeader=X-Sourcegraph-Actor-UID: internal fetch --depth=1 --no-tags http://api.test/.internal/git/test/repo deadbeef feebdaed 12345678 oldhead olddev oldrelease", 591 591 "git -C $TMPDIR/test%2Frepo.git update-ref HEAD deadbeef", 592 592 "git -C $TMPDIR/test%2Frepo.git update-ref refs/heads/dev feebdaed", 593 593 "git -C $TMPDIR/test%2Frepo.git update-ref refs/heads/release 12345678",
+6 -5
cmd/zoekt-sourcegraph-indexserver/main.go
··· 136 136 clientMetrics *grpcprom.ClientMetrics 137 137 ) 138 138 139 + // 1 MB; match https://sourcegraph.sgdev.org/github.com/sourcegraph/sourcegraph/-/blob/cmd/symbols/internal/symbols/search.go#L22 140 + // NOTE: if you change this, you must also update gitIndex to use the same value when fetching the repo. 141 + const MaxFileSize = 1 << 20 142 + 139 143 // set of repositories that we want to capture separate indexing metrics for 140 144 var reposWithSeparateIndexingMetrics = make(map[string]struct{}) 141 145 ··· 656 660 IndexDir: s.IndexDir, 657 661 Parallelism: parallelism, 658 662 Incremental: true, 659 - 660 - // 1 MB; match https://sourcegraph.sgdev.org/github.com/sourcegraph/sourcegraph/-/blob/cmd/symbols/internal/symbols/search.go#L22 661 - FileLimit: 1 << 20, 662 - 663 + FileLimit: MaxFileSize, 663 664 ShardMerging: s.shardMerging, 664 665 } 665 666 } ··· 754 755 <a href="?show_repos=false">hide repos</a><br /> 755 756 <table style="margin-top: 20px"> 756 757 <th style="text-align:left">Name</th> 757 - <th style="text-align:left">ID</th> 758 + <th style="text-align:left">ID (click to reindex)</th> 758 759 {{range .Repos}} 759 760 <tr> 760 761 <td>{{.Name}}</td>
+16 -6
gitindex/index.go
··· 887 887 opts build.Options, 888 888 ) (zoekt.Document, error) { 889 889 blob, err := repos[key].Repo.BlobObject(key.ID) 890 + 891 + // We filter out large documents when fetching the repo. So if an object is too large, it will not be found. 892 + if errors.Is(err, plumbing.ErrObjectNotFound) { 893 + return skippedLargeDoc(key, branchMap, opts), nil 894 + } 895 + 890 896 if err != nil { 891 897 return zoekt.Document{}, err 892 898 } 893 899 894 900 keyFullPath := key.FullPath() 895 901 if blob.Size > int64(opts.SizeMax) && !opts.IgnoreSizeMax(keyFullPath) { 896 - return zoekt.Document{ 897 - SkipReason: fmt.Sprintf("file size %d exceeds maximum size %d", blob.Size, opts.SizeMax), 898 - Name: key.FullPath(), 899 - Branches: branchMap[key], 900 - SubRepositoryPath: key.SubRepoPath, 901 - }, nil 902 + return skippedLargeDoc(key, branchMap, opts), nil 902 903 } 903 904 904 905 contents, err := blobContents(blob) ··· 920 921 Branches: branchMap[key], 921 922 Ranks: pathRanks, 922 923 }, nil 924 + } 925 + 926 + func skippedLargeDoc(key fileKey, branchMap map[fileKey][]string, opts build.Options) zoekt.Document { 927 + return zoekt.Document{ 928 + SkipReason: fmt.Sprintf("file size exceeds maximum size %d", opts.SizeMax), 929 + Name: key.FullPath(), 930 + Branches: branchMap[key], 931 + SubRepositoryPath: key.SubRepoPath, 932 + } 923 933 } 924 934 925 935 func blobContents(blob *object.Blob) ([]byte, error) {