fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

gitindex: set filter for cat-file (#1026)

At Sourcegraph we do a sparse clone which excludes files based on max
file size. However, cat-file will hydrate in missing objects. So we pass
in the same filter to avoid hydrating in those files.

+235 -114
+32 -17
gitindex/catfile.go
··· 28 28 "github.com/go-git/go-git/v5/plumbing" 29 29 ) 30 30 31 + type catfileReaderOptions struct { 32 + filterSpec string 33 + } 34 + 31 35 // catfileReader provides streaming access to git blob objects via a pipelined 32 36 // "git cat-file --batch --buffer" process. A writer goroutine feeds all blob 33 37 // SHAs to stdin while the caller reads responses one at a time, similar to ··· 39 43 // 40 44 // Usage: 41 45 // 42 - // cr, err := newCatfileReader(repoDir, ids) 46 + // cr, err := newCatfileReader(repoDir, ids, catfileReaderOptions{}) 43 47 // if err != nil { ... } 44 48 // defer cr.Close() 45 49 // 46 50 // for { 47 - // size, missing, err := cr.Next() 51 + // size, missing, excluded, err := cr.Next() 48 52 // if err == io.EOF { break } 49 53 // if missing { continue } 54 + // if excluded { continue } 50 55 // if size > maxSize { continue } // unread bytes auto-skipped 51 56 // content := make([]byte, size) 52 57 // io.ReadFull(cr, content) ··· 66 71 67 72 // newCatfileReader starts a "git cat-file --batch --buffer" process and feeds 68 73 // all ids to its stdin via a background goroutine. The caller must call Close 69 - // when done. 70 - func newCatfileReader(repoDir string, ids []plumbing.Hash) (*catfileReader, error) { 71 - cmd := exec.Command("git", "cat-file", "--batch", "--buffer") 74 + // when done. Pass a zero-value catfileReaderOptions when no options are needed. 75 + func newCatfileReader(repoDir string, ids []plumbing.Hash, opts catfileReaderOptions) (*catfileReader, error) { 76 + args := []string{"cat-file", "--batch", "--buffer"} 77 + if opts.filterSpec != "" { 78 + args = append(args, "--filter="+opts.filterSpec) 79 + } 80 + 81 + cmd := exec.Command("git", args...) 72 82 cmd.Dir = repoDir 73 83 74 84 stdin, err := cmd.StdinPipe() ··· 114 124 } 115 125 116 126 // Next advances to the next blob entry. It returns the blob's size and whether 117 - // it is missing. Any unread content from the previous entry is automatically 118 - // discarded. Returns io.EOF when all entries have been consumed. 127 + // it is missing or excluded by the configured filter. Any unread content from 128 + // the previous entry is automatically discarded. Returns io.EOF when all 129 + // entries have been consumed. 119 130 // 120 - // After Next returns successfully with missing=false, call Read to consume the 121 - // blob content, or call Next again to skip it. 122 - func (cr *catfileReader) Next() (size int, missing bool, err error) { 131 + // After Next returns successfully with missing=false and excluded=false, call 132 + // Read to consume the blob content, or call Next again to skip it. 133 + func (cr *catfileReader) Next() (size int, missing bool, excluded bool, err error) { 123 134 // Discard unread content from the previous entry. 124 135 if cr.pending > 0 { 125 136 if _, err := cr.reader.Discard(cr.pending); err != nil { 126 - return 0, false, fmt.Errorf("discard pending bytes: %w", err) 137 + return 0, false, false, fmt.Errorf("discard pending bytes: %w", err) 127 138 } 128 139 cr.pending = 0 129 140 } ··· 131 142 headerBytes, err := cr.reader.ReadBytes('\n') 132 143 if err != nil { 133 144 if err == io.EOF { 134 - return 0, false, io.EOF 145 + return 0, false, false, io.EOF 135 146 } 136 - return 0, false, fmt.Errorf("read header: %w", err) 147 + return 0, false, false, fmt.Errorf("read header: %w", err) 137 148 } 138 149 header := headerBytes[:len(headerBytes)-1] // trim \n 139 150 140 151 if bytes.HasSuffix(header, []byte(" missing")) { 141 - return 0, true, nil 152 + return 0, true, false, nil 153 + } 154 + 155 + if bytes.HasSuffix(header, []byte(" excluded")) { 156 + return 0, false, true, nil 142 157 } 143 158 144 159 // Parse size from "<oid> <type> <size>". 145 160 lastSpace := bytes.LastIndexByte(header, ' ') 146 161 if lastSpace == -1 { 147 - return 0, false, fmt.Errorf("unexpected header: %q", header) 162 + return 0, false, false, fmt.Errorf("unexpected header: %q", header) 148 163 } 149 164 size, err = strconv.Atoi(string(header[lastSpace+1:])) 150 165 if err != nil { 151 - return 0, false, fmt.Errorf("parse size from %q: %w", header, err) 166 + return 0, false, false, fmt.Errorf("parse size from %q: %w", header, err) 152 167 } 153 168 154 169 // Track pending bytes: content + trailing LF. 155 170 cr.pending = size + 1 156 - return size, false, nil 171 + return size, false, false, nil 157 172 } 158 173 159 174 // Read reads from the current blob's content. Implements io.Reader. Returns
+3 -3
gitindex/catfile_bench_test.go
··· 130 130 var totalBytes int64 131 131 for b.Loop() { 132 132 totalBytes = 0 133 - cr, err := newCatfileReader(gitDir, subset) 133 + cr, err := newCatfileReader(gitDir, subset, catfileReaderOptions{}) 134 134 if err != nil { 135 135 b.Fatalf("newCatfileReader: %v", err) 136 136 } 137 137 for range subset { 138 - size, missing, err := cr.Next() 138 + size, missing, excluded, err := cr.Next() 139 139 if err != nil { 140 140 cr.Close() 141 141 b.Fatalf("Next: %v", err) 142 142 } 143 - if missing { 143 + if missing || excluded { 144 144 continue 145 145 } 146 146 content := make([]byte, size)
+76 -73
gitindex/catfile_hardening_test.go
··· 22 22 repoDir, blobs := createTestRepo(t) 23 23 ids := []plumbing.Hash{blobs["hello.txt"]} 24 24 25 - cr, err := newCatfileReader(repoDir, ids) 25 + cr, err := newCatfileReader(repoDir, ids, catfileReaderOptions{}) 26 26 if err != nil { 27 27 t.Fatal(err) 28 28 } 29 29 30 30 // Consume the entry so the process can exit cleanly. 31 - if _, _, err := cr.Next(); err != nil { 31 + if _, _, _, err := cr.Next(); err != nil { 32 32 t.Fatal(err) 33 33 } 34 34 ··· 61 61 blobs["binary.bin"], 62 62 } 63 63 64 - cr, err := newCatfileReader(repoDir, ids) 64 + cr, err := newCatfileReader(repoDir, ids, catfileReaderOptions{}) 65 65 if err != nil { 66 66 t.Fatal(err) 67 67 } 68 68 69 69 // Read one entry, leave two unconsumed. 70 - if _, _, err := cr.Next(); err != nil { 70 + if _, _, _, err := cr.Next(); err != nil { 71 71 t.Fatal(err) 72 72 } 73 73 ··· 111 111 blobs["empty.txt"], 112 112 } 113 113 114 - cr, err := newCatfileReader(repoDir, ids) 114 + cr, err := newCatfileReader(repoDir, ids, catfileReaderOptions{}) 115 115 if err != nil { 116 116 t.Fatal(err) 117 117 } ··· 168 168 ids = append(ids, plumbing.NewHash(string(out[:len(out)-1]))) 169 169 } 170 170 171 - cr, err := newCatfileReader(repoDir, ids) 171 + cr, err := newCatfileReader(repoDir, ids, catfileReaderOptions{}) 172 172 if err != nil { 173 173 t.Fatal(err) 174 174 } 175 175 176 176 // Read only 1 of 200 entries. 177 - if _, _, err := cr.Next(); err != nil { 177 + if _, _, _, err := cr.Next(); err != nil { 178 178 t.Fatal(err) 179 179 } 180 180 ··· 207 207 repoDir, blobs := createTestRepo(t) 208 208 ids := []plumbing.Hash{blobs["hello.txt"]} 209 209 210 - cr, err := newCatfileReader(repoDir, ids) 210 + cr, err := newCatfileReader(repoDir, ids, catfileReaderOptions{}) 211 211 if err != nil { 212 212 t.Fatal(err) 213 213 } ··· 227 227 repoDir, blobs := createTestRepo(t) 228 228 ids := []plumbing.Hash{blobs["hello.txt"]} 229 229 230 - cr, err := newCatfileReader(repoDir, ids) 230 + cr, err := newCatfileReader(repoDir, ids, catfileReaderOptions{}) 231 231 if err != nil { 232 232 t.Fatal(err) 233 233 } 234 234 defer cr.Close() 235 235 236 - size, _, _ := cr.Next() 236 + size, _, _, _ := cr.Next() 237 237 content := make([]byte, size) 238 238 if _, err := io.ReadFull(cr, content); err != nil { 239 239 t.Fatal(err) ··· 256 256 repoDir, blobs := createTestRepo(t) 257 257 ids := []plumbing.Hash{blobs["hello.txt"]} 258 258 259 - cr, err := newCatfileReader(repoDir, ids) 259 + cr, err := newCatfileReader(repoDir, ids, catfileReaderOptions{}) 260 260 if err != nil { 261 261 t.Fatal(err) 262 262 } 263 263 defer cr.Close() 264 264 265 - size, _, _ := cr.Next() 265 + size, _, _, _ := cr.Next() 266 266 267 267 var result []byte 268 268 buf := make([]byte, 1) ··· 297 297 blobs["binary.bin"], // variable, starts with 0x00 298 298 } 299 299 300 - cr, err := newCatfileReader(repoDir, ids) 300 + cr, err := newCatfileReader(repoDir, ids, catfileReaderOptions{}) 301 301 if err != nil { 302 302 t.Fatal(err) 303 303 } 304 304 defer cr.Close() 305 305 306 306 // Read only 5 of 12 bytes from hello.txt. 307 - size, _, _ := cr.Next() 307 + size, _, _, _ := cr.Next() 308 308 if size != 12 { 309 309 t.Fatalf("hello.txt size = %d, want 12", size) 310 310 } ··· 317 317 } 318 318 319 319 // Advance — must discard remaining 7 content bytes + trailing LF. 320 - size, _, err = cr.Next() 320 + size, _, _, err = cr.Next() 321 321 if err != nil { 322 322 t.Fatalf("Next binary.bin after partial read: %v", err) 323 323 } ··· 343 343 blobs["binary.bin"], // starts with 0x00 344 344 } 345 345 346 - cr, err := newCatfileReader(repoDir, ids) 346 + cr, err := newCatfileReader(repoDir, ids, catfileReaderOptions{}) 347 347 if err != nil { 348 348 t.Fatal(err) 349 349 } 350 350 defer cr.Close() 351 351 352 - size, _, _ := cr.Next() 352 + size, _, _, _ := cr.Next() 353 353 // Read exactly size-1 bytes — leaves 1 content byte + trailing LF. 354 354 buf := make([]byte, size-1) 355 355 if _, err := io.ReadFull(cr, buf); err != nil { ··· 361 361 362 362 // Advance — pending should be 2 (1 content byte + 1 LF). The 363 363 // Discard call must handle this exact boundary correctly. 364 - size, missing, err := cr.Next() 364 + size, missing, excluded, err := cr.Next() 365 365 if err != nil { 366 366 t.Fatalf("Next after size-1 partial read: %v", err) 367 367 } 368 - if missing { 368 + if missing || excluded { 369 369 t.Fatal("binary.bin unexpectedly missing") 370 370 } 371 371 ··· 386 386 func TestCatfileReader_EmptyIds(t *testing.T) { 387 387 repoDir, _ := createTestRepo(t) 388 388 389 - cr, err := newCatfileReader(repoDir, nil) 389 + cr, err := newCatfileReader(repoDir, nil, catfileReaderOptions{}) 390 390 if err != nil { 391 391 t.Fatal(err) 392 392 } 393 393 defer cr.Close() 394 394 395 - _, _, err = cr.Next() 395 + _, _, _, err = cr.Next() 396 396 if err != io.EOF { 397 397 t.Fatalf("expected io.EOF for empty ids, got %v", err) 398 398 } ··· 408 408 emptyID := blobs["empty.txt"] 409 409 ids := []plumbing.Hash{emptyID, emptyID, emptyID, emptyID, emptyID} 410 410 411 - cr, err := newCatfileReader(repoDir, ids) 411 + cr, err := newCatfileReader(repoDir, ids, catfileReaderOptions{}) 412 412 if err != nil { 413 413 t.Fatal(err) 414 414 } 415 415 defer cr.Close() 416 416 417 417 for i := range ids { 418 - size, missing, err := cr.Next() 418 + size, missing, excluded, err := cr.Next() 419 419 if err != nil { 420 420 t.Fatalf("Next #%d: %v", i, err) 421 421 } 422 - if missing { 422 + if missing || excluded { 423 423 t.Fatalf("#%d unexpectedly missing", i) 424 424 } 425 425 if size != 0 { ··· 428 428 // Don't read — Next should discard the trailing LF for us. 429 429 } 430 430 431 - _, _, err = cr.Next() 431 + _, _, _, err = cr.Next() 432 432 if err != io.EOF { 433 433 t.Fatalf("expected EOF after %d empty blobs, got %v", len(ids), err) 434 434 } ··· 440 440 func TestCatfileReader_EmptyBlobRead(t *testing.T) { 441 441 repoDir, blobs := createTestRepo(t) 442 442 ids := []plumbing.Hash{ 443 - blobs["empty.txt"], // 0 bytes 444 - blobs["hello.txt"], // 12 bytes — sentinel 443 + blobs["empty.txt"], // 0 bytes 444 + blobs["hello.txt"], // 12 bytes — sentinel 445 445 } 446 446 447 - cr, err := newCatfileReader(repoDir, ids) 447 + cr, err := newCatfileReader(repoDir, ids, catfileReaderOptions{}) 448 448 if err != nil { 449 449 t.Fatal(err) 450 450 } 451 451 defer cr.Close() 452 452 453 - size, _, _ := cr.Next() 453 + size, _, _, _ := cr.Next() 454 454 if size != 0 { 455 455 t.Fatalf("empty.txt size = %d", size) 456 456 } ··· 464 464 465 465 // The trailing LF must have been consumed. Verify by reading the 466 466 // next entry — if the LF leaked, the header parse would fail. 467 - size, _, err = cr.Next() 467 + size, _, _, err = cr.Next() 468 468 if err != nil { 469 469 t.Fatalf("Next hello.txt after empty blob Read: %v", err) 470 470 } ··· 494 494 plumbing.NewHash("2222222222222222222222222222222222222222"), 495 495 } 496 496 497 - cr, err := newCatfileReader(repoDir, ids) 497 + cr, err := newCatfileReader(repoDir, ids, catfileReaderOptions{}) 498 498 if err != nil { 499 499 t.Fatal(err) 500 500 } 501 501 defer cr.Close() 502 502 503 503 for i, id := range ids { 504 - _, missing, err := cr.Next() 504 + _, missing, excluded, err := cr.Next() 505 505 if err != nil { 506 506 t.Fatalf("Next #%d (%s): %v", i, id, err) 507 507 } 508 + if excluded { 509 + t.Errorf("expected #%d (%s) to be missing, not excluded", i, id) 510 + } 508 511 if !missing { 509 512 t.Errorf("expected #%d (%s) to be missing", i, id) 510 513 } 511 514 } 512 515 513 - _, _, err = cr.Next() 516 + _, _, _, err = cr.Next() 514 517 if err != io.EOF { 515 518 t.Fatalf("expected EOF after all missing, got %v", err) 516 519 } ··· 532 535 blobs["binary.bin"], 533 536 } 534 537 535 - cr, err := newCatfileReader(repoDir, ids) 538 + cr, err := newCatfileReader(repoDir, ids, catfileReaderOptions{}) 536 539 if err != nil { 537 540 t.Fatal(err) 538 541 } 539 542 defer cr.Close() 540 543 541 544 // fake1 — missing 542 - _, missing, err := cr.Next() 543 - if err != nil || !missing { 544 - t.Fatalf("fake1: err=%v missing=%v", err, missing) 545 + _, missing, excluded, err := cr.Next() 546 + if err != nil || !missing || excluded { 547 + t.Fatalf("fake1: err=%v missing=%v excluded=%v", err, missing, excluded) 545 548 } 546 549 547 550 // hello.txt — present, read it 548 - size, missing, err := cr.Next() 549 - if err != nil || missing { 550 - t.Fatalf("hello.txt: err=%v missing=%v", err, missing) 551 + size, missing, excluded, err := cr.Next() 552 + if err != nil || missing || excluded { 553 + t.Fatalf("hello.txt: err=%v missing=%v excluded=%v", err, missing, excluded) 551 554 } 552 555 content := make([]byte, size) 553 556 if _, err := io.ReadFull(cr, content); err != nil { ··· 558 561 } 559 562 560 563 // fake2 — missing 561 - _, missing, err = cr.Next() 562 - if err != nil || !missing { 563 - t.Fatalf("fake2: err=%v missing=%v", err, missing) 564 + _, missing, excluded, err = cr.Next() 565 + if err != nil || !missing || excluded { 566 + t.Fatalf("fake2: err=%v missing=%v excluded=%v", err, missing, excluded) 564 567 } 565 568 566 569 // empty.txt — present, skip it 567 - size, missing, err = cr.Next() 568 - if err != nil || missing { 569 - t.Fatalf("empty.txt: err=%v missing=%v", err, missing) 570 + size, missing, excluded, err = cr.Next() 571 + if err != nil || missing || excluded { 572 + t.Fatalf("empty.txt: err=%v missing=%v excluded=%v", err, missing, excluded) 570 573 } 571 574 if size != 0 { 572 575 t.Errorf("empty.txt size = %d", size) 573 576 } 574 577 575 578 // binary.bin — present, read it 576 - size, missing, err = cr.Next() 577 - if err != nil || missing { 578 - t.Fatalf("binary.bin: err=%v missing=%v", err, missing) 579 + size, missing, excluded, err = cr.Next() 580 + if err != nil || missing || excluded { 581 + t.Fatalf("binary.bin: err=%v missing=%v excluded=%v", err, missing, excluded) 579 582 } 580 583 binContent := make([]byte, size) 581 584 if _, err := io.ReadFull(cr, binContent); err != nil { ··· 585 588 t.Errorf("binary.bin[0] = 0x%02x, want 0x00", binContent[0]) 586 589 } 587 590 588 - _, _, err = cr.Next() 591 + _, _, _, err = cr.Next() 589 592 if err != io.EOF { 590 593 t.Fatalf("expected EOF, got %v", err) 591 594 } ··· 601 604 fake := plumbing.NewHash("deadbeefdeadbeefdeadbeefdeadbeefdeadbeef") 602 605 ids := []plumbing.Hash{ 603 606 fake, 604 - blobs["large.bin"], // 64KB — skip without reading 605 - blobs["hello.txt"], // sentinel — read to verify integrity 607 + blobs["large.bin"], // 64KB — skip without reading 608 + blobs["hello.txt"], // sentinel — read to verify integrity 606 609 } 607 610 608 - cr, err := newCatfileReader(repoDir, ids) 611 + cr, err := newCatfileReader(repoDir, ids, catfileReaderOptions{}) 609 612 if err != nil { 610 613 t.Fatal(err) 611 614 } 612 615 defer cr.Close() 613 616 614 617 // missing 615 - _, missing, _ := cr.Next() 616 - if !missing { 618 + _, missing, excluded, _ := cr.Next() 619 + if !missing || excluded { 617 620 t.Fatal("expected missing") 618 621 } 619 622 620 623 // large.bin — skip 621 - size, missing, err := cr.Next() 622 - if err != nil || missing { 623 - t.Fatalf("large.bin: err=%v missing=%v", err, missing) 624 + size, missing, excluded, err := cr.Next() 625 + if err != nil || missing || excluded { 626 + t.Fatalf("large.bin: err=%v missing=%v excluded=%v", err, missing, excluded) 624 627 } 625 628 if size != 64*1024 { 626 629 t.Fatalf("large.bin size = %d", size) ··· 628 631 // deliberately don't read 629 632 630 633 // hello.txt — read after missing+skip 631 - size, missing, err = cr.Next() 632 - if err != nil || missing { 633 - t.Fatalf("hello.txt: err=%v missing=%v", err, missing) 634 + size, missing, excluded, err = cr.Next() 635 + if err != nil || missing || excluded { 636 + t.Fatalf("hello.txt: err=%v missing=%v excluded=%v", err, missing, excluded) 634 637 } 635 638 content := make([]byte, size) 636 639 if _, err := io.ReadFull(cr, content); err != nil { ··· 649 652 repoDir, blobs := createTestRepo(t) 650 653 ids := []plumbing.Hash{blobs["hello.txt"]} 651 654 652 - cr, err := newCatfileReader(repoDir, ids) 655 + cr, err := newCatfileReader(repoDir, ids, catfileReaderOptions{}) 653 656 if err != nil { 654 657 t.Fatal(err) 655 658 } 656 659 defer cr.Close() 657 660 658 661 // Consume and skip the only entry. 659 - if _, _, err := cr.Next(); err != nil { 662 + if _, _, _, err := cr.Next(); err != nil { 660 663 t.Fatal(err) 661 664 } 662 665 663 666 // First EOF. 664 - _, _, err = cr.Next() 667 + _, _, _, err = cr.Next() 665 668 if err != io.EOF { 666 669 t.Fatalf("first post-exhaust Next: %v, want io.EOF", err) 667 670 } 668 671 669 672 // Second and third EOF — must be stable. 670 673 for i := 0; i < 2; i++ { 671 - _, _, err = cr.Next() 674 + _, _, _, err = cr.Next() 672 675 if err != io.EOF { 673 676 t.Fatalf("Next #%d after EOF: %v, want io.EOF", i+2, err) 674 677 } ··· 684 687 repoDir, blobs := createTestRepo(t) 685 688 ids := []plumbing.Hash{blobs["large.bin"]} 686 689 687 - cr, err := newCatfileReader(repoDir, ids) 690 + cr, err := newCatfileReader(repoDir, ids, catfileReaderOptions{}) 688 691 if err != nil { 689 692 t.Fatal(err) 690 693 } 691 694 defer cr.Close() 692 695 693 - size, _, err := cr.Next() 696 + size, _, _, err := cr.Next() 694 697 if err != nil { 695 698 t.Fatal(err) 696 699 } ··· 732 735 repoDir, blobs := createTestRepo(t) 733 736 ids := []plumbing.Hash{blobs["large.bin"]} 734 737 735 - cr, err := newCatfileReader(repoDir, ids) 738 + cr, err := newCatfileReader(repoDir, ids, catfileReaderOptions{}) 736 739 if err != nil { 737 740 t.Fatal(err) 738 741 } 739 742 defer cr.Close() 740 743 741 - size, _, _ := cr.Next() 744 + size, _, _, _ := cr.Next() 742 745 if size != 64*1024 { 743 746 t.Fatalf("size = %d", size) 744 747 } ··· 780 783 sha := blobs["hello.txt"] 781 784 ids := []plumbing.Hash{sha, sha, sha} 782 785 783 - cr, err := newCatfileReader(repoDir, ids) 786 + cr, err := newCatfileReader(repoDir, ids, catfileReaderOptions{}) 784 787 if err != nil { 785 788 t.Fatal(err) 786 789 } 787 790 defer cr.Close() 788 791 789 792 for i := 0; i < 3; i++ { 790 - size, missing, err := cr.Next() 793 + size, missing, excluded, err := cr.Next() 791 794 if err != nil { 792 795 t.Fatalf("Next #%d: %v", i, err) 793 796 } 794 - if missing { 797 + if missing || excluded { 795 798 t.Fatalf("#%d unexpectedly missing", i) 796 799 } 797 800 if size != 12 { ··· 806 809 } 807 810 } 808 811 809 - _, _, err = cr.Next() 812 + _, _, _, err = cr.Next() 810 813 if err != io.EOF { 811 814 t.Fatalf("expected EOF, got %v", err) 812 815 }
+72 -19
gitindex/catfile_test.go
··· 70 70 blobs["large.bin"], 71 71 } 72 72 73 - cr, err := newCatfileReader(repoDir, ids) 73 + cr, err := newCatfileReader(repoDir, ids, catfileReaderOptions{}) 74 74 if err != nil { 75 75 t.Fatalf("newCatfileReader: %v", err) 76 76 } 77 77 defer cr.Close() 78 78 79 79 // hello.txt 80 - size, missing, err := cr.Next() 80 + size, missing, excluded, err := cr.Next() 81 81 if err != nil { 82 82 t.Fatalf("Next hello.txt: %v", err) 83 83 } 84 - if missing { 84 + if missing || excluded { 85 85 t.Fatal("hello.txt unexpectedly missing") 86 86 } 87 87 if size != 12 { ··· 96 96 } 97 97 98 98 // empty.txt 99 - size, missing, err = cr.Next() 99 + size, missing, excluded, err = cr.Next() 100 100 if err != nil { 101 101 t.Fatalf("Next empty.txt: %v", err) 102 102 } 103 + if missing || excluded { 104 + t.Fatal("empty.txt unexpectedly missing") 105 + } 103 106 if size != 0 { 104 107 t.Errorf("empty.txt size = %d, want 0", size) 105 108 } 106 109 107 110 // binary.bin — read content and verify binary data survives. 108 - size, missing, err = cr.Next() 111 + size, missing, excluded, err = cr.Next() 109 112 if err != nil { 110 113 t.Fatalf("Next binary.bin: %v", err) 114 + } 115 + if missing || excluded { 116 + t.Fatal("binary.bin unexpectedly missing") 111 117 } 112 118 binContent := make([]byte, size) 113 119 if _, err := io.ReadFull(cr, binContent); err != nil { ··· 118 124 } 119 125 120 126 // large.bin 121 - size, missing, err = cr.Next() 127 + size, missing, excluded, err = cr.Next() 122 128 if err != nil { 123 129 t.Fatalf("Next large.bin: %v", err) 124 130 } 131 + if missing || excluded { 132 + t.Fatal("large.bin unexpectedly missing") 133 + } 125 134 if size != 64*1024 { 126 135 t.Errorf("large.bin size = %d, want %d", size, 64*1024) 127 136 } ··· 131 140 } 132 141 133 142 // EOF after all entries. 134 - _, _, err = cr.Next() 143 + _, _, _, err = cr.Next() 135 144 if err != io.EOF { 136 145 t.Errorf("expected io.EOF after last entry, got %v", err) 137 146 } ··· 146 155 blobs["binary.bin"], 147 156 } 148 157 149 - cr, err := newCatfileReader(repoDir, ids) 158 + cr, err := newCatfileReader(repoDir, ids, catfileReaderOptions{}) 150 159 if err != nil { 151 160 t.Fatalf("newCatfileReader: %v", err) 152 161 } 153 162 defer cr.Close() 154 163 155 164 // Skip hello.txt by calling Next again without reading. 156 - _, _, err = cr.Next() 165 + _, _, _, err = cr.Next() 157 166 if err != nil { 158 167 t.Fatalf("Next hello.txt: %v", err) 159 168 } 160 169 161 170 // Skip large.bin too. 162 - size, _, err := cr.Next() 171 + size, _, _, err := cr.Next() 163 172 if err != nil { 164 173 t.Fatalf("Next large.bin: %v", err) 165 174 } ··· 168 177 } 169 178 170 179 // Read binary.bin after skipping two entries. 171 - size, _, err = cr.Next() 180 + size, _, _, err = cr.Next() 172 181 if err != nil { 173 182 t.Fatalf("Next binary.bin: %v", err) 174 183 } ··· 191 200 blobs["empty.txt"], 192 201 } 193 202 194 - cr, err := newCatfileReader(repoDir, ids) 203 + cr, err := newCatfileReader(repoDir, ids, catfileReaderOptions{}) 195 204 if err != nil { 196 205 t.Fatalf("newCatfileReader: %v", err) 197 206 } 198 207 defer cr.Close() 199 208 200 209 // hello.txt — read normally. 201 - size, missing, err := cr.Next() 202 - if err != nil || missing { 203 - t.Fatalf("Next hello.txt: err=%v missing=%v", err, missing) 210 + size, missing, excluded, err := cr.Next() 211 + if err != nil || missing || excluded { 212 + t.Fatalf("Next hello.txt: err=%v missing=%v excluded=%v", err, missing, excluded) 204 213 } 205 214 content := make([]byte, size) 206 215 if _, err := io.ReadFull(cr, content); err != nil { ··· 211 220 } 212 221 213 222 // fakeHash — missing. 214 - _, missing, err = cr.Next() 223 + _, missing, excluded, err = cr.Next() 215 224 if err != nil { 216 225 t.Fatalf("Next fakeHash: %v", err) 217 226 } 227 + if excluded { 228 + t.Error("expected fakeHash to be missing, not excluded") 229 + } 218 230 if !missing { 219 231 t.Error("expected fakeHash to be missing") 220 232 } 221 233 222 234 // empty.txt — still works after missing entry. 223 - size, missing, err = cr.Next() 224 - if err != nil || missing { 225 - t.Fatalf("Next empty.txt: err=%v missing=%v", err, missing) 235 + size, missing, excluded, err = cr.Next() 236 + if err != nil || missing || excluded { 237 + t.Fatalf("Next empty.txt: err=%v missing=%v excluded=%v", err, missing, excluded) 226 238 } 227 239 if size != 0 { 228 240 t.Errorf("empty.txt size = %d, want 0", size) 229 241 } 230 242 } 243 + 244 + func TestCatfileReader_Excluded(t *testing.T) { 245 + repoDir, blobs := createTestRepo(t) 246 + 247 + ids := []plumbing.Hash{ 248 + blobs["large.bin"], 249 + blobs["hello.txt"], 250 + } 251 + 252 + cr, err := newCatfileReader(repoDir, ids, catfileReaderOptions{filterSpec: "blob:limit=1k"}) 253 + if err != nil { 254 + t.Fatalf("newCatfileReader: %v", err) 255 + } 256 + defer cr.Close() 257 + 258 + _, missing, excluded, err := cr.Next() 259 + if err != nil { 260 + t.Fatalf("Next large.bin: %v", err) 261 + } 262 + if missing { 263 + t.Fatal("large.bin unexpectedly missing") 264 + } 265 + if !excluded { 266 + t.Fatal("large.bin unexpectedly included") 267 + } 268 + 269 + size, missing, excluded, err := cr.Next() 270 + if err != nil { 271 + t.Fatalf("Next hello.txt: %v", err) 272 + } 273 + if missing || excluded { 274 + t.Fatalf("hello.txt unexpectedly skipped: missing=%v excluded=%v", missing, excluded) 275 + } 276 + content := make([]byte, size) 277 + if _, err := io.ReadFull(cr, content); err != nil { 278 + t.Fatalf("ReadFull hello.txt: %v", err) 279 + } 280 + if string(content) != "hello world\n" { 281 + t.Errorf("hello.txt = %q", content) 282 + } 283 + }
+22 -2
gitindex/index.go
··· 636 636 // Stream main-repo blobs via pipelined cat-file --batch --buffer. 637 637 // Large blobs are skipped without reading content into memory. 638 638 if len(mainRepoIDs) > 0 { 639 - cr, err := newCatfileReader(opts.RepoDir, mainRepoIDs) 639 + crOpts := catfileReaderOptions{ 640 + filterSpec: catfileFilterSpec(opts), 641 + } 642 + cr, err := newCatfileReader(opts.RepoDir, mainRepoIDs, crOpts) 640 643 if err != nil { 641 644 return false, fmt.Errorf("newCatfileReader: %w", err) 642 645 } ··· 673 676 defer cr.Close() 674 677 675 678 for idx, key := range keys { 676 - size, missing, err := cr.Next() 679 + size, missing, excluded, err := cr.Next() 677 680 if err != nil { 678 681 return fmt.Errorf("cat-file next for %s: %w", key.FullPath(), err) 679 682 } ··· 686 689 // clone, or a race with git gc. Log a warning and skip. 687 690 log.Printf("warning: blob %s missing for %s", key.ID, key.FullPath()) 688 691 doc = skippedDoc(key, branches, index.SkipReasonMissing) 692 + } else if excluded { 693 + doc = skippedDoc(key, branches, index.SkipReasonTooLarge) 689 694 } else { 690 695 keyFullPath := key.FullPath() 691 696 if size > opts.BuildOptions.SizeMax && !opts.BuildOptions.IgnoreSizeMax(keyFullPath) { ··· 775 780 type noopCloser struct{} 776 781 777 782 func (noopCloser) Close() error { return nil } 783 + 784 + func catfileFilterSpec(opts Options) string { 785 + // Can't filter by size if we have large file exceptions 786 + if len(opts.BuildOptions.LargeFiles) > 0 { 787 + return "" 788 + } 789 + 790 + if opts.BuildOptions.SizeMax <= 0 { 791 + return "" 792 + } 793 + 794 + // Git's blob:limit filter excludes blobs whose size is >= the given limit, 795 + // while zoekt indexes files up to and including SizeMax bytes. 796 + return fmt.Sprintf("blob:limit=%d", int64(opts.BuildOptions.SizeMax)+1) 797 + } 778 798 779 799 func newIgnoreMatcher(tree *object.Tree) (*ignore.Matcher, error) { 780 800 ignoreFile, err := tree.File(ignore.IgnoreFile)
+30
gitindex/index_test.go
··· 165 165 } 166 166 } 167 167 168 + func TestCatfileFilterSpec(t *testing.T) { 169 + for _, tc := range []struct { 170 + name string 171 + opts Options 172 + want string 173 + }{ 174 + { 175 + name: "size max", 176 + opts: Options{BuildOptions: index.Options{SizeMax: 1 << 20}}, 177 + want: "blob:limit=1048577", 178 + }, 179 + { 180 + name: "large file exception disables filter", 181 + opts: Options{BuildOptions: index.Options{SizeMax: 1 << 20, LargeFiles: []string{"*.bin"}}}, 182 + want: "", 183 + }, 184 + { 185 + name: "zero size max disables filter", 186 + opts: Options{BuildOptions: index.Options{SizeMax: 0}}, 187 + want: "", 188 + }, 189 + } { 190 + t.Run(tc.name, func(t *testing.T) { 191 + if got := catfileFilterSpec(tc.opts); got != tc.want { 192 + t.Fatalf("catfileFilterSpec() = %q, want %q", got, tc.want) 193 + } 194 + }) 195 + } 196 + } 197 + 168 198 func executeCommand(t *testing.T, dir string, cmd *exec.Cmd) *exec.Cmd { 169 199 cmd.Dir = dir 170 200 cmd.Env = []string{