fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

gitindex: optimize git index time by ~21% (#1036)

+137 -15
+3 -4
gitindex/index.go
··· 689 689 func indexCatfileBlobs(cr *catfileReader, keys []fileKey, repos map[fileKey]BlobLocation, opts Options, builder *index.Builder) error { 690 690 defer cr.Close() 691 691 692 + slab := newContentSlab(16 << 20) // 16 MB per slab 693 + 692 694 for idx, key := range keys { 693 695 size, missing, excluded, err := cr.Next() 694 696 if err != nil { ··· 711 713 // Skip without reading content into memory. 712 714 doc = skippedDoc(key, branches, index.SkipReasonTooLarge) 713 715 } else { 714 - // Pre-allocate and read the full blob content in one call. 715 - // io.ReadFull is preferred over io.LimitedReader here as it 716 - // avoids the intermediate allocation and the size is known. 717 - content := make([]byte, size) 716 + content := slab.alloc(size) 718 717 if _, err := io.ReadFull(cr, content); err != nil { 719 718 return fmt.Errorf("read blob %s: %w", keyFullPath, err) 720 719 }
+32
gitindex/slab.go
··· 1 + package gitindex 2 + 3 + // contentSlab reduces per-file heap allocations by sub-slicing from a 4 + // shared buffer. Each returned slice has its capacity capped (3-index 5 + // slice) so appending to one file's content cannot overwrite adjacent 6 + // data. Files larger than the slab get their own allocation. 7 + type contentSlab struct { 8 + buf []byte 9 + cap int 10 + } 11 + 12 + func newContentSlab(slabCap int) contentSlab { 13 + return contentSlab{ 14 + buf: make([]byte, 0, slabCap), 15 + cap: slabCap, 16 + } 17 + } 18 + 19 + // alloc returns a byte slice of length n. The caller must write into it 20 + // immediately (the bytes are uninitialized when sourced from the slab). 21 + func (s *contentSlab) alloc(n int) []byte { 22 + if n > s.cap { 23 + return make([]byte, n) 24 + } 25 + if len(s.buf)+n > cap(s.buf) { 26 + s.buf = make([]byte, n, s.cap) 27 + return s.buf[:n:n] 28 + } 29 + off := len(s.buf) 30 + s.buf = s.buf[:off+n] 31 + return s.buf[off : off+n : off+n] 32 + }
+72
gitindex/slab_test.go
··· 1 + package gitindex 2 + 3 + import "testing" 4 + 5 + func TestContentSlab(t *testing.T) { 6 + t.Run("fits in slab", func(t *testing.T) { 7 + s := newContentSlab(1024) 8 + b := s.alloc(100) 9 + if len(b) != 100 { 10 + t.Fatalf("len = %d, want 100", len(b)) 11 + } 12 + if cap(b) != 100 { 13 + t.Fatalf("cap = %d, want 100 (3-index slice)", cap(b)) 14 + } 15 + }) 16 + 17 + t.Run("cap is capped so append cannot corrupt adjacent data", func(t *testing.T) { 18 + s := newContentSlab(1024) 19 + a := s.alloc(10) 20 + copy(a, []byte("aaaaaaaaaa")) 21 + 22 + b := s.alloc(10) 23 + copy(b, []byte("bbbbbbbbbb")) 24 + 25 + // Appending to a must not overwrite b. 26 + a = append(a, 'X') // triggers new backing array since cap==len 27 + if string(b) != "bbbbbbbbbb" { 28 + t.Fatalf("adjacent data corrupted: got %q", b) 29 + } 30 + _ = a 31 + }) 32 + 33 + t.Run("slab rollover", func(t *testing.T) { 34 + s := newContentSlab(64) 35 + a := s.alloc(60) 36 + if len(a) != 60 || cap(a) != 60 { 37 + t.Fatalf("a: len=%d cap=%d", len(a), cap(a)) 38 + } 39 + // Next alloc doesn't fit in remaining 4 bytes → new slab. 40 + b := s.alloc(10) 41 + if len(b) != 10 || cap(b) != 10 { 42 + t.Fatalf("b: len=%d cap=%d", len(b), cap(b)) 43 + } 44 + // a and b should not share backing arrays. 45 + copy(a, make([]byte, 60)) 46 + copy(b, []byte("0123456789")) 47 + if string(b) != "0123456789" { 48 + t.Fatal("rollover corrupted data") 49 + } 50 + }) 51 + 52 + t.Run("oversized allocation", func(t *testing.T) { 53 + s := newContentSlab(64) 54 + b := s.alloc(128) 55 + if len(b) != 128 { 56 + t.Fatalf("len = %d, want 128", len(b)) 57 + } 58 + // Oversized alloc should not consume slab space. 59 + c := s.alloc(32) 60 + if len(c) != 32 || cap(c) != 32 { 61 + t.Fatalf("c: len=%d cap=%d", len(c), cap(c)) 62 + } 63 + }) 64 + 65 + t.Run("zero size", func(t *testing.T) { 66 + s := newContentSlab(64) 67 + b := s.alloc(0) 68 + if len(b) != 0 { 69 + t.Fatalf("len = %d, want 0", len(b)) 70 + } 71 + }) 72 + }
+9 -4
index/builder.go
··· 37 37 38 38 "github.com/bmatcuk/doublestar" 39 39 "github.com/dustin/go-humanize" 40 - "github.com/go-enry/go-enry/v2" 41 40 "github.com/rs/xid" 42 41 "golang.org/x/sys/unix" 43 42 ··· 625 624 doc.SkipReason = skip 626 625 } 627 626 627 + // Pre-compute file category and language while content is still 628 + // available, before content is dropped for skipped documents. 629 + DetermineFileCategory(&doc) 630 + DetermineLanguageIfUnknown(&doc) 631 + 628 632 b.todo = append(b.todo, &doc) 629 633 630 634 if doc.SkipReason == SkipReasonNone { ··· 888 892 skipped = 1.0 889 893 } 890 894 895 + // Use pre-computed Category from DetermineFileCategory. 891 896 generated := 0.0 892 - if enry.IsGenerated(d.Name, d.Content) { 897 + if d.Category == FileCategoryGenerated { 893 898 generated = 1.0 894 899 } 895 900 896 901 vendor := 0.0 897 - if enry.IsVendor(d.Name) { 902 + if d.Category == FileCategoryVendored { 898 903 vendor = 1.0 899 904 } 900 905 901 906 test := 0.0 902 - if enry.IsTest(d.Name) { 907 + if d.Category == FileCategoryTest { 903 908 test = 1.0 904 909 } 905 910
+3
index/builder_test.go
··· 1084 1084 1085 1085 got := make([]*Document, len(c.docs)) 1086 1086 copy(got, c.docs) 1087 + for _, d := range got { 1088 + DetermineFileCategory(d) 1089 + } 1087 1090 sortDocuments(got) 1088 1091 1089 1092 print := func(ds []*Document) string {
-2
index/ctags.go
··· 55 55 continue 56 56 } 57 57 58 - DetermineLanguageIfUnknown(doc) 59 - 60 58 parserType := languageMap[normalizeLanguage(doc.Language)] 61 59 if parserType == ctags.NoCTags { 62 60 continue
+18 -5
index/shard_builder.go
··· 235 235 s.postings[ng] = pl 236 236 } 237 237 } 238 - m := binary.PutUvarint(buf[:], uint64(newOff-pl.lastOff)) 239 - pl.data = append(pl.data, buf[:m]...) 238 + delta := uint64(newOff - pl.lastOff) 239 + if delta < 0x80 { 240 + // Single-byte varint fast path: ~80% of deltas are < 128. 241 + // append(slice, byte) is cheaper than append(slice, slice...). 242 + pl.data = append(pl.data, byte(delta)) 243 + } else { 244 + m := binary.PutUvarint(buf[:], delta) 245 + pl.data = append(pl.data, buf[:m]...) 246 + } 240 247 pl.lastOff = newOff 241 248 } 242 249 s.runeCount += runeIndex ··· 536 543 537 544 // Add a file which only occurs in certain branches. 538 545 func (b *ShardBuilder) Add(doc Document) error { 539 - if index := bytes.IndexByte(doc.Content, 0); index > 0 { 540 - doc.SkipReason = SkipReasonBinary 546 + // Skip binary check if already computed (e.g., by Builder.Add 547 + // which calls DocChecker.Check before docs reach buildShard). 548 + if doc.Category == FileCategoryMissing { 549 + if index := bytes.IndexByte(doc.Content, 0); index > 0 { 550 + doc.SkipReason = SkipReasonBinary 551 + } 541 552 } 542 553 543 554 if doc.SkipReason != SkipReasonNone { ··· 547 558 } 548 559 549 560 DetermineLanguageIfUnknown(&doc) 550 - DetermineFileCategory(&doc) 561 + if doc.Category == FileCategoryMissing { 562 + DetermineFileCategory(&doc) 563 + } 551 564 552 565 sort.Sort(symbolSlice{doc.Symbols, doc.SymbolsMetaData}) 553 566 var last DocumentSection