query: binary encoder for repo branches (#53) · boltless.me/zoekt@809c853

+194

query/marshal.go

··· 1 + package query 2 + 3 + import ( 4 + "bytes" 5 + "encoding/binary" 6 + "errors" 7 + "fmt" 8 + "unsafe" 9 + ) 10 + 11 + // We implement a custom binary marshaller for a list of repos to 12 + // branches. When profiling Sourcegraph this is one of the dominant items. 13 + // 14 + // Wire-format of map[string][]string is pretty straightforward: 15 + // 16 + // byte(1) version 17 + // uvarint(len(map)) 18 + // for k, vs in map: 19 + // str(k) 20 + // uvarint(len(vs)) 21 + // for v in vs: 22 + // str(v) 23 + // 24 + // where str(v) is uvarint(len(v)) bytes(v) 25 + // 26 + // The above format gives about the same size encoding as gob does. However, 27 + // gob doesn't have a specialization for map[string][]string so we get to 28 + // avoid a lot of intermediate allocations. 29 + // 30 + // The only other specialization we add is treating []string{"HEAD"} as if it 31 + // was []string{}. This is the most common value for branches so avoids the 32 + // need to write it on the wire. This makes us beat gob for encoded size. 33 + // 34 + // The above adds up to a huge improvement, worth the extra complexity: 35 + // 36 + // name old time/op new time/op delta 37 + // RepoBranches_Encode-8 2.37ms ± 3% 0.62ms ± 0% -73.77% (p=0.000 n=10+8) 38 + // RepoBranches_Decode-8 4.19ms ± 2% 0.74ms ± 1% -82.37% (p=0.000 n=10+9) 39 + // 40 + // name old bytes new bytes delta 41 + // RepoBranches_Encode-8 393kB ± 0% 344kB ± 0% -12.48% (p=0.000 n=10+10) 42 + // 43 + // name old alloc/op new alloc/op delta 44 + // RepoBranches_Encode-8 726kB ± 0% 344kB ± 0% -52.60% (p=0.000 n=10+9) 45 + // RepoBranches_Decode-8 2.31MB ± 0% 1.44MB ± 0% -37.51% (p=0.000 n=9+10) 46 + // 47 + // name old allocs/op new allocs/op delta 48 + // RepoBranches_Encode-8 20.0k ± 0% 0.0k ± 0% -100.00% (p=0.000 n=10+10) 49 + // RepoBranches_Decode-8 50.6k ± 0% 0.4k ± 0% -99.26% (p=0.000 n=10+10) 50 + 51 + // repoBranchesEncode implements an efficient encoder for RepoBranches. 52 + func repoBranchesEncode(repoBranches map[string][]string) ([]byte, error) { 53 + var b bytes.Buffer 54 + var enc [binary.MaxVarintLen64]byte 55 + varint := func(n int) { 56 + m := binary.PutUvarint(enc[:], uint64(n)) 57 + b.Write(enc[:m]) 58 + } 59 + str := func(s string) { 60 + varint(len(s)) 61 + b.WriteString(s) 62 + } 63 + strSize := func(s string) int { 64 + return binary.PutUvarint(enc[:], uint64(len(s))) + len(s) 65 + } 66 + 67 + // Calculate size 68 + size := 1 // version 69 + size += binary.PutUvarint(enc[:], uint64(len(repoBranches))) 70 + for name, branches := range repoBranches { 71 + size += strSize(name) + 1 72 + if l := len(branches); l == 1 && branches[0] == "HEAD" { 73 + continue 74 + } else if l == 0 { 75 + // We reserve "0" for the "HEAD" special case. 76 + return nil, fmt.Errorf("repo with no branches: %q", name) 77 + } else if l > 255 { 78 + // We encode branches len as a byte (saves 11% cpu vs varint). This is 79 + // fine sinze Zoekt can only index upto 64 branches (uses a bitmask on a 80 + // 64bit int to encode branch information for a document) 81 + return nil, fmt.Errorf("can't encode more than 255 branches: %d", l) 82 + } 83 + for _, branch := range branches { 84 + size += strSize(branch) 85 + } 86 + } 87 + b.Grow(size) 88 + 89 + // Version 90 + b.WriteByte(1) 91 + 92 + // Length 93 + varint(len(repoBranches)) 94 + 95 + for name, branches := range repoBranches { 96 + str(name) 97 + 98 + // Special case "HEAD" 99 + if len(branches) == 1 && branches[0] == "HEAD" { 100 + branches = nil 101 + } 102 + 103 + b.WriteByte(byte(len(branches))) 104 + for _, branch := range branches { 105 + str(branch) 106 + } 107 + } 108 + 109 + return b.Bytes(), nil 110 + } 111 + 112 + // head is the most common slice of branches we search. We re-use it to avoid 113 + // allocations when decoding. We know that zoekt never mutates the 114 + // repoBranches slice, so it is safe to share this slice. 115 + var head = []string{"HEAD"} 116 + 117 + // repoBranchesDecode implements an efficient decoder for RepoBranches. 118 + func repoBranchesDecode(b []byte) (map[string][]string, error) { 119 + // binaryReader returns strings pointing into b to avoid allocations. We 120 + // don't own b, so we create a copy of it. 121 + r := binaryReader{b: append([]byte{}, b...)} 122 + 123 + // Version 124 + if v := r.byt(); v != 1 { 125 + return nil, fmt.Errorf("unsupported RepoBranches encoding version %d", v) 126 + } 127 + 128 + // Length 129 + l := r.uvarint() 130 + repoBranches := make(map[string][]string, l) 131 + 132 + for i := 0; i < l; i++ { 133 + name := r.str() 134 + 135 + branchesLen := int(r.byt()) 136 + 137 + // Special case "HEAD" 138 + if branchesLen == 0 { 139 + repoBranches[name] = head 140 + continue 141 + } 142 + 143 + branches := make([]string, branchesLen) 144 + for j := 0; j < branchesLen; j++ { 145 + branches[j] = r.str() 146 + } 147 + repoBranches[name] = branches 148 + } 149 + 150 + return repoBranches, r.err 151 + } 152 + 153 + type binaryReader struct { 154 + b []byte 155 + err error 156 + } 157 + 158 + func (b *binaryReader) uvarint() int { 159 + x, n := binary.Uvarint(b.b) 160 + if n < 0 { 161 + b.b = nil 162 + b.err = errors.New("malformed RepoBranches") 163 + return 0 164 + } 165 + b.b = b.b[n:] 166 + return int(x) 167 + } 168 + 169 + func (b *binaryReader) str() string { 170 + l := b.uvarint() 171 + if l > len(b.b) { 172 + b.b = nil 173 + b.err = errors.New("malformed RepoBranches") 174 + return "" 175 + } 176 + s := b2s(b.b[:l]) 177 + b.b = b.b[l:] 178 + return s 179 + } 180 + 181 + func (b *binaryReader) byt() byte { 182 + if len(b.b) < 1 { 183 + b.b = nil 184 + b.err = errors.New("malformed RepoBranches") 185 + return 0 186 + } 187 + x := b.b[0] 188 + b.b = b.b[1:] 189 + return x 190 + } 191 + 192 + func b2s(b []byte) string { 193 + return *(*string)(unsafe.Pointer(&b)) 194 + }

+106

query/marshal_test.go

··· 1 + package query 2 + 3 + import ( 4 + "bytes" 5 + "crypto/sha256" 6 + "encoding/binary" 7 + "encoding/gob" 8 + "fmt" 9 + "testing" 10 + 11 + "github.com/google/go-cmp/cmp" 12 + ) 13 + 14 + // We benchmark via Gob since that allows us to compare to no custom 15 + // marshalling. 16 + 17 + func BenchmarkRepoBranches_Encode(b *testing.B) { 18 + repoBranches := genRepoBranches() 19 + 20 + // do one write to amortize away the cost of gob registration 21 + w := &countWriter{} 22 + enc := gob.NewEncoder(w) 23 + if err := enc.Encode(repoBranches); err != nil { 24 + b.Fatal(err) 25 + } 26 + 27 + b.ResetTimer() 28 + b.ReportAllocs() 29 + 30 + b.ReportMetric(float64(w.n), "bytes") 31 + 32 + for n := 0; n < b.N; n++ { 33 + if err := enc.Encode(repoBranches); err != nil { 34 + b.Fatal(err) 35 + } 36 + } 37 + } 38 + 39 + func BenchmarkRepoBranches_Decode(b *testing.B) { 40 + repoBranches := genRepoBranches() 41 + 42 + var buf bytes.Buffer 43 + if err := gob.NewEncoder(&buf).Encode(repoBranches); err != nil { 44 + b.Fatal(err) 45 + } 46 + 47 + b.ResetTimer() 48 + b.ReportAllocs() 49 + 50 + for n := 0; n < b.N; n++ { 51 + // We need to include gob.NewDecoder cost to avoid measuring encoding. 52 + var repoBranches RepoBranches 53 + if err := gob.NewDecoder(bytes.NewReader(buf.Bytes())).Decode(&repoBranches); err != nil { 54 + b.Fatal(err) 55 + } 56 + } 57 + } 58 + 59 + func TestRepoBranches_Marshal(t *testing.T) { 60 + want := genRepoBranches() 61 + 62 + var buf bytes.Buffer 63 + if err := gob.NewEncoder(&buf).Encode(want); err != nil { 64 + t.Fatal(err) 65 + } 66 + 67 + var got RepoBranches 68 + if err := gob.NewDecoder(bytes.NewReader(buf.Bytes())).Decode(&got); err != nil { 69 + t.Fatal(err) 70 + } 71 + 72 + if diff := cmp.Diff(want, &got); diff != "" { 73 + t.Fatalf("mismatch (-want +got):\n%s", diff) 74 + } 75 + } 76 + 77 + func genRepoBranches() *RepoBranches { 78 + genName := func(n int) string { 79 + bs := make([]byte, 8) 80 + binary.LittleEndian.PutUint64(bs, uint64(n)) 81 + return fmt.Sprintf("%x", sha256.Sum256(bs))[:10] 82 + } 83 + 84 + repoBranches := &RepoBranches{Set: map[string][]string{}} 85 + for i := 0; i < 100; i++ { 86 + org := genName(i) 87 + for j := 0; j < 100; j++ { 88 + name := "github.com/" + org + "/" + genName(i*2+j) 89 + repoBranches.Set[name] = []string{"HEAD"} 90 + if j%50 == 0 { 91 + repoBranches.Set[name] = append(repoBranches.Set[name], "more", "branches") 92 + } 93 + } 94 + } 95 + 96 + return repoBranches 97 + } 98 + 99 + type countWriter struct { 100 + n int 101 + } 102 + 103 + func (w *countWriter) Write(b []byte) (int, error) { 104 + w.n += len(b) 105 + return len(b), nil 106 + }

+12

query/query.go

··· 151 151 return fmt.Sprintf("(reposet %s)", detail) 152 152 } 153 153 154 + // MarshalBinary implements a specialized encoder for RepoBranches. 155 + func (q *RepoBranches) MarshalBinary() ([]byte, error) { 156 + return repoBranchesEncode(q.Set) 157 + } 158 + 159 + // UnmarshalBinary implements a specialized decoder for RepoBranches. 160 + func (q *RepoBranches) UnmarshalBinary(b []byte) error { 161 + var err error 162 + q.Set, err = repoBranchesDecode(b) 163 + return err 164 + } 165 + 154 166 // RepoSet is a list of repos to match. It is a Sourcegraph addition and only 155 167 // used in the RPC interface for efficient checking of large repo lists. 156 168 type RepoSet struct {

Configure Feed

Configure Feed