fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

query: introduce FileNameSet (#482)

Sourcegraph's new hybrid searcher sends around large ORs of filenames to
search for. FileNameSet provides an efficient implementation to
transport and query these sets of filenames.

We include an efficient gob implementation based on previous work around
encoding a more complicated map of strings.

Test Plan: Added unit tests. Additionally, updated sourcegraph to use
this implementation and ran its tests.

+251 -6
+24
index_test.go
··· 598 598 t.Fatal(diff) 599 599 } 600 600 }) 601 + 602 + t.Run("FileNameSet", func(t *testing.T) { 603 + sres := searchForTest(t, b, query.NewFileNameSet("banana"), chunkOpts) 604 + 605 + matches := sres.Files 606 + if len(matches) != 1 || len(matches[0].ChunkMatches) != 1 { 607 + t.Fatalf("got %v, want 1 match", matches) 608 + } 609 + 610 + got := matches[0].ChunkMatches[0] 611 + want := ChunkMatch{ 612 + Content: []byte("banana"), 613 + ContentStart: Location{ByteOffset: 0, LineNumber: 1, Column: 1}, 614 + Ranges: []Range{{ 615 + Start: Location{ByteOffset: 0, LineNumber: 1, Column: 1}, 616 + End: Location{ByteOffset: 6, LineNumber: 1, Column: 7}, 617 + }}, 618 + FileName: true, 619 + } 620 + 621 + if diff := cmp.Diff(want, got); diff != "" { 622 + t.Fatal(diff) 623 + } 624 + }) 601 625 } 602 626 603 627 func TestFileCase(t *testing.T) {
+11
matchtree.go
··· 920 920 matchTree: subMT, 921 921 }, nil 922 922 923 + case *query.FileNameSet: 924 + return &docMatchTree{ 925 + reason: "FileNameSet", 926 + numDocs: d.numDocs(), 927 + predicate: func(docID uint32) bool { 928 + fileName := d.fileName(docID) 929 + _, ok := s.Set[string(fileName)] 930 + return ok 931 + }, 932 + }, nil 933 + 923 934 case *query.BranchesRepos: 924 935 reposBranchesWant := make([]uint64, len(d.repoMetaData)) 925 936 for repoIdx := range d.repoMetaData {
+92
query/marshal.go
··· 83 83 return brs, r.err 84 84 } 85 85 86 + // We implement a custom binary marshaller for a set of file names. See commit 87 + // 6c893ff323647b0419fac46ee462532401bf3283 for context on this code. 88 + // Additionally this code is based on that commit. 89 + // 90 + // Wire-format of map[string]struct{} is pretty straightforward: 91 + // 92 + // byte(1) version 93 + // uvarint(len(map)) 94 + // for k in map: 95 + // uvarint(len(k)) 96 + // bytes(k) 97 + // 98 + // The above format gives about the same size encoding as gob does. However, 99 + // gob doesn't have a specialization for map[string]struct{} so we get to 100 + // avoid a lot of intermediate allocations. 101 + // 102 + // The above adds up to a huge improvement, worth the extra complexity: 103 + // 104 + // name old time/op new time/op delta 105 + // FileNameSet_Encode-8 91.2µs ± 2% 36.8µs ± 1% -59.69% (p=0.000 n=10+9) 106 + // FileNameSet_Decode-8 143µs ± 1% 54µs ± 1% -61.96% (p=0.000 n=8+9) 107 + // 108 + // name old bytes new bytes delta 109 + // FileNameSet_Encode-8 12.1kB ± 0% 11.1kB ± 0% -8.63% (p=0.000 n=10+10) 110 + // 111 + // name old alloc/op new alloc/op delta 112 + // FileNameSet_Encode-8 16.0kB ± 0% 12.3kB ± 0% -23.20% (p=0.000 n=10+10) 113 + // FileNameSet_Decode-8 76.7kB ± 0% 72.3kB ± 0% -5.77% (p=0.000 n=10+10) 114 + // 115 + // name old allocs/op new allocs/op delta 116 + // FileNameSet_Encode-8 1.00k ± 0% 0.00k ± 0% -99.90% (p=0.000 n=10+10) 117 + // FileNameSet_Decode-8 1.20k ± 0% 0.18k ± 0% -85.27% (p=0.000 n=10+10) 118 + 119 + // stringSetEncode implements an efficient encoder for map[string]struct{}. 120 + func stringSetEncode(set map[string]struct{}) ([]byte, error) { 121 + var b bytes.Buffer 122 + var enc [binary.MaxVarintLen64]byte 123 + varint := func(n int) { 124 + m := binary.PutUvarint(enc[:], uint64(n)) 125 + b.Write(enc[:m]) 126 + } 127 + str := func(s string) { 128 + varint(len(s)) 129 + b.WriteString(s) 130 + } 131 + strSize := func(s string) int { 132 + return binary.PutUvarint(enc[:], uint64(len(s))) + len(s) 133 + } 134 + 135 + // Calculate size 136 + size := 1 // version 137 + size += binary.PutUvarint(enc[:], uint64(len(set))) 138 + for k := range set { 139 + size += strSize(k) 140 + } 141 + b.Grow(size) 142 + 143 + // Version 144 + b.WriteByte(1) 145 + 146 + // Length 147 + varint(len(set)) 148 + 149 + for k := range set { 150 + str(k) 151 + } 152 + 153 + return b.Bytes(), nil 154 + } 155 + 156 + // stringSetDecode implements an efficient decoder for map[string]struct{}. 157 + func stringSetDecode(b []byte) (map[string]struct{}, error) { 158 + // binaryReader returns strings pointing into b to avoid allocations. We 159 + // don't own b, so we create a copy of it. 160 + r := binaryReader{b: append([]byte{}, b...)} 161 + 162 + // Version 163 + if v := r.byt(); v != 1 { 164 + return nil, fmt.Errorf("unsupported stringSet encoding version %d", v) 165 + } 166 + 167 + // Length 168 + l := r.uvarint() 169 + set := make(map[string]struct{}, l) 170 + 171 + for i := 0; i < l; i++ { 172 + set[r.str()] = struct{}{} 173 + } 174 + 175 + return set, r.err 176 + } 177 + 86 178 type binaryReader struct { 87 179 b []byte 88 180 err error
+76 -6
query/marshal_test.go
··· 99 99 } 100 100 } 101 101 102 + func BenchmarkFileNameSet_Encode(b *testing.B) { 103 + set := genFileNameSet(1000) 104 + 105 + // do one write to amortize away the cost of gob registration 106 + w := &countWriter{} 107 + enc := gob.NewEncoder(w) 108 + if err := enc.Encode(set); err != nil { 109 + b.Fatal(err) 110 + } 111 + 112 + b.ResetTimer() 113 + b.ReportAllocs() 114 + 115 + b.ReportMetric(float64(w.n), "bytes") 116 + 117 + for n := 0; n < b.N; n++ { 118 + if err := enc.Encode(set); err != nil { 119 + b.Fatal(err) 120 + } 121 + } 122 + } 123 + 124 + func BenchmarkFileNameSet_Decode(b *testing.B) { 125 + set := genFileNameSet(1000) 126 + 127 + var buf bytes.Buffer 128 + if err := gob.NewEncoder(&buf).Encode(set); err != nil { 129 + b.Fatal(err) 130 + } 131 + 132 + b.ResetTimer() 133 + b.ReportAllocs() 134 + 135 + for n := 0; n < b.N; n++ { 136 + // We need to include gob.NewDecoder cost to avoid measuring encoding. 137 + var repoBranches FileNameSet 138 + if err := gob.NewDecoder(bytes.NewReader(buf.Bytes())).Decode(&repoBranches); err != nil { 139 + b.Fatal(err) 140 + } 141 + } 142 + } 143 + 144 + func TestFileNameSet_Marshal(t *testing.T) { 145 + for i := range []int{0, 1, 10, 100} { 146 + want := genFileNameSet(i) 147 + 148 + var buf bytes.Buffer 149 + if err := gob.NewEncoder(&buf).Encode(want); err != nil { 150 + t.Fatal(err) 151 + } 152 + 153 + var got FileNameSet 154 + if err := gob.NewDecoder(bytes.NewReader(buf.Bytes())).Decode(&got); err != nil { 155 + t.Fatal(err) 156 + } 157 + 158 + if diff := cmp.Diff(want, &got); diff != "" { 159 + t.Fatalf("mismatch for set size %d (-want +got):\n%s", i, diff) 160 + } 161 + } 162 + } 163 + 164 + func genFileNameSet(size int) *FileNameSet { 165 + set := make(map[string]struct{}, size) 166 + for i := 0; i < size; i++ { 167 + set[genName(i)] = struct{}{} 168 + } 169 + return &FileNameSet{Set: set} 170 + } 171 + 102 172 // Generating 5.5M repos slows down the benchmark setup time, so we cache things. 103 173 var genCache = map[string]interface{}{} 104 174 105 175 func genRepoBranches(n int) map[string][]string { 106 - genName := func(n int) string { 107 - bs := make([]byte, 8) 108 - binary.LittleEndian.PutUint64(bs, uint64(n)) 109 - return fmt.Sprintf("%x", sha256.Sum256(bs))[:10] 110 - } 111 - 112 176 repoBranches := map[string][]string{} 113 177 orgIndex := 0 114 178 repoIndex := 0 ··· 130 194 } 131 195 132 196 return repoBranches 197 + } 198 + 199 + func genName(n int) string { 200 + bs := make([]byte, 8) 201 + binary.LittleEndian.PutUint64(bs, uint64(n)) 202 + return fmt.Sprintf("%x", sha256.Sum256(bs))[:10] 133 203 } 134 204 135 205 func genBranchesRepos(n int) *BranchesRepos {
+47
query/query.go
··· 281 281 return s 282 282 } 283 283 284 + // FileNameSet is a list of file names to match. It is a Sourcegraph addition 285 + // and only used in the RPC interface for efficient checking of large file 286 + // lists. 287 + type FileNameSet struct { 288 + Set map[string]struct{} 289 + } 290 + 291 + // MarshalBinary implements a specialized encoder for FileNameSet. 292 + func (q *FileNameSet) MarshalBinary() ([]byte, error) { 293 + return stringSetEncode(q.Set) 294 + } 295 + 296 + // UnmarshalBinary implements a specialized decoder for FileNameSet. 297 + func (q *FileNameSet) UnmarshalBinary(b []byte) error { 298 + var err error 299 + q.Set, err = stringSetDecode(b) 300 + return err 301 + } 302 + 303 + func (q *FileNameSet) String() string { 304 + var detail string 305 + if len(q.Set) > 5 { 306 + // Large sets being output are not useful 307 + detail = fmt.Sprintf("size=%d", len(q.Set)) 308 + } else { 309 + values := make([]string, 0, len(q.Set)) 310 + for v := range q.Set { 311 + values = append(values, v) 312 + } 313 + sort.Strings(values) 314 + detail = strings.Join(values, " ") 315 + } 316 + return fmt.Sprintf("(filenameset %s)", detail) 317 + } 318 + 319 + func NewFileNameSet(fileNames ...string) *FileNameSet { 320 + s := &FileNameSet{Set: make(map[string]struct{})} 321 + for _, r := range fileNames { 322 + s.Set[r] = struct{}{} 323 + } 324 + return s 325 + } 326 + 284 327 const ( 285 328 TypeFileMatch uint8 = iota 286 329 TypeFileName ··· 613 656 return &Const{true} 614 657 } 615 658 case *RepoSet: 659 + if len(s.Set) == 0 { 660 + return &Const{false} 661 + } 662 + case *FileNameSet: 616 663 if len(s.Set) == 0 { 617 664 return &Const{false} 618 665 }
+1
rpc/rpc.go
··· 135 135 gobRegister(&query.BranchesRepos{}) 136 136 gobRegister(&query.Branch{}) 137 137 gobRegister(&query.Const{}) 138 + gobRegister(&query.FileNameSet{}) 138 139 gobRegister(&query.GobCache{}) 139 140 gobRegister(&query.Language{}) 140 141 gobRegister(&query.Not{})