fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

query: introduce space optimized BranchesRepos (#160)

This commit introduces a space optimized alternative to
query.RepoBranches. Its name is query.BranchesRepos and it is a slice of
(Branch, Repo IDs bitmap). This is part of the effort to reduce memory spikes
and OOMs in the sourcegraph frontend.

The benchmark diff is impressive at 5.5m repos:

name old time/op new time/op delta
BranchesRepos_Encode-32 1.03s ± 7% 0.00s ± 3% -99.99% (p=0.008 n=5+5)
BranchesRepos_Decode-32 915ms ± 4% 0ms ± 1% -99.98% (p=0.008 n=5+5)

name old bytes new bytes delta
BranchesRepos_Encode-32 189MB ± 0% 0MB ± 0% -99.77% (p=0.008 n=5+5)

name old alloc/op new alloc/op delta
BranchesRepos_Encode-32 189MB ± 0% 0MB ± 0% -99.76% (p=0.000 n=4+5)
BranchesRepos_Decode-32 766MB ± 0% 1MB ± 0% -99.88% (p=0.008 n=5+5)

name old allocs/op new allocs/op delta
BranchesRepos_Encode-32 1.60 ±88% 89.00 ± 0% +5462.50% (p=0.008 n=5+5)
BranchesRepos_Decode-32 134k ± 0% 0k ± 0% -99.67% (p=0.008 n=5+5)

Co-authored-by: Keegan Carruthers-Smith <keegan.csmith@gmail.com>

+466 -51
+4 -2
api.go
··· 290 290 return err 291 291 } 292 292 293 - id, _ := strconv.ParseUint(repo.RawConfig["repoid"], 10, 32) 294 - r.ID = uint32(id) 293 + if v, ok := repo.RawConfig["repoid"]; ok { 294 + id, _ := strconv.ParseUint(v, 10, 32) 295 + r.ID = uint32(id) 296 + } 295 297 296 298 return nil 297 299 }
+18 -5
eval.go
··· 42 42 // simplifyMultiRepo takes a query and a predicate. It returns Const(true) if all 43 43 // repository names fulfill the predicate, Const(false) if none of them do, and q 44 44 // otherwise. 45 - func (d *indexData) simplifyMultiRepo(q query.Q, predicate func(repoName string) bool) query.Q { 45 + func (d *indexData) simplifyMultiRepo(q query.Q, predicate func(*Repository) bool) query.Q { 46 46 count := 0 47 47 alive := len(d.repoMetaData) 48 - for i, md := range d.repoMetaData { 48 + for i := range d.repoMetaData { 49 49 if d.repoMetaData[i].Tombstone { 50 50 alive-- 51 - } else if predicate(md.Name) { 51 + } else if predicate(&d.repoMetaData[i]) { 52 52 count++ 53 53 } 54 54 } ··· 65 65 eval := query.Map(in, func(q query.Q) query.Q { 66 66 switch r := q.(type) { 67 67 case *query.Repo: 68 - return d.simplifyMultiRepo(q, func(name string) bool { return strings.Contains(name, r.Pattern) }) 68 + return d.simplifyMultiRepo(q, func(repo *Repository) bool { 69 + return strings.Contains(repo.Name, r.Pattern) 70 + }) 71 + case *query.BranchesRepos: 72 + for i := range d.repoMetaData { 73 + for _, br := range r.List { 74 + if br.Repos.Contains(d.repoMetaData[i].ID) { 75 + return q 76 + } 77 + } 78 + } 79 + return &query.Const{Value: false} 69 80 case *query.RepoBranches: 70 81 if len(d.repoMetaData) == 1 { 71 82 // Can simplify query now. compound too complicated since each repo ··· 79 90 } 80 91 return &query.Const{Value: false} 81 92 case *query.RepoSet: 82 - return d.simplifyMultiRepo(q, func(name string) bool { return r.Set[name] }) 93 + return d.simplifyMultiRepo(q, func(repo *Repository) bool { 94 + return r.Set[repo.Name] 95 + }) 83 96 case *query.Language: 84 97 _, has := d.metaData.LanguageMap[r.Language] 85 98 if !has {
+31 -1
eval_test.go
··· 15 15 package zoekt 16 16 17 17 import ( 18 + "hash/fnv" 18 19 "reflect" 19 20 "regexp/syntax" 20 21 "strings" 21 22 "testing" 22 23 24 + "github.com/RoaringBitmap/roaring" 23 25 "github.com/google/go-cmp/cmp" 24 26 "github.com/google/zoekt/query" 25 27 ) ··· 134 136 b := newIndexBuilder() 135 137 b.indexFormatVersion = NextIndexFormatVersion 136 138 for _, name := range names { 137 - if err := b.setRepository(&Repository{Name: name}); err != nil { 139 + if err := b.setRepository(&Repository{ID: hash(name), Name: name}); err != nil { 138 140 t.Fatal(err) 139 141 } 140 142 if err := b.AddFile(name+".txt", []byte(name+" content")); err != nil { ··· 206 208 } 207 209 } 208 210 211 + func TestSimplifyBranchesRepos(t *testing.T) { 212 + d := compoundReposShard(t, "foo", "bar") 213 + 214 + some := &query.BranchesRepos{ 215 + List: []query.BranchRepos{ 216 + {Branch: "branch1", Repos: roaring.BitmapOf(hash("bar"))}, 217 + }, 218 + } 219 + none := &query.Repo{"banana"} 220 + 221 + got := d.simplify(some) 222 + tr := cmp.Transformer("", func(b *roaring.Bitmap) []uint32 { return b.ToArray() }) 223 + if d := cmp.Diff(some, got, tr); d != "" { 224 + t.Fatalf("-want, +got:\n%s", d) 225 + } 226 + 227 + got = d.simplify(none) 228 + if d := cmp.Diff(&query.Const{Value: false}, got); d != "" { 229 + t.Fatalf("-want, +got:\n%s", d) 230 + } 231 + } 232 + 209 233 func TestSimplifyRepoBranchSimple(t *testing.T) { 210 234 d := compoundReposShard(t, "foo") 211 235 q := &query.RepoBranches{Set: map[string][]string{"foo": {"HEAD", "b1"}, "bar": {"HEAD"}}} ··· 223 247 t.Fatalf("-want, +got:\n%s", d) 224 248 } 225 249 } 250 + 251 + func hash(name string) uint32 { 252 + h := fnv.New32() 253 + h.Write([]byte(name)) 254 + return h.Sum32() 255 + }
+2
go.mod
··· 4 4 cloud.google.com/go v0.82.0 5 5 github.com/Microsoft/go-winio v0.5.0 // indirect 6 6 github.com/ProtonMail/go-crypto v0.0.0-20210707164159-52430bf6b52c // indirect 7 + github.com/RoaringBitmap/roaring v0.9.4 7 8 github.com/andygrunwald/go-gerrit v0.0.0-20191101112536-3f5e365ccf57 9 + github.com/bits-and-blooms/bitset v1.2.1 // indirect 8 10 github.com/bmatcuk/doublestar v1.3.4 9 11 github.com/codahale/hdrhistogram v0.0.0-20161010025455-3a0bb77429bd // indirect 10 12 github.com/fsnotify/fsnotify v1.4.9
+8
go.sum
··· 51 51 github.com/ProtonMail/go-crypto v0.0.0-20210428141323-04723f9f07d7/go.mod h1:z4/9nQmJSSwwds7ejkxaJwO37dru3geImFUdJlaLzQo= 52 52 github.com/ProtonMail/go-crypto v0.0.0-20210707164159-52430bf6b52c h1:FP7mMdsXy0ybzar1sJeIcZtaJka0U/ZmLTW4wRpolYk= 53 53 github.com/ProtonMail/go-crypto v0.0.0-20210707164159-52430bf6b52c/go.mod h1:z4/9nQmJSSwwds7ejkxaJwO37dru3geImFUdJlaLzQo= 54 + github.com/RoaringBitmap/roaring v0.9.4 h1:ckvZSX5gwCRaJYBNe7syNawCU5oruY9gQmjXlp4riwo= 55 + github.com/RoaringBitmap/roaring v0.9.4/go.mod h1:icnadbWcNyfEHlYdr+tDlOTih1Bf/h+rzPpv4sbomAA= 54 56 github.com/StackExchange/wmi v0.0.0-20180116203802-5d049714c4a6/go.mod h1:3eOhrUMpNV+6aFIbp5/iudMxNCF27Vw2OZgy4xEx0Fg= 55 57 github.com/acomagu/bufpipe v1.0.3 h1:fxAGrHZTgQ9w5QqVItgzwj235/uYZYgbXitB+dLupOk= 56 58 github.com/acomagu/bufpipe v1.0.3/go.mod h1:mxdxdup/WdsKVreO5GpW4+M/1CE2sMG4jeGJ2sYmHc4= ··· 69 71 github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8= 70 72 github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= 71 73 github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= 74 + github.com/bits-and-blooms/bitset v1.2.0 h1:Kn4yilvwNtMACtf1eYDlG8H77R07mZSPbMjLyS07ChA= 75 + github.com/bits-and-blooms/bitset v1.2.0/go.mod h1:gIdJ4wp64HaoK2YrL1Q5/N7Y16edYb8uY+O0FJTyyDA= 76 + github.com/bits-and-blooms/bitset v1.2.1 h1:M+/hrU9xlMp7t4TyTDQW97d3tRPVuKFC6zBEK16QnXY= 77 + github.com/bits-and-blooms/bitset v1.2.1/go.mod h1:gIdJ4wp64HaoK2YrL1Q5/N7Y16edYb8uY+O0FJTyyDA= 72 78 github.com/bmatcuk/doublestar v1.3.4 h1:gPypJ5xD31uhX6Tf54sDPUOBXTqKH4c9aPY66CyQrS0= 73 79 github.com/bmatcuk/doublestar v1.3.4/go.mod h1:wiQtGV+rzVYxB7WIlirSN++5HPtPlXEo9MEoZQC/PmE= 74 80 github.com/bombsimon/wsl/v2 v2.0.0/go.mod h1:mf25kr/SqFEPhhcxW1+7pxzGlW+hIl/hYTKY95VwV8U= ··· 357 363 github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= 358 364 github.com/mozilla/tls-observatory v0.0.0-20190404164649-a3c1b6cfecfd/go.mod h1:SrKMQvPiws7F7iqYp8/TX+IhxCYhzr6N/1yb8cwHsGk= 359 365 github.com/mozilla/tls-observatory v0.0.0-20200220173314-aae45faa4006/go.mod h1:SrKMQvPiws7F7iqYp8/TX+IhxCYhzr6N/1yb8cwHsGk= 366 + github.com/mschoch/smat v0.2.0 h1:8imxQsjDm8yFEAVBe7azKmKSgzSkZXDuKkSq9374khM= 367 + github.com/mschoch/smat v0.2.0/go.mod h1:kc9mz7DoBKqDyiRL7VZN8KvXQMWeTaVnttLRXOlotKw= 360 368 github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= 361 369 github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J1GEMiLbxo1LJaP8RfCpH6pymGZus= 362 370 github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw=
+19
matchtree.go
··· 905 905 matchTree: subMT, 906 906 }, nil 907 907 908 + case *query.BranchesRepos: 909 + reposBranchesWant := make([]uint64, len(d.repoMetaData)) 910 + for repoIdx := range d.repoMetaData { 911 + var mask uint64 912 + for _, br := range s.List { 913 + if br.Repos.Contains(d.repoMetaData[repoIdx].ID) { 914 + mask |= uint64(d.branchIDs[repoIdx][br.Branch]) 915 + } 916 + } 917 + reposBranchesWant[repoIdx] = mask 918 + } 919 + return &docMatchTree{ 920 + reason: "BranchesRepos", 921 + numDocs: d.numDocs(), 922 + predicate: func(docID uint32) bool { 923 + return d.fileBranchMasks[docID]&reposBranchesWant[d.repos[docID]] != 0 924 + }, 925 + }, nil 926 + 908 927 case *query.RepoBranches: 909 928 reposBranchesWant := make([]uint64, len(d.repoMetaData)) 910 929 for repoIdx, r := range d.repoMetaData {
+34
matchtree_test.go
··· 18 18 "reflect" 19 19 "testing" 20 20 21 + "github.com/RoaringBitmap/roaring" 21 22 "github.com/google/zoekt/query" 22 23 ) 23 24 ··· 295 296 t.Fatalf("expect %d documents, but got at least 1 more", len(want)) 296 297 } 297 298 } 299 + 300 + func TestBranchesRepos(t *testing.T) { 301 + d := &indexData{ 302 + repoMetaData: []Repository{ 303 + {ID: hash("foo"), Name: "foo"}, 304 + {ID: hash("bar"), Name: "bar"}, 305 + }, 306 + fileBranchMasks: []uint64{1, 1, 1, 2, 1, 2, 1}, 307 + repos: []uint16{0, 0, 1, 1, 1, 1, 1}, 308 + branchIDs: []map[string]uint{{"HEAD": 1}, {"HEAD": 1, "b1": 2}}, 309 + } 310 + 311 + mt, err := d.newMatchTree(&query.BranchesRepos{List: []query.BranchRepos{ 312 + {Branch: "b1", Repos: roaring.BitmapOf(hash("bar"))}, 313 + {Branch: "b2", Repos: roaring.BitmapOf(hash("bar"))}, 314 + }}) 315 + if err != nil { 316 + t.Fatal(err) 317 + } 318 + 319 + want := []uint32{3, 5} 320 + for i := 0; i < len(want); i++ { 321 + nextDoc := mt.nextDoc() 322 + if nextDoc != want[i] { 323 + t.Fatalf("want %d, got %d", want[i], nextDoc) 324 + } 325 + mt.prepare(nextDoc) 326 + } 327 + 328 + if mt.nextDoc() != maxUInt32 { 329 + t.Fatalf("expect %d documents, but got at least 1 more", len(want)) 330 + } 331 + }
+88
query/marshal.go
··· 5 5 "encoding/binary" 6 6 "errors" 7 7 "fmt" 8 + "io" 8 9 "unsafe" 10 + 11 + "github.com/RoaringBitmap/roaring" 9 12 ) 10 13 11 14 // We implement a custom binary marshaller for a list of repos to ··· 150 153 return repoBranches, r.err 151 154 } 152 155 156 + func branchesReposEncode(brs []BranchRepos) ([]byte, error) { 157 + var b bytes.Buffer 158 + var enc [binary.MaxVarintLen64]byte 159 + varint := func(n uint64) { 160 + m := binary.PutUvarint(enc[:], n) 161 + b.Write(enc[:m]) 162 + } 163 + str := func(s string) { 164 + varint(uint64(len(s))) 165 + b.WriteString(s) 166 + } 167 + strSize := func(s string) uint64 { 168 + return uint64(binary.PutUvarint(enc[:], uint64(len(s))) + len(s)) 169 + } 170 + 171 + // Calculate size 172 + size := uint64(1) // version 173 + size += uint64(binary.PutUvarint(enc[:], uint64(len(brs)))) 174 + for _, br := range brs { 175 + size += strSize(br.Branch) 176 + idsSize := br.Repos.GetSerializedSizeInBytes() 177 + size += uint64(binary.PutUvarint(enc[:], idsSize)) 178 + size += idsSize 179 + } 180 + 181 + b.Grow(int(size)) 182 + 183 + // Version 184 + b.WriteByte(1) 185 + 186 + // Length 187 + varint(uint64(len(brs))) 188 + 189 + for _, br := range brs { 190 + str(br.Branch) 191 + l := br.Repos.GetSerializedSizeInBytes() 192 + varint(l) 193 + 194 + n, err := br.Repos.WriteTo(&b) 195 + if err != nil { 196 + return nil, err 197 + } 198 + 199 + if uint64(n) != l { 200 + return nil, io.ErrShortWrite 201 + } 202 + } 203 + 204 + return b.Bytes(), nil 205 + } 206 + 207 + func branchesReposDecode(b []byte) ([]BranchRepos, error) { 208 + // binaryReader returns strings pointing into b to avoid allocations. We 209 + // don't own b, so we create a copy of it. 210 + r := binaryReader{b: append(make([]byte, 0, len(b)), b...)} 211 + 212 + // Version 213 + if v := r.byt(); v != 1 { 214 + return nil, fmt.Errorf("unsupported BranchRepos encoding version %d", v) 215 + } 216 + 217 + l := r.uvarint() // Length 218 + brs := make([]BranchRepos, l) 219 + 220 + for i := 0; i < l; i++ { 221 + brs[i].Branch = r.str() 222 + brs[i].Repos = r.bitmap() 223 + } 224 + 225 + return brs, r.err 226 + } 227 + 153 228 type binaryReader struct { 154 229 b []byte 155 230 err error ··· 176 251 s := b2s(b.b[:l]) 177 252 b.b = b.b[l:] 178 253 return s 254 + } 255 + 256 + func (b *binaryReader) bitmap() *roaring.Bitmap { 257 + l := b.uvarint() 258 + if l > len(b.b) { 259 + b.b = nil 260 + b.err = errors.New("malformed BranchRepos") 261 + return nil 262 + } 263 + r := roaring.New() 264 + _, b.err = r.FromBuffer(b.b[:l]) 265 + b.b = b.b[l:] 266 + return r 179 267 } 180 268 181 269 func (b *binaryReader) byt() byte {
+135 -12
query/marshal_test.go
··· 6 6 "encoding/binary" 7 7 "encoding/gob" 8 8 "fmt" 9 + "sort" 9 10 "testing" 10 11 12 + "github.com/RoaringBitmap/roaring" 11 13 "github.com/google/go-cmp/cmp" 12 14 ) 13 15 ··· 15 17 // marshalling. 16 18 17 19 func BenchmarkRepoBranches_Encode(b *testing.B) { 18 - repoBranches := genRepoBranches() 20 + repoBranches := genRepoBranches(5_500_000) 19 21 20 22 // do one write to amortize away the cost of gob registration 21 23 w := &countWriter{} ··· 37 39 } 38 40 39 41 func BenchmarkRepoBranches_Decode(b *testing.B) { 40 - repoBranches := genRepoBranches() 42 + repoBranches := genRepoBranches(5_500_000) 41 43 42 44 var buf bytes.Buffer 43 45 if err := gob.NewEncoder(&buf).Encode(repoBranches); err != nil { ··· 57 59 } 58 60 59 61 func TestRepoBranches_Marshal(t *testing.T) { 60 - want := genRepoBranches() 62 + want := genRepoBranches(1000) 61 63 62 64 var buf bytes.Buffer 63 65 if err := gob.NewEncoder(&buf).Encode(want); err != nil { ··· 74 76 } 75 77 } 76 78 77 - func genRepoBranches() *RepoBranches { 79 + func BenchmarkBranchesRepos_Encode(b *testing.B) { 80 + brs := genBranchesRepos(5_500_000) 81 + 82 + // do one write to amortize away the cost of gob registration 83 + w := &countWriter{} 84 + enc := gob.NewEncoder(w) 85 + if err := enc.Encode(brs); err != nil { 86 + b.Fatal(err) 87 + } 88 + 89 + b.ResetTimer() 90 + b.ReportAllocs() 91 + 92 + b.ReportMetric(float64(w.n), "bytes") 93 + 94 + for n := 0; n < b.N; n++ { 95 + if err := enc.Encode(brs); err != nil { 96 + b.Fatal(err) 97 + } 98 + } 99 + } 100 + 101 + func BenchmarkBranchesRepos_Decode(b *testing.B) { 102 + brs := genBranchesRepos(5_500_000) 103 + 104 + var buf bytes.Buffer 105 + if err := gob.NewEncoder(&buf).Encode(brs); err != nil { 106 + b.Fatal(err) 107 + } 108 + 109 + b.ResetTimer() 110 + b.ReportAllocs() 111 + 112 + for n := 0; n < b.N; n++ { 113 + // We need to include gob.NewDecoder cost to avoid measuring encoding. 114 + var brs BranchesRepos 115 + if err := gob.NewDecoder(bytes.NewReader(buf.Bytes())).Decode(&brs); err != nil { 116 + b.Fatal(err) 117 + } 118 + } 119 + } 120 + 121 + func TestBranchesRepos_Marshal(t *testing.T) { 122 + want := genBranchesRepos(1000) 123 + 124 + var buf bytes.Buffer 125 + if err := gob.NewEncoder(&buf).Encode(want); err != nil { 126 + t.Fatal(err) 127 + } 128 + 129 + var got BranchesRepos 130 + if err := gob.NewDecoder(bytes.NewReader(buf.Bytes())).Decode(&got); err != nil { 131 + t.Fatal(err) 132 + } 133 + 134 + tr := cmp.Transformer("", func(b *roaring.Bitmap) []uint32 { return b.ToArray() }) 135 + if diff := cmp.Diff(want, &got, tr); diff != "" { 136 + t.Fatalf("mismatch IDs (-want +got):\n%s", diff) 137 + } 138 + } 139 + 140 + // Generating 5.5M repos slows down the benchmark setup time, so we cache things. 141 + var genCache = map[string]interface{}{} 142 + 143 + func genRepoBranches(n int) *RepoBranches { 144 + key := fmt.Sprintf("RepoBranches:%d", n) 145 + val, ok := genCache[key] 146 + if ok { 147 + return val.(*RepoBranches) 148 + } 149 + 78 150 genName := func(n int) string { 79 151 bs := make([]byte, 8) 80 152 binary.LittleEndian.PutUint64(bs, uint64(n)) ··· 82 154 } 83 155 84 156 repoBranches := &RepoBranches{Set: map[string][]string{}} 85 - for i := 0; i < 100; i++ { 86 - org := genName(i) 87 - for j := 0; j < 100; j++ { 88 - name := "github.com/" + org + "/" + genName(i*2+j) 89 - repoBranches.Set[name] = []string{"HEAD"} 90 - if j%50 == 0 { 91 - repoBranches.Set[name] = append(repoBranches.Set[name], "more", "branches") 157 + orgIndex := 0 158 + repoIndex := 0 159 + 160 + for i := 0; i < n; i++ { 161 + org := genName(orgIndex) 162 + name := "github.com/" + org + "/" + genName(orgIndex*2+repoIndex) 163 + repoBranches.Set[name] = []string{"HEAD"} 164 + if repoIndex%50 == 0 { 165 + repoBranches.Set[name] = append(repoBranches.Set[name], "more", "branches") 166 + } 167 + 168 + if i%1000 == 0 { 169 + orgIndex++ 170 + repoIndex = 0 171 + } 172 + 173 + repoIndex++ 174 + } 175 + 176 + genCache[key] = repoBranches 177 + return repoBranches 178 + } 179 + 180 + func genBranchesRepos(n int) *BranchesRepos { 181 + key := fmt.Sprintf("BranchesRepos:%d", n) 182 + val, ok := genCache[key] 183 + if ok { 184 + return val.(*BranchesRepos) 185 + } 186 + 187 + set := genRepoBranches(n).Set 188 + br := map[string]*roaring.Bitmap{} 189 + id := uint32(1) 190 + 191 + for _, branches := range set { 192 + for _, branch := range branches { 193 + ids, ok := br[branch] 194 + if !ok { 195 + ids = roaring.New() 196 + br[branch] = ids 92 197 } 198 + ids.Add(id) 93 199 } 200 + id++ 94 201 } 95 202 96 - return repoBranches 203 + brs := make([]BranchRepos, 0, len(br)) 204 + for branch, ids := range br { 205 + ids.RunOptimize() 206 + brs = append(brs, BranchRepos{Branch: branch, Repos: ids}) 207 + } 208 + 209 + sort.Slice(brs, func(i, j int) bool { 210 + return brs[i].Branch < brs[j].Branch 211 + }) 212 + 213 + q := &BranchesRepos{ 214 + List: brs, 215 + } 216 + 217 + genCache[key] = q 218 + 219 + return q 97 220 } 98 221 99 222 type countWriter struct {
+45
query/query.go
··· 23 23 "reflect" 24 24 "regexp/syntax" 25 25 "sort" 26 + "strconv" 26 27 "strings" 27 28 "sync" 29 + 30 + "github.com/RoaringBitmap/roaring" 28 31 ) 29 32 30 33 var _ = log.Println ··· 167 170 168 171 func (q *Repo) String() string { 169 172 return fmt.Sprintf("repo:%s", q.Pattern) 173 + } 174 + 175 + // BranchesRepos is a slice of BranchRepos to match. It is a Sourcegraph 176 + // addition and only used in the RPC interface for efficient checking of large 177 + // repo lists. 178 + type BranchesRepos struct { 179 + List []BranchRepos 180 + } 181 + 182 + func (q *BranchesRepos) String() string { 183 + var sb strings.Builder 184 + 185 + sb.WriteString("(branchesrepos") 186 + 187 + for _, br := range q.List { 188 + if size := br.Repos.GetCardinality(); size > 1 { 189 + sb.WriteString(" " + br.Branch + ":" + strconv.FormatUint(size, 64)) 190 + } else { 191 + sb.WriteString(" " + br.Branch + "=" + br.Repos.String()) 192 + } 193 + } 194 + 195 + sb.WriteString(")") 196 + return sb.String() 197 + } 198 + 199 + // MarshalBinary implements a specialized encoder for BranchesRepos. 200 + func (q BranchesRepos) MarshalBinary() ([]byte, error) { 201 + return branchesReposEncode(q.List) 202 + } 203 + 204 + // UnmarshalBinary implements a specialized decoder for BranchesRepos. 205 + func (q *BranchesRepos) UnmarshalBinary(b []byte) (err error) { 206 + q.List, err = branchesReposDecode(b) 207 + return err 208 + } 209 + 210 + // BranchRepos is a (branch, sourcegraph repo ids bitmap) tuple. It is a 211 + // Sourcegraph addition. 212 + type BranchRepos struct { 213 + Branch string 214 + Repos *roaring.Bitmap 170 215 } 171 216 172 217 // RepoBranches is a list of branches in repos to match. It is a Sourcegraph
+2
rpc/rpc.go
··· 133 133 gob.Register(&query.Not{}) 134 134 gob.Register(&query.Or{}) 135 135 gob.Register(&query.Regexp{}) 136 + gob.Register(&query.BranchRepos{}) 137 + gob.Register(&query.BranchesRepos{}) 136 138 gob.Register(&query.RepoBranches{}) 137 139 gob.Register(&query.RepoSet{}) 138 140 gob.Register(&query.Repo{})
+1 -1
shards/eval_test.go
··· 13 13 ss := newShardedSearcher(2) 14 14 nextShardNum := 1 15 15 addShard := func(docs ...zoekt.Document) { 16 - b := testIndexBuilder(t, &zoekt.Repository{Name: "reponame"}, docs...) 16 + b := testIndexBuilder(t, &zoekt.Repository{ID: 1, Name: "reponame"}, docs...) 17 17 shard := searcherForTest(t, b) 18 18 ss.replace(fmt.Sprintf("key-%d", nextShardNum), shard) 19 19 nextShardNum++
+57 -25
shards/shards.go
··· 176 176 // We have out of band ranking on compound shards which can change even if 177 177 // the shard file does not. So we compute a rank in getShards. We store 178 178 // names here to avoid the cost of List in the search request path. 179 - names []string 179 + repos []*zoekt.Repository 180 180 } 181 181 182 182 type shardedSearcher struct { ··· 280 280 // (and (repobranches ...) (q)) 281 281 // (and (repobranches ...) (q)) 282 282 283 - hasReposForPredicate := func(pred func(name string) bool) func(names []string) (any, all bool) { 284 - return func(names []string) (any, all bool) { 283 + hasReposForPredicate := func(pred func(repo *zoekt.Repository) bool) func(repos []*zoekt.Repository) (any, all bool) { 284 + return func(repos []*zoekt.Repository) (any, all bool) { 285 285 any = false 286 286 all = true 287 - for _, name := range names { 288 - b := pred(name) 287 + for _, repo := range repos { 288 + b := pred(repo) 289 289 any = any || b 290 290 all = all && b 291 291 } ··· 295 295 296 296 for i, c := range and.Children { 297 297 var setSize int 298 - var hasRepos func([]string) (bool, bool) 298 + var hasRepos func([]*zoekt.Repository) (bool, bool) 299 299 switch setQuery := c.(type) { 300 300 case *query.RepoSet: 301 301 setSize = len(setQuery.Set) 302 - hasRepos = hasReposForPredicate(func(name string) bool { return setQuery.Set[name] }) 302 + hasRepos = hasReposForPredicate(func(repo *zoekt.Repository) bool { 303 + return setQuery.Set[repo.Name] 304 + }) 305 + case *query.BranchesRepos: 306 + for _, br := range setQuery.List { 307 + setSize += int(br.Repos.GetCardinality()) 308 + } 309 + 310 + hasRepos = hasReposForPredicate(func(repo *zoekt.Repository) bool { 311 + for _, br := range setQuery.List { 312 + if br.Repos.Contains(repo.ID) { 313 + return true 314 + } 315 + } 316 + return false 317 + }) 303 318 case *query.RepoBranches: 304 319 setSize = len(setQuery.Set) 305 - hasRepos = hasReposForPredicate(func(name string) bool { return len(setQuery.Set[name]) > 0 }) 320 + hasRepos = hasReposForPredicate(func(repo *zoekt.Repository) bool { 321 + return len(setQuery.Set[repo.Name]) > 0 322 + }) 306 323 default: 307 324 continue 308 325 } ··· 317 334 filteredAll := true 318 335 319 336 for _, s := range shards { 320 - if any, all := hasRepos(s.names); any { 337 + if any, all := hasRepos(s.repos); any { 321 338 filtered = append(filtered, s) 322 339 filteredAll = filteredAll && all 323 340 } ··· 342 359 // then at this point filtered is [foo bar] and q is the same. For each 343 360 // shard indexData.simplify will simplify to (and true (content baz)) -> 344 361 // (content baz). This work can be done now once, rather than per shard. 345 - if _, ok := c.(*query.RepoSet); ok { 362 + switch c := c.(type) { 363 + case *query.RepoSet: 346 364 and.Children[i] = &query.Const{Value: true} 347 365 return filtered, query.Simplify(and) 348 - } 349 - if b, ok := c.(*query.RepoBranches); ok { 366 + 367 + case *query.BranchesRepos: 368 + // We can only replace if all the repos want the same branches. We 369 + // simplify and just check that we are requesting 1 branch. The common 370 + // case is just asking for HEAD, so this should be effective. 371 + if len(c.List) != 1 { 372 + return filtered, and 373 + } 374 + 375 + // Every repo wants the same branches, so we can replace RepoBranches 376 + // with a list of branch queries. 377 + and.Children[i] = &query.Branch{Pattern: c.List[0].Branch, Exact: true} 378 + return filtered, query.Simplify(and) 379 + 380 + case *query.RepoBranches: 350 381 // We can only replace if all the repos want the same branches. 351 - want := b.Set[filtered[0].names[0]] 382 + want := c.Set[filtered[0].repos[0].Name] 352 383 for _, s := range filtered { 353 - for _, name := range s.names { 354 - if !strSliceEqual(want, b.Set[name]) { 384 + for _, repo := range s.repos { 385 + if !strSliceEqual(want, c.Set[repo.Name]) { 355 386 return filtered, and 356 387 } 357 388 } ··· 359 390 360 391 // Every repo wants the same branches, so we can replace RepoBranches 361 392 // with a list of branch queries. 362 - and.Children[i] = b.Branches(filtered[0].names[0]) 393 + and.Children[i] = c.Branches(filtered[0].repos[0].Name) 363 394 return filtered, query.Simplify(and) 364 395 } 365 396 ··· 777 808 if priorityDiff != 0 { 778 809 return priorityDiff > 0 779 810 } 780 - if len(res[i].names) == 0 || len(res[j].names) == 0 { 811 + if len(res[i].repos) == 0 || len(res[j].repos) == 0 { 781 812 // Protect against empty names which can happen if we fail to List or 782 813 // the shard is full of tombstones. Prefer the shard which has names. 783 - return len(res[i].names) >= len(res[j].names) 814 + return len(res[i].repos) >= len(res[j].repos) 784 815 } 785 - return res[i].names[0] < res[j].names[0] 816 + return res[i].repos[0].Name < res[j].repos[0].Name 786 817 }) 787 818 788 819 s.ranked = res ··· 802 833 803 834 var ( 804 835 maxPriority float64 805 - names = make([]string, 0, len(result.Repos)) 836 + repos = make([]*zoekt.Repository, 0, len(result.Repos)) 806 837 ) 807 - for _, r := range result.Repos { 808 - names = append(names, r.Repository.Name) 809 - if r.Repository.RawConfig != nil { 810 - priority, _ := strconv.ParseFloat(r.Repository.RawConfig["priority"], 64) 838 + for i := range result.Repos { 839 + repo := &result.Repos[i].Repository 840 + repos = append(repos, repo) 841 + if repo.RawConfig != nil { 842 + priority, _ := strconv.ParseFloat(repo.RawConfig["priority"], 64) 811 843 if priority > maxPriority { 812 844 maxPriority = priority 813 845 } ··· 816 848 817 849 return rankedShard{ 818 850 Searcher: s, 819 - names: names, 851 + repos: repos, 820 852 priority: maxPriority, 821 853 } 822 854 }
+22 -5
shards/shards_test.go
··· 18 18 "bytes" 19 19 "context" 20 20 "fmt" 21 + "hash/fnv" 21 22 "log" 22 23 "math" 23 24 "os" ··· 26 27 "testing" 27 28 "time" 28 29 30 + "github.com/RoaringBitmap/roaring" 29 31 "github.com/google/go-cmp/cmp" 30 32 "github.com/google/go-cmp/cmp/cmpopts" 31 33 "github.com/google/zoekt" ··· 180 182 } 181 183 182 184 ss.replace(shardName, &rankSearcher{ 183 - repo: &zoekt.Repository{Name: repoName}, 185 + repo: &zoekt.Repository{ID: hash(repoName), Name: repoName}, 184 186 rank: uint16(n - i), 185 187 }) 186 188 } ··· 193 195 t.Fatalf("no reposet: got %d results, want %d", len(res.Files), n) 194 196 } 195 197 196 - repoBranches := &query.RepoBranches{Set: make(map[string][]string)} 198 + repoBranchesSet := &query.RepoBranches{Set: make(map[string][]string)} 199 + branchesRepos := &query.BranchesRepos{List: []query.BranchRepos{ 200 + {Branch: "HEAD", Repos: roaring.New()}, 201 + }} 202 + 197 203 for _, name := range repoSetNames { 198 - repoBranches.Set[name] = []string{"HEAD"} 204 + repoBranchesSet.Set[name] = []string{"HEAD"} 205 + branchesRepos.List[0].Repos.Add(hash(name)) 199 206 } 200 207 201 208 set := query.NewRepoSet(repoSetNames...) ··· 206 213 // Test with the same reposet again 207 214 query.NewAnd(set, sub), 208 215 209 - query.NewAnd(repoBranches, sub), 216 + query.NewAnd(repoBranchesSet, sub), 210 217 // Test with the same repoBranches again 211 - query.NewAnd(repoBranches, sub), 218 + query.NewAnd(repoBranchesSet, sub), 219 + 220 + query.NewAnd(branchesRepos, sub), 221 + // Test with the same repoBranches with IDs again 222 + query.NewAnd(branchesRepos, sub), 212 223 } 213 224 214 225 for _, q := range queries { ··· 222 233 t.Fatalf("%s: got %d results, want %d", q, len(res.Files), len(repoSetNames)) 223 234 } 224 235 } 236 + } 237 + 238 + func hash(name string) uint32 { 239 + h := fnv.New32() 240 + h.Write([]byte(name)) 241 + return h.Sum32() 225 242 } 226 243 227 244 type memSeeker struct {