fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

stats: fixed wrong size reporting for repos from compound shards (#352)

The boundaries slice keeps the offsets at which text contents of files start within a shard,
one entry per file and one sentinel entry in the end. Therefore, for simple shards the last
entry in this slice can be used to determine the total size of the files in the shard.
The accurate calculation (boundaries[len(boundaries) - 1] - boundaries[0]) is simplified
by the relativeIndex() trick, so boundaries[0] equals 0, and the expression we were using to
calculate the file size was boundaries[len(boundaries) - 1].

Later we added support for compound shards where different files in a shard can belong to
different repos, although files from a single repo always occupy a contiguous subrange in
all data structures where direct indexing by file number makes sense. If the files span the
subrange of indexes [l, r) the correct expression for their total size is boundaries[r] - boundaries[l].
However, the old code had a mistake where we used, by analogy with the simple shard case,
the one-term expression boundaries[r]. This resulted in overestimation of the total repo sizes for
repos that ended up in compound shards.

+202 -25
+10 -11
eval_test.go
··· 171 171 }) 172 172 } 173 173 174 - // compoundReposShard returns a compound shard where each repo has 1 document. 175 174 func compoundReposShard(t *testing.T, names ...string) *indexData { 176 175 t.Helper() 177 - b := newIndexBuilder() 178 - b.indexFormatVersion = NextIndexFormatVersion 176 + 177 + repos := make([]*Repository, 0, len(names)) 178 + docs := make([][]Document, 0, len(names)) 179 179 for _, name := range names { 180 - if err := b.setRepository(&Repository{ID: hash(name), Name: name}); err != nil { 181 - t.Fatal(err) 180 + repos = append(repos, &Repository{ID: hash(name), Name: name}) 181 + ds := []Document{ 182 + Document{Name: name + ".txt", Content: []byte(name + " content")}, 183 + Document{Name: name + ".2.txt", Content: []byte(name + " content 2")}, 182 184 } 183 - if err := b.AddFile(name+".txt", []byte(name+" content")); err != nil { 184 - t.Fatal(err) 185 - } 186 - if err := b.AddFile(name+".2.txt", []byte(name+" content 2")); err != nil { 187 - t.Fatal(err) 188 - } 185 + docs = append(docs, ds) 189 186 } 187 + 188 + b := testIndexBuilderCompound(t, repos, docs) 190 189 s := searcherForTest(t, b) 191 190 return s.(*indexData) 192 191 }
+186
index_test.go
··· 61 61 return b 62 62 } 63 63 64 + func testIndexBuilderCompound(t *testing.T, repos []*Repository, docs [][]Document) *IndexBuilder { 65 + t.Helper() 66 + 67 + b := newIndexBuilder() 68 + b.indexFormatVersion = NextIndexFormatVersion 69 + b.contentBloom.bits = b.contentBloom.bits[:bloomSizeTest] 70 + b.nameBloom.bits = b.nameBloom.bits[:bloomSizeTest] 71 + 72 + if len(repos) != len(docs) { 73 + t.Fatalf("testIndexBuilderCompound: repos must be the same length as docs, got: len(repos)=%d len(docs)=%d", len(repos), len(docs)) 74 + } 75 + 76 + for i, repo := range repos { 77 + if err := b.setRepository(repo); err != nil { 78 + t.Fatal(err) 79 + } 80 + for j, d := range docs[i] { 81 + if err := b.Add(d); err != nil { 82 + t.Fatalf("Add %d %d: %v", i, j, err) 83 + } 84 + } 85 + } 86 + 87 + return b 88 + } 89 + 64 90 func TestBoundary(t *testing.T) { 65 91 b := testIndexBuilder(t, nil, 66 92 Document{Name: "f1", Content: []byte("x the")}, ··· 2176 2202 res = searchForTest(t, b, &query.Language{Language: "C++"}) 2177 2203 wantSingleMatch(res, "hello.h") 2178 2204 } 2205 + 2206 + func TestStats(t *testing.T) { 2207 + ignored := []cmp.Option{ 2208 + cmpopts.EquateEmpty(), 2209 + cmpopts.IgnoreFields(RepoListEntry{}, "Repository"), 2210 + cmpopts.IgnoreFields(RepoListEntry{}, "IndexMetadata"), 2211 + cmpopts.IgnoreFields(RepoStats{}, "IndexBytes"), 2212 + } 2213 + 2214 + repoListEntries := func(b *IndexBuilder) []RepoListEntry { 2215 + searcher := searcherForTest(t, b) 2216 + indexdata := searcher.(*indexData) 2217 + return indexdata.repoListEntry 2218 + } 2219 + 2220 + t.Run("one empty repo", func(t *testing.T) { 2221 + b := testIndexBuilder(t, nil) 2222 + got := repoListEntries(b) 2223 + want := []RepoListEntry{ 2224 + RepoListEntry{ 2225 + Stats: RepoStats{ 2226 + Repos: 0, 2227 + Shards: 1, 2228 + Documents: 0, 2229 + IndexBytes: 20, 2230 + ContentBytes: 0, 2231 + NewLinesCount: 0, 2232 + DefaultBranchNewLinesCount: 0, 2233 + OtherBranchesNewLinesCount: 0, 2234 + }, 2235 + }, 2236 + } 2237 + 2238 + if diff := cmp.Diff(want, got, ignored...); diff != "" { 2239 + t.Fatalf("mismatch (-want +got):\n%s", diff) 2240 + } 2241 + 2242 + }) 2243 + 2244 + t.Run("one simple shard", func(t *testing.T) { 2245 + b := testIndexBuilder(t, nil, 2246 + Document{Name: "doc 0", Content: []byte("content 0")}, 2247 + Document{Name: "doc 1", Content: []byte("content 1")}, 2248 + ) 2249 + got := repoListEntries(b) 2250 + want := []RepoListEntry{ 2251 + RepoListEntry{ 2252 + Stats: RepoStats{ 2253 + Repos: 0, 2254 + Shards: 1, 2255 + Documents: 2, 2256 + IndexBytes: 224, 2257 + ContentBytes: 28, 2258 + NewLinesCount: 0, 2259 + DefaultBranchNewLinesCount: 0, 2260 + OtherBranchesNewLinesCount: 0, 2261 + }, 2262 + }, 2263 + } 2264 + 2265 + if diff := cmp.Diff(want, got, ignored...); diff != "" { 2266 + t.Fatalf("mismatch (-want +got):\n%s", diff) 2267 + } 2268 + 2269 + }) 2270 + 2271 + t.Run("one compound shard", func(t *testing.T) { 2272 + b := testIndexBuilderCompound(t, 2273 + []*Repository{ 2274 + &Repository{Name: "repo 0"}, 2275 + &Repository{Name: "repo 1"}, 2276 + }, 2277 + [][]Document{ 2278 + []Document{ 2279 + Document{Name: "doc 0", Content: []byte("content 0")}, 2280 + Document{Name: "doc 1", Content: []byte("content 1")}, 2281 + }, 2282 + []Document{ 2283 + Document{Name: "doc 2", Content: []byte("content 2")}, 2284 + Document{Name: "doc 3", Content: []byte("content 3")}, 2285 + }, 2286 + }, 2287 + ) 2288 + got := repoListEntries(b) 2289 + want := []RepoListEntry{ 2290 + RepoListEntry{ 2291 + Stats: RepoStats{ 2292 + Repos: 0, 2293 + Shards: 1, 2294 + Documents: 2, 2295 + IndexBytes: 180, 2296 + ContentBytes: 28, 2297 + NewLinesCount: 0, 2298 + DefaultBranchNewLinesCount: 0, 2299 + OtherBranchesNewLinesCount: 0, 2300 + }, 2301 + }, 2302 + RepoListEntry{ 2303 + Stats: RepoStats{ 2304 + Repos: 0, 2305 + Shards: 1, 2306 + Documents: 2, 2307 + IndexBytes: 180, 2308 + ContentBytes: 28, 2309 + NewLinesCount: 0, 2310 + DefaultBranchNewLinesCount: 0, 2311 + OtherBranchesNewLinesCount: 0, 2312 + }, 2313 + }, 2314 + } 2315 + 2316 + if diff := cmp.Diff(want, got, ignored...); diff != "" { 2317 + t.Fatalf("mismatch (-want +got):\n%s", diff) 2318 + } 2319 + }) 2320 + 2321 + t.Run("compound shard with empty repos", func(t *testing.T) { 2322 + b := testIndexBuilderCompound(t, 2323 + []*Repository{ 2324 + &Repository{Name: "repo 0"}, 2325 + &Repository{Name: "repo 1"}, 2326 + &Repository{Name: "repo 2"}, 2327 + &Repository{Name: "repo 3"}, 2328 + &Repository{Name: "repo 4"}, 2329 + }, 2330 + [][]Document{ 2331 + []Document{Document{Name: "doc 0", Content: []byte("content 0")}}, 2332 + nil, 2333 + []Document{Document{Name: "doc 1", Content: []byte("content 1")}}, 2334 + nil, 2335 + nil, 2336 + }, 2337 + ) 2338 + got := repoListEntries(b) 2339 + 2340 + entryEmpty := RepoListEntry{Stats: RepoStats{ 2341 + Shards: 1, 2342 + Documents: 0, 2343 + ContentBytes: 0, 2344 + }} 2345 + entryNonEmpty := RepoListEntry{Stats: RepoStats{ 2346 + Shards: 1, 2347 + Documents: 1, 2348 + ContentBytes: 14, 2349 + }} 2350 + 2351 + want := []RepoListEntry{ 2352 + entryNonEmpty, 2353 + entryEmpty, 2354 + entryNonEmpty, 2355 + entryEmpty, 2356 + entryEmpty, 2357 + } 2358 + 2359 + if diff := cmp.Diff(want, got, ignored...); diff != "" { 2360 + t.Fatalf("mismatch (-want +got):\n%s", diff) 2361 + } 2362 + 2363 + }) 2364 + }
+6 -14
indexdata.go
··· 179 179 // calculates stats for files in the range [start, end). 180 180 func (d *indexData) calculateStatsForFileRange(start, end uint32) RepoStats { 181 181 if start >= end { 182 + // An empty shard for an empty repository. 182 183 return RepoStats{ 183 - IndexBytes: int64(d.memoryUse()), 184 - Shards: 1, 184 + Shards: 1, 185 185 } 186 186 } 187 187 188 - var last uint32 189 - if len(d.boundaries) > int(end) { 190 - last += d.boundaries[end] 191 - } 192 - 193 - lastFN := last 194 - if len(d.fileNameIndex) > int(end) { 195 - lastFN = d.fileNameIndex[end] 196 - } 197 - 188 + bytesContent := d.boundaries[end] - d.boundaries[start] 189 + bytesFN := d.fileNameIndex[end] - d.fileNameIndex[start] 198 190 count, defaultCount, otherCount := d.calculateNewLinesStats(start, end) 199 191 200 192 // CR keegan for stefan: I think we may want to restructure RepoListEntry so ··· 204 196 // after aggregation. For now I will move forward with this until we can 205 197 // chat more. 206 198 return RepoStats{ 207 - ContentBytes: int64(int(last) + int(lastFN)), 199 + ContentBytes: int64(bytesContent) + int64(bytesFN), 208 200 Documents: int(end - start), 209 201 // CR keegan for stefan: our shard count is going to go out of whack, 210 202 // since we will aggregate these. So we will report more shards than are ··· 221 213 func (d *indexData) calculateStats() error { 222 214 d.repoListEntry = make([]RepoListEntry, 0, len(d.repoMetaData)) 223 215 var start, end uint32 216 + 224 217 for repoID, md := range d.repoMetaData { 225 218 // determine the file range for repo i 226 219 for end < uint32(len(d.repos)) && d.repos[end] == uint16(repoID) { 227 220 end++ 228 221 } 229 - 230 222 if start < end && d.repos[start] != uint16(repoID) { 231 223 return fmt.Errorf("shard documents out of order with respect to repositories: expected document %d to be part of repo %d", start, repoID) 232 224 }