stats: fixed wrong size reporting for repos from compound shards (#352) · boltless.me/zoekt@52951bd

+10 -11

eval_test.go

··· 171 171 }) 172 172 } 173 173 174 - // compoundReposShard returns a compound shard where each repo has 1 document. 175 174 func compoundReposShard(t *testing.T, names ...string) *indexData { 176 175 t.Helper() 177 - b := newIndexBuilder() 178 - b.indexFormatVersion = NextIndexFormatVersion 176 + 177 + repos := make([]*Repository, 0, len(names)) 178 + docs := make([][]Document, 0, len(names)) 179 179 for _, name := range names { 180 - if err := b.setRepository(&Repository{ID: hash(name), Name: name}); err != nil { 181 - t.Fatal(err) 180 + repos = append(repos, &Repository{ID: hash(name), Name: name}) 181 + ds := []Document{ 182 + Document{Name: name + ".txt", Content: []byte(name + " content")}, 183 + Document{Name: name + ".2.txt", Content: []byte(name + " content 2")}, 182 184 } 183 - if err := b.AddFile(name+".txt", []byte(name+" content")); err != nil { 184 - t.Fatal(err) 185 - } 186 - if err := b.AddFile(name+".2.txt", []byte(name+" content 2")); err != nil { 187 - t.Fatal(err) 188 - } 185 + docs = append(docs, ds) 189 186 } 187 + 188 + b := testIndexBuilderCompound(t, repos, docs) 190 189 s := searcherForTest(t, b) 191 190 return s.(*indexData) 192 191 }

+186

index_test.go

··· 61 61 return b 62 62 } 63 63 64 + func testIndexBuilderCompound(t *testing.T, repos []*Repository, docs [][]Document) *IndexBuilder { 65 + t.Helper() 66 + 67 + b := newIndexBuilder() 68 + b.indexFormatVersion = NextIndexFormatVersion 69 + b.contentBloom.bits = b.contentBloom.bits[:bloomSizeTest] 70 + b.nameBloom.bits = b.nameBloom.bits[:bloomSizeTest] 71 + 72 + if len(repos) != len(docs) { 73 + t.Fatalf("testIndexBuilderCompound: repos must be the same length as docs, got: len(repos)=%d len(docs)=%d", len(repos), len(docs)) 74 + } 75 + 76 + for i, repo := range repos { 77 + if err := b.setRepository(repo); err != nil { 78 + t.Fatal(err) 79 + } 80 + for j, d := range docs[i] { 81 + if err := b.Add(d); err != nil { 82 + t.Fatalf("Add %d %d: %v", i, j, err) 83 + } 84 + } 85 + } 86 + 87 + return b 88 + } 89 + 64 90 func TestBoundary(t *testing.T) { 65 91 b := testIndexBuilder(t, nil, 66 92 Document{Name: "f1", Content: []byte("x the")}, ··· 2176 2202 res = searchForTest(t, b, &query.Language{Language: "C++"}) 2177 2203 wantSingleMatch(res, "hello.h") 2178 2204 } 2205 + 2206 + func TestStats(t *testing.T) { 2207 + ignored := []cmp.Option{ 2208 + cmpopts.EquateEmpty(), 2209 + cmpopts.IgnoreFields(RepoListEntry{}, "Repository"), 2210 + cmpopts.IgnoreFields(RepoListEntry{}, "IndexMetadata"), 2211 + cmpopts.IgnoreFields(RepoStats{}, "IndexBytes"), 2212 + } 2213 + 2214 + repoListEntries := func(b *IndexBuilder) []RepoListEntry { 2215 + searcher := searcherForTest(t, b) 2216 + indexdata := searcher.(*indexData) 2217 + return indexdata.repoListEntry 2218 + } 2219 + 2220 + t.Run("one empty repo", func(t *testing.T) { 2221 + b := testIndexBuilder(t, nil) 2222 + got := repoListEntries(b) 2223 + want := []RepoListEntry{ 2224 + RepoListEntry{ 2225 + Stats: RepoStats{ 2226 + Repos: 0, 2227 + Shards: 1, 2228 + Documents: 0, 2229 + IndexBytes: 20, 2230 + ContentBytes: 0, 2231 + NewLinesCount: 0, 2232 + DefaultBranchNewLinesCount: 0, 2233 + OtherBranchesNewLinesCount: 0, 2234 + }, 2235 + }, 2236 + } 2237 + 2238 + if diff := cmp.Diff(want, got, ignored...); diff != "" { 2239 + t.Fatalf("mismatch (-want +got):\n%s", diff) 2240 + } 2241 + 2242 + }) 2243 + 2244 + t.Run("one simple shard", func(t *testing.T) { 2245 + b := testIndexBuilder(t, nil, 2246 + Document{Name: "doc 0", Content: []byte("content 0")}, 2247 + Document{Name: "doc 1", Content: []byte("content 1")}, 2248 + ) 2249 + got := repoListEntries(b) 2250 + want := []RepoListEntry{ 2251 + RepoListEntry{ 2252 + Stats: RepoStats{ 2253 + Repos: 0, 2254 + Shards: 1, 2255 + Documents: 2, 2256 + IndexBytes: 224, 2257 + ContentBytes: 28, 2258 + NewLinesCount: 0, 2259 + DefaultBranchNewLinesCount: 0, 2260 + OtherBranchesNewLinesCount: 0, 2261 + }, 2262 + }, 2263 + } 2264 + 2265 + if diff := cmp.Diff(want, got, ignored...); diff != "" { 2266 + t.Fatalf("mismatch (-want +got):\n%s", diff) 2267 + } 2268 + 2269 + }) 2270 + 2271 + t.Run("one compound shard", func(t *testing.T) { 2272 + b := testIndexBuilderCompound(t, 2273 + []*Repository{ 2274 + &Repository{Name: "repo 0"}, 2275 + &Repository{Name: "repo 1"}, 2276 + }, 2277 + [][]Document{ 2278 + []Document{ 2279 + Document{Name: "doc 0", Content: []byte("content 0")}, 2280 + Document{Name: "doc 1", Content: []byte("content 1")}, 2281 + }, 2282 + []Document{ 2283 + Document{Name: "doc 2", Content: []byte("content 2")}, 2284 + Document{Name: "doc 3", Content: []byte("content 3")}, 2285 + }, 2286 + }, 2287 + ) 2288 + got := repoListEntries(b) 2289 + want := []RepoListEntry{ 2290 + RepoListEntry{ 2291 + Stats: RepoStats{ 2292 + Repos: 0, 2293 + Shards: 1, 2294 + Documents: 2, 2295 + IndexBytes: 180, 2296 + ContentBytes: 28, 2297 + NewLinesCount: 0, 2298 + DefaultBranchNewLinesCount: 0, 2299 + OtherBranchesNewLinesCount: 0, 2300 + }, 2301 + }, 2302 + RepoListEntry{ 2303 + Stats: RepoStats{ 2304 + Repos: 0, 2305 + Shards: 1, 2306 + Documents: 2, 2307 + IndexBytes: 180, 2308 + ContentBytes: 28, 2309 + NewLinesCount: 0, 2310 + DefaultBranchNewLinesCount: 0, 2311 + OtherBranchesNewLinesCount: 0, 2312 + }, 2313 + }, 2314 + } 2315 + 2316 + if diff := cmp.Diff(want, got, ignored...); diff != "" { 2317 + t.Fatalf("mismatch (-want +got):\n%s", diff) 2318 + } 2319 + }) 2320 + 2321 + t.Run("compound shard with empty repos", func(t *testing.T) { 2322 + b := testIndexBuilderCompound(t, 2323 + []*Repository{ 2324 + &Repository{Name: "repo 0"}, 2325 + &Repository{Name: "repo 1"}, 2326 + &Repository{Name: "repo 2"}, 2327 + &Repository{Name: "repo 3"}, 2328 + &Repository{Name: "repo 4"}, 2329 + }, 2330 + [][]Document{ 2331 + []Document{Document{Name: "doc 0", Content: []byte("content 0")}}, 2332 + nil, 2333 + []Document{Document{Name: "doc 1", Content: []byte("content 1")}}, 2334 + nil, 2335 + nil, 2336 + }, 2337 + ) 2338 + got := repoListEntries(b) 2339 + 2340 + entryEmpty := RepoListEntry{Stats: RepoStats{ 2341 + Shards: 1, 2342 + Documents: 0, 2343 + ContentBytes: 0, 2344 + }} 2345 + entryNonEmpty := RepoListEntry{Stats: RepoStats{ 2346 + Shards: 1, 2347 + Documents: 1, 2348 + ContentBytes: 14, 2349 + }} 2350 + 2351 + want := []RepoListEntry{ 2352 + entryNonEmpty, 2353 + entryEmpty, 2354 + entryNonEmpty, 2355 + entryEmpty, 2356 + entryEmpty, 2357 + } 2358 + 2359 + if diff := cmp.Diff(want, got, ignored...); diff != "" { 2360 + t.Fatalf("mismatch (-want +got):\n%s", diff) 2361 + } 2362 + 2363 + }) 2364 + }

+6 -14

indexdata.go

··· 179 179 // calculates stats for files in the range [start, end). 180 180 func (d *indexData) calculateStatsForFileRange(start, end uint32) RepoStats { 181 181 if start >= end { 182 + // An empty shard for an empty repository. 182 183 return RepoStats{ 183 - IndexBytes: int64(d.memoryUse()), 184 - Shards: 1, 184 + Shards: 1, 185 185 } 186 186 } 187 187 188 - var last uint32 189 - if len(d.boundaries) > int(end) { 190 - last += d.boundaries[end] 191 - } 192 - 193 - lastFN := last 194 - if len(d.fileNameIndex) > int(end) { 195 - lastFN = d.fileNameIndex[end] 196 - } 197 - 188 + bytesContent := d.boundaries[end] - d.boundaries[start] 189 + bytesFN := d.fileNameIndex[end] - d.fileNameIndex[start] 198 190 count, defaultCount, otherCount := d.calculateNewLinesStats(start, end) 199 191 200 192 // CR keegan for stefan: I think we may want to restructure RepoListEntry so ··· 204 196 // after aggregation. For now I will move forward with this until we can 205 197 // chat more. 206 198 return RepoStats{ 207 - ContentBytes: int64(int(last) + int(lastFN)), 199 + ContentBytes: int64(bytesContent) + int64(bytesFN), 208 200 Documents: int(end - start), 209 201 // CR keegan for stefan: our shard count is going to go out of whack, 210 202 // since we will aggregate these. So we will report more shards than are ··· 221 213 func (d *indexData) calculateStats() error { 222 214 d.repoListEntry = make([]RepoListEntry, 0, len(d.repoMetaData)) 223 215 var start, end uint32 216 + 224 217 for repoID, md := range d.repoMetaData { 225 218 // determine the file range for repo i 226 219 for end < uint32(len(d.repos)) && d.repos[end] == uint16(repoID) { 227 220 end++ 228 221 } 229 - 230 222 if start < end && d.repos[start] != uint16(repoID) { 231 223 return fmt.Errorf("shard documents out of order with respect to repositories: expected document %d to be part of repo %d", start, repoID) 232 224 }

Configure Feed

Configure Feed