fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

Skip other sections when reading metadata (#826)

Looking at heap profiles, the `ReadMetadata` function creates a ton of garbage
objects. The main contributor is in other sections from the TOC, specifically
decoding `compoundSection.offsets` . However, to read metadata, we only really
need to parse the metadata sections.

This PR introduces a `skip` method that skips over a section without reading
it. This greatly reduces the allocations from `ReadMetadata`.

+104 -27
+50 -24
read.go
··· 21 21 "hash/crc64" 22 22 "log" 23 23 "os" 24 + "slices" 24 25 "sort" 25 26 26 27 "github.com/rs/xid" ··· 94 95 } 95 96 96 97 func (r *reader) readTOC(toc *indexTOC) error { 97 - sz, err := r.r.Size() 98 - if err != nil { 99 - return err 100 - } 101 - r.off = sz - 8 102 - 103 - var tocSection simpleSection 104 - if err := tocSection.read(r); err != nil { 105 - return err 106 - } 98 + return r.readTOCSections(toc, []string{}) 99 + } 107 100 108 - r.seek(tocSection.off) 109 - 110 - sectionCount, err := r.U32() 101 + // readTOCSections reads the table of contents of the index file. 102 + // 103 + // If the tags parameter is non-empty, it reads only those tagged sections for efficiency 104 + // and does not populate the other sections. 105 + func (r *reader) readTOCSections(toc *indexTOC, tags []string) error { 106 + tocSection, sectionCount, err := r.readHeader() 111 107 if err != nil { 112 108 return err 113 109 } ··· 126 122 return err 127 123 } 128 124 125 + skipSection := len(tags) > 0 && !slices.Contains(tags, tag) 129 126 sec := secs[tag] 130 127 if sec == nil || sec.kind() != sectionKind(kind) { 131 128 // If we don't recognize the section, we may be reading a newer index than the current version. Use 132 129 // a "dummy section" struct to skip over it. 133 - log.Printf("encountered unrecognized index section (%s), skipping over it", tag) 130 + skipSection = true 131 + log.Printf("encountered malformed index section (%s), skipping over it", tag) 132 + 134 133 switch sectionKind(kind) { 135 134 case sectionKindSimple: 136 135 sec = &simpleSection{} ··· 143 142 } 144 143 } 145 144 146 - if err := sec.read(r); err != nil { 147 - return err 145 + if skipSection { 146 + if err := sec.skip(r); err != nil { 147 + return err 148 + } 149 + } else { 150 + if err := sec.read(r); err != nil { 151 + return err 152 + } 148 153 } 149 154 } 150 155 } else { ··· 169 174 return nil 170 175 } 171 176 177 + func (r *reader) readHeader() (simpleSection, uint32, error) { 178 + sz, err := r.r.Size() 179 + if err != nil { 180 + return simpleSection{}, 0, err 181 + } 182 + r.off = sz - 8 183 + 184 + var tocSection simpleSection 185 + if err := tocSection.read(r); err != nil { 186 + return simpleSection{}, 0, err 187 + } 188 + 189 + r.seek(tocSection.off) 190 + 191 + sectionCount, err := r.U32() 192 + if err != nil { 193 + return simpleSection{}, 0, err 194 + } 195 + return tocSection, sectionCount, nil 196 + } 197 + 172 198 func (r *indexData) readSectionBlob(sec simpleSection) ([]byte, error) { 173 199 return r.file.Read(sec.off, sec.sz) 174 200 } ··· 205 231 return arr, nil 206 232 } 207 233 208 - func (r *reader) readJSON(data interface{}, sec *simpleSection) error { 234 + func (r *reader) readJSON(data interface{}, sec simpleSection) error { 209 235 blob, err := r.r.Read(sec.off, sec.sz) 210 236 if err != nil { 211 237 return err ··· 228 254 branchNames: []map[uint]string{}, 229 255 } 230 256 231 - repos, md, err := r.readMetadata(toc) 257 + repos, md, err := r.parseMetadata(toc.metaData, toc.repoMetaData) 232 258 if md != nil && !canReadVersion(md) { 233 259 return nil, fmt.Errorf("file is v%d, want v%d", md.IndexFormatVersion, IndexFormatVersion) 234 260 } else if err != nil { ··· 395 421 return &d, nil 396 422 } 397 423 398 - func (r *reader) readMetadata(toc *indexTOC) ([]*Repository, *IndexMetadata, error) { 424 + func (r *reader) parseMetadata(metaData simpleSection, repoMetaData simpleSection) ([]*Repository, *IndexMetadata, error) { 399 425 var md IndexMetadata 400 - if err := r.readJSON(&md, &toc.metaData); err != nil { 426 + if err := r.readJSON(&md, metaData); err != nil { 401 427 return nil, nil, err 402 428 } 403 429 ··· 410 436 } 411 437 412 438 if len(blob) == 0 { 413 - blob, err = r.r.Read(toc.repoMetaData.off, toc.repoMetaData.sz) 439 + blob, err = r.r.Read(repoMetaData.off, repoMetaData.sz) 414 440 if err != nil { 415 441 return nil, &md, err 416 442 } ··· 573 599 func ReadMetadata(inf IndexFile) ([]*Repository, *IndexMetadata, error) { 574 600 rd := &reader{r: inf} 575 601 var toc indexTOC 576 - if err := rd.readTOC(&toc); err != nil { 602 + err := rd.readTOCSections(&toc, []string{"metaData", "repoMetaData"}) 603 + if err != nil { 577 604 return nil, nil, err 578 605 } 579 - 580 - return rd.readMetadata(&toc) 606 + return rd.parseMetadata(toc.metaData, toc.repoMetaData) 581 607 } 582 608 583 609 // ReadMetadataPathAlive is like ReadMetadataPath except that it only returns
+29 -1
read_test.go
··· 32 32 33 33 "github.com/google/go-cmp/cmp" 34 34 "github.com/google/go-cmp/cmp/cmpopts" 35 - 36 35 "github.com/sourcegraph/zoekt/query" 37 36 ) 38 37 ··· 467 466 return true 468 467 }, nil) 469 468 } 469 + 470 + func BenchmarkReadMetadata(b *testing.B) { 471 + file, err := os.Open("testdata/benchmark/zoekt_v16.00000.zoekt") 472 + if err != nil { 473 + b.Fatalf("Failed to open test file: %v", err) 474 + } 475 + defer file.Close() 476 + 477 + indexFile, err := NewIndexFile(file) 478 + if err != nil { 479 + b.Fatalf("could not open index: %v", err) 480 + } 481 + 482 + b.ReportAllocs() 483 + b.ResetTimer() 484 + 485 + for i := 0; i < b.N; i++ { 486 + repos, metadata, err := ReadMetadata(indexFile) 487 + if err != nil { 488 + b.Fatalf("ReadMetadata failed: %v", err) 489 + } 490 + if len(repos) != 1 { 491 + b.Fatalf("expected 1 repository") 492 + } 493 + if metadata == nil { 494 + b.Fatalf("expected non-nil metadata") 495 + } 496 + } 497 + }
+25 -2
section.go
··· 127 127 // section is a range of bytes in the index file. 128 128 type section interface { 129 129 read(*reader) error 130 + // skip advances over the data in the section without reading it. 131 + // NOTE: the section will not contain valid data after this call, and it should not be used. 132 + skip(*reader) error 130 133 write(*writer) 131 - kind() sectionKind // simple or complex, used in serialization 134 + // kind encodes whether the section is simple or compound, and is used in serialization 135 + kind() sectionKind 132 136 } 133 137 134 138 type sectionKind int ··· 156 160 return err 157 161 } 158 162 s.sz, err = r.U32() 163 + return err 164 + } 165 + 166 + func (s *simpleSection) skip(r *reader) error { 167 + var err error 168 + _, err = r.U32() 159 169 if err != nil { 160 170 return err 161 171 } 162 - return nil 172 + _, err = r.U32() 173 + return err 163 174 } 164 175 165 176 func (s *simpleSection) write(w *writer) { ··· 212 223 } 213 224 var err error 214 225 s.offsets, err = readSectionU32(r.r, s.index) 226 + return err 227 + } 228 + 229 + func (s *compoundSection) skip(r *reader) error { 230 + if err := s.data.skip(r); err != nil { 231 + return err 232 + } 233 + if err := s.index.read(r); err != nil { 234 + return err 235 + } 236 + 237 + _, err := r.r.Read(s.index.off, s.index.sz) 215 238 return err 216 239 } 217 240