fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

toc: modify disk format to include names for backwards/forwards compatiblity.

Including section names in the table of contents permits simpler
forwards and backwards compatibility. Instead of having to bump
the entire IndexFormatVersion when a new section is added, there is
simply a new section present in the table of contents.

Older versions can read TOCs with unknown sections and skip over them
with a warning. This is useful to permit downgrades without always
requiring a reindex.

Newer versions can read TOCs from older version with missing sections
and handle them gracefully, by checking for empty sections when loading
an index file and implementing whatever fallback code is necessary.

Section evolution is possible by having a new name for a tagged section,
adding the old section to the CompatibilityList, and writing the
conversion code when loading the file, or modifying the users of the
section to use whichever one is loaded.

Change-Id: I9aa05f29eb9d64fd0fff218f008d2031f1a15c8c

+314 -46
+8 -7
api.go
··· 339 339 // IndexMetadata holds metadata stored in the index file. It contains 340 340 // data generated by the core indexing library. 341 341 type IndexMetadata struct { 342 - IndexFormatVersion int 343 - IndexFeatureVersion int 344 - IndexTime time.Time 345 - PlainASCII bool 346 - LanguageMap map[string]byte 347 - ZoektVersion string 348 - ID string 342 + IndexFormatVersion int 343 + IndexFeatureVersion int 344 + IndexMinReaderVersion int 345 + IndexTime time.Time 346 + PlainASCII bool 347 + LanguageMap map[string]byte 348 + ZoektVersion string 349 + ID string 349 350 } 350 351 351 352 // Statistics of a (collection of) repositories.
+1 -1
build/builder.go
··· 302 302 FeatureVersion: zoekt.FeatureVersion, 303 303 }, { 304 304 IndexFormatVersion: zoekt.NextIndexFormatVersion, 305 - FeatureVersion: zoekt.NextFeatureVersion, 305 + FeatureVersion: zoekt.FeatureVersion, 306 306 }} 307 307 308 308 // IncrementalSkipIndexing returns true if the index present on disk matches
+13 -5
build/builder_test.go
··· 13 13 "github.com/google/zoekt" 14 14 ) 15 15 16 + var update = flag.Bool("update", false, "update golden file") 17 + 16 18 // ensure we don't regress on how we build v16 17 19 func TestBuildv16(t *testing.T) { 18 20 dir := t.TempDir() ··· 57 59 58 60 gotP := filepath.Join(dir, "repo_v16.00000.zoekt") 59 61 60 - // uncomment to update 61 - //err = os.Rename(gotP, wantP) 62 - //if err != nil { 63 - // t.Fatal(err) 64 - //} 62 + if *update { 63 + data, err := os.ReadFile(gotP) 64 + if err != nil { 65 + t.Fatal(err) 66 + } 67 + err = os.WriteFile(wantP, data, 0644) 68 + if err != nil { 69 + t.Fatal(err) 70 + } 71 + return 72 + } 65 73 66 74 got, err := os.ReadFile(gotP) 67 75 if err != nil {
-1
eval_test.go
··· 133 133 t.Helper() 134 134 b := newIndexBuilder() 135 135 b.indexFormatVersion = NextIndexFormatVersion 136 - b.featureVersion = NextFeatureVersion 137 136 for _, name := range names { 138 137 if err := b.setRepository(&Repository{Name: name}); err != nil { 139 138 t.Fatal(err)
-1
merge.go
··· 90 90 91 91 ib := newIndexBuilder() 92 92 ib.indexFormatVersion = NextIndexFormatVersion 93 - ib.featureVersion = NextFeatureVersion 94 93 95 94 for _, d := range ds { 96 95 lastRepoID := -1
+88 -10
read.go
··· 19 19 "encoding/json" 20 20 "fmt" 21 21 "hash/crc64" 22 + "log" 22 23 "os" 23 24 "sort" 24 25 ··· 62 63 return binary.BigEndian.Uint64(b), nil 63 64 } 64 65 66 + func (r *reader) ReadByte() (byte, error) { 67 + b, err := r.r.Read(r.off, 1) 68 + r.off += 1 69 + if err != nil { 70 + return 0, err 71 + } 72 + return b[0], nil 73 + } 74 + 75 + func (r *reader) Varint() (uint64, error) { 76 + v, err := binary.ReadUvarint(r) 77 + if err != nil { 78 + return 0, err 79 + } 80 + return v, nil 81 + } 82 + 83 + func (r *reader) Str() (string, error) { 84 + slen, err := r.Varint() 85 + if err != nil { 86 + return "", err 87 + } 88 + b, err := r.r.Read(r.off, uint32(slen)) 89 + if err != nil { 90 + return "", err 91 + } 92 + r.off += uint32(slen) 93 + return string(b), nil 94 + } 95 + 65 96 func (r *reader) readTOC(toc *indexTOC) error { 66 97 sz, err := r.r.Size() 67 98 if err != nil { ··· 81 112 return err 82 113 } 83 114 84 - secs := toc.sections() 85 - if len(secs) != int(sectionCount) { 86 - secs = toc.sectionsNext() 87 - } 115 + if sectionCount == 0 { 116 + // tagged sections are indicated by a 0 sectionCount, 117 + // and then a list of string-tagged type-indicated sections. 118 + secs := toc.sectionsTagged() 119 + for r.off < tocSection.off+tocSection.sz { 120 + tag, err := r.Str() 121 + if err != nil { 122 + return err 123 + } 124 + kind, err := r.Varint() 125 + if err != nil { 126 + return err 127 + } 128 + sec := secs[tag] 129 + if sec != nil && sec.kind() == sectionKind(kind) { 130 + // happy path 131 + if err := sec.read(r); err != nil { 132 + return err 133 + } 134 + continue 135 + } 136 + // error case: skip over unknown section 137 + if sec == nil { 138 + log.Printf("file %s TOC has unknown section %q", r.r.Name(), tag) 139 + } else { 140 + return fmt.Errorf("file %s TOC section %q expects kind %d, got kind %d", r.r.Name(), tag, 141 + kind, sec.kind()) 142 + } 143 + if kind == 0 { 144 + (&simpleSection{}).read(r) 145 + } else if kind == 1 { 146 + (&compoundSection{}).read(r) 147 + } 148 + } 149 + } else { 150 + // TODO: Remove this branch when ReaderMinFeatureVersion >= 10 88 151 89 - if len(secs) != int(sectionCount) { 90 - return fmt.Errorf("section count mismatch: got %d want %d", sectionCount, len(secs)) 91 - } 152 + secs := toc.sections() 92 153 93 - for _, s := range secs { 94 - if err := s.read(r); err != nil { 95 - return err 154 + if len(secs) != int(sectionCount) { 155 + secs = toc.sectionsNext() 156 + } 157 + 158 + if len(secs) != int(sectionCount) { 159 + return fmt.Errorf("section count mismatch: got %d want %d", sectionCount, len(secs)) 160 + } 161 + 162 + for _, s := range secs { 163 + if err := s.read(r); err != nil { 164 + return err 165 + } 96 166 } 97 167 } 98 168 return nil ··· 169 239 d.repoMetaData = make([]Repository, 0, len(repos)) 170 240 for _, r := range repos { 171 241 d.repoMetaData = append(d.repoMetaData, *r) 242 + } 243 + 244 + if d.metaData.IndexFeatureVersion < ReadMinFeatureVersion { 245 + return nil, fmt.Errorf("file is feature version %d, want feature version >= %d", d.metaData.IndexFeatureVersion, ReadMinFeatureVersion) 246 + } 247 + 248 + if d.metaData.IndexMinReaderVersion > FeatureVersion { 249 + return nil, fmt.Errorf("file needs read feature version >= %d, have read feature version %d", d.metaData.IndexMinReaderVersion, FeatureVersion) 172 250 } 173 251 174 252 d.boundariesStart = toc.fileContents.data.off
+72
read_test.go
··· 20 20 "encoding/json" 21 21 "flag" 22 22 "fmt" 23 + "io/fs" 23 24 "io/ioutil" 24 25 "os" 26 + "path" 25 27 "path/filepath" 26 28 "reflect" 27 29 "strconv" ··· 265 267 } 266 268 }) 267 269 } 270 + } 268 271 272 + func TestBackwardsCompat(t *testing.T) { 273 + if *update { 274 + b, err := NewIndexBuilder(nil) 275 + if err != nil { 276 + t.Fatalf("NewIndexBuilder: %v", err) 277 + } 278 + 279 + if err := b.AddFile("filename", []byte("abcde")); err != nil { 280 + t.Fatalf("AddFile: %v", err) 281 + } 282 + 283 + var buf bytes.Buffer 284 + b.Write(&buf) 285 + 286 + outname := fmt.Sprintf("testdata/backcompat/new_v%d.%05d.zoekt", IndexFormatVersion, 0) 287 + t.Log("writing new file", outname) 288 + 289 + err = os.WriteFile(outname, buf.Bytes(), 0644) 290 + if err != nil { 291 + t.Fatalf("Creating output file: %v", err) 292 + } 293 + } 294 + 295 + compatibleFiles, err := fs.Glob(os.DirFS("."), "testdata/backcompat/*.zoekt") 296 + if err != nil { 297 + t.Fatalf("fs.Glob: %v", err) 298 + } 299 + 300 + for _, fname := range compatibleFiles { 301 + t.Run(path.Base(fname), 302 + func(t *testing.T) { 303 + f, err := os.Open(fname) 304 + if err != nil { 305 + t.Fatal("os.Open", err) 306 + } 307 + idx, err := NewIndexFile(f) 308 + if err != nil { 309 + t.Fatal("NewIndexFile", err) 310 + } 311 + r := reader{r: idx} 312 + 313 + var toc indexTOC 314 + err = r.readTOC(&toc) 315 + 316 + if err != nil { 317 + t.Errorf("got read error %v", err) 318 + } 319 + if toc.fileContents.data.sz != 5 { 320 + t.Errorf("got contents size %d, want 5", toc.fileContents.data.sz) 321 + } 322 + 323 + data, err := r.readIndexData(&toc) 324 + if err != nil { 325 + t.Fatalf("readIndexData: %v", err) 326 + } 327 + if got := data.fileName(0); string(got) != "filename" { 328 + t.Errorf("got filename %q, want %q", got, "filename") 329 + } 330 + 331 + if len(data.ngrams.DumpMap()) != 3 { 332 + t.Fatalf("got ngrams %v, want 3 ngrams", data.ngrams) 333 + } 334 + 335 + if sec := data.ngrams.Get(stringToNGram("bcq")); sec.sz > 0 { 336 + t.Errorf("found ngram bcd in %v", data.ngrams) 337 + } 338 + }, 339 + ) 340 + } 269 341 } 270 342 271 343 func TestBackfillIDIsDeterministic(t *testing.T) {
+27
section.go
··· 65 65 w.Write(enc[:m]) 66 66 } 67 67 68 + func (w *writer) String(s string) { 69 + b := []byte(s) 70 + w.Varint(uint32(len(b))) 71 + w.Write(b) 72 + } 73 + 68 74 func (s *simpleSection) start(w *writer) { 69 75 s.off = w.Off() 70 76 } ··· 77 83 type section interface { 78 84 read(*reader) error 79 85 write(*writer) 86 + kind() sectionKind // simple or complex, used in serialization 80 87 } 81 88 89 + type sectionKind int 90 + 91 + const ( 92 + sectionKindSimple sectionKind = 0 93 + sectionKindCompound sectionKind = 1 94 + sectionKindCompoundLazy sectionKind = 2 95 + ) 96 + 82 97 // simpleSection is a simple range of bytes. 83 98 type simpleSection struct { 84 99 off uint32 85 100 sz uint32 101 + } 102 + 103 + func (s *simpleSection) kind() sectionKind { 104 + return sectionKindSimple 86 105 } 87 106 88 107 func (s *simpleSection) read(r *reader) error { ··· 110 129 111 130 offsets []uint32 112 131 index simpleSection 132 + } 133 + 134 + func (s *compoundSection) kind() sectionKind { 135 + return sectionKindCompound 113 136 } 114 137 115 138 func (s *compoundSection) start(w *writer) { ··· 162 185 163 186 type lazyCompoundSection struct { 164 187 compoundSection 188 + } 189 + 190 + func (s *lazyCompoundSection) kind() sectionKind { 191 + return sectionKindCompoundLazy 165 192 } 166 193 167 194 func (s *lazyCompoundSection) read(r *reader) error {
+2 -2
testdata/golden/TestReadSearch/repo17_v17.00000.golden
··· 1 1 { 2 2 "FormatVersion": 17, 3 - "FeatureVersion": 1, 3 + "FeatureVersion": 10, 4 4 "FileMatches": [ 5 5 [ 6 6 { ··· 71 71 null, 72 72 null 73 73 ] 74 - } 74 + }
+2 -2
testdata/golden/TestReadSearch/repo_v16.00000.golden
··· 1 1 { 2 2 "FormatVersion": 16, 3 - "FeatureVersion": 9, 3 + "FeatureVersion": 10, 4 4 "FileMatches": [ 5 5 [ 6 6 { ··· 71 71 null, 72 72 null 73 73 ] 74 - } 74 + }
testdata/shards/repo17_v17.00000.zoekt

This is a binary file and will not be displayed.

testdata/shards/repo_v16.00000.zoekt

This is a binary file and will not be displayed.

+76 -2
toc.go
··· 40 40 // 7: Record skip reasons in the index. 41 41 // 8: Record source path in the index. 42 42 // 9: Store ctags metadata & bump default max file size 43 - const FeatureVersion = 9 43 + // 10: Compound shards; more flexible TOC format. 44 + const FeatureVersion = 10 45 + 46 + // WriteMinFeatureVersion and ReadMinFeatureVersion constrain forwards and backwards 47 + // compatibility. For example, if a new way to encode filenameNgrams on disk is 48 + // added using a new section but the old one is retained, this would only bump 49 + // FeatureVersion, since the previous version can read the file and ignore the 50 + // new section, but the index files should be regenerated. 51 + // When the new encoding is fully rolled out and stable, the section with the old 52 + // encoding and the associated reader can be removed, and WriteMinFeatureVersion and 53 + // ReadMinFeatureVersion can be set to the current FeatureVersion, indicating 54 + // that the reader must handle the new version and that older versions are no 55 + // longer valid. 56 + // In this way, compatibility with arbitrary version offsets can be indicated. 57 + 58 + // WriteMinFeatureVersion constrains forwards compatibility by emitting files 59 + // that won't load in zoekt with a FeatureVersion below it. 60 + const WriteMinFeatureVersion = 10 61 + 62 + // ReadMinFeatureVersion constrains backwards compatibility by refusing to 63 + // load a file with a FeatureVersion below it. 64 + const ReadMinFeatureVersion = 8 44 65 45 66 // 17: compound shard (multi repo) 46 67 const NextIndexFormatVersion = 17 47 - const NextFeatureVersion = 1 48 68 49 69 type indexTOC struct { 50 70 fileContents compoundSection ··· 78 98 } 79 99 80 100 func (t *indexTOC) sections() []section { 101 + // This old sections list is only needed to maintain backwards compatibility, 102 + // and can be removed when a migration to tagged sections is complete. 81 103 return []section{ 82 104 // This must be first, so it can be reliably read across 83 105 // file format versions. ··· 110 132 func (t *indexTOC) sectionsNext() []section { 111 133 return append(t.sections(), &t.repos) 112 134 } 135 + 136 + type taggedSection struct { 137 + tag string 138 + sec section 139 + } 140 + 141 + func (t *indexTOC) sectionsTagged() map[string]section { 142 + out := map[string]section{} 143 + for _, ent := range t.sectionsTaggedList() { 144 + out[ent.tag] = ent.sec 145 + } 146 + for _, ent := range t.sectionsTaggedCompatibilityList() { 147 + out[ent.tag] = ent.sec 148 + } 149 + return out 150 + } 151 + 152 + func (t *indexTOC) sectionsTaggedList() []taggedSection { 153 + return []taggedSection{ 154 + {"metaData", &t.metaData}, 155 + {"repoMetaData", &t.repoMetaData}, 156 + {"fileContents", &t.fileContents}, 157 + {"fileNames", &t.fileNames}, 158 + {"fileSections", &t.fileSections}, 159 + {"fileEndSymbol", &t.fileEndSymbol}, 160 + {"symbolMap", &t.symbolMap}, 161 + {"symbolKindMap", &t.symbolKindMap}, 162 + {"symbolMetaData", &t.symbolMetaData}, 163 + {"newlines", &t.newlines}, 164 + {"ngramText", &t.ngramText}, 165 + {"postings", &t.postings}, 166 + {"nameNgramText", &t.nameNgramText}, 167 + {"namePostings", &t.namePostings}, 168 + {"branchMasks", &t.branchMasks}, 169 + {"subRepos", &t.subRepos}, 170 + {"runeOffsets", &t.runeOffsets}, 171 + {"nameRuneOffsets", &t.nameRuneOffsets}, 172 + {"fileEndRunes", &t.fileEndRunes}, 173 + {"nameEndRunes", &t.nameEndRunes}, 174 + {"contentChecksums", &t.contentChecksums}, 175 + {"languages", &t.languages}, 176 + {"runeDocSections", &t.runeDocSections}, 177 + {"repos", &t.repos}, 178 + } 179 + } 180 + 181 + // sectionsTaggedCompatibilityList returns a list of sections that will be 182 + // handled or converted for backwards compatiblity, but aren't written by 183 + // the current iteration of the indexer. 184 + func (t *indexTOC) sectionsTaggedCompatibilityList() []taggedSection { 185 + return []taggedSection{} 186 + }
+25 -15
write.go
··· 25 25 "time" 26 26 ) 27 27 28 - func (w *writer) writeTOC(secs []section) { 29 - w.U32(uint32(len(secs))) 28 + func (w *writer) writeTOC(toc *indexTOC) { 29 + // Tagged sections are indicated with a 0 section count. 30 + // Tagged sections allow easier forwards and backwards 31 + // compatibility when evolving zoekt index files with new 32 + // sections. 33 + // 34 + // A tagged section is: 35 + // Varint TagLen, Tag String, Varint Kind, Section 36 + // 37 + // Section kind is indicated because simpleSections and 38 + // compoundSections have different lengths. 39 + w.U32(0) 40 + secs := toc.sectionsTaggedList() 30 41 for _, s := range secs { 31 - s.write(w) 42 + w.String(s.tag) 43 + w.Varint(uint32(s.sec.kind())) 44 + s.sec.write(w) 32 45 } 33 46 } 34 47 ··· 161 174 } 162 175 163 176 if err := b.writeJSON(&IndexMetadata{ 164 - IndexFormatVersion: b.indexFormatVersion, 165 - IndexTime: indexTime, 166 - IndexFeatureVersion: b.featureVersion, 167 - PlainASCII: b.contentPostings.isPlainASCII && b.namePostings.isPlainASCII, 168 - LanguageMap: b.languageMap, 169 - ZoektVersion: Version, 170 - ID: b.ID, 177 + IndexFormatVersion: b.indexFormatVersion, 178 + IndexTime: indexTime, 179 + IndexFeatureVersion: b.featureVersion, 180 + IndexMinReaderVersion: WriteMinFeatureVersion, 181 + PlainASCII: b.contentPostings.isPlainASCII && b.namePostings.isPlainASCII, 182 + LanguageMap: b.languageMap, 183 + ZoektVersion: Version, 184 + ID: b.ID, 171 185 }, &toc.metaData, w); err != nil { 172 186 return err 173 187 } ··· 188 202 var tocSection simpleSection 189 203 190 204 tocSection.start(w) 191 - if next { 192 - w.writeTOC(toc.sectionsNext()) 193 - } else { 194 - w.writeTOC(toc.sections()) 195 - } 205 + w.writeTOC(&toc) 196 206 tocSection.end(w) 197 207 tocSection.write(w) 198 208 return w.err