fork of https://github.com/sourcegraph/zoekt
1// Copyright 2017 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17// IndexFormatVersion is a version number. It is increased every time the
18// on-disk index format is changed.
19// 5: subrepositories.
20// 6: remove size prefix for posting varint list.
21// 7: move subrepos into Repository struct.
22// 8: move repoMetaData out of indexMetadata
23// 9: use bigendian uint64 for trigrams.
24// 10: sections for rune offsets.
25// 11: file ends in rune offsets.
26// 12: 64-bit branchmasks.
27// 13: content checksums
28// 14: languages
29// 15: rune based symbol sections
30// 16: ctags metadata
31const IndexFormatVersion = 16
32
33// FeatureVersion is increased if a feature is added that requires reindexing data
34// without changing the format version
35// 2: Rank field for shards.
36// 3: Rank documents within shards
37// 4: Dedup file bugfix
38// 5: Remove max line size limit
39// 6: Include '#' into the LineFragment template
40// 7: Record skip reasons in the index.
41// 8: Record source path in the index.
42// 9: Store ctags metadata & bump default max file size
43// 10: Compound shards; more flexible TOC format.
44// 11: Bloom filters for file names & contents
45// 12: go-enry for identifying file languages
46const FeatureVersion = 12
47
48// WriteMinFeatureVersion and ReadMinFeatureVersion constrain forwards and backwards
49// compatibility. For example, if a new way to encode filenameNgrams on disk is
50// added using a new section but the old one is retained, this would only bump
51// FeatureVersion, since the previous version can read the file and ignore the
52// new section, but the index files should be regenerated.
53// When the new encoding is fully rolled out and stable, the section with the old
54// encoding and the associated reader can be removed, and WriteMinFeatureVersion and
55// ReadMinFeatureVersion can be set to the current FeatureVersion, indicating
56// that the reader must handle the new version and that older versions are no
57// longer valid.
58// In this way, compatibility with arbitrary version offsets can be indicated.
59
60// WriteMinFeatureVersion constrains forwards compatibility by emitting files
61// that won't load in zoekt with a FeatureVersion below it.
62const WriteMinFeatureVersion = 10
63
64// ReadMinFeatureVersion constrains backwards compatibility by refusing to
65// load a file with a FeatureVersion below it.
66const ReadMinFeatureVersion = 8
67
68// 17: compound shard (multi repo)
69const NextIndexFormatVersion = 17
70
71type indexTOC struct {
72 fileContents compoundSection
73 fileNames compoundSection
74 fileSections compoundSection
75 postings compoundSection
76 newlines compoundSection
77 ngramText simpleSection
78 runeOffsets simpleSection
79 fileEndRunes simpleSection
80 languages simpleSection
81
82 fileEndSymbol simpleSection
83 symbolMap lazyCompoundSection
84 symbolKindMap compoundSection
85 symbolMetaData simpleSection
86
87 branchMasks simpleSection
88 subRepos simpleSection
89
90 nameNgramText simpleSection
91 namePostings compoundSection
92 nameRuneOffsets simpleSection
93 metaData simpleSection
94 repoMetaData simpleSection
95 nameEndRunes simpleSection
96 contentChecksums simpleSection
97 runeDocSections simpleSection
98
99 repos simpleSection
100
101 ranks simpleSection
102}
103
104func (t *indexTOC) sections() []section {
105 // This old sections list is only needed to maintain backwards compatibility,
106 // and can be removed when a migration to tagged sections is complete.
107 return []section{
108 // This must be first, so it can be reliably read across
109 // file format versions.
110 &t.metaData,
111 &t.repoMetaData,
112 &t.fileContents,
113 &t.fileNames,
114 &t.fileSections,
115 &t.fileEndSymbol,
116 &t.symbolMap,
117 &t.symbolKindMap,
118 &t.symbolMetaData,
119 &t.newlines,
120 &t.ngramText,
121 &t.postings,
122 &t.nameNgramText,
123 &t.namePostings,
124 &t.branchMasks,
125 &t.subRepos,
126 &t.runeOffsets,
127 &t.nameRuneOffsets,
128 &t.fileEndRunes,
129 &t.nameEndRunes,
130 &t.contentChecksums,
131 &t.languages,
132 &t.runeDocSections,
133 }
134}
135
136func (t *indexTOC) sectionsNext() []section {
137 return append(t.sections(), &t.repos)
138}
139
140type taggedSection struct {
141 tag string
142 sec section
143}
144
145func (t *indexTOC) sectionsTagged() map[string]section {
146 out := map[string]section{}
147 for _, ent := range t.sectionsTaggedList() {
148 out[ent.tag] = ent.sec
149 }
150 for _, ent := range t.sectionsTaggedCompatibilityList() {
151 out[ent.tag] = ent.sec
152 }
153 return out
154}
155
156func (t *indexTOC) sectionsTaggedList() []taggedSection {
157 var unusedSimple simpleSection
158
159 return []taggedSection{
160 {"metaData", &t.metaData},
161 {"repoMetaData", &t.repoMetaData},
162 {"fileContents", &t.fileContents},
163 {"fileNames", &t.fileNames},
164 {"fileSections", &t.fileSections},
165 {"fileEndSymbol", &t.fileEndSymbol},
166 {"symbolMap", &t.symbolMap},
167 {"symbolKindMap", &t.symbolKindMap},
168 {"symbolMetaData", &t.symbolMetaData},
169 {"newlines", &t.newlines},
170 {"ngramText", &t.ngramText},
171 {"postings", &t.postings},
172 {"nameNgramText", &t.nameNgramText},
173 {"namePostings", &t.namePostings},
174 {"branchMasks", &t.branchMasks},
175 {"subRepos", &t.subRepos},
176 {"runeOffsets", &t.runeOffsets},
177 {"nameRuneOffsets", &t.nameRuneOffsets},
178 {"fileEndRunes", &t.fileEndRunes},
179 {"nameEndRunes", &t.nameEndRunes},
180 {"contentChecksums", &t.contentChecksums},
181 {"languages", &t.languages},
182 {"runeDocSections", &t.runeDocSections},
183 {"repos", &t.repos},
184
185 // We no longer write these sections, but we still return them here to avoid
186 // warnings about unknown sections.
187 {"nameBloom", &unusedSimple},
188 {"contentBloom", &unusedSimple},
189 {"ranks", &unusedSimple},
190 }
191}
192
193// sectionsTaggedCompatibilityList returns a list of sections that will be
194// handled or converted for backwards compatiblity, but aren't written by
195// the current iteration of the indexer.
196func (t *indexTOC) sectionsTaggedCompatibilityList() []taggedSection {
197 return []taggedSection{}
198}