fork of https://github.com/sourcegraph/zoekt
1// Copyright 2017 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package index
16
17// IndexFormatVersion is a version number. It is increased every time the
18// on-disk index format is changed.
19// 5: subrepositories.
20// 6: remove size prefix for posting varint list.
21// 7: move subrepos into Repository struct.
22// 8: move repoMetaData out of indexMetadata
23// 9: use bigendian uint64 for trigrams.
24// 10: sections for rune offsets.
25// 11: file ends in rune offsets.
26// 12: 64-bit branchmasks.
27// 13: content checksums
28// 14: languages
29// 15: rune based symbol sections
30// 16: ctags metadata
31const IndexFormatVersion = 16
32
33// FeatureVersion is increased if a feature is added that requires reindexing data
34// without changing the format version
35// 2: Rank field for shards.
36// 3: Rank documents within shards
37// 4: Dedup file bugfix
38// 5: Remove max line size limit
39// 6: Include '#' into the LineFragment template
40// 7: Record skip reasons in the index.
41// 8: Record source path in the index.
42// 9: Store ctags metadata & bump default max file size
43// 10: Compound shards; more flexible TOC format.
44// 11: Bloom filters for file names & contents
45// 12: go-enry for identifying file languages
46const FeatureVersion = 12
47
48// WriteMinFeatureVersion and ReadMinFeatureVersion constrain forwards and backwards
49// compatibility. For example, if a new way to encode filenameNgrams on disk is
50// added using a new section but the old one is retained, this would only bump
51// FeatureVersion, since the previous version can read the file and ignore the
52// new section, but the index files should be regenerated.
53// When the new encoding is fully rolled out and stable, the section with the old
54// encoding and the associated reader can be removed, and WriteMinFeatureVersion and
55// ReadMinFeatureVersion can be set to the current FeatureVersion, indicating
56// that the reader must handle the new version and that older versions are no
57// longer valid.
58// In this way, compatibility with arbitrary version offsets can be indicated.
59
60// WriteMinFeatureVersion constrains forwards compatibility by emitting files
61// that won't load in zoekt with a FeatureVersion below it.
62const WriteMinFeatureVersion = 10
63
64// ReadMinFeatureVersion constrains backwards compatibility by refusing to
65// load a file with a FeatureVersion below it.
66const ReadMinFeatureVersion = 8
67
68// 17: compound shard (multi repo)
69const NextIndexFormatVersion = 17
70
71type indexTOC struct {
72 fileContents compoundSection
73 fileNames compoundSection
74 fileSections compoundSection
75 postings compoundSection
76 newlines compoundSection
77 ngramText simpleSection
78 runeOffsets simpleSection
79 fileEndRunes simpleSection
80 languages simpleSection
81 categories simpleSection
82
83 fileEndSymbol simpleSection
84 symbolMap lazyCompoundSection
85 symbolKindMap compoundSection
86 symbolMetaData simpleSection
87
88 branchMasks simpleSection
89 subRepos simpleSection
90
91 nameNgramText simpleSection
92 namePostings compoundSection
93 nameRuneOffsets simpleSection
94 metaData simpleSection
95 repoMetaData simpleSection
96 nameEndRunes simpleSection
97 contentChecksums simpleSection
98 runeDocSections simpleSection
99
100 repos simpleSection
101 reposIDsBitmap simpleSection
102
103 ranks simpleSection
104}
105
106func (t *indexTOC) sections() []section {
107 // This old sections list is only needed to maintain backwards compatibility,
108 // and can be removed when a migration to tagged sections is complete.
109 return []section{
110 // This must be first, so it can be reliably read across
111 // file format versions.
112 &t.metaData,
113 &t.repoMetaData,
114 &t.fileContents,
115 &t.fileNames,
116 &t.fileSections,
117 &t.fileEndSymbol,
118 &t.symbolMap,
119 &t.symbolKindMap,
120 &t.symbolMetaData,
121 &t.newlines,
122 &t.ngramText,
123 &t.postings,
124 &t.nameNgramText,
125 &t.namePostings,
126 &t.branchMasks,
127 &t.subRepos,
128 &t.runeOffsets,
129 &t.nameRuneOffsets,
130 &t.fileEndRunes,
131 &t.nameEndRunes,
132 &t.contentChecksums,
133 &t.languages,
134 &t.runeDocSections,
135 }
136}
137
138func (t *indexTOC) sectionsNext() []section {
139 return append(t.sections(), &t.repos)
140}
141
142type taggedSection struct {
143 tag string
144 sec section
145}
146
147func (t *indexTOC) sectionsTagged() map[string]section {
148 out := map[string]section{}
149 for _, ent := range t.sectionsTaggedList() {
150 out[ent.tag] = ent.sec
151 }
152 for _, ent := range t.sectionsTaggedCompatibilityList() {
153 out[ent.tag] = ent.sec
154 }
155 return out
156}
157
158func (t *indexTOC) sectionsTaggedList() []taggedSection {
159 var unusedSimple simpleSection
160
161 return []taggedSection{
162 {"metaData", &t.metaData},
163 {"repoMetaData", &t.repoMetaData},
164 {"fileContents", &t.fileContents},
165 {"fileNames", &t.fileNames},
166 {"fileSections", &t.fileSections},
167 {"fileEndSymbol", &t.fileEndSymbol},
168 {"symbolMap", &t.symbolMap},
169 {"symbolKindMap", &t.symbolKindMap},
170 {"symbolMetaData", &t.symbolMetaData},
171 {"newlines", &t.newlines},
172 {"ngramText", &t.ngramText},
173 {"postings", &t.postings},
174 {"nameNgramText", &t.nameNgramText},
175 {"namePostings", &t.namePostings},
176 {"branchMasks", &t.branchMasks},
177 {"subRepos", &t.subRepos},
178 {"runeOffsets", &t.runeOffsets},
179 {"nameRuneOffsets", &t.nameRuneOffsets},
180 {"fileEndRunes", &t.fileEndRunes},
181 {"nameEndRunes", &t.nameEndRunes},
182 {"contentChecksums", &t.contentChecksums},
183 {"languages", &t.languages},
184 {"categories", &t.categories},
185 {"runeDocSections", &t.runeDocSections},
186 {"repos", &t.repos},
187 {"reposIDsBitmap", &t.reposIDsBitmap},
188
189 // We no longer write these sections, but we still return them here to avoid
190 // warnings about unknown sections.
191 {"nameBloom", &unusedSimple},
192 {"contentBloom", &unusedSimple},
193 {"ranks", &unusedSimple},
194 }
195}
196
197// sectionsTaggedCompatibilityList returns a list of sections that will be
198// handled or converted for backwards compatiblity, but aren't written by
199// the current iteration of the indexer.
200func (t *indexTOC) sectionsTaggedCompatibilityList() []taggedSection {
201 return []taggedSection{}
202}