fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package index
16
17import (
18 "bufio"
19 "bytes"
20 "encoding/binary"
21 "encoding/json"
22 "fmt"
23 "io"
24 "sort"
25 "time"
26
27 "github.com/sourcegraph/zoekt"
28
29 "github.com/RoaringBitmap/roaring"
30)
31
32func (w *writer) writeTOC(toc *indexTOC) {
33 // Tagged sections are indicated with a 0 section count.
34 // Tagged sections allow easier forwards and backwards
35 // compatibility when evolving zoekt index files with new
36 // sections.
37 //
38 // A tagged section is:
39 // Varint TagLen, Tag String, Varint Kind, Section
40 //
41 // Section kind is indicated because simpleSections and
42 // compoundSections have different lengths.
43 w.U32(0)
44 secs := toc.sectionsTaggedList()
45 for _, s := range secs {
46 w.String(s.tag)
47 w.Varint(uint32(s.sec.kind()))
48 s.sec.write(w)
49 }
50}
51
52func (s *compoundSection) writeStrings(w *writer, strs []*searchableString) {
53 s.start(w)
54 for _, f := range strs {
55 s.addItem(w, f.data)
56 }
57 s.end(w)
58}
59
60func (s *compoundSection) writeMap(w *writer, m map[string]uint32) {
61 keys := make([]*searchableString, 0, len(m))
62 for k := range m {
63 keys = append(keys, &searchableString{
64 data: []byte(k),
65 })
66 }
67 sort.Slice(keys, func(i, j int) bool {
68 return m[string(keys[i].data)] < m[string(keys[j].data)]
69 })
70 s.writeStrings(w, keys)
71}
72
73func writeUint32Bitmap(w *writer, dat []uint32) {
74 rb := roaring.BitmapOf(dat...)
75 rb.RunOptimize()
76 rb.WriteTo(w)
77}
78
79func writePostings(w *writer, s *postingsBuilder, ngramText *simpleSection,
80 charOffsets *simpleSection, postings *compoundSection, endRunes *simpleSection,
81) {
82 keys := make(ngramSlice, 0, len(s.postings))
83 for k := range s.postings {
84 keys = append(keys, k)
85 }
86 sort.Sort(keys)
87
88 ngramText.start(w)
89 for _, k := range keys {
90 var buf [8]byte
91 binary.BigEndian.PutUint64(buf[:], uint64(k))
92 w.Write(buf[:])
93 }
94 ngramText.end(w)
95
96 postings.start(w)
97 for _, k := range keys {
98 postings.addItem(w, s.postings[k])
99 }
100 postings.end(w)
101
102 charOffsets.start(w)
103 w.Write(toSizedDeltas(s.runeOffsets))
104 charOffsets.end(w)
105
106 endRunes.start(w)
107 w.Write(toSizedDeltas(s.endRunes))
108 endRunes.end(w)
109}
110
111func (b *ShardBuilder) Write(out io.Writer) error {
112 next := b.indexFormatVersion == NextIndexFormatVersion
113
114 buffered := bufio.NewWriterSize(out, 1<<20)
115 defer buffered.Flush()
116
117 w := &writer{w: buffered}
118 toc := indexTOC{}
119
120 toc.fileContents.writeStrings(w, b.contentStrings)
121 toc.newlines.start(w)
122 for _, f := range b.contentStrings {
123 toc.newlines.addItem(w, toSizedDeltas(newLinesIndices(f.data)))
124 }
125 toc.newlines.end(w)
126
127 toc.fileEndSymbol.start(w)
128 for _, m := range b.fileEndSymbol {
129 w.U32(m)
130 }
131 toc.fileEndSymbol.end(w)
132
133 toc.symbolMap.writeMap(w, b.symIndex)
134 toc.symbolKindMap.writeMap(w, b.symKindIndex)
135 toc.symbolMetaData.start(w)
136 for _, m := range b.symMetaData {
137 w.U32(m)
138 }
139 toc.symbolMetaData.end(w)
140
141 toc.branchMasks.start(w)
142 for _, m := range b.branchMasks {
143 w.U64(m)
144 }
145 toc.branchMasks.end(w)
146
147 toc.fileSections.start(w)
148 for _, s := range b.docSections {
149 toc.fileSections.addItem(w, marshalDocSections(s))
150 }
151 toc.fileSections.end(w)
152
153 writePostings(w, b.contentPostings, &toc.ngramText, &toc.runeOffsets, &toc.postings, &toc.fileEndRunes)
154
155 // names.
156 toc.fileNames.writeStrings(w, b.nameStrings)
157
158 writePostings(w, b.namePostings, &toc.nameNgramText, &toc.nameRuneOffsets, &toc.namePostings, &toc.nameEndRunes)
159
160 toc.subRepos.start(w)
161 w.Write(toSizedDeltas(b.subRepos))
162 toc.subRepos.end(w)
163
164 toc.contentChecksums.start(w)
165 w.Write(b.checksums)
166 toc.contentChecksums.end(w)
167
168 toc.languages.start(w)
169 w.Write(b.languages)
170 toc.languages.end(w)
171
172 toc.categories.start(w)
173 w.Write(b.categories)
174 toc.categories.end(w)
175
176 toc.runeDocSections.start(w)
177 w.Write(marshalDocSections(b.runeDocSections))
178 toc.runeDocSections.end(w)
179
180 if next {
181 toc.repos.start(w)
182 w.Write(toSizedDeltas16(b.repos))
183 toc.repos.end(w)
184 }
185
186 if repoIDs, ok := b.repoIDs(); ok && next {
187 toc.reposIDsBitmap.start(w)
188 writeUint32Bitmap(w, repoIDs)
189 toc.reposIDsBitmap.end(w)
190 }
191
192 indexTime := b.IndexTime
193 if indexTime.IsZero() {
194 indexTime = time.Now().UTC()
195 }
196
197 if err := b.writeJSON(&zoekt.IndexMetadata{
198 IndexFormatVersion: b.indexFormatVersion,
199 IndexTime: indexTime,
200 IndexFeatureVersion: b.featureVersion,
201 IndexMinReaderVersion: WriteMinFeatureVersion,
202 PlainASCII: b.contentPostings.isPlainASCII && b.namePostings.isPlainASCII,
203 LanguageMap: b.languageMap,
204 ZoektVersion: Version,
205 ID: b.ID,
206 }, &toc.metaData, w); err != nil {
207 return err
208 }
209
210 if next {
211 if err := b.writeJSON(b.repoList, &toc.repoMetaData, w); err != nil {
212 return err
213 }
214 } else {
215 if len(b.repoList) != 1 {
216 return fmt.Errorf("have %d repos, but only support 1 in index format version %d", len(b.repoList), b.indexFormatVersion)
217 }
218 if err := b.writeJSON(b.repoList[0], &toc.repoMetaData, w); err != nil {
219 return err
220 }
221 }
222
223 var tocSection simpleSection
224
225 tocSection.start(w)
226 w.writeTOC(&toc)
227 tocSection.end(w)
228 tocSection.write(w)
229 return w.err
230}
231
232func (b *ShardBuilder) writeJSON(data interface{}, sec *simpleSection, w *writer) error {
233 blob, err := json.Marshal(data)
234 if err != nil {
235 return err
236 }
237 sec.start(w)
238 w.Write(blob)
239 sec.end(w)
240 return nil
241}
242
243func newLinesIndices(in []byte) []uint32 {
244 out := make([]uint32, 0, bytes.Count(in, []byte{'\n'}))
245 for i, c := range in {
246 if c == '\n' {
247 out = append(out, uint32(i))
248 }
249 }
250 return out
251}