fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "bufio"
19 "bytes"
20 "encoding/binary"
21 "encoding/json"
22 "fmt"
23 "io"
24 "sort"
25 "time"
26)
27
28func (w *writer) writeTOC(toc *indexTOC) {
29 // Tagged sections are indicated with a 0 section count.
30 // Tagged sections allow easier forwards and backwards
31 // compatibility when evolving zoekt index files with new
32 // sections.
33 //
34 // A tagged section is:
35 // Varint TagLen, Tag String, Varint Kind, Section
36 //
37 // Section kind is indicated because simpleSections and
38 // compoundSections have different lengths.
39 w.U32(0)
40 secs := toc.sectionsTaggedList()
41 for _, s := range secs {
42 w.String(s.tag)
43 w.Varint(uint32(s.sec.kind()))
44 s.sec.write(w)
45 }
46}
47
48func (s *compoundSection) writeStrings(w *writer, strs []*searchableString) {
49 s.start(w)
50 for _, f := range strs {
51 s.addItem(w, f.data)
52 }
53 s.end(w)
54}
55
56func (s *compoundSection) writeMap(w *writer, m map[string]uint32) {
57 keys := make([]*searchableString, 0, len(m))
58 for k := range m {
59 keys = append(keys, &searchableString{
60 data: []byte(k),
61 })
62 }
63 sort.Slice(keys, func(i, j int) bool {
64 return m[string(keys[i].data)] < m[string(keys[j].data)]
65 })
66 s.writeStrings(w, keys)
67}
68
69func writePostings(w *writer, s *postingsBuilder, ngramText *simpleSection,
70 charOffsets *simpleSection, postings *compoundSection, endRunes *simpleSection) {
71 keys := make(ngramSlice, 0, len(s.postings))
72 for k := range s.postings {
73 keys = append(keys, k)
74 }
75 sort.Sort(keys)
76
77 ngramText.start(w)
78 for _, k := range keys {
79 var buf [8]byte
80 binary.BigEndian.PutUint64(buf[:], uint64(k))
81 w.Write(buf[:])
82 }
83 ngramText.end(w)
84
85 postings.start(w)
86 for _, k := range keys {
87 postings.addItem(w, s.postings[k])
88 }
89 postings.end(w)
90
91 charOffsets.start(w)
92 w.Write(toSizedDeltas(s.runeOffsets))
93 charOffsets.end(w)
94
95 endRunes.start(w)
96 w.Write(toSizedDeltas(s.endRunes))
97 endRunes.end(w)
98}
99
100func (b *IndexBuilder) Write(out io.Writer) error {
101 next := b.indexFormatVersion == NextIndexFormatVersion
102
103 buffered := bufio.NewWriterSize(out, 1<<20)
104 defer buffered.Flush()
105
106 w := &writer{w: buffered}
107 toc := indexTOC{}
108
109 toc.fileContents.writeStrings(w, b.contentStrings)
110 toc.newlines.start(w)
111 for _, f := range b.contentStrings {
112 toc.newlines.addItem(w, toSizedDeltas(newLinesIndices(f.data)))
113 }
114 toc.newlines.end(w)
115
116 toc.fileEndSymbol.start(w)
117 for _, m := range b.fileEndSymbol {
118 w.U32(m)
119 }
120 toc.fileEndSymbol.end(w)
121
122 toc.symbolMap.writeMap(w, b.symIndex)
123 toc.symbolKindMap.writeMap(w, b.symKindIndex)
124 toc.symbolMetaData.start(w)
125 for _, m := range b.symMetaData {
126 w.U32(m)
127 }
128 toc.symbolMetaData.end(w)
129
130 toc.branchMasks.start(w)
131 for _, m := range b.branchMasks {
132 w.U64(m)
133 }
134 toc.branchMasks.end(w)
135
136 toc.fileSections.start(w)
137 for _, s := range b.docSections {
138 toc.fileSections.addItem(w, marshalDocSections(s))
139 }
140 toc.fileSections.end(w)
141
142 writePostings(w, b.contentPostings, &toc.ngramText, &toc.runeOffsets, &toc.postings, &toc.fileEndRunes)
143
144 // names.
145 toc.fileNames.writeStrings(w, b.nameStrings)
146
147 writePostings(w, b.namePostings, &toc.nameNgramText, &toc.nameRuneOffsets, &toc.namePostings, &toc.nameEndRunes)
148
149 toc.subRepos.start(w)
150 w.Write(toSizedDeltas(b.subRepos))
151 toc.subRepos.end(w)
152
153 toc.contentChecksums.start(w)
154 w.Write(b.checksums)
155 toc.contentChecksums.end(w)
156
157 toc.languages.start(w)
158 w.Write(b.languages)
159 toc.languages.end(w)
160
161 toc.runeDocSections.start(w)
162 w.Write(marshalDocSections(b.runeDocSections))
163 toc.runeDocSections.end(w)
164
165 if next {
166 toc.repos.start(w)
167 w.Write(toSizedDeltas16(b.repos))
168 toc.repos.end(w)
169 }
170
171 indexTime := b.IndexTime
172 if indexTime.IsZero() {
173 indexTime = time.Now().UTC()
174 }
175
176 if err := b.writeJSON(&IndexMetadata{
177 IndexFormatVersion: b.indexFormatVersion,
178 IndexTime: indexTime,
179 IndexFeatureVersion: b.featureVersion,
180 IndexMinReaderVersion: WriteMinFeatureVersion,
181 PlainASCII: b.contentPostings.isPlainASCII && b.namePostings.isPlainASCII,
182 LanguageMap: b.languageMap,
183 ZoektVersion: Version,
184 ID: b.ID,
185 }, &toc.metaData, w); err != nil {
186 return err
187 }
188
189 if next {
190 if err := b.writeJSON(b.repoList, &toc.repoMetaData, w); err != nil {
191 return err
192 }
193 } else {
194 if len(b.repoList) != 1 {
195 return fmt.Errorf("have %d repos, but only support 1 in index format version %d", len(b.repoList), b.indexFormatVersion)
196 }
197 if err := b.writeJSON(b.repoList[0], &toc.repoMetaData, w); err != nil {
198 return err
199 }
200 }
201
202 toc.ranks.start(w)
203 if err := encodeRanks(w, b.ranks); err != nil {
204 return err
205 }
206 toc.ranks.end(w)
207
208 var tocSection simpleSection
209
210 tocSection.start(w)
211 w.writeTOC(&toc)
212 tocSection.end(w)
213 tocSection.write(w)
214 return w.err
215}
216
217func (b *IndexBuilder) writeJSON(data interface{}, sec *simpleSection, w *writer) error {
218 blob, err := json.Marshal(data)
219 if err != nil {
220 return err
221 }
222 sec.start(w)
223 w.Write(blob)
224 sec.end(w)
225 return nil
226}
227
228func newLinesIndices(in []byte) []uint32 {
229 out := make([]uint32, 0, bytes.Count(in, []byte{'\n'}))
230 for i, c := range in {
231 if c == '\n' {
232 out = append(out, uint32(i))
233 }
234 }
235 return out
236}