fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "bufio"
19 "bytes"
20 "encoding/binary"
21 "encoding/json"
22 "fmt"
23 "io"
24 "sort"
25 "time"
26)
27
28func (w *writer) writeTOC(toc *indexTOC) {
29 // Tagged sections are indicated with a 0 section count.
30 // Tagged sections allow easier forwards and backwards
31 // compatibility when evolving zoekt index files with new
32 // sections.
33 //
34 // A tagged section is:
35 // Varint TagLen, Tag String, Varint Kind, Section
36 //
37 // Section kind is indicated because simpleSections and
38 // compoundSections have different lengths.
39 w.U32(0)
40 secs := toc.sectionsTaggedList()
41 for _, s := range secs {
42 w.String(s.tag)
43 w.Varint(uint32(s.sec.kind()))
44 s.sec.write(w)
45 }
46}
47
48func (s *compoundSection) writeStrings(w *writer, strs []*searchableString) {
49 s.start(w)
50 for _, f := range strs {
51 s.addItem(w, f.data)
52 }
53 s.end(w)
54}
55
56func (s *compoundSection) writeMap(w *writer, m map[string]uint32) {
57 keys := make([]*searchableString, 0, len(m))
58 for k := range m {
59 keys = append(keys, &searchableString{
60 data: []byte(k),
61 })
62 }
63 sort.Slice(keys, func(i, j int) bool {
64 return m[string(keys[i].data)] < m[string(keys[j].data)]
65 })
66 s.writeStrings(w, keys)
67}
68
69func writePostings(w *writer, s *postingsBuilder, ngramText *simpleSection,
70 charOffsets *simpleSection, postings *compoundSection, endRunes *simpleSection,
71) {
72 keys := make(ngramSlice, 0, len(s.postings))
73 for k := range s.postings {
74 keys = append(keys, k)
75 }
76 sort.Sort(keys)
77
78 ngramText.start(w)
79 for _, k := range keys {
80 var buf [8]byte
81 binary.BigEndian.PutUint64(buf[:], uint64(k))
82 w.Write(buf[:])
83 }
84 ngramText.end(w)
85
86 postings.start(w)
87 for _, k := range keys {
88 postings.addItem(w, s.postings[k])
89 }
90 postings.end(w)
91
92 charOffsets.start(w)
93 w.Write(toSizedDeltas(s.runeOffsets))
94 charOffsets.end(w)
95
96 endRunes.start(w)
97 w.Write(toSizedDeltas(s.endRunes))
98 endRunes.end(w)
99}
100
101func (b *IndexBuilder) Write(out io.Writer) error {
102 next := b.indexFormatVersion == NextIndexFormatVersion
103
104 buffered := bufio.NewWriterSize(out, 1<<20)
105 defer buffered.Flush()
106
107 w := &writer{w: buffered}
108 toc := indexTOC{}
109
110 toc.fileContents.writeStrings(w, b.contentStrings)
111 toc.newlines.start(w)
112 for _, f := range b.contentStrings {
113 toc.newlines.addItem(w, toSizedDeltas(newLinesIndices(f.data)))
114 }
115 toc.newlines.end(w)
116
117 toc.fileEndSymbol.start(w)
118 for _, m := range b.fileEndSymbol {
119 w.U32(m)
120 }
121 toc.fileEndSymbol.end(w)
122
123 toc.symbolMap.writeMap(w, b.symIndex)
124 toc.symbolKindMap.writeMap(w, b.symKindIndex)
125 toc.symbolMetaData.start(w)
126 for _, m := range b.symMetaData {
127 w.U32(m)
128 }
129 toc.symbolMetaData.end(w)
130
131 toc.branchMasks.start(w)
132 for _, m := range b.branchMasks {
133 w.U64(m)
134 }
135 toc.branchMasks.end(w)
136
137 toc.fileSections.start(w)
138 for _, s := range b.docSections {
139 toc.fileSections.addItem(w, marshalDocSections(s))
140 }
141 toc.fileSections.end(w)
142
143 writePostings(w, b.contentPostings, &toc.ngramText, &toc.runeOffsets, &toc.postings, &toc.fileEndRunes)
144
145 // names.
146 toc.fileNames.writeStrings(w, b.nameStrings)
147
148 writePostings(w, b.namePostings, &toc.nameNgramText, &toc.nameRuneOffsets, &toc.namePostings, &toc.nameEndRunes)
149
150 toc.subRepos.start(w)
151 w.Write(toSizedDeltas(b.subRepos))
152 toc.subRepos.end(w)
153
154 toc.contentChecksums.start(w)
155 w.Write(b.checksums)
156 toc.contentChecksums.end(w)
157
158 toc.languages.start(w)
159 w.Write(b.languages)
160 toc.languages.end(w)
161
162 toc.runeDocSections.start(w)
163 w.Write(marshalDocSections(b.runeDocSections))
164 toc.runeDocSections.end(w)
165
166 if next {
167 toc.repos.start(w)
168 w.Write(toSizedDeltas16(b.repos))
169 toc.repos.end(w)
170 }
171
172 indexTime := b.IndexTime
173 if indexTime.IsZero() {
174 indexTime = time.Now().UTC()
175 }
176
177 if err := b.writeJSON(&IndexMetadata{
178 IndexFormatVersion: b.indexFormatVersion,
179 IndexTime: indexTime,
180 IndexFeatureVersion: b.featureVersion,
181 IndexMinReaderVersion: WriteMinFeatureVersion,
182 PlainASCII: b.contentPostings.isPlainASCII && b.namePostings.isPlainASCII,
183 LanguageMap: b.languageMap,
184 ZoektVersion: Version,
185 ID: b.ID,
186 }, &toc.metaData, w); err != nil {
187 return err
188 }
189
190 if next {
191 if err := b.writeJSON(b.repoList, &toc.repoMetaData, w); err != nil {
192 return err
193 }
194 } else {
195 if len(b.repoList) != 1 {
196 return fmt.Errorf("have %d repos, but only support 1 in index format version %d", len(b.repoList), b.indexFormatVersion)
197 }
198 if err := b.writeJSON(b.repoList[0], &toc.repoMetaData, w); err != nil {
199 return err
200 }
201 }
202
203 toc.ranks.start(w)
204 if err := encodeRanks(w, b.ranks); err != nil {
205 return err
206 }
207 toc.ranks.end(w)
208
209 var tocSection simpleSection
210
211 tocSection.start(w)
212 w.writeTOC(&toc)
213 tocSection.end(w)
214 tocSection.write(w)
215 return w.err
216}
217
218func (b *IndexBuilder) writeJSON(data interface{}, sec *simpleSection, w *writer) error {
219 blob, err := json.Marshal(data)
220 if err != nil {
221 return err
222 }
223 sec.start(w)
224 w.Write(blob)
225 sec.end(w)
226 return nil
227}
228
229func newLinesIndices(in []byte) []uint32 {
230 out := make([]uint32, 0, bytes.Count(in, []byte{'\n'}))
231 for i, c := range in {
232 if c == '\n' {
233 out = append(out, uint32(i))
234 }
235 }
236 return out
237}