fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "bufio"
19 "bytes"
20 "encoding/binary"
21 "encoding/json"
22 "fmt"
23 "io"
24 "sort"
25 "time"
26)
27
28func (w *writer) writeTOC(toc *indexTOC) {
29 // Tagged sections are indicated with a 0 section count.
30 // Tagged sections allow easier forwards and backwards
31 // compatibility when evolving zoekt index files with new
32 // sections.
33 //
34 // A tagged section is:
35 // Varint TagLen, Tag String, Varint Kind, Section
36 //
37 // Section kind is indicated because simpleSections and
38 // compoundSections have different lengths.
39 w.U32(0)
40 secs := toc.sectionsTaggedList()
41 for _, s := range secs {
42 w.String(s.tag)
43 w.Varint(uint32(s.sec.kind()))
44 s.sec.write(w)
45 }
46}
47
48func (s *compoundSection) writeStrings(w *writer, strs []*searchableString) {
49 s.start(w)
50 for _, f := range strs {
51 s.addItem(w, f.data)
52 }
53 s.end(w)
54}
55
56func (s *compoundSection) writeMap(w *writer, m map[string]uint32) {
57 keys := make([]*searchableString, 0, len(m))
58 for k := range m {
59 keys = append(keys, &searchableString{
60 data: []byte(k),
61 })
62 }
63 sort.Slice(keys, func(i, j int) bool {
64 return m[string(keys[i].data)] < m[string(keys[j].data)]
65 })
66 s.writeStrings(w, keys)
67}
68
69func writePostings(w *writer, s *postingsBuilder, ngramText *simpleSection,
70 charOffsets *simpleSection, postings *compoundSection, endRunes *simpleSection,
71) {
72 keys := make(ngramSlice, 0, len(s.postings))
73 for k := range s.postings {
74 keys = append(keys, k)
75 }
76 sort.Sort(keys)
77
78 ngramText.start(w)
79 for _, k := range keys {
80 var buf [8]byte
81 binary.BigEndian.PutUint64(buf[:], uint64(k))
82 w.Write(buf[:])
83 }
84 ngramText.end(w)
85
86 postings.start(w)
87 for _, k := range keys {
88 postings.addItem(w, s.postings[k])
89 }
90 postings.end(w)
91
92 charOffsets.start(w)
93 w.Write(toSizedDeltas(s.runeOffsets))
94 charOffsets.end(w)
95
96 endRunes.start(w)
97 w.Write(toSizedDeltas(s.endRunes))
98 endRunes.end(w)
99}
100
101func (b *IndexBuilder) Write(out io.Writer) error {
102 next := b.indexFormatVersion == NextIndexFormatVersion
103
104 buffered := bufio.NewWriterSize(out, 1<<20)
105 defer buffered.Flush()
106
107 w := &writer{w: buffered}
108 toc := indexTOC{}
109
110 toc.fileContents.writeStrings(w, b.contentStrings)
111 toc.newlines.start(w)
112 for _, f := range b.contentStrings {
113 toc.newlines.addItem(w, toSizedDeltas(newLinesIndices(f.data)))
114 }
115 toc.newlines.end(w)
116
117 toc.fileEndSymbol.start(w)
118 for _, m := range b.fileEndSymbol {
119 w.U32(m)
120 }
121 toc.fileEndSymbol.end(w)
122
123 toc.symbolMap.writeMap(w, b.symIndex)
124 toc.symbolKindMap.writeMap(w, b.symKindIndex)
125 toc.symbolMetaData.start(w)
126 for _, m := range b.symMetaData {
127 w.U32(m)
128 }
129 toc.symbolMetaData.end(w)
130
131 toc.branchMasks.start(w)
132 for _, m := range b.branchMasks {
133 w.U64(m)
134 }
135 toc.branchMasks.end(w)
136
137 toc.fileSections.start(w)
138 for _, s := range b.docSections {
139 toc.fileSections.addItem(w, marshalDocSections(s))
140 }
141 toc.fileSections.end(w)
142
143 writePostings(w, b.contentPostings, &toc.ngramText, &toc.runeOffsets, &toc.postings, &toc.fileEndRunes)
144
145 // names.
146 toc.fileNames.writeStrings(w, b.nameStrings)
147
148 writePostings(w, b.namePostings, &toc.nameNgramText, &toc.nameRuneOffsets, &toc.namePostings, &toc.nameEndRunes)
149
150 toc.subRepos.start(w)
151 w.Write(toSizedDeltas(b.subRepos))
152 toc.subRepos.end(w)
153
154 toc.contentChecksums.start(w)
155 w.Write(b.checksums)
156 toc.contentChecksums.end(w)
157
158 toc.languages.start(w)
159 w.Write(b.languages)
160 toc.languages.end(w)
161
162 toc.runeDocSections.start(w)
163 w.Write(marshalDocSections(b.runeDocSections))
164 toc.runeDocSections.end(w)
165
166 if next {
167 toc.repos.start(w)
168 w.Write(toSizedDeltas16(b.repos))
169 toc.repos.end(w)
170 }
171
172 indexTime := b.IndexTime
173 if indexTime.IsZero() {
174 indexTime = time.Now().UTC()
175 }
176
177 if err := b.writeJSON(&IndexMetadata{
178 IndexFormatVersion: b.indexFormatVersion,
179 IndexTime: indexTime,
180 IndexFeatureVersion: b.featureVersion,
181 IndexMinReaderVersion: WriteMinFeatureVersion,
182 PlainASCII: b.contentPostings.isPlainASCII && b.namePostings.isPlainASCII,
183 LanguageMap: b.languageMap,
184 ZoektVersion: Version,
185 ID: b.ID,
186 }, &toc.metaData, w); err != nil {
187 return err
188 }
189
190 if next {
191 if err := b.writeJSON(b.repoList, &toc.repoMetaData, w); err != nil {
192 return err
193 }
194 } else {
195 if len(b.repoList) != 1 {
196 return fmt.Errorf("have %d repos, but only support 1 in index format version %d", len(b.repoList), b.indexFormatVersion)
197 }
198 if err := b.writeJSON(b.repoList[0], &toc.repoMetaData, w); err != nil {
199 return err
200 }
201 }
202
203 var tocSection simpleSection
204
205 tocSection.start(w)
206 w.writeTOC(&toc)
207 tocSection.end(w)
208 tocSection.write(w)
209 return w.err
210}
211
212func (b *IndexBuilder) writeJSON(data interface{}, sec *simpleSection, w *writer) error {
213 blob, err := json.Marshal(data)
214 if err != nil {
215 return err
216 }
217 sec.start(w)
218 w.Write(blob)
219 sec.end(w)
220 return nil
221}
222
223func newLinesIndices(in []byte) []uint32 {
224 out := make([]uint32, 0, bytes.Count(in, []byte{'\n'}))
225 for i, c := range in {
226 if c == '\n' {
227 out = append(out, uint32(i))
228 }
229 }
230 return out
231}