fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package index
16
17import (
18 "bufio"
19 "bytes"
20 "encoding/binary"
21 "encoding/json"
22 "fmt"
23 "io"
24 "sort"
25 "time"
26
27 "github.com/sourcegraph/zoekt"
28)
29
30func (w *writer) writeTOC(toc *indexTOC) {
31 // Tagged sections are indicated with a 0 section count.
32 // Tagged sections allow easier forwards and backwards
33 // compatibility when evolving zoekt index files with new
34 // sections.
35 //
36 // A tagged section is:
37 // Varint TagLen, Tag String, Varint Kind, Section
38 //
39 // Section kind is indicated because simpleSections and
40 // compoundSections have different lengths.
41 w.U32(0)
42 secs := toc.sectionsTaggedList()
43 for _, s := range secs {
44 w.String(s.tag)
45 w.Varint(uint32(s.sec.kind()))
46 s.sec.write(w)
47 }
48}
49
50func (s *compoundSection) writeStrings(w *writer, strs []*searchableString) {
51 s.start(w)
52 for _, f := range strs {
53 s.addItem(w, f.data)
54 }
55 s.end(w)
56}
57
58func (s *compoundSection) writeMap(w *writer, m map[string]uint32) {
59 keys := make([]*searchableString, 0, len(m))
60 for k := range m {
61 keys = append(keys, &searchableString{
62 data: []byte(k),
63 })
64 }
65 sort.Slice(keys, func(i, j int) bool {
66 return m[string(keys[i].data)] < m[string(keys[j].data)]
67 })
68 s.writeStrings(w, keys)
69}
70
71func writePostings(w *writer, s *postingsBuilder, ngramText *simpleSection,
72 charOffsets *simpleSection, postings *compoundSection, endRunes *simpleSection,
73) {
74 keys := make(ngramSlice, 0, len(s.postings))
75 for k := range s.postings {
76 keys = append(keys, k)
77 }
78 sort.Sort(keys)
79
80 ngramText.start(w)
81 for _, k := range keys {
82 var buf [8]byte
83 binary.BigEndian.PutUint64(buf[:], uint64(k))
84 w.Write(buf[:])
85 }
86 ngramText.end(w)
87
88 postings.start(w)
89 for _, k := range keys {
90 postings.addItem(w, s.postings[k])
91 }
92 postings.end(w)
93
94 charOffsets.start(w)
95 w.Write(toSizedDeltas(s.runeOffsets))
96 charOffsets.end(w)
97
98 endRunes.start(w)
99 w.Write(toSizedDeltas(s.endRunes))
100 endRunes.end(w)
101}
102
103func (b *ShardBuilder) Write(out io.Writer) error {
104 next := b.indexFormatVersion == NextIndexFormatVersion
105
106 buffered := bufio.NewWriterSize(out, 1<<20)
107 defer buffered.Flush()
108
109 w := &writer{w: buffered}
110 toc := indexTOC{}
111
112 toc.fileContents.writeStrings(w, b.contentStrings)
113 toc.newlines.start(w)
114 for _, f := range b.contentStrings {
115 toc.newlines.addItem(w, toSizedDeltas(newLinesIndices(f.data)))
116 }
117 toc.newlines.end(w)
118
119 toc.fileEndSymbol.start(w)
120 for _, m := range b.fileEndSymbol {
121 w.U32(m)
122 }
123 toc.fileEndSymbol.end(w)
124
125 toc.symbolMap.writeMap(w, b.symIndex)
126 toc.symbolKindMap.writeMap(w, b.symKindIndex)
127 toc.symbolMetaData.start(w)
128 for _, m := range b.symMetaData {
129 w.U32(m)
130 }
131 toc.symbolMetaData.end(w)
132
133 toc.branchMasks.start(w)
134 for _, m := range b.branchMasks {
135 w.U64(m)
136 }
137 toc.branchMasks.end(w)
138
139 toc.fileSections.start(w)
140 for _, s := range b.docSections {
141 toc.fileSections.addItem(w, marshalDocSections(s))
142 }
143 toc.fileSections.end(w)
144
145 writePostings(w, b.contentPostings, &toc.ngramText, &toc.runeOffsets, &toc.postings, &toc.fileEndRunes)
146
147 // names.
148 toc.fileNames.writeStrings(w, b.nameStrings)
149
150 writePostings(w, b.namePostings, &toc.nameNgramText, &toc.nameRuneOffsets, &toc.namePostings, &toc.nameEndRunes)
151
152 toc.subRepos.start(w)
153 w.Write(toSizedDeltas(b.subRepos))
154 toc.subRepos.end(w)
155
156 toc.contentChecksums.start(w)
157 w.Write(b.checksums)
158 toc.contentChecksums.end(w)
159
160 toc.languages.start(w)
161 w.Write(b.languages)
162 toc.languages.end(w)
163
164 toc.runeDocSections.start(w)
165 w.Write(marshalDocSections(b.runeDocSections))
166 toc.runeDocSections.end(w)
167
168 if next {
169 toc.repos.start(w)
170 w.Write(toSizedDeltas16(b.repos))
171 toc.repos.end(w)
172 }
173
174 indexTime := b.IndexTime
175 if indexTime.IsZero() {
176 indexTime = time.Now().UTC()
177 }
178
179 if err := b.writeJSON(&zoekt.IndexMetadata{
180 IndexFormatVersion: b.indexFormatVersion,
181 IndexTime: indexTime,
182 IndexFeatureVersion: b.featureVersion,
183 IndexMinReaderVersion: WriteMinFeatureVersion,
184 PlainASCII: b.contentPostings.isPlainASCII && b.namePostings.isPlainASCII,
185 LanguageMap: b.languageMap,
186 ZoektVersion: Version,
187 ID: b.ID,
188 }, &toc.metaData, w); err != nil {
189 return err
190 }
191
192 if next {
193 if err := b.writeJSON(b.repoList, &toc.repoMetaData, w); err != nil {
194 return err
195 }
196 } else {
197 if len(b.repoList) != 1 {
198 return fmt.Errorf("have %d repos, but only support 1 in index format version %d", len(b.repoList), b.indexFormatVersion)
199 }
200 if err := b.writeJSON(b.repoList[0], &toc.repoMetaData, w); err != nil {
201 return err
202 }
203 }
204
205 var tocSection simpleSection
206
207 tocSection.start(w)
208 w.writeTOC(&toc)
209 tocSection.end(w)
210 tocSection.write(w)
211 return w.err
212}
213
214func (b *ShardBuilder) writeJSON(data interface{}, sec *simpleSection, w *writer) error {
215 blob, err := json.Marshal(data)
216 if err != nil {
217 return err
218 }
219 sec.start(w)
220 w.Write(blob)
221 sec.end(w)
222 return nil
223}
224
225func newLinesIndices(in []byte) []uint32 {
226 out := make([]uint32, 0, bytes.Count(in, []byte{'\n'}))
227 for i, c := range in {
228 if c == '\n' {
229 out = append(out, uint32(i))
230 }
231 }
232 return out
233}