fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package index
16
17import (
18 "bufio"
19 "bytes"
20 "cmp"
21 "encoding/binary"
22 "encoding/json"
23 "fmt"
24 "io"
25 "slices"
26 "sort"
27 "time"
28
29 "github.com/sourcegraph/zoekt"
30
31 "github.com/RoaringBitmap/roaring"
32)
33
34func (w *writer) writeTOC(toc *indexTOC) {
35 // Tagged sections are indicated with a 0 section count.
36 // Tagged sections allow easier forwards and backwards
37 // compatibility when evolving zoekt index files with new
38 // sections.
39 //
40 // A tagged section is:
41 // Varint TagLen, Tag String, Varint Kind, Section
42 //
43 // Section kind is indicated because simpleSections and
44 // compoundSections have different lengths.
45 w.U32(0)
46 secs := toc.sectionsTaggedList()
47 for _, s := range secs {
48 w.String(s.tag)
49 w.Varint(uint32(s.sec.kind()))
50 s.sec.write(w)
51 }
52}
53
54func (s *compoundSection) writeStrings(w *writer, strs []*searchableString) {
55 s.start(w)
56 for _, f := range strs {
57 s.addItem(w, f.data)
58 }
59 s.end(w)
60}
61
62func (s *compoundSection) writeMap(w *writer, m map[string]uint32) {
63 keys := make([]*searchableString, 0, len(m))
64 for k := range m {
65 keys = append(keys, &searchableString{
66 data: []byte(k),
67 })
68 }
69 sort.Slice(keys, func(i, j int) bool {
70 return m[string(keys[i].data)] < m[string(keys[j].data)]
71 })
72 s.writeStrings(w, keys)
73}
74
75func writeUint32Bitmap(w *writer, dat []uint32) {
76 rb := roaring.BitmapOf(dat...)
77 rb.RunOptimize()
78 rb.WriteTo(w)
79}
80
81func writePostings(w *writer, s *postingsBuilder, ngramText *simpleSection,
82 charOffsets *simpleSection, postings *compoundSection, endRunes *simpleSection,
83) {
84 // Collect ngrams from both the ASCII direct-indexed array and the
85 // non-ASCII map, then sort by ngram value.
86 type ngramPosting struct {
87 ng ngram
88 pl *postingList
89 }
90 all := make([]ngramPosting, 0, len(s.asciiPopulated)+len(s.postings))
91 for _, idx := range s.asciiPopulated {
92 pl := s.asciiPostings[idx]
93 if len(pl.data) > 0 {
94 all = append(all, ngramPosting{asciiIndexToNgram(idx), pl})
95 }
96 }
97 for k, pl := range s.postings {
98 if len(pl.data) > 0 {
99 all = append(all, ngramPosting{k, pl})
100 }
101 }
102 slices.SortFunc(all, func(a, b ngramPosting) int { return cmp.Compare(a.ng, b.ng) })
103
104 ngramText.start(w)
105 for _, np := range all {
106 var buf [8]byte
107 binary.BigEndian.PutUint64(buf[:], uint64(np.ng))
108 w.Write(buf[:])
109 }
110 ngramText.end(w)
111
112 postings.start(w)
113 for _, np := range all {
114 postings.addItem(w, np.pl.data)
115 }
116 postings.end(w)
117
118 charOffsets.start(w)
119 w.Write(toSizedDeltas(s.runeOffsets))
120 charOffsets.end(w)
121
122 endRunes.start(w)
123 w.Write(toSizedDeltas(s.endRunes))
124 endRunes.end(w)
125}
126
127func (b *ShardBuilder) Write(out io.Writer) error {
128 next := b.indexFormatVersion == NextIndexFormatVersion
129
130 buffered := bufio.NewWriterSize(out, 1<<20)
131 defer buffered.Flush()
132
133 w := &writer{w: buffered}
134 toc := indexTOC{}
135
136 toc.fileContents.writeStrings(w, b.contentStrings)
137 toc.newlines.start(w)
138 for _, f := range b.contentStrings {
139 toc.newlines.addItem(w, toSizedDeltas(newLinesIndices(f.data)))
140 }
141 toc.newlines.end(w)
142
143 toc.fileEndSymbol.start(w)
144 for _, m := range b.fileEndSymbol {
145 w.U32(m)
146 }
147 toc.fileEndSymbol.end(w)
148
149 toc.symbolMap.writeMap(w, b.symIndex)
150 toc.symbolKindMap.writeMap(w, b.symKindIndex)
151 toc.symbolMetaData.start(w)
152 for _, m := range b.symMetaData {
153 w.U32(m)
154 }
155 toc.symbolMetaData.end(w)
156
157 toc.branchMasks.start(w)
158 for _, m := range b.branchMasks {
159 w.U64(m)
160 }
161 toc.branchMasks.end(w)
162
163 toc.fileSections.start(w)
164 for _, s := range b.docSections {
165 toc.fileSections.addItem(w, marshalDocSections(s))
166 }
167 toc.fileSections.end(w)
168
169 writePostings(w, b.contentPostings, &toc.ngramText, &toc.runeOffsets, &toc.postings, &toc.fileEndRunes)
170
171 // names.
172 toc.fileNames.writeStrings(w, b.nameStrings)
173
174 writePostings(w, b.namePostings, &toc.nameNgramText, &toc.nameRuneOffsets, &toc.namePostings, &toc.nameEndRunes)
175
176 toc.subRepos.start(w)
177 w.Write(toSizedDeltas(b.subRepos))
178 toc.subRepos.end(w)
179
180 toc.contentChecksums.start(w)
181 w.Write(b.checksums)
182 toc.contentChecksums.end(w)
183
184 toc.languages.start(w)
185 w.Write(b.languages)
186 toc.languages.end(w)
187
188 toc.categories.start(w)
189 w.Write(b.categories)
190 toc.categories.end(w)
191
192 toc.runeDocSections.start(w)
193 w.Write(marshalDocSections(b.runeDocSections))
194 toc.runeDocSections.end(w)
195
196 if next {
197 toc.repos.start(w)
198 w.Write(toSizedDeltas16(b.repos))
199 toc.repos.end(w)
200 }
201
202 if repoIDs, ok := b.repoIDs(); ok && next {
203 toc.reposIDsBitmap.start(w)
204 writeUint32Bitmap(w, repoIDs)
205 toc.reposIDsBitmap.end(w)
206 }
207
208 indexTime := b.IndexTime
209 if indexTime.IsZero() {
210 indexTime = time.Now().UTC()
211 }
212
213 if err := b.writeJSON(&zoekt.IndexMetadata{
214 IndexFormatVersion: b.indexFormatVersion,
215 IndexTime: indexTime,
216 IndexFeatureVersion: b.featureVersion,
217 IndexMinReaderVersion: WriteMinFeatureVersion,
218 PlainASCII: b.contentPostings.isPlainASCII && b.namePostings.isPlainASCII,
219 LanguageMap: b.languageMap,
220 ZoektVersion: Version,
221 ID: b.ID,
222 }, &toc.metaData, w); err != nil {
223 return err
224 }
225
226 if next {
227 if err := b.writeJSON(b.repoList, &toc.repoMetaData, w); err != nil {
228 return err
229 }
230 } else {
231 if len(b.repoList) != 1 {
232 return fmt.Errorf("have %d repos, but only support 1 in index format version %d", len(b.repoList), b.indexFormatVersion)
233 }
234 if err := b.writeJSON(b.repoList[0], &toc.repoMetaData, w); err != nil {
235 return err
236 }
237 }
238
239 var tocSection simpleSection
240
241 tocSection.start(w)
242 w.writeTOC(&toc)
243 tocSection.end(w)
244 tocSection.write(w)
245 return w.err
246}
247
248func (b *ShardBuilder) writeJSON(data any, sec *simpleSection, w *writer) error {
249 blob, err := json.Marshal(data)
250 if err != nil {
251 return err
252 }
253 sec.start(w)
254 w.Write(blob)
255 sec.end(w)
256 return nil
257}
258
259func newLinesIndices(in []byte) []uint32 {
260 out := make([]uint32, 0, bytes.Count(in, []byte{'\n'}))
261 for i, c := range in {
262 if c == '\n' {
263 out = append(out, uint32(i))
264 }
265 }
266 return out
267}