fork of https://github.com/sourcegraph/zoekt
1package zoekt
2
3import (
4 "crypto/sha1"
5 "fmt"
6 "io"
7 "log"
8 "net/url"
9 "os"
10 "path/filepath"
11 "runtime"
12 "sort"
13
14 "github.com/sourcegraph/zoekt/internal/tenant"
15)
16
17// Merge files into a compound shard in dstDir. Merge returns tmpName and a
18// dstName. It is the responsibility of the caller to delete the input shards and
19// rename the temporary compound shard from tmpName to dstName.
20func Merge(dstDir string, files ...IndexFile) (tmpName, dstName string, _ error) {
21 var ds []*indexData
22 for _, f := range files {
23 searcher, err := NewSearcher(f)
24 if err != nil {
25 return "", "", err
26 }
27 ds = append(ds, searcher.(*indexData))
28 }
29
30 ib, err := merge(ds...)
31 if err != nil {
32 return "", "", err
33 }
34
35 hasher := sha1.New()
36 for _, d := range ds {
37 for i, md := range d.repoMetaData {
38 if d.repoMetaData[i].Tombstone {
39 continue
40 }
41 hasher.Write([]byte(md.Name))
42 hasher.Write([]byte{0})
43 }
44 }
45
46 dstName = filepath.Join(dstDir, fmt.Sprintf("compound-%x_v%d.%05d.zoekt", hasher.Sum(nil), NextIndexFormatVersion, 0))
47 tmpName = dstName + ".tmp"
48 if err := builderWriteAll(tmpName, ib); err != nil {
49 return "", "", err
50 }
51 return tmpName, dstName, nil
52}
53
54func builderWriteAll(fn string, ib *IndexBuilder) error {
55 dir := filepath.Dir(fn)
56 if err := os.MkdirAll(dir, 0o700); err != nil {
57 return err
58 }
59
60 f, err := os.CreateTemp(dir, filepath.Base(fn)+".*.tmp")
61 if err != nil {
62 return err
63 }
64 if runtime.GOOS != "windows" {
65 // umask?
66 if err := f.Chmod(0o666); err != nil {
67 return err
68 }
69 }
70
71 defer f.Close()
72 if err := ib.Write(f); err != nil {
73 return err
74 }
75 fi, err := f.Stat()
76 if err != nil {
77 return err
78 }
79 if err := f.Close(); err != nil {
80 return err
81 }
82
83 if err := os.Rename(f.Name(), fn); err != nil {
84 return err
85 }
86
87 log.Printf("finished shard %s: %d index bytes (overhead %3.1f)", fn, fi.Size(),
88 float64(fi.Size())/float64(ib.ContentSize()+1))
89
90 return nil
91}
92
93func merge(ds ...*indexData) (*IndexBuilder, error) {
94 if len(ds) == 0 {
95 return nil, fmt.Errorf("need 1 or more indexData to merge")
96 }
97
98 sort.Slice(ds, func(i, j int) bool {
99 return ds[i].repoMetaData[0].priority > ds[j].repoMetaData[0].priority
100 })
101
102 ib := newIndexBuilder()
103 ib.indexFormatVersion = NextIndexFormatVersion
104
105 for _, d := range ds {
106 lastRepoID := -1
107 for docID := uint32(0); int(docID) < len(d.fileBranchMasks); docID++ {
108 repoID := int(d.repos[docID])
109
110 if d.repoMetaData[repoID].Tombstone {
111 continue
112 }
113
114 if repoID != lastRepoID {
115 if lastRepoID > repoID {
116 return nil, fmt.Errorf("non-contiguous repo ids in %s for document %d: old=%d current=%d", d.String(), docID, lastRepoID, repoID)
117 }
118 lastRepoID = repoID
119
120 // TODO we are losing empty repos on merging since we only get here if
121 // there is an associated document.
122
123 if err := ib.setRepository(&d.repoMetaData[repoID]); err != nil {
124 return nil, err
125 }
126 }
127
128 if err := addDocument(d, ib, repoID, docID); err != nil {
129 return nil, err
130 }
131 }
132 }
133
134 return ib, nil
135}
136
137// Explode takes an IndexFile f and creates 1 simple shard per repository
138// contained in f. Explode returns a map of tmpName -> dstName. It is the
139// responsibility of the caller to rename the temporary shard(s) and delete the
140// input shard.
141func Explode(dstDir string, f IndexFile) (map[string]string, error) {
142 return explode(dstDir, f)
143}
144
145type indexBuilderFunc func(ib *IndexBuilder)
146
147// explode offers a richer signature compared to Explode for testing. You
148// probably want to call Explode instead.
149func explode(dstDir string, f IndexFile, ibFuncs ...indexBuilderFunc) (map[string]string, error) {
150 searcher, err := NewSearcher(f)
151 if err != nil {
152 return nil, err
153 }
154 d := searcher.(*indexData)
155
156 shardNames := make(map[string]string, len(d.repoMetaData))
157
158 writeShard := func(ib *IndexBuilder) error {
159 if len(ib.repoList) != 1 {
160 return fmt.Errorf("expected ib to contain exactly 1 repository")
161 }
162 for _, ibFunc := range ibFuncs {
163 ibFunc(ib)
164 }
165
166 prefix := ""
167 if tenant.EnforceTenant() {
168 prefix = tenant.SrcPrefix(ib.repoList[0].TenantID, ib.repoList[0].ID)
169 } else {
170 prefix = ib.repoList[0].Name
171 }
172
173 fn := filepath.Join(dstDir, shardName(prefix, ib.indexFormatVersion, 0))
174 fnTmp := fn + ".tmp"
175 shardNames[fnTmp] = fn
176 return builderWriteAll(fnTmp, ib)
177 }
178
179 var ib *IndexBuilder
180 lastRepoID := -1
181 for docID := uint32(0); int(docID) < len(d.fileBranchMasks); docID++ {
182 repoID := int(d.repos[docID])
183
184 if d.repoMetaData[repoID].Tombstone {
185 continue
186 }
187
188 if repoID != lastRepoID {
189 if lastRepoID > repoID {
190 return shardNames, fmt.Errorf("non-contiguous repo ids in %s for document %d: old=%d current=%d", d.String(), docID, lastRepoID, repoID)
191 }
192 lastRepoID = repoID
193
194 if ib != nil {
195 if err := writeShard(ib); err != nil {
196 return shardNames, err
197 }
198 }
199
200 ib = newIndexBuilder()
201 ib.indexFormatVersion = IndexFormatVersion
202 if err := ib.setRepository(&d.repoMetaData[repoID]); err != nil {
203 return shardNames, err
204 }
205 }
206
207 err := addDocument(d, ib, repoID, docID)
208 if err != nil {
209 return shardNames, err
210 }
211 }
212
213 if ib != nil {
214 if err := writeShard(ib); err != nil {
215 return shardNames, err
216 }
217 }
218
219 return shardNames, nil
220}
221
222func addDocument(d *indexData, ib *IndexBuilder, repoID int, docID uint32) error {
223 doc := Document{
224 Name: string(d.fileName(docID)),
225 // Content set below since it can return an error
226 // Branches set below since it requires lookups
227 SubRepositoryPath: d.subRepoPaths[repoID][d.subRepos[docID]],
228 Language: d.languageMap[d.getLanguage(docID)],
229 // SkipReason not set, will be part of content from original indexer.
230 }
231
232 var err error
233 if doc.Content, err = d.readContents(docID); err != nil {
234 return err
235 }
236
237 if doc.Symbols, _, err = d.readDocSections(docID, nil); err != nil {
238 return err
239 }
240
241 doc.SymbolsMetaData = make([]*Symbol, len(doc.Symbols))
242 for i := range doc.SymbolsMetaData {
243 doc.SymbolsMetaData[i] = d.symbols.data(d.fileEndSymbol[docID] + uint32(i))
244 }
245
246 // calculate branches
247 {
248 mask := d.fileBranchMasks[docID]
249 id := uint32(1)
250 for mask != 0 {
251 if mask&0x1 != 0 {
252 doc.Branches = append(doc.Branches, d.branchNames[repoID][uint(id)])
253 }
254 id <<= 1
255 mask >>= 1
256 }
257 }
258 return ib.Add(doc)
259}
260
261// copied from builder package to avoid circular imports.
262func hashString(s string) string {
263 h := sha1.New()
264 _, _ = io.WriteString(h, s)
265 return fmt.Sprintf("%x", h.Sum(nil))
266}
267
268// copied from builder package to avoid circular imports.
269func shardName(prefix string, version, n int) string {
270 abs := url.QueryEscape(prefix)
271 if len(abs) > 200 {
272 abs = abs[:200] + hashString(abs)[:8]
273 }
274 return fmt.Sprintf("%s_v%d.%05d.zoekt", abs, version, n)
275}