fork of https://github.com/sourcegraph/zoekt
1package zoekt
2
3import (
4 "crypto/sha1"
5 "fmt"
6 "io"
7 "log"
8 "os"
9 "path/filepath"
10 "runtime"
11 "sort"
12
13 "github.com/sourcegraph/zoekt/internal/tenant"
14)
15
16// Merge files into a compound shard in dstDir. Merge returns tmpName and a
17// dstName. It is the responsibility of the caller to delete the input shards and
18// rename the temporary compound shard from tmpName to dstName.
19func Merge(dstDir string, files ...IndexFile) (tmpName, dstName string, _ error) {
20 var ds []*indexData
21 for _, f := range files {
22 searcher, err := NewSearcher(f)
23 if err != nil {
24 return "", "", err
25 }
26 ds = append(ds, searcher.(*indexData))
27 }
28
29 ib, err := merge(ds...)
30 if err != nil {
31 return "", "", err
32 }
33
34 hasher := sha1.New()
35 for _, d := range ds {
36 for i, md := range d.repoMetaData {
37 if d.repoMetaData[i].Tombstone {
38 continue
39 }
40 hasher.Write([]byte(md.Name))
41 hasher.Write([]byte{0})
42 }
43 }
44
45 dstName = filepath.Join(dstDir, fmt.Sprintf("compound-%x_v%d.%05d.zoekt", hasher.Sum(nil), NextIndexFormatVersion, 0))
46 tmpName = dstName + ".tmp"
47 if err := builderWriteAll(tmpName, ib); err != nil {
48 return "", "", err
49 }
50 return tmpName, dstName, nil
51}
52
53func builderWriteAll(fn string, ib *IndexBuilder) error {
54 dir := filepath.Dir(fn)
55 if err := os.MkdirAll(dir, 0o700); err != nil {
56 return err
57 }
58
59 f, err := os.CreateTemp(dir, filepath.Base(fn)+".*.tmp")
60 if err != nil {
61 return err
62 }
63 if runtime.GOOS != "windows" {
64 // umask?
65 if err := f.Chmod(0o666); err != nil {
66 return err
67 }
68 }
69
70 defer f.Close()
71 if err := ib.Write(f); err != nil {
72 return err
73 }
74 fi, err := f.Stat()
75 if err != nil {
76 return err
77 }
78 if err := f.Close(); err != nil {
79 return err
80 }
81
82 if err := os.Rename(f.Name(), fn); err != nil {
83 return err
84 }
85
86 log.Printf("finished shard %s: %d index bytes (overhead %3.1f)", fn, fi.Size(),
87 float64(fi.Size())/float64(ib.ContentSize()+1))
88
89 return nil
90}
91
92func merge(ds ...*indexData) (*IndexBuilder, error) {
93 if len(ds) == 0 {
94 return nil, fmt.Errorf("need 1 or more indexData to merge")
95 }
96
97 sort.Slice(ds, func(i, j int) bool {
98 return ds[i].repoMetaData[0].priority > ds[j].repoMetaData[0].priority
99 })
100
101 ib := newIndexBuilder()
102 ib.indexFormatVersion = NextIndexFormatVersion
103
104 for _, d := range ds {
105 lastRepoID := -1
106 for docID := uint32(0); int(docID) < len(d.fileBranchMasks); docID++ {
107 repoID := int(d.repos[docID])
108
109 if d.repoMetaData[repoID].Tombstone {
110 continue
111 }
112
113 if repoID != lastRepoID {
114 if lastRepoID > repoID {
115 return nil, fmt.Errorf("non-contiguous repo ids in %s for document %d: old=%d current=%d", d.String(), docID, lastRepoID, repoID)
116 }
117 lastRepoID = repoID
118
119 // TODO we are losing empty repos on merging since we only get here if
120 // there is an associated document.
121
122 if err := ib.setRepository(&d.repoMetaData[repoID]); err != nil {
123 return nil, err
124 }
125 }
126
127 if err := addDocument(d, ib, repoID, docID); err != nil {
128 return nil, err
129 }
130 }
131 }
132
133 return ib, nil
134}
135
136// Explode takes an IndexFile f and creates 1 simple shard per repository
137// contained in f. Explode returns a map of tmpName -> dstName. It is the
138// responsibility of the caller to rename the temporary shard(s) and delete the
139// input shard.
140func Explode(dstDir string, f IndexFile) (map[string]string, error) {
141 return explode(dstDir, f)
142}
143
144type indexBuilderFunc func(ib *IndexBuilder)
145
146// explode offers a richer signature compared to Explode for testing. You
147// probably want to call Explode instead.
148func explode(dstDir string, f IndexFile, ibFuncs ...indexBuilderFunc) (map[string]string, error) {
149 searcher, err := NewSearcher(f)
150 if err != nil {
151 return nil, err
152 }
153 d := searcher.(*indexData)
154
155 shardNames := make(map[string]string, len(d.repoMetaData))
156
157 writeShard := func(ib *IndexBuilder) error {
158 if len(ib.repoList) != 1 {
159 return fmt.Errorf("expected ib to contain exactly 1 repository")
160 }
161 for _, ibFunc := range ibFuncs {
162 ibFunc(ib)
163 }
164
165 prefix := ""
166 if tenant.EnforceTenant() {
167 prefix = tenant.SrcPrefix(ib.repoList[0].TenantID, ib.repoList[0].ID)
168 } else {
169 prefix = ib.repoList[0].Name
170 }
171
172 shardName := ShardName(dstDir, prefix, ib.indexFormatVersion, 0)
173 shardNameTmp := shardName + ".tmp"
174 shardNames[shardNameTmp] = shardName
175 return builderWriteAll(shardNameTmp, ib)
176 }
177
178 var ib *IndexBuilder
179 lastRepoID := -1
180 for docID := uint32(0); int(docID) < len(d.fileBranchMasks); docID++ {
181 repoID := int(d.repos[docID])
182
183 if d.repoMetaData[repoID].Tombstone {
184 continue
185 }
186
187 if repoID != lastRepoID {
188 if lastRepoID > repoID {
189 return shardNames, fmt.Errorf("non-contiguous repo ids in %s for document %d: old=%d current=%d", d.String(), docID, lastRepoID, repoID)
190 }
191 lastRepoID = repoID
192
193 if ib != nil {
194 if err := writeShard(ib); err != nil {
195 return shardNames, err
196 }
197 }
198
199 ib = newIndexBuilder()
200 ib.indexFormatVersion = IndexFormatVersion
201 if err := ib.setRepository(&d.repoMetaData[repoID]); err != nil {
202 return shardNames, err
203 }
204 }
205
206 err := addDocument(d, ib, repoID, docID)
207 if err != nil {
208 return shardNames, err
209 }
210 }
211
212 if ib != nil {
213 if err := writeShard(ib); err != nil {
214 return shardNames, err
215 }
216 }
217
218 return shardNames, nil
219}
220
221func addDocument(d *indexData, ib *IndexBuilder, repoID int, docID uint32) error {
222 doc := Document{
223 Name: string(d.fileName(docID)),
224 // Content set below since it can return an error
225 // Branches set below since it requires lookups
226 SubRepositoryPath: d.subRepoPaths[repoID][d.subRepos[docID]],
227 Language: d.languageMap[d.getLanguage(docID)],
228 // SkipReason not set, will be part of content from original indexer.
229 }
230
231 var err error
232 if doc.Content, err = d.readContents(docID); err != nil {
233 return err
234 }
235
236 if doc.Symbols, _, err = d.readDocSections(docID, nil); err != nil {
237 return err
238 }
239
240 doc.SymbolsMetaData = make([]*Symbol, len(doc.Symbols))
241 for i := range doc.SymbolsMetaData {
242 doc.SymbolsMetaData[i] = d.symbols.data(d.fileEndSymbol[docID] + uint32(i))
243 }
244
245 // calculate branches
246 {
247 mask := d.fileBranchMasks[docID]
248 id := uint32(1)
249 for mask != 0 {
250 if mask&0x1 != 0 {
251 doc.Branches = append(doc.Branches, d.branchNames[repoID][uint(id)])
252 }
253 id <<= 1
254 mask >>= 1
255 }
256 }
257 return ib.Add(doc)
258}
259
260// copied from builder package to avoid circular imports.
261func hashString(s string) string {
262 h := sha1.New()
263 _, _ = io.WriteString(h, s)
264 return fmt.Sprintf("%x", h.Sum(nil))
265}