fork of https://github.com/sourcegraph/zoekt
1package zoekt
2
3import (
4 "crypto/sha1"
5 "fmt"
6 "io"
7 "log"
8 "net/url"
9 "os"
10 "path/filepath"
11 "runtime"
12 "sort"
13)
14
15// Merge files into a compound shard in dstDir. Merge returns tmpName and a
16// dstName. It is the responsibility of the caller to delete the input shards and
17// rename the temporary compound shard from tmpName to dstName.
18func Merge(dstDir string, files ...IndexFile) (tmpName, dstName string, _ error) {
19 var ds []*indexData
20 for _, f := range files {
21 searcher, err := NewSearcher(f)
22 if err != nil {
23 return "", "", err
24 }
25 ds = append(ds, searcher.(*indexData))
26 }
27
28 ib, err := merge(ds...)
29 if err != nil {
30 return "", "", err
31 }
32
33 hasher := sha1.New()
34 for _, d := range ds {
35 for i, md := range d.repoMetaData {
36 if d.repoMetaData[i].Tombstone {
37 continue
38 }
39 hasher.Write([]byte(md.Name))
40 hasher.Write([]byte{0})
41 }
42 }
43
44 dstName = filepath.Join(dstDir, fmt.Sprintf("compound-%x_v%d.%05d.zoekt", hasher.Sum(nil), NextIndexFormatVersion, 0))
45 tmpName = dstName + ".tmp"
46 if err := builderWriteAll(tmpName, ib); err != nil {
47 return "", "", err
48 }
49 return tmpName, dstName, nil
50}
51
52func builderWriteAll(fn string, ib *IndexBuilder) error {
53 dir := filepath.Dir(fn)
54 if err := os.MkdirAll(dir, 0o700); err != nil {
55 return err
56 }
57
58 f, err := os.CreateTemp(dir, filepath.Base(fn)+".*.tmp")
59 if err != nil {
60 return err
61 }
62 if runtime.GOOS != "windows" {
63 // umask?
64 if err := f.Chmod(0o666); err != nil {
65 return err
66 }
67 }
68
69 defer f.Close()
70 if err := ib.Write(f); err != nil {
71 return err
72 }
73 fi, err := f.Stat()
74 if err != nil {
75 return err
76 }
77 if err := f.Close(); err != nil {
78 return err
79 }
80
81 if err := os.Rename(f.Name(), fn); err != nil {
82 return err
83 }
84
85 log.Printf("finished shard %s: %d index bytes (overhead %3.1f)", fn, fi.Size(),
86 float64(fi.Size())/float64(ib.ContentSize()+1))
87
88 return nil
89}
90
91func merge(ds ...*indexData) (*IndexBuilder, error) {
92 if len(ds) == 0 {
93 return nil, fmt.Errorf("need 1 or more indexData to merge")
94 }
95
96 sort.Slice(ds, func(i, j int) bool {
97 return ds[i].repoMetaData[0].priority > ds[j].repoMetaData[0].priority
98 })
99
100 ib := newIndexBuilder()
101 ib.indexFormatVersion = NextIndexFormatVersion
102
103 for _, d := range ds {
104 lastRepoID := -1
105 for docID := uint32(0); int(docID) < len(d.fileBranchMasks); docID++ {
106 repoID := int(d.repos[docID])
107
108 if d.repoMetaData[repoID].Tombstone {
109 continue
110 }
111
112 if repoID != lastRepoID {
113 if lastRepoID > repoID {
114 return nil, fmt.Errorf("non-contiguous repo ids in %s for document %d: old=%d current=%d", d.String(), docID, lastRepoID, repoID)
115 }
116 lastRepoID = repoID
117
118 // TODO we are losing empty repos on merging since we only get here if
119 // there is an associated document.
120
121 if err := ib.setRepository(&d.repoMetaData[repoID]); err != nil {
122 return nil, err
123 }
124 }
125
126 if err := addDocument(d, ib, repoID, docID); err != nil {
127 return nil, err
128 }
129 }
130 }
131
132 return ib, nil
133}
134
135// Explode takes an IndexFile f and creates 1 simple shard per repository
136// contained in f. Explode returns a map of tmpName -> dstName. It is the
137// responsibility of the caller to rename the temporary shard(s) and delete the
138// input shard.
139func Explode(dstDir string, f IndexFile) (map[string]string, error) {
140 return explode(dstDir, f)
141}
142
143type indexBuilderFunc func(ib *IndexBuilder)
144
145// explode offers a richer signature compared to Explode for testing. You
146// probably want to call Explode instead.
147func explode(dstDir string, f IndexFile, ibFuncs ...indexBuilderFunc) (map[string]string, error) {
148 searcher, err := NewSearcher(f)
149 if err != nil {
150 return nil, err
151 }
152 d := searcher.(*indexData)
153
154 shardNames := make(map[string]string, len(d.repoMetaData))
155
156 writeShard := func(ib *IndexBuilder) error {
157 if len(ib.repoList) != 1 {
158 return fmt.Errorf("expected ib to contain exactly 1 repository")
159 }
160 for _, ibFunc := range ibFuncs {
161 ibFunc(ib)
162 }
163 fn := filepath.Join(dstDir, shardName(ib.repoList[0].Name, ib.indexFormatVersion, 0))
164 fnTmp := fn + ".tmp"
165 shardNames[fnTmp] = fn
166 return builderWriteAll(fnTmp, ib)
167 }
168
169 var ib *IndexBuilder
170 lastRepoID := -1
171 for docID := uint32(0); int(docID) < len(d.fileBranchMasks); docID++ {
172 repoID := int(d.repos[docID])
173
174 if d.repoMetaData[repoID].Tombstone {
175 continue
176 }
177
178 if repoID != lastRepoID {
179 if lastRepoID > repoID {
180 return shardNames, fmt.Errorf("non-contiguous repo ids in %s for document %d: old=%d current=%d", d.String(), docID, lastRepoID, repoID)
181 }
182 lastRepoID = repoID
183
184 if ib != nil {
185 if err := writeShard(ib); err != nil {
186 return shardNames, err
187 }
188 }
189
190 ib = newIndexBuilder()
191 ib.indexFormatVersion = IndexFormatVersion
192 if err := ib.setRepository(&d.repoMetaData[repoID]); err != nil {
193 return shardNames, err
194 }
195 }
196
197 err := addDocument(d, ib, repoID, docID)
198 if err != nil {
199 return shardNames, err
200 }
201 }
202
203 if ib != nil {
204 if err := writeShard(ib); err != nil {
205 return shardNames, err
206 }
207 }
208
209 return shardNames, nil
210}
211
212func addDocument(d *indexData, ib *IndexBuilder, repoID int, docID uint32) error {
213 doc := Document{
214 Name: string(d.fileName(docID)),
215 // Content set below since it can return an error
216 // Branches set below since it requires lookups
217 SubRepositoryPath: d.subRepoPaths[repoID][d.subRepos[docID]],
218 Language: d.languageMap[d.getLanguage(docID)],
219 // SkipReason not set, will be part of content from original indexer.
220 }
221
222 var err error
223 if doc.Content, err = d.readContents(docID); err != nil {
224 return err
225 }
226
227 if doc.Symbols, _, err = d.readDocSections(docID, nil); err != nil {
228 return err
229 }
230
231 doc.SymbolsMetaData = make([]*Symbol, len(doc.Symbols))
232 for i := range doc.SymbolsMetaData {
233 doc.SymbolsMetaData[i] = d.symbols.data(d.fileEndSymbol[docID] + uint32(i))
234 }
235
236 // calculate branches
237 {
238 mask := d.fileBranchMasks[docID]
239 id := uint32(1)
240 for mask != 0 {
241 if mask&0x1 != 0 {
242 doc.Branches = append(doc.Branches, d.branchNames[repoID][uint(id)])
243 }
244 id <<= 1
245 mask >>= 1
246 }
247 }
248 return ib.Add(doc)
249}
250
251// copied from builder package to avoid circular imports.
252func hashString(s string) string {
253 h := sha1.New()
254 _, _ = io.WriteString(h, s)
255 return fmt.Sprintf("%x", h.Sum(nil))
256}
257
258// copied from builder package to avoid circular imports.
259func shardName(name string, version, n int) string {
260 abs := url.QueryEscape(name)
261 if len(abs) > 200 {
262 abs = abs[:200] + hashString(abs)[:8]
263 }
264 return fmt.Sprintf("%s_v%d.%05d.zoekt", abs, version, n)
265}