fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package build
16
17import (
18 "bytes"
19 "fmt"
20 "log"
21 "strings"
22 "sync"
23 "time"
24
25 "golang.org/x/exp/slices"
26
27 "github.com/sourcegraph/zoekt"
28 "github.com/sourcegraph/zoekt/ctags"
29)
30
31// Make sure all names are lowercase here, since they are normalized
32var enryLanguageMappings = map[string]string{
33 "c#": "c_sharp",
34}
35
36func normalizeLanguage(filetype string) string {
37 normalized := strings.ToLower(filetype)
38 if mapped, ok := enryLanguageMappings[normalized]; ok {
39 normalized = mapped
40 }
41
42 return normalized
43}
44
45func ctagsAddSymbolsParserMap(todo []*zoekt.Document, languageMap ctags.LanguageMap, parserMap ctags.ParserMap) error {
46 monitor := newMonitor()
47 defer monitor.Stop()
48
49 var tagsToSections tagsToSections
50
51 for _, doc := range todo {
52 if doc.Symbols != nil {
53 continue
54 }
55
56 zoekt.DetermineLanguageIfUnknown(doc)
57
58 parserKind := languageMap[normalizeLanguage(doc.Language)]
59 if parserKind == ctags.NoCTags {
60 continue
61 }
62
63 parser := parserMap[parserKind]
64 if parser == nil {
65 parser = parserMap[ctags.UniversalCTags]
66 if parser == nil {
67 // this happens if CTagsMustSucceed is not true and we didn't find universal-ctags
68 continue
69 }
70 }
71
72 monitor.BeginParsing(doc)
73 es, err := parser.Parse(doc.Name, doc.Content)
74 monitor.EndParsing(es)
75
76 if err != nil {
77 return err
78 }
79 if len(es) == 0 {
80 continue
81 }
82
83 symOffsets, symMetaData, err := tagsToSections.Convert(doc.Content, es)
84 if err != nil {
85 return fmt.Errorf("%s: %v", doc.Name, err)
86 }
87 doc.Symbols = symOffsets
88 doc.SymbolsMetaData = symMetaData
89 }
90
91 return nil
92}
93
94// overlaps finds the proper position to insert a zoekt.DocumentSection with
95// "start and "end" into "symOffsets". It returns -1 if the new section overlaps
96// with one of the existing ones.
97func overlaps(symOffsets []zoekt.DocumentSection, start, end uint32) int {
98 i := 0
99 for i = len(symOffsets) - 1; i >= 0; i-- {
100 // The most common case is that we exit here, because symOffsets is sorted by
101 // construction and start is in many cases monotonically increasing.
102 if start >= symOffsets[i].End {
103 break
104 }
105 if end <= symOffsets[i].Start {
106 continue
107 }
108 // overlap
109 return -1
110 }
111 return i + 1
112}
113
114// tagsToSections contains buffers to be reused between conversions of bytes
115// ranges to metadata. This is done to reduce pressure on the garbage
116// collector.
117type tagsToSections struct {
118 nlsBuf []uint32
119}
120
121// Convert ctags entries to byte ranges (zoekt.DocumentSection) with
122// corresponding metadata (zoekt.Symbol).
123//
124// This can not be called concurrently.
125func (t *tagsToSections) Convert(content []byte, tags []*ctags.Entry) ([]zoekt.DocumentSection, []*zoekt.Symbol, error) {
126 nls := t.newLinesIndices(content)
127 symOffsets := make([]zoekt.DocumentSection, 0, len(tags))
128 symMetaData := make([]*zoekt.Symbol, 0, len(tags))
129
130 for _, t := range tags {
131 if t.Line <= 0 {
132 // Observed this with a .JS file.
133 continue
134 }
135 lineIdx := t.Line - 1
136 if lineIdx >= len(nls) {
137 return nil, nil, fmt.Errorf("linenum for entry out of range %v", t)
138 }
139
140 lineOff := uint32(0)
141 if lineIdx > 0 {
142 lineOff = nls[lineIdx-1] + 1
143 }
144
145 end := nls[lineIdx]
146 line := content[lineOff:end]
147
148 // This is best-effort only. For short symbol names, we will often determine the
149 // wrong offset.
150 intraOff := bytes.Index(line, []byte(t.Name))
151 if intraOff < 0 {
152 // for Go code, this is very common, since
153 // ctags barfs on multi-line declarations
154 continue
155 }
156
157 start := lineOff + uint32(intraOff)
158 endSym := start + uint32(len(t.Name))
159
160 i := overlaps(symOffsets, start, endSym)
161 if i == -1 {
162 // Detected an overlap. Give up.
163 continue
164 }
165
166 symOffsets = slices.Insert(symOffsets, i, zoekt.DocumentSection{
167 Start: start,
168 End: endSym,
169 })
170 symMetaData = slices.Insert(symMetaData, i, &zoekt.Symbol{
171 Sym: t.Name,
172 Kind: t.Kind,
173 Parent: t.Parent,
174 ParentKind: t.ParentKind,
175 })
176 }
177
178 return symOffsets, symMetaData, nil
179}
180
181// newLinesIndices returns an array of all indexes of '\n' aswell as a final
182// value for the length of the document.
183func (t *tagsToSections) newLinesIndices(in []byte) []uint32 {
184 // reuse nlsBuf between calls to tagsToSections.Convert
185 out := t.nlsBuf
186 if out == nil {
187 out = make([]uint32, 0, len(in)/30)
188 }
189
190 finalEntry := uint32(len(in))
191 off := uint32(0)
192 for len(in) > 0 {
193 i := bytes.IndexByte(in, '\n')
194 if i < 0 {
195 out = append(out, finalEntry)
196 break
197 }
198
199 off += uint32(i)
200 out = append(out, off)
201
202 in = in[i+1:]
203 off++
204 }
205
206 // save buffer for reuse
207 t.nlsBuf = out[:0]
208
209 return out
210}
211
212// monitorReportStuck is how long we need to be analysing a document before
213// reporting it to stdout.
214const monitorReportStuck = 10 * time.Second
215
216// monitorReportStatus is how often we given status updates
217const monitorReportStatus = time.Minute
218
219type monitor struct {
220 mu sync.Mutex
221
222 lastUpdate time.Time
223
224 start time.Time
225 totalSize int
226 totalSymbols int
227
228 currentDocName string
229 currentDocSize int
230 currentDocStuckCount int
231
232 done chan struct{}
233}
234
235func newMonitor() *monitor {
236 start := time.Now()
237 m := &monitor{
238 start: start,
239 lastUpdate: start,
240 done: make(chan struct{}),
241 }
242 go m.run()
243 return m
244}
245
246func (m *monitor) BeginParsing(doc *zoekt.Document) {
247 now := time.Now()
248 m.mu.Lock()
249 m.lastUpdate = now
250
251 // set current doc
252 m.currentDocName = doc.Name
253 m.currentDocSize = len(doc.Content)
254
255 m.mu.Unlock()
256}
257
258func (m *monitor) EndParsing(entries []*ctags.Entry) {
259 now := time.Now()
260 m.mu.Lock()
261 m.lastUpdate = now
262
263 // update aggregate stats
264 m.totalSize += m.currentDocSize
265 m.totalSymbols += len(entries)
266
267 // inform done if we warned about current document
268 if m.currentDocStuckCount > 0 {
269 log.Printf("symbol analysis for %s (size %d bytes) is done and found %d symbols", m.currentDocName, m.currentDocSize, len(entries))
270 m.currentDocStuckCount = 0
271 }
272
273 // unset current document
274 m.currentDocName = ""
275 m.currentDocSize = 0
276
277 m.mu.Unlock()
278}
279
280func (m *monitor) Stop() {
281 close(m.done)
282}
283
284func (m *monitor) run() {
285 stuckTicker := time.NewTicker(monitorReportStuck / 2) // half due to sampling theorem (nyquist)
286 statusTicker := time.NewTicker(monitorReportStatus)
287
288 defer stuckTicker.Stop()
289 defer statusTicker.Stop()
290
291 for {
292 select {
293 case <-m.done:
294 now := time.Now()
295 m.mu.Lock()
296 log.Printf("symbol analysis finished for shard statistics: duration=%v symbols=%d bytes=%d", now.Sub(m.start).Truncate(time.Second), m.totalSymbols, m.totalSize)
297 m.mu.Unlock()
298 return
299
300 case <-stuckTicker.C:
301 now := time.Now()
302 m.mu.Lock()
303 running := now.Sub(m.lastUpdate).Truncate(time.Second)
304 report := monitorReportStuck * (1 << m.currentDocStuckCount) // double the amount of time each time we report
305 if m.currentDocName != "" && running >= report {
306 m.currentDocStuckCount++
307 log.Printf("WARN: symbol analysis for %s (%d bytes) has been running for %v", m.currentDocName, m.currentDocSize, running)
308 }
309 m.mu.Unlock()
310
311 case <-statusTicker.C:
312 now := time.Now()
313 m.mu.Lock()
314 log.Printf("DEBUG: symbol analysis still running for shard statistics: duration=%v symbols=%d bytes=%d", now.Sub(m.start).Truncate(time.Second), m.totalSymbols, m.totalSize)
315 m.mu.Unlock()
316 }
317 }
318}