fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package build
16
17import (
18 "bytes"
19 "fmt"
20 "log"
21 "strings"
22 "sync"
23 "time"
24
25 "golang.org/x/exp/slices"
26
27 "github.com/sourcegraph/zoekt"
28 "github.com/sourcegraph/zoekt/ctags"
29)
30
31// Make sure all names are lowercase here, since they are normalized
32var enryLanguageMappings = map[string]string{
33 "c#": "c_sharp",
34}
35
36func normalizeLanguage(filetype string) string {
37 normalized := strings.ToLower(filetype)
38 if mapped, ok := enryLanguageMappings[normalized]; ok {
39 normalized = mapped
40 }
41
42 return normalized
43}
44
45func ctagsAddSymbolsParserMap(todo []*zoekt.Document, languageMap ctags.LanguageMap, parserMap ctags.ParserMap) error {
46 monitor := newMonitor()
47 defer monitor.Stop()
48
49 var tagsToSections tagsToSections
50
51 for _, doc := range todo {
52 if len(doc.Content) == 0 || doc.Symbols != nil {
53 continue
54 }
55
56 zoekt.DetermineLanguageIfUnknown(doc)
57
58 parserKind := languageMap[normalizeLanguage(doc.Language)]
59 if parserKind == ctags.NoCTags {
60 continue
61 }
62
63 // If the parser kind is unknown, default to universal-ctags
64 if parserKind == ctags.UnknownCTags {
65 parserKind = ctags.UniversalCTags
66 }
67
68 parser := parserMap[parserKind]
69 if parser == nil {
70 // this happens if CTagsMustSucceed is false and we didn't find the binary
71 continue
72 }
73
74 monitor.BeginParsing(doc)
75 es, err := parser.Parse(doc.Name, doc.Content)
76 monitor.EndParsing(es)
77
78 if err != nil {
79 return err
80 }
81 if len(es) == 0 {
82 continue
83 }
84
85 symOffsets, symMetaData, err := tagsToSections.Convert(doc.Content, es)
86 if err != nil {
87 return fmt.Errorf("%s: %v", doc.Name, err)
88 }
89 doc.Symbols = symOffsets
90 doc.SymbolsMetaData = symMetaData
91 }
92
93 return nil
94}
95
96// overlaps finds the proper position to insert a zoekt.DocumentSection with
97// "start and "end" into "symOffsets". It returns -1 if the new section overlaps
98// with one of the existing ones.
99func overlaps(symOffsets []zoekt.DocumentSection, start, end uint32) int {
100 i := 0
101 for i = len(symOffsets) - 1; i >= 0; i-- {
102 // The most common case is that we exit here, because symOffsets is sorted by
103 // construction and start is in many cases monotonically increasing.
104 if start >= symOffsets[i].End {
105 break
106 }
107 if end <= symOffsets[i].Start {
108 continue
109 }
110 // overlap
111 return -1
112 }
113 return i + 1
114}
115
116// tagsToSections contains buffers to be reused between conversions of bytes
117// ranges to metadata. This is done to reduce pressure on the garbage
118// collector.
119type tagsToSections struct {
120 nlsBuf []uint32
121}
122
123// Convert ctags entries to byte ranges (zoekt.DocumentSection) with
124// corresponding metadata (zoekt.Symbol).
125//
126// This can not be called concurrently.
127func (t *tagsToSections) Convert(content []byte, tags []*ctags.Entry) ([]zoekt.DocumentSection, []*zoekt.Symbol, error) {
128 nls := t.newLinesIndices(content)
129 symOffsets := make([]zoekt.DocumentSection, 0, len(tags))
130 symMetaData := make([]*zoekt.Symbol, 0, len(tags))
131
132 for _, t := range tags {
133 if t.Line <= 0 {
134 // Observed this with a .JS file.
135 continue
136 }
137 lineIdx := t.Line - 1
138 if lineIdx >= len(nls) {
139 // Observed this with a .TS file.
140 continue
141 }
142
143 lineOff := uint32(0)
144 if lineIdx > 0 {
145 lineOff = nls[lineIdx-1] + 1
146 }
147
148 end := nls[lineIdx]
149 line := content[lineOff:end]
150
151 // This is best-effort only. For short symbol names, we will often determine the
152 // wrong offset.
153 intraOff := bytes.Index(line, []byte(t.Name))
154 if intraOff < 0 {
155 // for Go code, this is very common, since
156 // ctags barfs on multi-line declarations
157 continue
158 }
159
160 start := lineOff + uint32(intraOff)
161 endSym := start + uint32(len(t.Name))
162
163 i := overlaps(symOffsets, start, endSym)
164 if i == -1 {
165 // Detected an overlap. Give up.
166 continue
167 }
168
169 symOffsets = slices.Insert(symOffsets, i, zoekt.DocumentSection{
170 Start: start,
171 End: endSym,
172 })
173 symMetaData = slices.Insert(symMetaData, i, &zoekt.Symbol{
174 Sym: t.Name,
175 Kind: t.Kind,
176 Parent: t.Parent,
177 ParentKind: t.ParentKind,
178 })
179 }
180
181 return symOffsets, symMetaData, nil
182}
183
184// newLinesIndices returns an array of all indexes of '\n' aswell as a final
185// value for the length of the document.
186func (t *tagsToSections) newLinesIndices(in []byte) []uint32 {
187 // reuse nlsBuf between calls to tagsToSections.Convert
188 out := t.nlsBuf
189 if out == nil {
190 out = make([]uint32, 0, len(in)/30)
191 }
192
193 finalEntry := uint32(len(in))
194 off := uint32(0)
195 for len(in) > 0 {
196 i := bytes.IndexByte(in, '\n')
197 if i < 0 {
198 out = append(out, finalEntry)
199 break
200 }
201
202 off += uint32(i)
203 out = append(out, off)
204
205 in = in[i+1:]
206 off++
207 }
208
209 // save buffer for reuse
210 t.nlsBuf = out[:0]
211
212 return out
213}
214
215// monitorReportStuck is how long we need to be analysing a document before
216// reporting it to stdout.
217const monitorReportStuck = 10 * time.Second
218
219// monitorReportStatus is how often we given status updates
220const monitorReportStatus = time.Minute
221
222type monitor struct {
223 mu sync.Mutex
224
225 lastUpdate time.Time
226
227 start time.Time
228 totalSize int
229 totalSymbols int
230
231 currentDocName string
232 currentDocSize int
233 currentDocStuckCount int
234
235 done chan struct{}
236}
237
238func newMonitor() *monitor {
239 start := time.Now()
240 m := &monitor{
241 start: start,
242 lastUpdate: start,
243 done: make(chan struct{}),
244 }
245 go m.run()
246 return m
247}
248
249func (m *monitor) BeginParsing(doc *zoekt.Document) {
250 now := time.Now()
251 m.mu.Lock()
252 m.lastUpdate = now
253
254 // set current doc
255 m.currentDocName = doc.Name
256 m.currentDocSize = len(doc.Content)
257
258 m.mu.Unlock()
259}
260
261func (m *monitor) EndParsing(entries []*ctags.Entry) {
262 now := time.Now()
263 m.mu.Lock()
264 m.lastUpdate = now
265
266 // update aggregate stats
267 m.totalSize += m.currentDocSize
268 m.totalSymbols += len(entries)
269
270 // inform done if we warned about current document
271 if m.currentDocStuckCount > 0 {
272 log.Printf("symbol analysis for %s (size %d bytes) is done and found %d symbols", m.currentDocName, m.currentDocSize, len(entries))
273 m.currentDocStuckCount = 0
274 }
275
276 // unset current document
277 m.currentDocName = ""
278 m.currentDocSize = 0
279
280 m.mu.Unlock()
281}
282
283func (m *monitor) Stop() {
284 close(m.done)
285}
286
287func (m *monitor) run() {
288 stuckTicker := time.NewTicker(monitorReportStuck / 2) // half due to sampling theorem (nyquist)
289 statusTicker := time.NewTicker(monitorReportStatus)
290
291 defer stuckTicker.Stop()
292 defer statusTicker.Stop()
293
294 for {
295 select {
296 case <-m.done:
297 now := time.Now()
298 m.mu.Lock()
299 log.Printf("symbol analysis finished for shard statistics: duration=%v symbols=%d bytes=%d", now.Sub(m.start).Truncate(time.Second), m.totalSymbols, m.totalSize)
300 m.mu.Unlock()
301 return
302
303 case <-stuckTicker.C:
304 now := time.Now()
305 m.mu.Lock()
306 running := now.Sub(m.lastUpdate).Truncate(time.Second)
307 report := monitorReportStuck * (1 << m.currentDocStuckCount) // double the amount of time each time we report
308 if m.currentDocName != "" && running >= report {
309 m.currentDocStuckCount++
310 log.Printf("WARN: symbol analysis for %s (%d bytes) has been running for %v", m.currentDocName, m.currentDocSize, running)
311 }
312 m.mu.Unlock()
313
314 case <-statusTicker.C:
315 now := time.Now()
316 m.mu.Lock()
317 log.Printf("DEBUG: symbol analysis still running for shard statistics: duration=%v symbols=%d bytes=%d", now.Sub(m.start).Truncate(time.Second), m.totalSymbols, m.totalSize)
318 m.mu.Unlock()
319 }
320 }
321}