fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package build
16
17import (
18 "bytes"
19 "fmt"
20 "log"
21 "strings"
22 "sync"
23 "time"
24
25 "golang.org/x/exp/slices"
26
27 "github.com/sourcegraph/zoekt"
28 "github.com/sourcegraph/zoekt/ctags"
29)
30
31// Make sure all names are lowercase here, since they are normalized
32var enryLanguageMappings = map[string]string{
33 "c#": "c_sharp",
34}
35
36func normalizeLanguage(filetype string) string {
37 normalized := strings.ToLower(filetype)
38 if mapped, ok := enryLanguageMappings[normalized]; ok {
39 normalized = mapped
40 }
41
42 return normalized
43}
44
45func parseSymbols(todo []*zoekt.Document, languageMap ctags.LanguageMap, parserBins ctags.ParserBinMap) error {
46 monitor := newMonitor()
47 defer monitor.Stop()
48
49 var tagsToSections tagsToSections
50
51 parser := ctags.NewCTagsParser(parserBins)
52 defer parser.Close()
53
54 for _, doc := range todo {
55 if len(doc.Content) == 0 || doc.Symbols != nil {
56 continue
57 }
58
59 zoekt.DetermineLanguageIfUnknown(doc)
60
61 parserType := languageMap[normalizeLanguage(doc.Language)]
62 if parserType == ctags.NoCTags {
63 continue
64 }
65
66 // If the parser type is unknown, default to universal-ctags
67 if parserType == ctags.UnknownCTags {
68 parserType = ctags.UniversalCTags
69 }
70
71 monitor.BeginParsing(doc)
72 es, err := parser.Parse(doc.Name, doc.Content, parserType)
73 monitor.EndParsing(es)
74
75 if err != nil {
76 return err
77 }
78 if len(es) == 0 {
79 continue
80 }
81
82 symOffsets, symMetaData, err := tagsToSections.Convert(doc.Content, es)
83 if err != nil {
84 return fmt.Errorf("%s: %v", doc.Name, err)
85 }
86 doc.Symbols = symOffsets
87 doc.SymbolsMetaData = symMetaData
88 }
89
90 return nil
91}
92
93// overlaps finds the proper position to insert a zoekt.DocumentSection with
94// "start and "end" into "symOffsets". It returns -1 if the new section overlaps
95// with one of the existing ones.
96func overlaps(symOffsets []zoekt.DocumentSection, start, end uint32) int {
97 i := 0
98 for i = len(symOffsets) - 1; i >= 0; i-- {
99 // The most common case is that we exit here, because symOffsets is sorted by
100 // construction and start is in many cases monotonically increasing.
101 if start >= symOffsets[i].End {
102 break
103 }
104 if end <= symOffsets[i].Start {
105 continue
106 }
107 // overlap
108 return -1
109 }
110 return i + 1
111}
112
113// tagsToSections contains buffers to be reused between conversions of bytes
114// ranges to metadata. This is done to reduce pressure on the garbage
115// collector.
116type tagsToSections struct {
117 nlsBuf []uint32
118}
119
120// Convert ctags entries to byte ranges (zoekt.DocumentSection) with
121// corresponding metadata (zoekt.Symbol).
122//
123// This can not be called concurrently.
124func (t *tagsToSections) Convert(content []byte, tags []*ctags.Entry) ([]zoekt.DocumentSection, []*zoekt.Symbol, error) {
125 nls := t.newLinesIndices(content)
126 symOffsets := make([]zoekt.DocumentSection, 0, len(tags))
127 symMetaData := make([]*zoekt.Symbol, 0, len(tags))
128
129 for _, t := range tags {
130 if t.Line <= 0 {
131 // Observed this with a .JS file.
132 continue
133 }
134 lineIdx := t.Line - 1
135 if lineIdx >= len(nls) {
136 // Observed this with a .TS file.
137 continue
138 }
139
140 lineOff := uint32(0)
141 if lineIdx > 0 {
142 lineOff = nls[lineIdx-1] + 1
143 }
144
145 end := nls[lineIdx]
146 line := content[lineOff:end]
147
148 // This is best-effort only. For short symbol names, we will often determine the
149 // wrong offset.
150 intraOff := bytes.Index(line, []byte(t.Name))
151 if intraOff < 0 {
152 // for Go code, this is very common, since
153 // ctags barfs on multi-line declarations
154 continue
155 }
156
157 start := lineOff + uint32(intraOff)
158 endSym := start + uint32(len(t.Name))
159
160 i := overlaps(symOffsets, start, endSym)
161 if i == -1 {
162 // Detected an overlap. Give up.
163 continue
164 }
165
166 symOffsets = slices.Insert(symOffsets, i, zoekt.DocumentSection{
167 Start: start,
168 End: endSym,
169 })
170 symMetaData = slices.Insert(symMetaData, i, &zoekt.Symbol{
171 Sym: t.Name,
172 Kind: t.Kind,
173 Parent: t.Parent,
174 ParentKind: t.ParentKind,
175 })
176 }
177
178 return symOffsets, symMetaData, nil
179}
180
181// newLinesIndices returns an array of all indexes of '\n' aswell as a final
182// value for the length of the document.
183func (t *tagsToSections) newLinesIndices(in []byte) []uint32 {
184 // reuse nlsBuf between calls to tagsToSections.Convert
185 out := t.nlsBuf
186 if out == nil {
187 out = make([]uint32, 0, len(in)/30)
188 }
189
190 finalEntry := uint32(len(in))
191 off := uint32(0)
192 for len(in) > 0 {
193 i := bytes.IndexByte(in, '\n')
194 if i < 0 {
195 out = append(out, finalEntry)
196 break
197 }
198
199 off += uint32(i)
200 out = append(out, off)
201
202 in = in[i+1:]
203 off++
204 }
205
206 // save buffer for reuse
207 t.nlsBuf = out[:0]
208
209 return out
210}
211
212// monitorReportStuck is how long we need to be analysing a document before
213// reporting it to stdout.
214const monitorReportStuck = 10 * time.Second
215
216// monitorReportStatus is how often we given status updates
217const monitorReportStatus = time.Minute
218
219type monitor struct {
220 mu sync.Mutex
221
222 lastUpdate time.Time
223
224 start time.Time
225 totalSize int
226 totalSymbols int
227
228 currentDocName string
229 currentDocSize int
230 currentDocStuckCount int
231
232 done chan struct{}
233}
234
235func newMonitor() *monitor {
236 start := time.Now()
237 m := &monitor{
238 start: start,
239 lastUpdate: start,
240 done: make(chan struct{}),
241 }
242 go m.run()
243 return m
244}
245
246func (m *monitor) BeginParsing(doc *zoekt.Document) {
247 now := time.Now()
248 m.mu.Lock()
249 m.lastUpdate = now
250
251 // set current doc
252 m.currentDocName = doc.Name
253 m.currentDocSize = len(doc.Content)
254
255 m.mu.Unlock()
256}
257
258func (m *monitor) EndParsing(entries []*ctags.Entry) {
259 now := time.Now()
260 m.mu.Lock()
261 m.lastUpdate = now
262
263 // update aggregate stats
264 m.totalSize += m.currentDocSize
265 m.totalSymbols += len(entries)
266
267 // inform done if we warned about current document
268 if m.currentDocStuckCount > 0 {
269 log.Printf("symbol analysis for %s (size %d bytes) is done and found %d symbols", m.currentDocName, m.currentDocSize, len(entries))
270 m.currentDocStuckCount = 0
271 }
272
273 // unset current document
274 m.currentDocName = ""
275 m.currentDocSize = 0
276
277 m.mu.Unlock()
278}
279
280func (m *monitor) Stop() {
281 close(m.done)
282}
283
284func (m *monitor) run() {
285 stuckTicker := time.NewTicker(monitorReportStuck / 2) // half due to sampling theorem (nyquist)
286 statusTicker := time.NewTicker(monitorReportStatus)
287
288 defer stuckTicker.Stop()
289 defer statusTicker.Stop()
290
291 for {
292 select {
293 case <-m.done:
294 now := time.Now()
295 m.mu.Lock()
296 log.Printf("symbol analysis finished for shard statistics: duration=%v symbols=%d bytes=%d", now.Sub(m.start).Truncate(time.Second), m.totalSymbols, m.totalSize)
297 m.mu.Unlock()
298 return
299
300 case <-stuckTicker.C:
301 now := time.Now()
302 m.mu.Lock()
303 running := now.Sub(m.lastUpdate).Truncate(time.Second)
304 report := monitorReportStuck * (1 << m.currentDocStuckCount) // double the amount of time each time we report
305 if m.currentDocName != "" && running >= report {
306 m.currentDocStuckCount++
307 log.Printf("WARN: symbol analysis for %s (%d bytes) has been running for %v", m.currentDocName, m.currentDocSize, running)
308 }
309 m.mu.Unlock()
310
311 case <-statusTicker.C:
312 now := time.Now()
313 m.mu.Lock()
314 log.Printf("DEBUG: symbol analysis still running for shard statistics: duration=%v symbols=%d bytes=%d", now.Sub(m.start).Truncate(time.Second), m.totalSymbols, m.totalSize)
315 m.mu.Unlock()
316 }
317 }
318}