fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package index
16
17import (
18 "bytes"
19 "fmt"
20 "log"
21 "slices"
22 "strings"
23 "sync"
24 "time"
25
26 "github.com/sourcegraph/zoekt"
27 "github.com/sourcegraph/zoekt/internal/ctags"
28)
29
30// Make sure all names are lowercase here, since they are normalized
31var enryLanguageMappings = map[string]string{
32 "c#": "c_sharp",
33}
34
35func normalizeLanguage(filetype string) string {
36 normalized := strings.ToLower(filetype)
37 if mapped, ok := enryLanguageMappings[normalized]; ok {
38 normalized = mapped
39 }
40
41 return normalized
42}
43
44func parseSymbols(todo []*Document, languageMap ctags.LanguageMap, parserBins ctags.ParserBinMap) error {
45 monitor := newMonitor()
46 defer monitor.Stop()
47
48 var tagsToSections tagsToSections
49
50 parser := ctags.NewCTagsParser(parserBins)
51 defer parser.Close()
52
53 for _, doc := range todo {
54 if len(doc.Content) == 0 || doc.Symbols != nil {
55 continue
56 }
57
58 DetermineLanguageIfUnknown(doc)
59
60 parserType := languageMap[normalizeLanguage(doc.Language)]
61 if parserType == ctags.NoCTags {
62 continue
63 }
64
65 // If the parser type is unknown, default to universal-ctags
66 if parserType == ctags.UnknownCTags {
67 parserType = ctags.UniversalCTags
68 }
69
70 monitor.BeginParsing(doc)
71 es, err := parser.Parse(doc.Name, doc.Content, parserType)
72 monitor.EndParsing(es)
73
74 if err != nil {
75 return err
76 }
77 if len(es) == 0 {
78 continue
79 }
80
81 symOffsets, symMetaData, err := tagsToSections.Convert(doc.Content, es)
82 if err != nil {
83 return fmt.Errorf("%s: %v", doc.Name, err)
84 }
85 doc.Symbols = symOffsets
86 doc.SymbolsMetaData = symMetaData
87 }
88
89 return nil
90}
91
92// overlaps finds the proper position to insert a zoekt.DocumentSection with
93// "start and "end" into "symOffsets". It returns -1 if the new section overlaps
94// with one of the existing ones.
95func overlaps(symOffsets []DocumentSection, start, end uint32) int {
96 i := 0
97 for i = len(symOffsets) - 1; i >= 0; i-- {
98 // The most common case is that we exit here, because symOffsets is sorted by
99 // construction and start is in many cases monotonically increasing.
100 if start >= symOffsets[i].End {
101 break
102 }
103 if end <= symOffsets[i].Start {
104 continue
105 }
106 // overlap
107 return -1
108 }
109 return i + 1
110}
111
112// tagsToSections contains buffers to be reused between conversions of bytes
113// ranges to metadata. This is done to reduce pressure on the garbage
114// collector.
115type tagsToSections struct {
116 nlsBuf []uint32
117}
118
119// Convert ctags entries to byte ranges (zoekt.DocumentSection) with
120// corresponding metadata (zoekt.Symbol).
121//
122// This can not be called concurrently.
123func (t *tagsToSections) Convert(content []byte, tags []*ctags.Entry) ([]DocumentSection, []*zoekt.Symbol, error) {
124 nls := t.newLinesIndices(content)
125 symOffsets := make([]DocumentSection, 0, len(tags))
126 symMetaData := make([]*zoekt.Symbol, 0, len(tags))
127
128 for _, t := range tags {
129 if t.Line <= 0 {
130 // Observed this with a .JS file.
131 continue
132 }
133 lineIdx := t.Line - 1
134 if lineIdx >= len(nls) {
135 // Observed this with a .TS file.
136 continue
137 }
138
139 lineOff := uint32(0)
140 if lineIdx > 0 {
141 lineOff = nls[lineIdx-1] + 1
142 }
143
144 end := nls[lineIdx]
145 line := content[lineOff:end]
146
147 // This is best-effort only. For short symbol names, we will often determine the
148 // wrong offset.
149 intraOff := bytes.Index(line, []byte(t.Name))
150 if intraOff < 0 {
151 // for Go code, this is very common, since
152 // ctags barfs on multi-line declarations
153 continue
154 }
155
156 start := lineOff + uint32(intraOff)
157 endSym := start + uint32(len(t.Name))
158
159 i := overlaps(symOffsets, start, endSym)
160 if i == -1 {
161 // Detected an overlap. Give up.
162 continue
163 }
164
165 symOffsets = slices.Insert(symOffsets, i, DocumentSection{
166 Start: start,
167 End: endSym,
168 })
169 symMetaData = slices.Insert(symMetaData, i, &zoekt.Symbol{
170 Sym: t.Name,
171 Kind: t.Kind,
172 Parent: t.Parent,
173 ParentKind: t.ParentKind,
174 })
175 }
176
177 return symOffsets, symMetaData, nil
178}
179
180// newLinesIndices returns an array of all indexes of '\n' aswell as a final
181// value for the length of the document.
182func (t *tagsToSections) newLinesIndices(in []byte) []uint32 {
183 // reuse nlsBuf between calls to tagsToSections.Convert
184 out := t.nlsBuf
185 if out == nil {
186 out = make([]uint32, 0, len(in)/30)
187 }
188
189 finalEntry := uint32(len(in))
190 off := uint32(0)
191 for len(in) > 0 {
192 i := bytes.IndexByte(in, '\n')
193 if i < 0 {
194 out = append(out, finalEntry)
195 break
196 }
197
198 off += uint32(i)
199 out = append(out, off)
200
201 in = in[i+1:]
202 off++
203 }
204
205 // save buffer for reuse
206 t.nlsBuf = out[:0]
207
208 return out
209}
210
211// monitorReportStuck is how long we need to be analysing a document before
212// reporting it to stdout.
213const monitorReportStuck = 10 * time.Second
214
215// monitorReportStatus is how often we given status updates
216const monitorReportStatus = time.Minute
217
218type monitor struct {
219 mu sync.Mutex
220
221 lastUpdate time.Time
222
223 start time.Time
224 totalSize int
225 totalSymbols int
226
227 currentDocName string
228 currentDocSize int
229 currentDocStuckCount int
230
231 done chan struct{}
232}
233
234func newMonitor() *monitor {
235 start := time.Now()
236 m := &monitor{
237 start: start,
238 lastUpdate: start,
239 done: make(chan struct{}),
240 }
241 go m.run()
242 return m
243}
244
245func (m *monitor) BeginParsing(doc *Document) {
246 now := time.Now()
247 m.mu.Lock()
248 m.lastUpdate = now
249
250 // set current doc
251 m.currentDocName = doc.Name
252 m.currentDocSize = len(doc.Content)
253
254 m.mu.Unlock()
255}
256
257func (m *monitor) EndParsing(entries []*ctags.Entry) {
258 now := time.Now()
259 m.mu.Lock()
260 m.lastUpdate = now
261
262 // update aggregate stats
263 m.totalSize += m.currentDocSize
264 m.totalSymbols += len(entries)
265
266 // inform done if we warned about current document
267 if m.currentDocStuckCount > 0 {
268 log.Printf("symbol analysis for %s (size %d bytes) is done and found %d symbols", m.currentDocName, m.currentDocSize, len(entries))
269 m.currentDocStuckCount = 0
270 }
271
272 // unset current document
273 m.currentDocName = ""
274 m.currentDocSize = 0
275
276 m.mu.Unlock()
277}
278
279func (m *monitor) Stop() {
280 close(m.done)
281}
282
283func (m *monitor) run() {
284 stuckTicker := time.NewTicker(monitorReportStuck / 2) // half due to sampling theorem (nyquist)
285 statusTicker := time.NewTicker(monitorReportStatus)
286
287 defer stuckTicker.Stop()
288 defer statusTicker.Stop()
289
290 for {
291 select {
292 case <-m.done:
293 now := time.Now()
294 m.mu.Lock()
295 log.Printf("symbol analysis finished for shard statistics: duration=%v symbols=%d bytes=%d", now.Sub(m.start).Truncate(time.Second), m.totalSymbols, m.totalSize)
296 m.mu.Unlock()
297 return
298
299 case <-stuckTicker.C:
300 now := time.Now()
301 m.mu.Lock()
302 running := now.Sub(m.lastUpdate).Truncate(time.Second)
303 report := monitorReportStuck * (1 << m.currentDocStuckCount) // double the amount of time each time we report
304 if m.currentDocName != "" && running >= report {
305 m.currentDocStuckCount++
306 log.Printf("WARN: symbol analysis for %s (%d bytes) has been running for %v", m.currentDocName, m.currentDocSize, running)
307 }
308 m.mu.Unlock()
309
310 case <-statusTicker.C:
311 now := time.Now()
312 m.mu.Lock()
313 log.Printf("DEBUG: symbol analysis still running for shard statistics: duration=%v symbols=%d bytes=%d", now.Sub(m.start).Truncate(time.Second), m.totalSymbols, m.totalSize)
314 m.mu.Unlock()
315 }
316 }
317}