fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package index
16
17import (
18 "bytes"
19 "fmt"
20 "log"
21 "slices"
22 "strings"
23 "sync"
24 "time"
25
26 "github.com/sourcegraph/zoekt"
27 "github.com/sourcegraph/zoekt/internal/ctags"
28)
29
30// Make sure all names are lowercase here, since they are normalized
31var enryLanguageMappings = map[string]string{
32 "c#": "c_sharp",
33}
34
35func normalizeLanguage(filetype string) string {
36 normalized := strings.ToLower(filetype)
37 if mapped, ok := enryLanguageMappings[normalized]; ok {
38 normalized = mapped
39 }
40
41 return normalized
42}
43
44func parseSymbols(todo []*Document, languageMap ctags.LanguageMap, parserBins ctags.ParserBinMap) error {
45 monitor := newMonitor()
46 defer monitor.Stop()
47
48 var tagsToSections tagsToSections
49
50 parser := ctags.NewCTagsParser(parserBins)
51 defer parser.Close()
52
53 for _, doc := range todo {
54 if len(doc.Content) == 0 || doc.Symbols != nil {
55 continue
56 }
57
58 parserType := languageMap[normalizeLanguage(doc.Language)]
59 if parserType == ctags.NoCTags {
60 continue
61 }
62
63 // If the parser type is unknown, default to universal-ctags
64 if parserType == ctags.UnknownCTags {
65 parserType = ctags.UniversalCTags
66 }
67
68 monitor.BeginParsing(doc)
69 es, err := parser.Parse(doc.Name, doc.Content, parserType)
70 monitor.EndParsing(es)
71
72 if err != nil {
73 return err
74 }
75 if len(es) == 0 {
76 continue
77 }
78
79 symOffsets, symMetaData, err := tagsToSections.Convert(doc.Content, es)
80 if err != nil {
81 return fmt.Errorf("%s: %v", doc.Name, err)
82 }
83 doc.Symbols = symOffsets
84 doc.SymbolsMetaData = symMetaData
85 }
86
87 return nil
88}
89
90// overlaps finds the proper position to insert a zoekt.DocumentSection with
91// "start and "end" into "symOffsets". It returns -1 if the new section overlaps
92// with one of the existing ones.
93func overlaps(symOffsets []DocumentSection, start, end uint32) int {
94 i := 0
95 for i = len(symOffsets) - 1; i >= 0; i-- {
96 // The most common case is that we exit here, because symOffsets is sorted by
97 // construction and start is in many cases monotonically increasing.
98 if start >= symOffsets[i].End {
99 break
100 }
101 if end <= symOffsets[i].Start {
102 continue
103 }
104 // overlap
105 return -1
106 }
107 return i + 1
108}
109
110// tagsToSections contains buffers to be reused between conversions of bytes
111// ranges to metadata. This is done to reduce pressure on the garbage
112// collector.
113type tagsToSections struct {
114 nlsBuf []uint32
115}
116
117// Convert ctags entries to byte ranges (zoekt.DocumentSection) with
118// corresponding metadata (zoekt.Symbol).
119//
120// This can not be called concurrently.
121func (t *tagsToSections) Convert(content []byte, tags []*ctags.Entry) ([]DocumentSection, []*zoekt.Symbol, error) {
122 nls := t.newLinesIndices(content)
123 symOffsets := make([]DocumentSection, 0, len(tags))
124 symMetaData := make([]*zoekt.Symbol, 0, len(tags))
125
126 for _, t := range tags {
127 if t.Line <= 0 {
128 // Observed this with a .JS file.
129 continue
130 }
131 lineIdx := t.Line - 1
132 if lineIdx >= len(nls) {
133 // Observed this with a .TS file.
134 continue
135 }
136
137 lineOff := uint32(0)
138 if lineIdx > 0 {
139 lineOff = nls[lineIdx-1] + 1
140 }
141
142 end := nls[lineIdx]
143 line := content[lineOff:end]
144
145 // This is best-effort only. For short symbol names, we will often determine the
146 // wrong offset.
147 intraOff := bytes.Index(line, []byte(t.Name))
148 if intraOff < 0 {
149 // for Go code, this is very common, since
150 // ctags barfs on multi-line declarations
151 continue
152 }
153
154 start := lineOff + uint32(intraOff)
155 endSym := start + uint32(len(t.Name))
156
157 i := overlaps(symOffsets, start, endSym)
158 if i == -1 {
159 // Detected an overlap. Give up.
160 continue
161 }
162
163 symOffsets = slices.Insert(symOffsets, i, DocumentSection{
164 Start: start,
165 End: endSym,
166 })
167 symMetaData = slices.Insert(symMetaData, i, &zoekt.Symbol{
168 Sym: t.Name,
169 Kind: t.Kind,
170 Parent: t.Parent,
171 ParentKind: t.ParentKind,
172 })
173 }
174
175 return symOffsets, symMetaData, nil
176}
177
178// newLinesIndices returns an array of all indexes of '\n' aswell as a final
179// value for the length of the document.
180func (t *tagsToSections) newLinesIndices(in []byte) []uint32 {
181 // reuse nlsBuf between calls to tagsToSections.Convert
182 out := t.nlsBuf
183 if out == nil {
184 out = make([]uint32, 0, len(in)/30)
185 }
186
187 finalEntry := uint32(len(in))
188 off := uint32(0)
189 for len(in) > 0 {
190 i := bytes.IndexByte(in, '\n')
191 if i < 0 {
192 out = append(out, finalEntry)
193 break
194 }
195
196 off += uint32(i)
197 out = append(out, off)
198
199 in = in[i+1:]
200 off++
201 }
202
203 // save buffer for reuse
204 t.nlsBuf = out[:0]
205
206 return out
207}
208
209// monitorReportStuck is how long we need to be analysing a document before
210// reporting it to stdout.
211const monitorReportStuck = 10 * time.Second
212
213// monitorReportStatus is how often we given status updates
214const monitorReportStatus = time.Minute
215
216type monitor struct {
217 mu sync.Mutex
218
219 lastUpdate time.Time
220
221 start time.Time
222 totalSize int
223 totalSymbols int
224
225 currentDocName string
226 currentDocSize int
227 currentDocStuckCount int
228
229 done chan struct{}
230}
231
232func newMonitor() *monitor {
233 start := time.Now()
234 m := &monitor{
235 start: start,
236 lastUpdate: start,
237 done: make(chan struct{}),
238 }
239 go m.run()
240 return m
241}
242
243func (m *monitor) BeginParsing(doc *Document) {
244 now := time.Now()
245 m.mu.Lock()
246 m.lastUpdate = now
247
248 // set current doc
249 m.currentDocName = doc.Name
250 m.currentDocSize = len(doc.Content)
251
252 m.mu.Unlock()
253}
254
255func (m *monitor) EndParsing(entries []*ctags.Entry) {
256 now := time.Now()
257 m.mu.Lock()
258 m.lastUpdate = now
259
260 // update aggregate stats
261 m.totalSize += m.currentDocSize
262 m.totalSymbols += len(entries)
263
264 // inform done if we warned about current document
265 if m.currentDocStuckCount > 0 {
266 log.Printf("symbol analysis for %s (size %d bytes) is done and found %d symbols", m.currentDocName, m.currentDocSize, len(entries))
267 m.currentDocStuckCount = 0
268 }
269
270 // unset current document
271 m.currentDocName = ""
272 m.currentDocSize = 0
273
274 m.mu.Unlock()
275}
276
277func (m *monitor) Stop() {
278 close(m.done)
279}
280
281func (m *monitor) run() {
282 stuckTicker := time.NewTicker(monitorReportStuck / 2) // half due to sampling theorem (nyquist)
283 statusTicker := time.NewTicker(monitorReportStatus)
284
285 defer stuckTicker.Stop()
286 defer statusTicker.Stop()
287
288 for {
289 select {
290 case <-m.done:
291 now := time.Now()
292 m.mu.Lock()
293 log.Printf("symbol analysis finished for shard statistics: duration=%v symbols=%d bytes=%d", now.Sub(m.start).Truncate(time.Second), m.totalSymbols, m.totalSize)
294 m.mu.Unlock()
295 return
296
297 case <-stuckTicker.C:
298 now := time.Now()
299 m.mu.Lock()
300 running := now.Sub(m.lastUpdate).Truncate(time.Second)
301 report := monitorReportStuck * (1 << m.currentDocStuckCount) // double the amount of time each time we report
302 if m.currentDocName != "" && running >= report {
303 m.currentDocStuckCount++
304 log.Printf("WARN: symbol analysis for %s (%d bytes) has been running for %v", m.currentDocName, m.currentDocSize, running)
305 }
306 m.mu.Unlock()
307
308 case <-statusTicker.C:
309 now := time.Now()
310 m.mu.Lock()
311 log.Printf("DEBUG: symbol analysis still running for shard statistics: duration=%v symbols=%d bytes=%d", now.Sub(m.start).Truncate(time.Second), m.totalSymbols, m.totalSize)
312 m.mu.Unlock()
313 }
314 }
315}