build/ctags.go at b93ad775693a9b41e6e3e9dccc2349d333817da2 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / build / ctags.go
at b93ad775693a9b41e6e3e9dccc2349d333817da2 7.9 kB View raw
Julie Tibshirani ctags: require scip-ctags to be present (#700) 2y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package build
 16
 17import (
 18	"bytes"
 19	"fmt"
 20	"log"
 21	"strings"
 22	"sync"
 23	"time"
 24
 25	"golang.org/x/exp/slices"
 26
 27	"github.com/sourcegraph/zoekt"
 28	"github.com/sourcegraph/zoekt/ctags"
 29)
 30
 31// Make sure all names are lowercase here, since they are normalized
 32var enryLanguageMappings = map[string]string{
 33	"c#": "c_sharp",
 34}
 35
 36func normalizeLanguage(filetype string) string {
 37	normalized := strings.ToLower(filetype)
 38	if mapped, ok := enryLanguageMappings[normalized]; ok {
 39		normalized = mapped
 40	}
 41
 42	return normalized
 43}
 44
 45func ctagsAddSymbolsParserMap(todo []*zoekt.Document, languageMap ctags.LanguageMap, parserMap ctags.ParserMap) error {
 46	monitor := newMonitor()
 47	defer monitor.Stop()
 48
 49	var tagsToSections tagsToSections
 50
 51	for _, doc := range todo {
 52		if len(doc.Content) == 0 || doc.Symbols != nil {
 53			continue
 54		}
 55
 56		zoekt.DetermineLanguageIfUnknown(doc)
 57
 58		parserKind := languageMap[normalizeLanguage(doc.Language)]
 59		if parserKind == ctags.NoCTags {
 60			continue
 61		}
 62
 63		// If the parser kind is unknown, default to universal-ctags
 64		if parserKind == ctags.UnknownCTags {
 65			parserKind = ctags.UniversalCTags
 66		}
 67
 68		parser := parserMap[parserKind]
 69		if parser == nil {
 70			// this happens if CTagsMustSucceed is false and we didn't find the binary
 71			continue
 72		}
 73
 74		monitor.BeginParsing(doc)
 75		es, err := parser.Parse(doc.Name, doc.Content)
 76		monitor.EndParsing(es)
 77
 78		if err != nil {
 79			return err
 80		}
 81		if len(es) == 0 {
 82			continue
 83		}
 84
 85		symOffsets, symMetaData, err := tagsToSections.Convert(doc.Content, es)
 86		if err != nil {
 87			return fmt.Errorf("%s: %v", doc.Name, err)
 88		}
 89		doc.Symbols = symOffsets
 90		doc.SymbolsMetaData = symMetaData
 91	}
 92
 93	return nil
 94}
 95
 96// overlaps finds the proper position to insert a zoekt.DocumentSection with
 97// "start and "end" into "symOffsets". It returns -1 if the new section overlaps
 98// with one of the existing ones.
 99func overlaps(symOffsets []zoekt.DocumentSection, start, end uint32) int {
100	i := 0
101	for i = len(symOffsets) - 1; i >= 0; i-- {
102		// The most common case is that we exit here, because symOffsets is sorted by
103		// construction and start is in many cases monotonically increasing.
104		if start >= symOffsets[i].End {
105			break
106		}
107		if end <= symOffsets[i].Start {
108			continue
109		}
110		// overlap
111		return -1
112	}
113	return i + 1
114}
115
116// tagsToSections contains buffers to be reused between conversions of bytes
117// ranges to metadata. This is done to reduce pressure on the garbage
118// collector.
119type tagsToSections struct {
120	nlsBuf []uint32
121}
122
123// Convert ctags entries to byte ranges (zoekt.DocumentSection) with
124// corresponding metadata (zoekt.Symbol).
125//
126// This can not be called concurrently.
127func (t *tagsToSections) Convert(content []byte, tags []*ctags.Entry) ([]zoekt.DocumentSection, []*zoekt.Symbol, error) {
128	nls := t.newLinesIndices(content)
129	symOffsets := make([]zoekt.DocumentSection, 0, len(tags))
130	symMetaData := make([]*zoekt.Symbol, 0, len(tags))
131
132	for _, t := range tags {
133		if t.Line <= 0 {
134			// Observed this with a .JS file.
135			continue
136		}
137		lineIdx := t.Line - 1
138		if lineIdx >= len(nls) {
139			// Observed this with a .TS file.
140			continue
141		}
142
143		lineOff := uint32(0)
144		if lineIdx > 0 {
145			lineOff = nls[lineIdx-1] + 1
146		}
147
148		end := nls[lineIdx]
149		line := content[lineOff:end]
150
151		// This is best-effort only. For short symbol names, we will often determine the
152		// wrong offset.
153		intraOff := bytes.Index(line, []byte(t.Name))
154		if intraOff < 0 {
155			// for Go code, this is very common, since
156			// ctags barfs on multi-line declarations
157			continue
158		}
159
160		start := lineOff + uint32(intraOff)
161		endSym := start + uint32(len(t.Name))
162
163		i := overlaps(symOffsets, start, endSym)
164		if i == -1 {
165			// Detected an overlap. Give up.
166			continue
167		}
168
169		symOffsets = slices.Insert(symOffsets, i, zoekt.DocumentSection{
170			Start: start,
171			End:   endSym,
172		})
173		symMetaData = slices.Insert(symMetaData, i, &zoekt.Symbol{
174			Sym:        t.Name,
175			Kind:       t.Kind,
176			Parent:     t.Parent,
177			ParentKind: t.ParentKind,
178		})
179	}
180
181	return symOffsets, symMetaData, nil
182}
183
184// newLinesIndices returns an array of all indexes of '\n' aswell as a final
185// value for the length of the document.
186func (t *tagsToSections) newLinesIndices(in []byte) []uint32 {
187	// reuse nlsBuf between calls to tagsToSections.Convert
188	out := t.nlsBuf
189	if out == nil {
190		out = make([]uint32, 0, len(in)/30)
191	}
192
193	finalEntry := uint32(len(in))
194	off := uint32(0)
195	for len(in) > 0 {
196		i := bytes.IndexByte(in, '\n')
197		if i < 0 {
198			out = append(out, finalEntry)
199			break
200		}
201
202		off += uint32(i)
203		out = append(out, off)
204
205		in = in[i+1:]
206		off++
207	}
208
209	// save buffer for reuse
210	t.nlsBuf = out[:0]
211
212	return out
213}
214
215// monitorReportStuck is how long we need to be analysing a document before
216// reporting it to stdout.
217const monitorReportStuck = 10 * time.Second
218
219// monitorReportStatus is how often we given status updates
220const monitorReportStatus = time.Minute
221
222type monitor struct {
223	mu sync.Mutex
224
225	lastUpdate time.Time
226
227	start        time.Time
228	totalSize    int
229	totalSymbols int
230
231	currentDocName       string
232	currentDocSize       int
233	currentDocStuckCount int
234
235	done chan struct{}
236}
237
238func newMonitor() *monitor {
239	start := time.Now()
240	m := &monitor{
241		start:      start,
242		lastUpdate: start,
243		done:       make(chan struct{}),
244	}
245	go m.run()
246	return m
247}
248
249func (m *monitor) BeginParsing(doc *zoekt.Document) {
250	now := time.Now()
251	m.mu.Lock()
252	m.lastUpdate = now
253
254	// set current doc
255	m.currentDocName = doc.Name
256	m.currentDocSize = len(doc.Content)
257
258	m.mu.Unlock()
259}
260
261func (m *monitor) EndParsing(entries []*ctags.Entry) {
262	now := time.Now()
263	m.mu.Lock()
264	m.lastUpdate = now
265
266	// update aggregate stats
267	m.totalSize += m.currentDocSize
268	m.totalSymbols += len(entries)
269
270	// inform done if we warned about current document
271	if m.currentDocStuckCount > 0 {
272		log.Printf("symbol analysis for %s (size %d bytes) is done and found %d symbols", m.currentDocName, m.currentDocSize, len(entries))
273		m.currentDocStuckCount = 0
274	}
275
276	// unset current document
277	m.currentDocName = ""
278	m.currentDocSize = 0
279
280	m.mu.Unlock()
281}
282
283func (m *monitor) Stop() {
284	close(m.done)
285}
286
287func (m *monitor) run() {
288	stuckTicker := time.NewTicker(monitorReportStuck / 2) // half due to sampling theorem (nyquist)
289	statusTicker := time.NewTicker(monitorReportStatus)
290
291	defer stuckTicker.Stop()
292	defer statusTicker.Stop()
293
294	for {
295		select {
296		case <-m.done:
297			now := time.Now()
298			m.mu.Lock()
299			log.Printf("symbol analysis finished for shard statistics: duration=%v symbols=%d bytes=%d", now.Sub(m.start).Truncate(time.Second), m.totalSymbols, m.totalSize)
300			m.mu.Unlock()
301			return
302
303		case <-stuckTicker.C:
304			now := time.Now()
305			m.mu.Lock()
306			running := now.Sub(m.lastUpdate).Truncate(time.Second)
307			report := monitorReportStuck * (1 << m.currentDocStuckCount) // double the amount of time each time we report
308			if m.currentDocName != "" && running >= report {
309				m.currentDocStuckCount++
310				log.Printf("WARN: symbol analysis for %s (%d bytes) has been running for %v", m.currentDocName, m.currentDocSize, running)
311			}
312			m.mu.Unlock()
313
314		case <-statusTicker.C:
315			now := time.Now()
316			m.mu.Lock()
317			log.Printf("DEBUG: symbol analysis still running for shard statistics: duration=%v symbols=%d bytes=%d", now.Sub(m.start).Truncate(time.Second), m.totalSymbols, m.totalSize)
318			m.mu.Unlock()
319		}
320	}
321}
Configure Feed

Configure Feed