fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package build 16 17import ( 18 "bytes" 19 "fmt" 20 "log" 21 "slices" 22 "strings" 23 "sync" 24 "time" 25 26 "github.com/sourcegraph/zoekt" 27 "github.com/sourcegraph/zoekt/ctags" 28) 29 30// Make sure all names are lowercase here, since they are normalized 31var enryLanguageMappings = map[string]string{ 32 "c#": "c_sharp", 33} 34 35func normalizeLanguage(filetype string) string { 36 normalized := strings.ToLower(filetype) 37 if mapped, ok := enryLanguageMappings[normalized]; ok { 38 normalized = mapped 39 } 40 41 return normalized 42} 43 44func parseSymbols(todo []*zoekt.Document, languageMap ctags.LanguageMap, parserBins ctags.ParserBinMap) error { 45 monitor := newMonitor() 46 defer monitor.Stop() 47 48 var tagsToSections tagsToSections 49 50 parser := ctags.NewCTagsParser(parserBins) 51 defer parser.Close() 52 53 for _, doc := range todo { 54 if len(doc.Content) == 0 || doc.Symbols != nil { 55 continue 56 } 57 58 zoekt.DetermineLanguageIfUnknown(doc) 59 60 parserType := languageMap[normalizeLanguage(doc.Language)] 61 if parserType == ctags.NoCTags { 62 continue 63 } 64 65 // If the parser type is unknown, default to universal-ctags 66 if parserType == ctags.UnknownCTags { 67 parserType = ctags.UniversalCTags 68 } 69 70 monitor.BeginParsing(doc) 71 es, err := parser.Parse(doc.Name, doc.Content, parserType) 72 monitor.EndParsing(es) 73 74 if err != nil { 75 return err 76 } 77 if len(es) == 0 { 78 continue 79 } 80 81 symOffsets, symMetaData, err := tagsToSections.Convert(doc.Content, es) 82 if err != nil { 83 return fmt.Errorf("%s: %v", doc.Name, err) 84 } 85 doc.Symbols = symOffsets 86 doc.SymbolsMetaData = symMetaData 87 } 88 89 return nil 90} 91 92// overlaps finds the proper position to insert a zoekt.DocumentSection with 93// "start and "end" into "symOffsets". It returns -1 if the new section overlaps 94// with one of the existing ones. 95func overlaps(symOffsets []zoekt.DocumentSection, start, end uint32) int { 96 i := 0 97 for i = len(symOffsets) - 1; i >= 0; i-- { 98 // The most common case is that we exit here, because symOffsets is sorted by 99 // construction and start is in many cases monotonically increasing. 100 if start >= symOffsets[i].End { 101 break 102 } 103 if end <= symOffsets[i].Start { 104 continue 105 } 106 // overlap 107 return -1 108 } 109 return i + 1 110} 111 112// tagsToSections contains buffers to be reused between conversions of bytes 113// ranges to metadata. This is done to reduce pressure on the garbage 114// collector. 115type tagsToSections struct { 116 nlsBuf []uint32 117} 118 119// Convert ctags entries to byte ranges (zoekt.DocumentSection) with 120// corresponding metadata (zoekt.Symbol). 121// 122// This can not be called concurrently. 123func (t *tagsToSections) Convert(content []byte, tags []*ctags.Entry) ([]zoekt.DocumentSection, []*zoekt.Symbol, error) { 124 nls := t.newLinesIndices(content) 125 symOffsets := make([]zoekt.DocumentSection, 0, len(tags)) 126 symMetaData := make([]*zoekt.Symbol, 0, len(tags)) 127 128 for _, t := range tags { 129 if t.Line <= 0 { 130 // Observed this with a .JS file. 131 continue 132 } 133 lineIdx := t.Line - 1 134 if lineIdx >= len(nls) { 135 // Observed this with a .TS file. 136 continue 137 } 138 139 lineOff := uint32(0) 140 if lineIdx > 0 { 141 lineOff = nls[lineIdx-1] + 1 142 } 143 144 end := nls[lineIdx] 145 line := content[lineOff:end] 146 147 // This is best-effort only. For short symbol names, we will often determine the 148 // wrong offset. 149 intraOff := bytes.Index(line, []byte(t.Name)) 150 if intraOff < 0 { 151 // for Go code, this is very common, since 152 // ctags barfs on multi-line declarations 153 continue 154 } 155 156 start := lineOff + uint32(intraOff) 157 endSym := start + uint32(len(t.Name)) 158 159 i := overlaps(symOffsets, start, endSym) 160 if i == -1 { 161 // Detected an overlap. Give up. 162 continue 163 } 164 165 symOffsets = slices.Insert(symOffsets, i, zoekt.DocumentSection{ 166 Start: start, 167 End: endSym, 168 }) 169 symMetaData = slices.Insert(symMetaData, i, &zoekt.Symbol{ 170 Sym: t.Name, 171 Kind: t.Kind, 172 Parent: t.Parent, 173 ParentKind: t.ParentKind, 174 }) 175 } 176 177 return symOffsets, symMetaData, nil 178} 179 180// newLinesIndices returns an array of all indexes of '\n' aswell as a final 181// value for the length of the document. 182func (t *tagsToSections) newLinesIndices(in []byte) []uint32 { 183 // reuse nlsBuf between calls to tagsToSections.Convert 184 out := t.nlsBuf 185 if out == nil { 186 out = make([]uint32, 0, len(in)/30) 187 } 188 189 finalEntry := uint32(len(in)) 190 off := uint32(0) 191 for len(in) > 0 { 192 i := bytes.IndexByte(in, '\n') 193 if i < 0 { 194 out = append(out, finalEntry) 195 break 196 } 197 198 off += uint32(i) 199 out = append(out, off) 200 201 in = in[i+1:] 202 off++ 203 } 204 205 // save buffer for reuse 206 t.nlsBuf = out[:0] 207 208 return out 209} 210 211// monitorReportStuck is how long we need to be analysing a document before 212// reporting it to stdout. 213const monitorReportStuck = 10 * time.Second 214 215// monitorReportStatus is how often we given status updates 216const monitorReportStatus = time.Minute 217 218type monitor struct { 219 mu sync.Mutex 220 221 lastUpdate time.Time 222 223 start time.Time 224 totalSize int 225 totalSymbols int 226 227 currentDocName string 228 currentDocSize int 229 currentDocStuckCount int 230 231 done chan struct{} 232} 233 234func newMonitor() *monitor { 235 start := time.Now() 236 m := &monitor{ 237 start: start, 238 lastUpdate: start, 239 done: make(chan struct{}), 240 } 241 go m.run() 242 return m 243} 244 245func (m *monitor) BeginParsing(doc *zoekt.Document) { 246 now := time.Now() 247 m.mu.Lock() 248 m.lastUpdate = now 249 250 // set current doc 251 m.currentDocName = doc.Name 252 m.currentDocSize = len(doc.Content) 253 254 m.mu.Unlock() 255} 256 257func (m *monitor) EndParsing(entries []*ctags.Entry) { 258 now := time.Now() 259 m.mu.Lock() 260 m.lastUpdate = now 261 262 // update aggregate stats 263 m.totalSize += m.currentDocSize 264 m.totalSymbols += len(entries) 265 266 // inform done if we warned about current document 267 if m.currentDocStuckCount > 0 { 268 log.Printf("symbol analysis for %s (size %d bytes) is done and found %d symbols", m.currentDocName, m.currentDocSize, len(entries)) 269 m.currentDocStuckCount = 0 270 } 271 272 // unset current document 273 m.currentDocName = "" 274 m.currentDocSize = 0 275 276 m.mu.Unlock() 277} 278 279func (m *monitor) Stop() { 280 close(m.done) 281} 282 283func (m *monitor) run() { 284 stuckTicker := time.NewTicker(monitorReportStuck / 2) // half due to sampling theorem (nyquist) 285 statusTicker := time.NewTicker(monitorReportStatus) 286 287 defer stuckTicker.Stop() 288 defer statusTicker.Stop() 289 290 for { 291 select { 292 case <-m.done: 293 now := time.Now() 294 m.mu.Lock() 295 log.Printf("symbol analysis finished for shard statistics: duration=%v symbols=%d bytes=%d", now.Sub(m.start).Truncate(time.Second), m.totalSymbols, m.totalSize) 296 m.mu.Unlock() 297 return 298 299 case <-stuckTicker.C: 300 now := time.Now() 301 m.mu.Lock() 302 running := now.Sub(m.lastUpdate).Truncate(time.Second) 303 report := monitorReportStuck * (1 << m.currentDocStuckCount) // double the amount of time each time we report 304 if m.currentDocName != "" && running >= report { 305 m.currentDocStuckCount++ 306 log.Printf("WARN: symbol analysis for %s (%d bytes) has been running for %v", m.currentDocName, m.currentDocSize, running) 307 } 308 m.mu.Unlock() 309 310 case <-statusTicker.C: 311 now := time.Now() 312 m.mu.Lock() 313 log.Printf("DEBUG: symbol analysis still running for shard statistics: duration=%v symbols=%d bytes=%d", now.Sub(m.start).Truncate(time.Second), m.totalSymbols, m.totalSize) 314 m.mu.Unlock() 315 } 316 } 317}