fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

at main 7.7 kB View raw
1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package index 16 17import ( 18 "bytes" 19 "fmt" 20 "log" 21 "slices" 22 "strings" 23 "sync" 24 "time" 25 26 "github.com/sourcegraph/zoekt" 27 "github.com/sourcegraph/zoekt/internal/ctags" 28) 29 30// Make sure all names are lowercase here, since they are normalized 31var enryLanguageMappings = map[string]string{ 32 "c#": "c_sharp", 33} 34 35func normalizeLanguage(filetype string) string { 36 normalized := strings.ToLower(filetype) 37 if mapped, ok := enryLanguageMappings[normalized]; ok { 38 normalized = mapped 39 } 40 41 return normalized 42} 43 44func parseSymbols(todo []*Document, languageMap ctags.LanguageMap, parserBins ctags.ParserBinMap) error { 45 monitor := newMonitor() 46 defer monitor.Stop() 47 48 var tagsToSections tagsToSections 49 50 parser := ctags.NewCTagsParser(parserBins) 51 defer parser.Close() 52 53 for _, doc := range todo { 54 if len(doc.Content) == 0 || doc.Symbols != nil { 55 continue 56 } 57 58 parserType := languageMap[normalizeLanguage(doc.Language)] 59 if parserType == ctags.NoCTags { 60 continue 61 } 62 63 // If the parser type is unknown, default to universal-ctags 64 if parserType == ctags.UnknownCTags { 65 parserType = ctags.UniversalCTags 66 } 67 68 monitor.BeginParsing(doc) 69 es, err := parser.Parse(doc.Name, doc.Content, parserType) 70 monitor.EndParsing(es) 71 72 if err != nil { 73 return err 74 } 75 if len(es) == 0 { 76 continue 77 } 78 79 symOffsets, symMetaData, err := tagsToSections.Convert(doc.Content, es) 80 if err != nil { 81 return fmt.Errorf("%s: %v", doc.Name, err) 82 } 83 doc.Symbols = symOffsets 84 doc.SymbolsMetaData = symMetaData 85 } 86 87 return nil 88} 89 90// overlaps finds the proper position to insert a zoekt.DocumentSection with 91// "start and "end" into "symOffsets". It returns -1 if the new section overlaps 92// with one of the existing ones. 93func overlaps(symOffsets []DocumentSection, start, end uint32) int { 94 i := 0 95 for i = len(symOffsets) - 1; i >= 0; i-- { 96 // The most common case is that we exit here, because symOffsets is sorted by 97 // construction and start is in many cases monotonically increasing. 98 if start >= symOffsets[i].End { 99 break 100 } 101 if end <= symOffsets[i].Start { 102 continue 103 } 104 // overlap 105 return -1 106 } 107 return i + 1 108} 109 110// tagsToSections contains buffers to be reused between conversions of bytes 111// ranges to metadata. This is done to reduce pressure on the garbage 112// collector. 113type tagsToSections struct { 114 nlsBuf []uint32 115} 116 117// Convert ctags entries to byte ranges (zoekt.DocumentSection) with 118// corresponding metadata (zoekt.Symbol). 119// 120// This can not be called concurrently. 121func (t *tagsToSections) Convert(content []byte, tags []*ctags.Entry) ([]DocumentSection, []*zoekt.Symbol, error) { 122 nls := t.newLinesIndices(content) 123 symOffsets := make([]DocumentSection, 0, len(tags)) 124 symMetaData := make([]*zoekt.Symbol, 0, len(tags)) 125 126 for _, t := range tags { 127 if t.Line <= 0 { 128 // Observed this with a .JS file. 129 continue 130 } 131 lineIdx := t.Line - 1 132 if lineIdx >= len(nls) { 133 // Observed this with a .TS file. 134 continue 135 } 136 137 lineOff := uint32(0) 138 if lineIdx > 0 { 139 lineOff = nls[lineIdx-1] + 1 140 } 141 142 end := nls[lineIdx] 143 line := content[lineOff:end] 144 145 // This is best-effort only. For short symbol names, we will often determine the 146 // wrong offset. 147 intraOff := bytes.Index(line, []byte(t.Name)) 148 if intraOff < 0 { 149 // for Go code, this is very common, since 150 // ctags barfs on multi-line declarations 151 continue 152 } 153 154 start := lineOff + uint32(intraOff) 155 endSym := start + uint32(len(t.Name)) 156 157 i := overlaps(symOffsets, start, endSym) 158 if i == -1 { 159 // Detected an overlap. Give up. 160 continue 161 } 162 163 symOffsets = slices.Insert(symOffsets, i, DocumentSection{ 164 Start: start, 165 End: endSym, 166 }) 167 symMetaData = slices.Insert(symMetaData, i, &zoekt.Symbol{ 168 Sym: t.Name, 169 Kind: t.Kind, 170 Parent: t.Parent, 171 ParentKind: t.ParentKind, 172 }) 173 } 174 175 return symOffsets, symMetaData, nil 176} 177 178// newLinesIndices returns an array of all indexes of '\n' aswell as a final 179// value for the length of the document. 180func (t *tagsToSections) newLinesIndices(in []byte) []uint32 { 181 // reuse nlsBuf between calls to tagsToSections.Convert 182 out := t.nlsBuf 183 if out == nil { 184 out = make([]uint32, 0, len(in)/30) 185 } 186 187 finalEntry := uint32(len(in)) 188 off := uint32(0) 189 for len(in) > 0 { 190 i := bytes.IndexByte(in, '\n') 191 if i < 0 { 192 out = append(out, finalEntry) 193 break 194 } 195 196 off += uint32(i) 197 out = append(out, off) 198 199 in = in[i+1:] 200 off++ 201 } 202 203 // save buffer for reuse 204 t.nlsBuf = out[:0] 205 206 return out 207} 208 209// monitorReportStuck is how long we need to be analysing a document before 210// reporting it to stdout. 211const monitorReportStuck = 10 * time.Second 212 213// monitorReportStatus is how often we given status updates 214const monitorReportStatus = time.Minute 215 216type monitor struct { 217 mu sync.Mutex 218 219 lastUpdate time.Time 220 221 start time.Time 222 totalSize int 223 totalSymbols int 224 225 currentDocName string 226 currentDocSize int 227 currentDocStuckCount int 228 229 done chan struct{} 230} 231 232func newMonitor() *monitor { 233 start := time.Now() 234 m := &monitor{ 235 start: start, 236 lastUpdate: start, 237 done: make(chan struct{}), 238 } 239 go m.run() 240 return m 241} 242 243func (m *monitor) BeginParsing(doc *Document) { 244 now := time.Now() 245 m.mu.Lock() 246 m.lastUpdate = now 247 248 // set current doc 249 m.currentDocName = doc.Name 250 m.currentDocSize = len(doc.Content) 251 252 m.mu.Unlock() 253} 254 255func (m *monitor) EndParsing(entries []*ctags.Entry) { 256 now := time.Now() 257 m.mu.Lock() 258 m.lastUpdate = now 259 260 // update aggregate stats 261 m.totalSize += m.currentDocSize 262 m.totalSymbols += len(entries) 263 264 // inform done if we warned about current document 265 if m.currentDocStuckCount > 0 { 266 log.Printf("symbol analysis for %s (size %d bytes) is done and found %d symbols", m.currentDocName, m.currentDocSize, len(entries)) 267 m.currentDocStuckCount = 0 268 } 269 270 // unset current document 271 m.currentDocName = "" 272 m.currentDocSize = 0 273 274 m.mu.Unlock() 275} 276 277func (m *monitor) Stop() { 278 close(m.done) 279} 280 281func (m *monitor) run() { 282 stuckTicker := time.NewTicker(monitorReportStuck / 2) // half due to sampling theorem (nyquist) 283 statusTicker := time.NewTicker(monitorReportStatus) 284 285 defer stuckTicker.Stop() 286 defer statusTicker.Stop() 287 288 for { 289 select { 290 case <-m.done: 291 now := time.Now() 292 m.mu.Lock() 293 log.Printf("symbol analysis finished for shard statistics: duration=%v symbols=%d bytes=%d", now.Sub(m.start).Truncate(time.Second), m.totalSymbols, m.totalSize) 294 m.mu.Unlock() 295 return 296 297 case <-stuckTicker.C: 298 now := time.Now() 299 m.mu.Lock() 300 running := now.Sub(m.lastUpdate).Truncate(time.Second) 301 report := monitorReportStuck * (1 << m.currentDocStuckCount) // double the amount of time each time we report 302 if m.currentDocName != "" && running >= report { 303 m.currentDocStuckCount++ 304 log.Printf("WARN: symbol analysis for %s (%d bytes) has been running for %v", m.currentDocName, m.currentDocSize, running) 305 } 306 m.mu.Unlock() 307 308 case <-statusTicker.C: 309 now := time.Now() 310 m.mu.Lock() 311 log.Printf("DEBUG: symbol analysis still running for shard statistics: duration=%v symbols=%d bytes=%d", now.Sub(m.start).Truncate(time.Second), m.totalSymbols, m.totalSize) 312 m.mu.Unlock() 313 } 314 } 315}