fork of https://github.com/sourcegraph/zoekt
1package languages
2
3import (
4 "path/filepath"
5 "slices"
6 "strings"
7
8 "github.com/go-enry/go-enry/v2"
9 enrydata "github.com/go-enry/go-enry/v2/data"
10)
11
12// GetLanguageByNameOrAlias returns the standardized name for
13// a language based on its name (in which case this is an identity operation)
14// or based on its alias, which is potentially an alternate name for
15// the language.
16//
17// Aliases are fully lowercase, and map N-1 to languages.
18//
19// For example,
20//
21// GetLanguageByNameOrAlias("ada") == "Ada", true
22// GetLanguageByNameOrAlias("ada95") == "Ada", true
23//
24// Historical note: This function was added for replacing usages of
25// enry.GetLanguageByAlias, which, unlike the name suggests, also
26// handles non-normalized names such as those with spaces.
27func GetLanguageByNameOrAlias(nameOrAlias string) (lang string, ok bool) {
28 alias := convertToAliasKey(nameOrAlias)
29 if lang, ok = unsupportedByEnryAliasMap[alias]; ok {
30 return lang, true
31 }
32
33 return enry.GetLanguageByAlias(alias)
34}
35
36// GetLanguageExtensions returns the list of file extensions for a given
37// language. Returned extensions are always prefixed with a '.'.
38//
39// The returned slice will be empty iff the language is not known.
40//
41// Handles more languages than enry.GetLanguageExtensions.
42//
43// Mutually consistent with getLanguagesByExtension, see the tests
44// for the exact invariants.
45func GetLanguageExtensions(language string) []string {
46 if langs, ok := unsupportedByEnryNameToExtensionMap[language]; ok {
47 return langs
48 }
49
50 ignoreExts, isNiche := nicheExtensionUsages[language]
51 // Force a copy to avoid accidentally modifying the global variable
52 exts := slices.Clone(enry.GetLanguageExtensions(language))
53 for ext, lang := range sgExtraLangsForExts { // Map is tiny, so linear lookup is fine
54 if language == lang {
55 exts = append(exts, ext)
56 }
57 }
58 if !isNiche {
59 return exts
60 }
61 return slices.DeleteFunc(exts, func(ext string) bool {
62 _, shouldIgnore := ignoreExts[ext]
63 return shouldIgnore
64 })
65}
66
67// getLanguagesByExtension is a replacement for enry.GetLanguagesByExtension
68// to work around the following limitations:
69// - For some extensions which are overwhelmingly used by a certain file type
70// in practice, such as '.ts', '.md' and '.yaml', it returns ambiguous results.
71// - It does not provide any information about binary files.
72// - Some languages are not supported by enry yet (e.g. Magik)
73func getLanguagesByExtension(path string) (candidates []string, isLikelyBinaryFile bool) {
74 // Lowercase extension before lookups to match enry's behavior.
75 ext := strings.ToLower(filepath.Ext(path))
76 if ext == "" {
77 return nil, false
78 }
79 if lang, ok := unsupportedByEnryExtensionToNameMap[ext]; ok {
80 return []string{lang}, false
81 }
82 if _, ok := commonBinaryFileExtensions[ext[1:]]; ok {
83 return nil, true
84 }
85 if lang, ok := overrideAmbiguousExtensionsMap[ext]; ok {
86 return []string{lang}, false
87 }
88 candidates = enry.GetLanguagesByExtension(path, nil, nil)
89 if extra, ok := sgExtraLangsForExts[ext]; ok {
90 candidates = append(candidates, extra)
91 }
92 return candidates, false
93}
94
95var commonBinaryFileExtensions = func() map[string]struct{} {
96 m := map[string]struct{}{}
97 for _, s := range commonBinaryFileExtensionsList {
98 m[s] = struct{}{}
99 }
100 return m
101}()
102
103var sgExtraLangsForExts = map[string]string{
104 ".c": "C++",
105 // NOTE: Downstream code does linear lookups on this map, so
106 // be careful if you're adding lots of entries here.
107}
108
109var sgExtraContentHeuristics = map[string]*enrydata.Heuristics{
110 ".c": enrydata.ContentHeuristics[".h"],
111}
112
113// overrideAmbiguousExtensionsMap represents extensions which are ambiguous according to
114// enry but not for Sourcegraph.
115var overrideAmbiguousExtensionsMap = map[string]string{
116 // Ignoring the uncommon usage of '.cs' for Smalltalk.
117 ".cs": "C#",
118 // The other languages are Filterscript, Forth, GLSL. Out of that,
119 // Forth and GLSL commonly use other extensions. Ignore Filterscript
120 // as it is niche.
121 ".fs": "F#",
122 // Ignoring the uncommon usage of '.html' for Ecmarkup.
123 ".html": "HTML",
124 // Ignoring other variants of JSON, such as OASv2-json and OASv3-json
125 ".json": "JSON",
126 // Not considering "GCC Machine Description".
127 ".md": "Markdown",
128 // The other main language using '.rs' is RenderScript, but that's deprecated.
129 // See https://developer.android.com/guide/topics/renderscript/compute
130 ".rs": "Rust",
131 // In i18n contexts, there are XML files with '.ts' and '.tsx' extensions,
132 // but we ignore those for now to avoid penalizing the common case.
133 ".tsx": "TSX",
134 ".ts": "TypeScript",
135 // Ignoring "Adblock Filter List" and "Vim Help File".
136 ".txt": "Text",
137 // Ignoring other variants of YAML, such as MiniYAML, OASv2-yaml, OASv3-yaml.
138 ".yaml": "YAML",
139 ".yml": "YAML",
140 // The PR adding Pkl support also listed another language called Pickle in
141 // its heuristics, but doesn't have any real support for it. Just ignore
142 // it.
143 // https://github.com/github-linguist/linguist/pull/6730/files#diff-c2d2d7946540ab501a5ef7a7f54a57c530d8da599e41c2beb0fd2f5635d2fd50R539
144 ".pkl": "Pkl",
145}
146
147// unsupportedByEnryExtensionToNameMap contains extension->name mappings
148// for languages not tracked by go-enry.
149var unsupportedByEnryExtensionToNameMap = map[string]string{
150 // Extensions for the Apex programming language
151 // See https://developer.salesforce.com/docs/atlas.en-us.apexcode.meta/apexcode/apex_dev_guide.htm
152 ".apex": "Apex",
153 ".apxt": "Apex",
154 ".apxc": "Apex",
155 ".trigger": "Apex",
156 ".magik": "Magik",
157}
158
159// nicheExtensionUsage keeps track of which (lang, extension) mappings
160// should not be considered.
161//
162// We cannot wholesale ignore these languages, as this list includes
163// languages like XML, but it can contain unusual extensions like '.tsx'
164// which we generally want to classify as TypeScript.
165var nicheExtensionUsages = func() map[string]map[string]struct{} {
166 niche := map[string]map[string]struct{}{}
167 considered := map[string]struct{}{}
168 for _, lang := range overrideAmbiguousExtensionsMap {
169 considered[lang] = struct{}{}
170 }
171 for ext := range overrideAmbiguousExtensionsMap {
172 langs := enry.GetLanguagesByExtension("foo"+ext, nil, nil)
173 for _, lang := range langs {
174 if _, found := considered[lang]; !found {
175 if m, hasMap := niche[lang]; hasMap {
176 m[ext] = struct{}{}
177 } else {
178 niche[lang] = map[string]struct{}{ext: {}}
179 }
180 }
181 }
182 }
183 for specialOverrideExt, lang := range unsupportedByEnryExtensionToNameMap {
184 considered[lang] = struct{}{}
185 langs := enry.GetLanguagesByExtension("foo"+specialOverrideExt, nil, nil)
186 for _, lang := range langs {
187 if _, found := considered[lang]; !found {
188 if m, hasMap := niche[lang]; hasMap {
189 m[specialOverrideExt] = struct{}{}
190 } else {
191 niche[lang] = map[string]struct{}{specialOverrideExt: {}}
192 }
193 }
194 }
195 }
196 return niche
197}()
198
199// unsupportedByEnryNameToExtensionMap contains language->extension mappings
200// for languages not tracked by go-enry.
201var unsupportedByEnryNameToExtensionMap = reverseMap(unsupportedByEnryExtensionToNameMap)
202
203// unsupportedByEnryAliasMap maps alias -> language name for languages
204// not tracked by go-enry.
205var unsupportedByEnryAliasMap = func() map[string]string {
206 out := map[string]string{}
207 for _, lang := range unsupportedByEnryExtensionToNameMap {
208 out[convertToAliasKey(lang)] = lang
209 }
210 return out
211}()
212
213func reverseMap(m map[string]string) map[string][]string {
214 n := make(map[string][]string, len(m))
215 for k, v := range m {
216 n[v] = append(n[v], k)
217 }
218 return n
219}
220
221// Source: https://github.com/sindresorhus/binary-extensions/blob/main/binary-extensions.json
222// License: https://github.com/sindresorhus/binary-extensions/blob/main/license
223// Replace the contents with
224// curl -L https://raw.githubusercontent.com/sindresorhus/binary-extensions/main/binary-extensions.json | jq '.[]' | awk '{print $1 ","}'
225//
226// Not adding a leading '.' here to make it easier to update/compare the list.
227var commonBinaryFileExtensionsList = []string{
228 "3dm",
229 "3ds",
230 "3g2",
231 "3gp",
232 "7z",
233 "a",
234 "aac",
235 "adp",
236 "afdesign",
237 "afphoto",
238 "afpub",
239 "ai",
240 "aif",
241 "aiff",
242 "alz",
243 "ape",
244 "apk",
245 "appimage",
246 "ar",
247 "arj",
248 "asf",
249 "au",
250 "avi",
251 "bak",
252 "baml",
253 "bh",
254 "bin",
255 "bk",
256 "bmp",
257 "btif",
258 "bz2",
259 "bzip2",
260 "cab",
261 "caf",
262 "cgm",
263 "class",
264 "cmx",
265 "cpio",
266 "cr2",
267 "cur",
268 "dat",
269 "dcm",
270 "deb",
271 "dex",
272 "djvu",
273 "dll",
274 "dmg",
275 "dng",
276 "doc",
277 "docm",
278 "docx",
279 "dot",
280 "dotm",
281 "dra",
282 "DS_Store",
283 "dsk",
284 "dts",
285 "dtshd",
286 "dvb",
287 "dwg",
288 "dxf",
289 "ecelp4800",
290 "ecelp7470",
291 "ecelp9600",
292 "egg",
293 "eol",
294 "eot",
295 "epub",
296 "exe",
297 "f4v",
298 "fbs",
299 "fh",
300 "fla",
301 "flac",
302 "flatpak",
303 "fli",
304 "flv",
305 "fpx",
306 "fst",
307 "fvt",
308 "g3",
309 "gh",
310 "gif",
311 "graffle",
312 "gz",
313 "gzip",
314 "h261",
315 "h263",
316 "h264",
317 "icns",
318 "ico",
319 "ief",
320 "img",
321 "ipa",
322 "iso",
323 "jar",
324 "jpeg",
325 "jpg",
326 "jpgv",
327 "jpm",
328 "jxr",
329 "key",
330 "ktx",
331 "lha",
332 "lib",
333 "lvp",
334 "lz",
335 "lzh",
336 "lzma",
337 "lzo",
338 "m3u",
339 "m4a",
340 "m4v",
341 "mar",
342 "mdi",
343 "mht",
344 "mid",
345 "midi",
346 "mj2",
347 "mka",
348 "mkv",
349 "mmr",
350 "mng",
351 "mobi",
352 "mov",
353 "movie",
354 "mp3",
355 "mp4",
356 "mp4a",
357 "mpeg",
358 "mpg",
359 "mpga",
360 "mxu",
361 "nef",
362 "npx",
363 "numbers",
364 "nupkg",
365 "o",
366 "odp",
367 "ods",
368 "odt",
369 "oga",
370 "ogg",
371 "ogv",
372 "otf",
373 "ott",
374 "pages",
375 "pbm",
376 "pcx",
377 "pdb",
378 "pdf",
379 "pea",
380 "pgm",
381 "pic",
382 "png",
383 "pnm",
384 "pot",
385 "potm",
386 "potx",
387 "ppa",
388 "ppam",
389 "ppm",
390 "pps",
391 "ppsm",
392 "ppsx",
393 "ppt",
394 "pptm",
395 "pptx",
396 "psd",
397 "pya",
398 "pyc",
399 "pyo",
400 "pyv",
401 "qt",
402 "rar",
403 "ras",
404 "raw",
405 "resources",
406 "rgb",
407 "rip",
408 "rlc",
409 "rmf",
410 "rmvb",
411 "rpm",
412 "rtf",
413 "rz",
414 "s3m",
415 "s7z",
416 "scpt",
417 "sgi",
418 "shar",
419 "snap",
420 "sil",
421 "sketch",
422 "slk",
423 "smv",
424 "snk",
425 "so",
426 "stl",
427 "suo",
428 "sub",
429 "swf",
430 "tar",
431 "tbz",
432 "tbz2",
433 "tga",
434 "tgz",
435 "thmx",
436 "tif",
437 "tiff",
438 "tlz",
439 "ttc",
440 "ttf",
441 "txz",
442 "udf",
443 "uvh",
444 "uvi",
445 "uvm",
446 "uvp",
447 "uvs",
448 "uvu",
449 "viv",
450 "vob",
451 "war",
452 "wav",
453 "wax",
454 "wbmp",
455 "wdp",
456 "weba",
457 "webm",
458 "webp",
459 "whl",
460 "wim",
461 "wm",
462 "wma",
463 "wmv",
464 "wmx",
465 "woff",
466 "woff2",
467 "wrm",
468 "wvx",
469 "xbm",
470 "xif",
471 "xla",
472 "xlam",
473 "xls",
474 "xlsb",
475 "xlsm",
476 "xlsx",
477 "xlt",
478 "xltm",
479 "xltx",
480 "xm",
481 "xmind",
482 "xpi",
483 "xpm",
484 "xwd",
485 "xz",
486 "z",
487 "zip",
488 "zipx",
489}