fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

at main 10 kB View raw
1package languages 2 3import ( 4 "path/filepath" 5 "slices" 6 "strings" 7 8 "github.com/go-enry/go-enry/v2" 9 enrydata "github.com/go-enry/go-enry/v2/data" 10) 11 12// GetLanguageByNameOrAlias returns the standardized name for 13// a language based on its name (in which case this is an identity operation) 14// or based on its alias, which is potentially an alternate name for 15// the language. 16// 17// Aliases are fully lowercase, and map N-1 to languages. 18// 19// For example, 20// 21// GetLanguageByNameOrAlias("ada") == "Ada", true 22// GetLanguageByNameOrAlias("ada95") == "Ada", true 23// 24// Historical note: This function was added for replacing usages of 25// enry.GetLanguageByAlias, which, unlike the name suggests, also 26// handles non-normalized names such as those with spaces. 27func GetLanguageByNameOrAlias(nameOrAlias string) (lang string, ok bool) { 28 alias := convertToAliasKey(nameOrAlias) 29 if lang, ok = unsupportedByEnryAliasMap[alias]; ok { 30 return lang, true 31 } 32 33 return enry.GetLanguageByAlias(alias) 34} 35 36// GetLanguageExtensions returns the list of file extensions for a given 37// language. Returned extensions are always prefixed with a '.'. 38// 39// The returned slice will be empty iff the language is not known. 40// 41// Handles more languages than enry.GetLanguageExtensions. 42// 43// Mutually consistent with getLanguagesByExtension, see the tests 44// for the exact invariants. 45func GetLanguageExtensions(language string) []string { 46 if langs, ok := unsupportedByEnryNameToExtensionMap[language]; ok { 47 return langs 48 } 49 50 ignoreExts, isNiche := nicheExtensionUsages[language] 51 // Force a copy to avoid accidentally modifying the global variable 52 exts := slices.Clone(enry.GetLanguageExtensions(language)) 53 for ext, lang := range sgExtraLangsForExts { // Map is tiny, so linear lookup is fine 54 if language == lang { 55 exts = append(exts, ext) 56 } 57 } 58 if !isNiche { 59 return exts 60 } 61 return slices.DeleteFunc(exts, func(ext string) bool { 62 _, shouldIgnore := ignoreExts[ext] 63 return shouldIgnore 64 }) 65} 66 67// getLanguagesByExtension is a replacement for enry.GetLanguagesByExtension 68// to work around the following limitations: 69// - For some extensions which are overwhelmingly used by a certain file type 70// in practice, such as '.ts', '.md' and '.yaml', it returns ambiguous results. 71// - It does not provide any information about binary files. 72// - Some languages are not supported by enry yet (e.g. Magik) 73func getLanguagesByExtension(path string) (candidates []string, isLikelyBinaryFile bool) { 74 // Lowercase extension before lookups to match enry's behavior. 75 ext := strings.ToLower(filepath.Ext(path)) 76 if ext == "" { 77 return nil, false 78 } 79 if lang, ok := unsupportedByEnryExtensionToNameMap[ext]; ok { 80 return []string{lang}, false 81 } 82 if _, ok := commonBinaryFileExtensions[ext[1:]]; ok { 83 return nil, true 84 } 85 if lang, ok := overrideAmbiguousExtensionsMap[ext]; ok { 86 return []string{lang}, false 87 } 88 candidates = enry.GetLanguagesByExtension(path, nil, nil) 89 if extra, ok := sgExtraLangsForExts[ext]; ok { 90 candidates = append(candidates, extra) 91 } 92 return candidates, false 93} 94 95var commonBinaryFileExtensions = func() map[string]struct{} { 96 m := map[string]struct{}{} 97 for _, s := range commonBinaryFileExtensionsList { 98 m[s] = struct{}{} 99 } 100 return m 101}() 102 103var sgExtraLangsForExts = map[string]string{ 104 ".c": "C++", 105 // NOTE: Downstream code does linear lookups on this map, so 106 // be careful if you're adding lots of entries here. 107} 108 109var sgExtraContentHeuristics = map[string]*enrydata.Heuristics{ 110 ".c": enrydata.ContentHeuristics[".h"], 111} 112 113// overrideAmbiguousExtensionsMap represents extensions which are ambiguous according to 114// enry but not for Sourcegraph. 115var overrideAmbiguousExtensionsMap = map[string]string{ 116 // Ignoring the uncommon usage of '.cs' for Smalltalk. 117 ".cs": "C#", 118 // The other languages are Filterscript, Forth, GLSL. Out of that, 119 // Forth and GLSL commonly use other extensions. Ignore Filterscript 120 // as it is niche. 121 ".fs": "F#", 122 // Ignoring the uncommon usage of '.html' for Ecmarkup. 123 ".html": "HTML", 124 // Ignoring other variants of JSON, such as OASv2-json and OASv3-json 125 ".json": "JSON", 126 // Not considering "GCC Machine Description". 127 ".md": "Markdown", 128 // The other main language using '.rs' is RenderScript, but that's deprecated. 129 // See https://developer.android.com/guide/topics/renderscript/compute 130 ".rs": "Rust", 131 // In i18n contexts, there are XML files with '.ts' and '.tsx' extensions, 132 // but we ignore those for now to avoid penalizing the common case. 133 ".tsx": "TSX", 134 ".ts": "TypeScript", 135 // Ignoring "Adblock Filter List" and "Vim Help File". 136 ".txt": "Text", 137 // Ignoring other variants of YAML, such as MiniYAML, OASv2-yaml, OASv3-yaml. 138 ".yaml": "YAML", 139 ".yml": "YAML", 140 // The PR adding Pkl support also listed another language called Pickle in 141 // its heuristics, but doesn't have any real support for it. Just ignore 142 // it. 143 // https://github.com/github-linguist/linguist/pull/6730/files#diff-c2d2d7946540ab501a5ef7a7f54a57c530d8da599e41c2beb0fd2f5635d2fd50R539 144 ".pkl": "Pkl", 145} 146 147// unsupportedByEnryExtensionToNameMap contains extension->name mappings 148// for languages not tracked by go-enry. 149var unsupportedByEnryExtensionToNameMap = map[string]string{ 150 // Extensions for the Apex programming language 151 // See https://developer.salesforce.com/docs/atlas.en-us.apexcode.meta/apexcode/apex_dev_guide.htm 152 ".apex": "Apex", 153 ".apxt": "Apex", 154 ".apxc": "Apex", 155 ".trigger": "Apex", 156 ".magik": "Magik", 157} 158 159// nicheExtensionUsage keeps track of which (lang, extension) mappings 160// should not be considered. 161// 162// We cannot wholesale ignore these languages, as this list includes 163// languages like XML, but it can contain unusual extensions like '.tsx' 164// which we generally want to classify as TypeScript. 165var nicheExtensionUsages = func() map[string]map[string]struct{} { 166 niche := map[string]map[string]struct{}{} 167 considered := map[string]struct{}{} 168 for _, lang := range overrideAmbiguousExtensionsMap { 169 considered[lang] = struct{}{} 170 } 171 for ext := range overrideAmbiguousExtensionsMap { 172 langs := enry.GetLanguagesByExtension("foo"+ext, nil, nil) 173 for _, lang := range langs { 174 if _, found := considered[lang]; !found { 175 if m, hasMap := niche[lang]; hasMap { 176 m[ext] = struct{}{} 177 } else { 178 niche[lang] = map[string]struct{}{ext: {}} 179 } 180 } 181 } 182 } 183 for specialOverrideExt, lang := range unsupportedByEnryExtensionToNameMap { 184 considered[lang] = struct{}{} 185 langs := enry.GetLanguagesByExtension("foo"+specialOverrideExt, nil, nil) 186 for _, lang := range langs { 187 if _, found := considered[lang]; !found { 188 if m, hasMap := niche[lang]; hasMap { 189 m[specialOverrideExt] = struct{}{} 190 } else { 191 niche[lang] = map[string]struct{}{specialOverrideExt: {}} 192 } 193 } 194 } 195 } 196 return niche 197}() 198 199// unsupportedByEnryNameToExtensionMap contains language->extension mappings 200// for languages not tracked by go-enry. 201var unsupportedByEnryNameToExtensionMap = reverseMap(unsupportedByEnryExtensionToNameMap) 202 203// unsupportedByEnryAliasMap maps alias -> language name for languages 204// not tracked by go-enry. 205var unsupportedByEnryAliasMap = func() map[string]string { 206 out := map[string]string{} 207 for _, lang := range unsupportedByEnryExtensionToNameMap { 208 out[convertToAliasKey(lang)] = lang 209 } 210 return out 211}() 212 213func reverseMap(m map[string]string) map[string][]string { 214 n := make(map[string][]string, len(m)) 215 for k, v := range m { 216 n[v] = append(n[v], k) 217 } 218 return n 219} 220 221// Source: https://github.com/sindresorhus/binary-extensions/blob/main/binary-extensions.json 222// License: https://github.com/sindresorhus/binary-extensions/blob/main/license 223// Replace the contents with 224// curl -L https://raw.githubusercontent.com/sindresorhus/binary-extensions/main/binary-extensions.json | jq '.[]' | awk '{print $1 ","}' 225// 226// Not adding a leading '.' here to make it easier to update/compare the list. 227var commonBinaryFileExtensionsList = []string{ 228 "3dm", 229 "3ds", 230 "3g2", 231 "3gp", 232 "7z", 233 "a", 234 "aac", 235 "adp", 236 "afdesign", 237 "afphoto", 238 "afpub", 239 "ai", 240 "aif", 241 "aiff", 242 "alz", 243 "ape", 244 "apk", 245 "appimage", 246 "ar", 247 "arj", 248 "asf", 249 "au", 250 "avi", 251 "bak", 252 "baml", 253 "bh", 254 "bin", 255 "bk", 256 "bmp", 257 "btif", 258 "bz2", 259 "bzip2", 260 "cab", 261 "caf", 262 "cgm", 263 "class", 264 "cmx", 265 "cpio", 266 "cr2", 267 "cur", 268 "dat", 269 "dcm", 270 "deb", 271 "dex", 272 "djvu", 273 "dll", 274 "dmg", 275 "dng", 276 "doc", 277 "docm", 278 "docx", 279 "dot", 280 "dotm", 281 "dra", 282 "DS_Store", 283 "dsk", 284 "dts", 285 "dtshd", 286 "dvb", 287 "dwg", 288 "dxf", 289 "ecelp4800", 290 "ecelp7470", 291 "ecelp9600", 292 "egg", 293 "eol", 294 "eot", 295 "epub", 296 "exe", 297 "f4v", 298 "fbs", 299 "fh", 300 "fla", 301 "flac", 302 "flatpak", 303 "fli", 304 "flv", 305 "fpx", 306 "fst", 307 "fvt", 308 "g3", 309 "gh", 310 "gif", 311 "graffle", 312 "gz", 313 "gzip", 314 "h261", 315 "h263", 316 "h264", 317 "icns", 318 "ico", 319 "ief", 320 "img", 321 "ipa", 322 "iso", 323 "jar", 324 "jpeg", 325 "jpg", 326 "jpgv", 327 "jpm", 328 "jxr", 329 "key", 330 "ktx", 331 "lha", 332 "lib", 333 "lvp", 334 "lz", 335 "lzh", 336 "lzma", 337 "lzo", 338 "m3u", 339 "m4a", 340 "m4v", 341 "mar", 342 "mdi", 343 "mht", 344 "mid", 345 "midi", 346 "mj2", 347 "mka", 348 "mkv", 349 "mmr", 350 "mng", 351 "mobi", 352 "mov", 353 "movie", 354 "mp3", 355 "mp4", 356 "mp4a", 357 "mpeg", 358 "mpg", 359 "mpga", 360 "mxu", 361 "nef", 362 "npx", 363 "numbers", 364 "nupkg", 365 "o", 366 "odp", 367 "ods", 368 "odt", 369 "oga", 370 "ogg", 371 "ogv", 372 "otf", 373 "ott", 374 "pages", 375 "pbm", 376 "pcx", 377 "pdb", 378 "pdf", 379 "pea", 380 "pgm", 381 "pic", 382 "png", 383 "pnm", 384 "pot", 385 "potm", 386 "potx", 387 "ppa", 388 "ppam", 389 "ppm", 390 "pps", 391 "ppsm", 392 "ppsx", 393 "ppt", 394 "pptm", 395 "pptx", 396 "psd", 397 "pya", 398 "pyc", 399 "pyo", 400 "pyv", 401 "qt", 402 "rar", 403 "ras", 404 "raw", 405 "resources", 406 "rgb", 407 "rip", 408 "rlc", 409 "rmf", 410 "rmvb", 411 "rpm", 412 "rtf", 413 "rz", 414 "s3m", 415 "s7z", 416 "scpt", 417 "sgi", 418 "shar", 419 "snap", 420 "sil", 421 "sketch", 422 "slk", 423 "smv", 424 "snk", 425 "so", 426 "stl", 427 "suo", 428 "sub", 429 "swf", 430 "tar", 431 "tbz", 432 "tbz2", 433 "tga", 434 "tgz", 435 "thmx", 436 "tif", 437 "tiff", 438 "tlz", 439 "ttc", 440 "ttf", 441 "txz", 442 "udf", 443 "uvh", 444 "uvi", 445 "uvm", 446 "uvp", 447 "uvs", 448 "uvu", 449 "viv", 450 "vob", 451 "war", 452 "wav", 453 "wax", 454 "wbmp", 455 "wdp", 456 "weba", 457 "webm", 458 "webp", 459 "whl", 460 "wim", 461 "wm", 462 "wma", 463 "wmv", 464 "wmx", 465 "woff", 466 "woff2", 467 "wrm", 468 "wvx", 469 "xbm", 470 "xif", 471 "xla", 472 "xlam", 473 "xls", 474 "xlsb", 475 "xlsm", 476 "xlsx", 477 "xlt", 478 "xltm", 479 "xltx", 480 "xm", 481 "xmind", 482 "xpi", 483 "xpm", 484 "xwd", 485 "xz", 486 "z", 487 "zip", 488 "zipx", 489}