fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

at main 5.1 kB View raw
1// Package languages provides enhanced language detection capabilities on top of 2// go-enry, with additional heuristics and mappings for better accuracy. 3package languages 4 5import ( 6 "path/filepath" 7 "slices" 8 "strings" 9 10 "github.com/go-enry/go-enry/v2" 11) 12 13// Make sure all names are lowercase here, since they are normalized 14var enryLanguageMappings = map[string]string{ 15 "c++": "cpp", 16 "c#": "c_sharp", 17} 18 19// NormalizeLanguage converts the language name to lowercase and maps known 20// aliases to their canonical names. 21func NormalizeLanguage(filetype string) string { 22 normalized := strings.ToLower(filetype) 23 if mapped, ok := enryLanguageMappings[normalized]; ok { 24 normalized = mapped 25 } 26 27 return normalized 28} 29 30// GetLanguages is a replacement for enry.GetLanguages which 31// avoids incorrect fallback behavior that is present in DefaultStrategies, 32// where it will misclassify '.h' header files as C when file contents 33// are not available. 34// 35// The content can be optionally passed via a callback instead of directly, so 36// that in the common case, the caller can avoid fetching the content. The full 37// content returned by getContent will be used for language detection. 38// 39// getContent is not called if the file is likely to be a binary file, 40// as enry only covers programming languages. 41// 42// The buffer provided by the getContent callback is not modified. 43// 44// Returns: 45// - An error if the getContent func returns an error 46// - An empty slice if language detection failed 47// - A single-element slice if the language was determined exactly 48// - A multi-element slice if the language was ambiguous. For example, 49// for simple `.h` files with just comments and macros, they may 50// be valid C, C++ or any of their derivative languages (e.g. Objective-C). 51func GetLanguages(path string, getContent func() ([]byte, error)) ([]string, error) { 52 impl := func() ([]string, error) { 53 langs := enry.GetLanguagesByFilename(path, nil, nil) 54 if len(langs) == 1 { 55 return langs, nil 56 } 57 newLangs, isLikelyBinaryFile := getLanguagesByExtension(path) 58 if isLikelyBinaryFile { 59 return nil, nil 60 } 61 switch len(newLangs) { 62 case 0: 63 break 64 case 1: 65 return newLangs, nil 66 default: 67 langs = newLangs 68 } 69 if getContent == nil { 70 return langs, nil 71 } 72 content, err := getContent() 73 if err != nil { 74 return nil, err 75 } 76 if len(content) == 0 { 77 return langs, nil 78 } 79 if enry.IsBinary(content) { 80 return nil, nil 81 } 82 83 // enry doesn't expose a way to call GetLanguages with a specific set of 84 // strategies, so just hand-roll that code here. 85 var languages = langs 86 for _, strategy := range []enry.Strategy{enry.GetLanguagesByModeline, getLanguagesByShebang, getLanguagesByContent, enry.GetLanguagesByClassifier} { 87 candidates := strategy(path, content, languages) 88 switch len(candidates) { 89 case 0: 90 continue 91 case 1: 92 return candidates, nil 93 default: 94 languages = candidates 95 } 96 } 97 98 return languages, nil 99 } 100 101 langs, err := impl() 102 return slices.Clone(langs), err 103} 104 105// GetLanguagesFromContent is a convenience wrapper around GetLanguages that 106// allows passing the content directly instead of a callback. 107func GetLanguagesFromContent(path string, content []byte) (langs []string) { 108 // We can ignore the error here, because the callback will never return an error 109 langs, _ = GetLanguages(path, func() ([]byte, error) { return content, nil }) 110 return 111} 112 113// getLanguagesByContent is a wrapper for enry.GetLanguagesByContent. 114// 115// It applies additional heuristics for file extensions that need special handling. 116func getLanguagesByContent(path string, content []byte, candidates []string) []string { 117 ext := strings.ToLower(filepath.Ext(path)) 118 if heuristic, ok := sgExtraContentHeuristics[ext]; ok { 119 return heuristic.Match(content) 120 } 121 return enry.GetLanguagesByContent(path, content, candidates) 122} 123 124// getLanguagesByShebang is a replacement for enry.GetLanguagesByShebang. 125// 126// The enry function considers non-programming languages such as 'Pod'/'Pod 6' 127// also for shebangs, so work around that. 128func getLanguagesByShebang(path string, content []byte, candidates []string) []string { 129 languages := enry.GetLanguagesByShebang(path, content, candidates) 130 if len(languages) == 2 { 131 // See https://sourcegraph.com/github.com/go-enry/go-enry@40f2a1e5b90eec55c20441c2a5911dcfc298a447/-/blob/data/interpreter.go?L95-96 132 if slices.Equal(languages, []string{"Perl", "Pod"}) { 133 return []string{"Perl"} 134 } 135 if slices.Equal(languages, []string{"Pod 6", "Raku"}) { 136 return []string{"Raku"} 137 } 138 } 139 return slices.Clone(languages) 140} 141 142// IsLikelyVendoredFile returns true if the file is likely to be a vendored file. 143// 144// 1. This method is not 100% foolproof, as it relies on conventions 145// around file paths which may or may not be followed. 146// 2. The caller must not pass a directory path to this function 147// for short-circuiting, as there is no guarantee that if a path 148// p1 returns true, then Join(p1, p2) also returns true. 149func IsLikelyVendoredFile(path string) bool { 150 return enry.IsVendor(path) 151}