fork of https://github.com/sourcegraph/zoekt
1// Package languages provides enhanced language detection capabilities on top of
2// go-enry, with additional heuristics and mappings for better accuracy.
3package languages
4
5import (
6 "path/filepath"
7 "slices"
8 "strings"
9
10 "github.com/go-enry/go-enry/v2"
11)
12
13// Make sure all names are lowercase here, since they are normalized
14var enryLanguageMappings = map[string]string{
15 "c++": "cpp",
16 "c#": "c_sharp",
17}
18
19// NormalizeLanguage converts the language name to lowercase and maps known
20// aliases to their canonical names.
21func NormalizeLanguage(filetype string) string {
22 normalized := strings.ToLower(filetype)
23 if mapped, ok := enryLanguageMappings[normalized]; ok {
24 normalized = mapped
25 }
26
27 return normalized
28}
29
30// GetLanguages is a replacement for enry.GetLanguages which
31// avoids incorrect fallback behavior that is present in DefaultStrategies,
32// where it will misclassify '.h' header files as C when file contents
33// are not available.
34//
35// The content can be optionally passed via a callback instead of directly, so
36// that in the common case, the caller can avoid fetching the content. The full
37// content returned by getContent will be used for language detection.
38//
39// getContent is not called if the file is likely to be a binary file,
40// as enry only covers programming languages.
41//
42// The buffer provided by the getContent callback is not modified.
43//
44// Returns:
45// - An error if the getContent func returns an error
46// - An empty slice if language detection failed
47// - A single-element slice if the language was determined exactly
48// - A multi-element slice if the language was ambiguous. For example,
49// for simple `.h` files with just comments and macros, they may
50// be valid C, C++ or any of their derivative languages (e.g. Objective-C).
51func GetLanguages(path string, getContent func() ([]byte, error)) ([]string, error) {
52 impl := func() ([]string, error) {
53 langs := enry.GetLanguagesByFilename(path, nil, nil)
54 if len(langs) == 1 {
55 return langs, nil
56 }
57 newLangs, isLikelyBinaryFile := getLanguagesByExtension(path)
58 if isLikelyBinaryFile {
59 return nil, nil
60 }
61 switch len(newLangs) {
62 case 0:
63 break
64 case 1:
65 return newLangs, nil
66 default:
67 langs = newLangs
68 }
69 if getContent == nil {
70 return langs, nil
71 }
72 content, err := getContent()
73 if err != nil {
74 return nil, err
75 }
76 if len(content) == 0 {
77 return langs, nil
78 }
79 if enry.IsBinary(content) {
80 return nil, nil
81 }
82
83 // enry doesn't expose a way to call GetLanguages with a specific set of
84 // strategies, so just hand-roll that code here.
85 var languages = langs
86 for _, strategy := range []enry.Strategy{enry.GetLanguagesByModeline, getLanguagesByShebang, getLanguagesByContent, enry.GetLanguagesByClassifier} {
87 candidates := strategy(path, content, languages)
88 switch len(candidates) {
89 case 0:
90 continue
91 case 1:
92 return candidates, nil
93 default:
94 languages = candidates
95 }
96 }
97
98 return languages, nil
99 }
100
101 langs, err := impl()
102 return slices.Clone(langs), err
103}
104
105// GetLanguagesFromContent is a convenience wrapper around GetLanguages that
106// allows passing the content directly instead of a callback.
107func GetLanguagesFromContent(path string, content []byte) (langs []string) {
108 // We can ignore the error here, because the callback will never return an error
109 langs, _ = GetLanguages(path, func() ([]byte, error) { return content, nil })
110 return
111}
112
113// getLanguagesByContent is a wrapper for enry.GetLanguagesByContent.
114//
115// It applies additional heuristics for file extensions that need special handling.
116func getLanguagesByContent(path string, content []byte, candidates []string) []string {
117 ext := strings.ToLower(filepath.Ext(path))
118 if heuristic, ok := sgExtraContentHeuristics[ext]; ok {
119 return heuristic.Match(content)
120 }
121 return enry.GetLanguagesByContent(path, content, candidates)
122}
123
124// getLanguagesByShebang is a replacement for enry.GetLanguagesByShebang.
125//
126// The enry function considers non-programming languages such as 'Pod'/'Pod 6'
127// also for shebangs, so work around that.
128func getLanguagesByShebang(path string, content []byte, candidates []string) []string {
129 languages := enry.GetLanguagesByShebang(path, content, candidates)
130 if len(languages) == 2 {
131 // See https://sourcegraph.com/github.com/go-enry/go-enry@40f2a1e5b90eec55c20441c2a5911dcfc298a447/-/blob/data/interpreter.go?L95-96
132 if slices.Equal(languages, []string{"Perl", "Pod"}) {
133 return []string{"Perl"}
134 }
135 if slices.Equal(languages, []string{"Pod 6", "Raku"}) {
136 return []string{"Raku"}
137 }
138 }
139 return slices.Clone(languages)
140}
141
142// IsLikelyVendoredFile returns true if the file is likely to be a vendored file.
143//
144// 1. This method is not 100% foolproof, as it relies on conventions
145// around file paths which may or may not be followed.
146// 2. The caller must not pass a directory path to this function
147// for short-circuiting, as there is no guarantee that if a path
148// p1 returns true, then Join(p1, p2) also returns true.
149func IsLikelyVendoredFile(path string) bool {
150 return enry.IsVendor(path)
151}