fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

Add support for Magik and PKl languages that are not handled by Linguist (#790)

Add fallbacks for languages not supported yet by linguist or go-enry

+169 -4
+4
index_test.go
··· 3449 3449 Document{Name: "apex.cls", Content: []byte("public class Car extends Vehicle {")}, 3450 3450 Document{Name: "tex.cls", Content: []byte(`\DeclareOption*{`)}, 3451 3451 Document{Name: "hello.h", Content: []byte(`#include <stdio.h>`)}, 3452 + Document{Name: "be.magik", Content: []byte(`_package unicorn`)}, 3452 3453 ) 3453 3454 3454 3455 t.Log(b.languageMap) ··· 3485 3486 3486 3487 res = searchForTest(t, b, &query.Language{Language: "C"}) 3487 3488 wantSingleMatch(res, "hello.h") 3489 + 3490 + res = searchForTest(t, b, &query.Language{Language: "Magik"}) 3491 + wantSingleMatch(res, "be.magik") 3488 3492 3489 3493 // test fallback language search by pretending it's an older index version 3490 3494 res = searchForTest(t, b, &query.Language{Language: "C++"})
+2 -2
indexbuilder.go
··· 27 27 "time" 28 28 "unicode/utf8" 29 29 30 - "github.com/go-enry/go-enry/v2" 30 + "github.com/sourcegraph/zoekt/internal/languages" 31 31 ) 32 32 33 33 var _ = log.Println ··· 397 397 398 398 func DetermineLanguageIfUnknown(doc *Document) { 399 399 if doc.Language == "" { 400 - doc.Language = enry.GetLanguage(doc.Name, doc.Content) 400 + doc.Language = languages.GetLanguage(doc.Name, doc.Content) 401 401 } 402 402 } 403 403
+66
internal/languages/language.go
··· 1 + // This file wraps the logic of go-enry (https://github.com/go-enry/go-enry) to support additional languages. 2 + // go-enry is based off of a package called Linguist (https://github.com/github/linguist) 3 + // and sometimes programming languages may not be supported by Linguist 4 + // or may take a while to get merged in and make it into go-enry. This wrapper 5 + // gives us flexibility to support languages in those cases. We list additional languages 6 + // in this file and remove them once they make it into Linguist and go-enry. 7 + // This logic is similar to what we have in the sourcegraph/sourcegraph repo, in the future 8 + // we plan to refactor both into a common library to share between the two repos. 9 + package languages 10 + 11 + import ( 12 + "path/filepath" 13 + "strings" 14 + 15 + "github.com/go-enry/go-enry/v2" 16 + ) 17 + 18 + var unsupportedByLinguistAliasMap = map[string]string{ 19 + // Pkl Configuration Language (https://pkl-lang.org/) 20 + // Add to linguist on 6/7/24 21 + // can remove once go-enry package updates 22 + // to that linguist version 23 + "pkl": "Pkl", 24 + // Magik Language 25 + "magik": "Magik", 26 + } 27 + 28 + var unsupportedByLinguistExtensionToNameMap = map[string]string{ 29 + // Pkl Configuration Language (https://pkl-lang.org/) 30 + ".pkl": "Pkl", 31 + // Magik Language 32 + ".magik": "Magik", 33 + } 34 + 35 + // getLanguagesByAlias is a replacement for enry.GetLanguagesByAlias 36 + // It supports languages that are missing in linguist 37 + func GetLanguageByAlias(alias string) (language string, ok bool) { 38 + language, ok = enry.GetLanguageByAlias(alias) 39 + if !ok { 40 + normalizedAlias := strings.ToLower(alias) 41 + language, ok = unsupportedByLinguistAliasMap[normalizedAlias] 42 + } 43 + 44 + return 45 + } 46 + 47 + // GetLanguage is a replacement for enry.GetLanguage 48 + // to find out the most probable language to return but includes support 49 + // for languages missing from linguist 50 + func GetLanguage(filename string, content []byte) (language string) { 51 + language = enry.GetLanguage(filename, content) 52 + 53 + // If go-enry failed to find language, fall back on our 54 + // internal check for languages missing in linguist 55 + if language == "" { 56 + ext := filepath.Ext(filename) 57 + normalizedExt := strings.ToLower(ext) 58 + if ext == "" { 59 + return 60 + } 61 + if lang, ok := unsupportedByLinguistExtensionToNameMap[normalizedExt]; ok { 62 + language = lang 63 + } 64 + } 65 + return 66 + }
+95
internal/languages/language_test.go
··· 1 + package languages 2 + 3 + import "testing" 4 + 5 + func TestGetLanguageByAlias(t *testing.T) { 6 + tests := []struct { 7 + name string 8 + alias string 9 + want string 10 + wantOk bool 11 + }{ 12 + { 13 + name: "empty alias", 14 + alias: "", 15 + want: "", 16 + wantOk: false, 17 + }, 18 + { 19 + name: "unknown alias", 20 + alias: "unknown", 21 + want: "", 22 + wantOk: false, 23 + }, 24 + { 25 + name: "supported alias", 26 + alias: "go", 27 + want: "Go", 28 + wantOk: true, 29 + }, 30 + { 31 + name: "unsupported by linguist alias", 32 + alias: "magik", 33 + want: "Magik", 34 + wantOk: true, 35 + }, 36 + { 37 + name: "unsupported by linguist alias normalized", 38 + alias: "mAgIk", 39 + want: "Magik", 40 + wantOk: true, 41 + }, 42 + } 43 + 44 + for _, tt := range tests { 45 + t.Run(tt.name, func(t *testing.T) { 46 + got, ok := GetLanguageByAlias(tt.alias) 47 + if got != tt.want || ok != tt.wantOk { 48 + t.Errorf("GetLanguageByAlias(%q) = %q, %t, want %q, %t", tt.alias, got, ok, tt.want, tt.wantOk) 49 + } 50 + }) 51 + } 52 + } 53 + 54 + func TestGetLanguage(t *testing.T) { 55 + tests := []struct { 56 + name string 57 + filename string 58 + content []byte 59 + want string 60 + }{ 61 + { 62 + name: "empty filename", 63 + filename: "", 64 + content: []byte(""), 65 + want: "", 66 + }, 67 + { 68 + name: "unknown extension", 69 + filename: "file.unknown", 70 + content: []byte(""), 71 + want: "", 72 + }, 73 + { 74 + name: "supported extension", 75 + filename: "file.go", 76 + content: []byte("package main"), 77 + want: "Go", 78 + }, 79 + { 80 + name: "unsupported by linguist extension", 81 + filename: "file.magik", 82 + content: []byte(""), 83 + want: "Magik", 84 + }, 85 + } 86 + 87 + for _, tt := range tests { 88 + t.Run(tt.name, func(t *testing.T) { 89 + got := GetLanguage(tt.filename, tt.content) 90 + if got != tt.want { 91 + t.Errorf("GetLanguage(%q, %q) = %q, want %q", tt.filename, tt.content, got, tt.want) 92 + } 93 + }) 94 + } 95 + }
+2 -2
query/parse.go
··· 20 20 "log" 21 21 "regexp/syntax" 22 22 23 - "github.com/go-enry/go-enry/v2" 24 23 "github.com/grafana/regexp" 24 + "github.com/sourcegraph/zoekt/internal/languages" 25 25 ) 26 26 27 27 var _ = log.Printf ··· 172 172 } 173 173 expr = q 174 174 case tokLang: 175 - canonical, ok := enry.GetLanguageByAlias(text) 175 + canonical, ok := languages.GetLanguageByAlias(text) 176 176 if !ok { 177 177 expr = &Const{false} 178 178 } else {