fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

compute precise language information with go-enry for lang: queries (#220)

use go-enry to compute more precise language information than ctags

make lang: use filename fallback for older index versions

+152 -32
+1 -1
api.go
··· 381 381 IndexMinReaderVersion int 382 382 IndexTime time.Time 383 383 PlainASCII bool 384 - LanguageMap map[string]byte 384 + LanguageMap map[string]uint16 385 385 ZoektVersion string 386 386 ID string 387 387 }
-5
build/ctags.go
··· 21 21 "os" 22 22 "os/exec" 23 23 "path/filepath" 24 - "strings" 25 24 "time" 26 25 27 26 "github.com/google/zoekt" ··· 152 151 if len(es) == 0 { 153 152 continue 154 153 } 155 - doc.Language = strings.ToLower(es[0].Language) 156 154 157 155 symOffsets, symMetaData, err := tagsToSections(doc.Content, es) 158 156 if err != nil { ··· 205 203 } 206 204 todo[pathIndices[k]].Symbols = symOffsets 207 205 todo[pathIndices[k]].SymbolsMetaData = symMetaData 208 - if len(tags) > 0 { 209 - todo[pathIndices[k]].Language = strings.ToLower(tags[0].Language) 210 - } 211 206 } 212 207 return nil 213 208 }
+33 -1
eval.go
··· 18 18 "context" 19 19 "fmt" 20 20 "log" 21 + "regexp" 21 22 "regexp/syntax" 22 23 "sort" 23 24 "strings" 24 25 26 + enry_data "github.com/go-enry/go-enry/v2/data" 25 27 "github.com/google/zoekt/query" 26 28 ) 27 29 ··· 98 100 }) 99 101 case *query.Language: 100 102 _, has := d.metaData.LanguageMap[r.Language] 103 + if !has && d.metaData.IndexFeatureVersion < 12 { 104 + // For index files that haven't been re-indexed by go-enry, 105 + // fall back to file-based matching and continue even if this 106 + // repo doesn't have the specific language present. 107 + extsForLang := enry_data.ExtensionsByLanguage[r.Language] 108 + if extsForLang != nil { 109 + extFrags := make([]string, 0, len(extsForLang)) 110 + for _, ext := range extsForLang { 111 + extFrags = append(extFrags, regexp.QuoteMeta(ext)) 112 + } 113 + if len(extFrags) > 0 { 114 + pattern := fmt.Sprintf("(?i)(%s)$", strings.Join(extFrags, "|")) 115 + // inlined copy of query.regexpQuery 116 + re, err := syntax.Parse(pattern, syntax.Perl) 117 + if err != nil { 118 + return &query.Const{Value: false} 119 + } 120 + if re.Op == syntax.OpLiteral { 121 + return &query.Substring{ 122 + Pattern: string(re.Rune), 123 + FileName: true, 124 + } 125 + } 126 + return &query.Regexp{ 127 + Regexp: re, 128 + FileName: true, 129 + } 130 + } 131 + } 132 + } 101 133 if !has { 102 134 return &query.Const{Value: false} 103 135 } ··· 238 270 RepositoryPriority: md.priority, 239 271 FileName: string(d.fileName(nextDoc)), 240 272 Checksum: d.getChecksum(nextDoc), 241 - Language: d.languageMap[d.languages[nextDoc]], 273 + Language: d.languageMap[d.getLanguage(nextDoc)], 242 274 } 243 275 244 276 if s := d.subRepos[nextDoc]; s > 0 {
+1
go.mod
··· 11 11 github.com/codahale/hdrhistogram v0.0.0-20161010025455-3a0bb77429bd // indirect 12 12 github.com/fsnotify/fsnotify v1.4.9 13 13 github.com/gfleury/go-bitbucket-v1 v0.0.0-20200312180434-e5170e3280fb 14 + github.com/go-enry/go-enry/v2 v2.8.0 14 15 github.com/go-git/go-git/v5 v5.4.2 15 16 github.com/gobwas/glob v0.2.3 16 17 github.com/google/go-cmp v0.5.5
+4
go.sum
··· 131 131 github.com/gliderlabs/ssh v0.2.2 h1:6zsha5zo/TWhRhwqCD3+EarCAgZ2yN28ipRnGPnwkI0= 132 132 github.com/gliderlabs/ssh v0.2.2/go.mod h1:U7qILu1NlMHj9FlMhZLlkCdDnU1DBEAqr0aevW3Awn0= 133 133 github.com/go-critic/go-critic v0.4.1/go.mod h1:7/14rZGnZbY6E38VEGk2kVhoq6itzc1E68facVDK23g= 134 + github.com/go-enry/go-enry/v2 v2.8.0 h1:KMW4mSG+8uUF6FaD3iPkFqyfC5tF8gRrsYImq6yhHzo= 135 + github.com/go-enry/go-enry/v2 v2.8.0/go.mod h1:GVzIiAytiS5uT/QiuakK7TF1u4xDab87Y8V5EJRpsIQ= 136 + github.com/go-enry/go-oniguruma v1.2.1 h1:k8aAMuJfMrqm/56SG2lV9Cfti6tC4x8673aHCcBk+eo= 137 + github.com/go-enry/go-oniguruma v1.2.1/go.mod h1:bWDhYP+S6xZQgiRL7wlTScFYBe023B6ilRZbCAD5Hf4= 134 138 github.com/go-git/gcfg v1.5.0 h1:Q5ViNfGF8zFgyJWPqYwA7qGFoMTEiBmdlkcfRmpIMa4= 135 139 github.com/go-git/gcfg v1.5.0/go.mod h1:5m20vg6GwYabIxaOonVkTdrILxQMpEShl1xiMF4ua+E= 136 140 github.com/go-git/go-billy/v5 v5.2.0/go.mod h1:pmpqyWchKfYfrkb/UVH4otLvyi/5gJlGI4Hb3ZqZ3W0=
+55
index_test.go
··· 55 55 t.Fatalf("Add %d: %v", i, err) 56 56 } 57 57 } 58 + 58 59 return b 59 60 } 60 61 ··· 2137 2138 }) 2138 2139 wantSingleMatch(res, "f2") 2139 2140 } 2141 + 2142 + func TestSearchTypeLanguage(t *testing.T) { 2143 + b := testIndexBuilder(t, &Repository{ 2144 + Name: "reponame", 2145 + }, 2146 + Document{Name: "apex.cls", Content: []byte("public class Car extends Vehicle {")}, 2147 + Document{Name: "tex.cls", Content: []byte(`\DeclareOption*{`)}, 2148 + Document{Name: "hello.h", Content: []byte(`#include <stdio.h>`)}, 2149 + ) 2150 + 2151 + t.Log(b.languageMap) 2152 + 2153 + wantSingleMatch := func(res *SearchResult, want string) { 2154 + t.Helper() 2155 + fmatches := res.Files 2156 + if len(fmatches) != 1 { 2157 + t.Errorf("got %v, want 1 matches", len(fmatches)) 2158 + return 2159 + } 2160 + if len(fmatches[0].LineMatches) != 1 { 2161 + t.Errorf("got %d line matches", len(fmatches[0].LineMatches)) 2162 + return 2163 + } 2164 + var got string 2165 + if fmatches[0].LineMatches[0].FileName { 2166 + got = fmatches[0].FileName 2167 + } else { 2168 + got = fmt.Sprintf("%s:%d", fmatches[0].FileName, fmatches[0].LineMatches[0].LineFragments[0].Offset) 2169 + } 2170 + 2171 + if got != want { 2172 + t.Errorf("got %s, want %s", got, want) 2173 + } 2174 + } 2175 + 2176 + res := searchForTest(t, b, &query.Language{Language: "Apex"}) 2177 + wantSingleMatch(res, "apex.cls") 2178 + 2179 + res = searchForTest(t, b, &query.Language{Language: "TeX"}) 2180 + wantSingleMatch(res, "tex.cls") 2181 + 2182 + res = searchForTest(t, b, &query.Language{Language: "C"}) 2183 + wantSingleMatch(res, "hello.h") 2184 + 2185 + // test fallback language search by pretending it's an older index version 2186 + res = searchForTest(t, b, &query.Language{Language: "C++"}) 2187 + if len(res.Files) != 0 { 2188 + t.Errorf("got %d results for C++, want 0", len(res.Files)) 2189 + } 2190 + 2191 + b.featureVersion = 11 // force fallback 2192 + res = searchForTest(t, b, &query.Language{Language: "C++"}) 2193 + wantSingleMatch(res, "hello.h") 2194 + }
+18 -7
indexbuilder.go
··· 25 25 "sort" 26 26 "time" 27 27 "unicode/utf8" 28 + 29 + "github.com/go-enry/go-enry/v2" 28 30 ) 29 31 30 32 var _ = log.Println ··· 187 189 subRepoIndices []map[string]uint32 188 190 189 191 // language => language code 190 - languageMap map[string]byte 192 + languageMap map[string]uint16 191 193 192 - // languages codes 193 - languages []byte 194 + // language codes, uint16 encoded as little-endian 195 + languages []uint8 194 196 195 197 // IndexTime will be used as the time if non-zero. Otherwise 196 198 // time.Now(). This is useful for doing reproducible builds in tests. ··· 242 244 fileEndSymbol: []uint32{0}, 243 245 symIndex: make(map[string]uint32), 244 246 symKindIndex: make(map[string]uint32), 245 - languageMap: map[string]byte{}, 247 + languageMap: map[string]uint16{}, 246 248 } 247 249 } 248 250 ··· 425 427 } 426 428 } 427 429 430 + if doc.Language == "" { 431 + c := doc.Content 432 + // classifier is faster on small files without losing much accuracy 433 + if len(c) > 2048 { 434 + c = c[:2048] 435 + } 436 + doc.Language = enry.GetLanguage(doc.Name, c) 437 + } 438 + 428 439 sort.Sort(symbolSlice{doc.Symbols, doc.SymbolsMetaData}) 429 440 var last DocumentSection 430 441 for i, s := range doc.Symbols { ··· 492 503 493 504 langCode, ok := b.languageMap[doc.Language] 494 505 if !ok { 495 - if len(b.languageMap) >= 255 { 506 + if len(b.languageMap) >= 65535 { 496 507 return fmt.Errorf("too many languages") 497 508 } 498 - langCode = byte(len(b.languageMap)) 509 + langCode = uint16(len(b.languageMap)) 499 510 b.languageMap[doc.Language] = langCode 500 511 } 501 - b.languages = append(b.languages, langCode) 512 + b.languages = append(b.languages, uint8(langCode), uint8(langCode>>8)) 502 513 503 514 return nil 504 515 }
+10 -1
indexdata.go
··· 87 87 languages []byte 88 88 89 89 // inverse of LanguageMap in metaData 90 - languageMap map[byte]string 90 + languageMap map[uint16]string 91 91 92 92 repoListEntry []RepoListEntry 93 93 ··· 164 164 func (d *indexData) getChecksum(idx uint32) []byte { 165 165 start := crc64.Size * idx 166 166 return d.checksums[start : start+crc64.Size] 167 + } 168 + 169 + func (d *indexData) getLanguage(idx uint32) uint16 { 170 + if d.metaData.IndexFeatureVersion < 12 { 171 + // older zoekt files had 8-bit language entries 172 + return uint16(d.languages[idx]) 173 + } 174 + // newer zoekt files have 16-bit language entries 175 + return uint16(d.languages[idx*2]) | uint16(d.languages[idx*2+1])<<8 167 176 } 168 177 169 178 // calculates stats for files in the range [start, end).
+6 -2
matchtree.go
··· 413 413 } 414 414 415 415 func (t *regexpMatchTree) String() string { 416 - return fmt.Sprintf("re(%s)", t.regexp) 416 + f := "" 417 + if t.fileName { 418 + f = "f" 419 + } 420 + return fmt.Sprintf("%sre(%s)", f, t.regexp) 417 421 } 418 422 419 423 func (t *orMatchTree) String() string { ··· 874 878 reason: "language", 875 879 numDocs: d.numDocs(), 876 880 predicate: func(docID uint32) bool { 877 - return d.languages[docID] == code 881 + return d.getLanguage(docID) == code 878 882 }, 879 883 }, nil 880 884
+1 -1
merge.go
··· 124 124 // Content set below since it can return an error 125 125 // Branches set below since it requires lookups 126 126 SubRepositoryPath: d.subRepoPaths[repoID][d.subRepos[docID]], 127 - Language: d.languageMap[d.languages[docID]], 127 + Language: d.languageMap[d.getLanguage(docID)], 128 128 // SkipReason not set, will be part of content from original indexer. 129 129 } 130 130
+8 -1
query/parse.go
··· 19 19 "fmt" 20 20 "log" 21 21 "regexp/syntax" 22 + 23 + "github.com/go-enry/go-enry/v2" 22 24 ) 23 25 24 26 var _ = log.Printf ··· 138 140 } 139 141 expr = q 140 142 case tokLang: 141 - expr = &Language{Language: text} 143 + canonical, ok := enry.GetLanguageByAlias(text) 144 + if !ok { 145 + expr = &Const{false} 146 + } else { 147 + expr = &Language{Language: canonical} 148 + } 142 149 143 150 case tokSym: 144 151 if text == "" {
+2 -1
query/parse_test.go
··· 83 83 {"c:abc", &Substring{Pattern: "abc", Content: true}}, 84 84 {"content:abc", &Substring{Pattern: "abc", Content: true}}, 85 85 86 - {"lang:c++", &Language{"c++"}}, 86 + {"lang:c++", &Language{"C++"}}, 87 + {"lang:cpp", &Language{"C++"}}, 87 88 {"sym:pqr", &Symbol{&Substring{Pattern: "pqr"}}}, 88 89 {"sym:Pqr", &Symbol{&Substring{Pattern: "Pqr", CaseSensitive: true}}}, 89 90 {"sym:.*", &Symbol{&Regexp{Regexp: mustParseRE(".*")}}},
+1 -1
read.go
··· 370 370 d.subRepoPaths = append(d.subRepoPaths, keys) 371 371 } 372 372 373 - d.languageMap = map[byte]string{} 373 + d.languageMap = map[uint16]string{} 374 374 for k, v := range d.metaData.LanguageMap { 375 375 d.languageMap[v] = k 376 376 }
+4 -4
testdata/gen-shards.sh
··· 2 2 3 3 set -ex 4 4 5 - go build ../cmd/zoekt-index 6 - 7 5 cp -r repo repo17 8 6 9 - ./zoekt-index -disable_ctags repo17 7 + go run ../cmd/zoekt-index -disable_ctags repo17 8 + go run ../cmd/zoekt-merge-index repo17_v16.00000.zoekt 9 + mv compound*zoekt repo17_v17.00000.zoekt 10 10 11 - rm -rf repo17 11 + rm -rf repo17 repo17_v16.00000.zoekt zoekt-builder-shard-log.tsv 12 12 13 13 mv *.zoekt shards/
+3 -3
testdata/golden/TestReadSearch/repo17_v17.00000.golden
··· 1 1 { 2 2 "FormatVersion": 17, 3 - "FeatureVersion": 11, 3 + "FeatureVersion": 12, 4 4 "FileMatches": [ 5 5 [ 6 6 { ··· 29 29 ], 30 30 "Content": null, 31 31 "Checksum": "n9fUYqacPXg=", 32 - "Language": "", 32 + "Language": "Go", 33 33 "SubRepositoryName": "", 34 34 "SubRepositoryPath": "", 35 35 "Version": "" ··· 62 62 ], 63 63 "Content": null, 64 64 "Checksum": "n9fUYqacPXg=", 65 - "Language": "", 65 + "Language": "Go", 66 66 "SubRepositoryName": "", 67 67 "SubRepositoryPath": "", 68 68 "Version": ""
+3 -3
testdata/golden/TestReadSearch/repo_v16.00000.golden
··· 1 1 { 2 2 "FormatVersion": 16, 3 - "FeatureVersion": 11, 3 + "FeatureVersion": 12, 4 4 "FileMatches": [ 5 5 [ 6 6 { ··· 29 29 ], 30 30 "Content": null, 31 31 "Checksum": "n9fUYqacPXg=", 32 - "Language": "", 32 + "Language": "Go", 33 33 "SubRepositoryName": "", 34 34 "SubRepositoryPath": "", 35 35 "Version": "" ··· 62 62 ], 63 63 "Content": null, 64 64 "Checksum": "n9fUYqacPXg=", 65 - "Language": "", 65 + "Language": "Go", 66 66 "SubRepositoryName": "", 67 67 "SubRepositoryPath": "", 68 68 "Version": ""
testdata/shards/repo17_v17.00000.zoekt

This is a binary file and will not be displayed.

testdata/shards/repo_v16.00000.zoekt

This is a binary file and will not be displayed.

+2 -1
toc.go
··· 42 42 // 9: Store ctags metadata & bump default max file size 43 43 // 10: Compound shards; more flexible TOC format. 44 44 // 11: Bloom filters for file names & contents 45 - const FeatureVersion = 11 45 + // 12: go-enry for identifying file languages 46 + const FeatureVersion = 12 46 47 47 48 // WriteMinFeatureVersion and ReadMinFeatureVersion constrain forwards and backwards 48 49 // compatibility. For example, if a new way to encode filenameNgrams on disk is