fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

Tackle the issue of XML files filtered as binaries in search results (#910)

When skipping a doc, we currently report the detected language as "binary" (if
it looks like binary) or "skipped" (if it's skipped for any other reason).
Skipped docs are still added to the index and can still be returned as search
results, for example if you only match on filename. So sometimes file matches
are returned with "skipped" as their language, even though the file path is
clearly some other language like XML.

This PR updates the indexing logic to still detect the language even if the
document is skipped. However, we avoid passing the contents to the language
detection library to avoid running detection on huge files.

---------

Co-authored-by: Julie Tibshirani <julietibs@apache.org>

+56 -6
-1
index/builder.go
··· 610 610 doc.SkipReason = fmt.Sprintf("document size %d larger than limit %d", len(doc.Content), b.opts.SizeMax) 611 611 } else if err := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile); err != nil { 612 612 doc.SkipReason = err.Error() 613 - doc.Language = "binary" 614 613 } 615 614 616 615 b.todo = append(b.todo, &doc)
+10 -5
index/shard_builder.go
··· 396 396 } 397 397 398 398 func DetermineLanguageIfUnknown(doc *Document) { 399 - if doc.Language == "" { 399 + if doc.Language != "" { 400 + return 401 + } 402 + 403 + if doc.SkipReason != "" { 404 + // If this document has been skipped, it's likely very large, or it's a non-code file like binary. 405 + // In this case, we just guess the language based on file name to avoid examining the contents. 406 + // Note: passing nil content is allowed by the go-enry contract (the underlying library we use here). 407 + doc.Language = languages.GetLanguage(doc.Name, nil) 408 + } else { 400 409 doc.Language = languages.GetLanguage(doc.Name, doc.Content) 401 410 } 402 411 } ··· 407 416 408 417 if idx := bytes.IndexByte(doc.Content, 0); idx >= 0 { 409 418 doc.SkipReason = fmt.Sprintf("binary content at byte offset %d", idx) 410 - doc.Language = "binary" 411 419 } 412 420 413 421 if doc.SkipReason != "" { 414 422 doc.Content = []byte(notIndexedMarker + doc.SkipReason) 415 423 doc.Symbols = nil 416 424 doc.SymbolsMetaData = nil 417 - if doc.Language == "" { 418 - doc.Language = "skipped" 419 - } 420 425 } 421 426 422 427 DetermineLanguageIfUnknown(&doc)
+46
index/shard_builder_test.go
··· 47 47 }) 48 48 } 49 49 } 50 + 51 + func TestDetermineLanguageIfUnknown(t *testing.T) { 52 + tests := []struct { 53 + name string 54 + doc Document 55 + wantLang string 56 + skipContent bool 57 + }{ 58 + { 59 + name: "already has language", 60 + doc: Document{ 61 + Name: "test.java", 62 + Language: "Go", 63 + Content: []byte("package main"), 64 + }, 65 + wantLang: "Go", 66 + }, 67 + { 68 + name: "skipped file", 69 + doc: Document{ 70 + Name: "large.js", 71 + SkipReason: "too large", 72 + Content: []byte(notIndexedMarker + "too large"), 73 + }, 74 + wantLang: "JavaScript", 75 + }, 76 + { 77 + name: "skipped file with unknown extension", 78 + doc: Document{ 79 + Name: "deadb33f", 80 + SkipReason: "binary", 81 + Content: []byte(notIndexedMarker + "binary"), 82 + }, 83 + wantLang: "", 84 + }, 85 + } 86 + 87 + for _, tt := range tests { 88 + t.Run(tt.name, func(t *testing.T) { 89 + DetermineLanguageIfUnknown(&tt.doc) 90 + if tt.doc.Language != tt.wantLang { 91 + t.Errorf("DetermineLanguageIfUnknown() got language = %v, want %v", tt.doc.Language, tt.wantLang) 92 + } 93 + }) 94 + } 95 + }