fork of https://github.com/sourcegraph/zoekt
1package index
2
3import (
4 "errors"
5
6 "github.com/go-enry/go-enry/v2"
7)
8
9// FileCategory represents the category of a file, as determined by go-enry. It is non-exhaustive
10// but tries to the major cases like whether the file is a test, generated, etc.
11//
12// A file's category is used in search scoring to determine the weight of a file match.
13type FileCategory byte
14
15const (
16 // FileCategoryMissing is a sentinel value that indicates we never computed the file category during indexing
17 // (which means we're reading from an old index version). This value can never be written to the index.
18 FileCategoryMissing FileCategory = iota
19 FileCategoryDefault
20 FileCategoryTest
21 FileCategoryVendored
22 FileCategoryGenerated
23 FileCategoryConfig
24 FileCategoryDotFile
25 FileCategoryBinary
26 FileCategoryDocumentation
27)
28
29func DetermineFileCategory(doc *Document) {
30 if doc.SkipReason == SkipReasonBinary {
31 doc.Category = FileCategoryBinary
32 return
33 }
34
35 name := doc.Name
36 content := doc.Content
37
38 // If this document was skipped (too large, binary, or missing from the repo),
39 // guess the category based on the filename to avoid examining the contents.
40 // Note: passing nil content is allowed by the go-enry contract.
41 if doc.SkipReason == SkipReasonTooLarge || doc.SkipReason == SkipReasonBinary || doc.SkipReason == SkipReasonMissing {
42 content = nil
43 }
44
45 category := FileCategoryDefault
46 if enry.IsTest(name) {
47 category = FileCategoryTest
48 } else if enry.IsDotFile(name) {
49 category = FileCategoryDotFile
50 } else if enry.IsVendor(name) {
51 category = FileCategoryVendored
52 } else if enry.IsGenerated(name, content) {
53 category = FileCategoryGenerated
54 } else if enry.IsConfiguration(name) {
55 category = FileCategoryConfig
56 } else if enry.IsDocumentation(name) {
57 category = FileCategoryDocumentation
58 }
59
60 doc.Category = category
61}
62
63// lowPriority returns true if this file category is considered 'low priority'. This is used
64// in search scoring to down-weight matches in these files.
65func (c FileCategory) lowPriority() bool {
66 return c == FileCategoryTest || c == FileCategoryVendored || c == FileCategoryGenerated || c == FileCategoryBinary
67}
68
69func (c FileCategory) encode() (byte, error) {
70 switch c {
71 case FileCategoryMissing:
72 return 0, errors.New("cannot encode missing file category")
73 case FileCategoryDefault:
74 return 1, nil
75 case FileCategoryTest:
76 return 2, nil
77 case FileCategoryVendored:
78 return 3, nil
79 case FileCategoryGenerated:
80 return 4, nil
81 case FileCategoryConfig:
82 return 5, nil
83 case FileCategoryDotFile:
84 return 6, nil
85 case FileCategoryDocumentation:
86 return 7, nil
87 case FileCategoryBinary:
88 return 8, nil
89 default:
90 return 0, errors.New("unrecognized file category")
91 }
92}
93
94func decodeCategory(c byte) (FileCategory, error) {
95 switch c {
96 case 1:
97 return FileCategoryDefault, nil
98 case 2:
99 return FileCategoryTest, nil
100 case 3:
101 return FileCategoryVendored, nil
102 case 4:
103 return FileCategoryGenerated, nil
104 case 5:
105 return FileCategoryConfig, nil
106 case 6:
107 return FileCategoryDotFile, nil
108 case 7:
109 return FileCategoryDocumentation, nil
110 case 8:
111 return FileCategoryBinary, nil
112 default:
113 return FileCategoryMissing, errors.New("unrecognized file category")
114 }
115}