fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

fix/index: preserve skipped-file category through shard paths (#1073)

ShardBuilder.Add still determined language before file category for callers
that bypass Builder.Add. In that path skipped content can be replaced with the
not-indexed marker, so doing language first leaves category detection
operating on synthetic content instead of the original file and misses the
cheaper skip-aware language path.

This changes ShardBuilder.Add to determine the file category before rewriting
skipped content, then infer language afterward so direct ShardBuilder callers
follow the same behavior as Builder.Add and keep content-aware categorization
for skipped documents.

Shard merging also reconstructs documents through ShardBuilder, so this PR
carries the category already stored in the source shard into the rebuilt
document. That keeps merge metadata-preserving instead of forcing category
inference to rediscover information the original indexer already knew.

+112 -3
+1
index/merge.go
··· 275 275 // Branches set below since it requires lookups 276 276 SubRepositoryPath: d.subRepoPaths[repoID][d.subRepos[docID]], 277 277 Language: d.languageMap[d.getLanguage(docID)], 278 + Category: d.getCategory(docID), 278 279 // SkipReason not set, will be part of content from original indexer. 279 280 } 280 281
+65
index/merge_test.go
··· 10 10 "github.com/sourcegraph/zoekt" 11 11 ) 12 12 13 + func TestMergePreservesStoredCategoryForSkippedGeneratedFile(t *testing.T) { 14 + sb, err := NewShardBuilder(&zoekt.Repository{Name: "repo"}) 15 + if err != nil { 16 + t.Fatalf("NewShardBuilder: %v", err) 17 + } 18 + 19 + err = sb.Add(Document{ 20 + Name: "generated.go", 21 + Content: []byte("// Code generated by protoc-gen-go. DO NOT EDIT.\npackage generated\n"), 22 + SkipReason: SkipReasonTooManyTrigrams, 23 + }) 24 + if err != nil { 25 + t.Fatalf("Add: %v", err) 26 + } 27 + 28 + tmpDir := t.TempDir() 29 + inputPath := filepath.Join(tmpDir, "input.zoekt") 30 + if err := builderWriteAll(inputPath, sb); err != nil { 31 + t.Fatalf("builderWriteAll: %v", err) 32 + } 33 + 34 + inputFile, err := os.Open(inputPath) 35 + if err != nil { 36 + t.Fatalf("Open: %v", err) 37 + } 38 + defer inputFile.Close() 39 + 40 + indexFile, err := NewIndexFile(inputFile) 41 + if err != nil { 42 + t.Fatalf("NewIndexFile: %v", err) 43 + } 44 + defer indexFile.Close() 45 + 46 + tmpName, dstName, err := Merge(tmpDir, indexFile) 47 + if err != nil { 48 + t.Fatalf("Merge: %v", err) 49 + } 50 + if err := os.Rename(tmpName, dstName); err != nil { 51 + t.Fatalf("Rename: %v", err) 52 + } 53 + 54 + mergedFile, err := os.Open(dstName) 55 + if err != nil { 56 + t.Fatalf("Open merged shard: %v", err) 57 + } 58 + defer mergedFile.Close() 59 + 60 + mergedIndexFile, err := NewIndexFile(mergedFile) 61 + if err != nil { 62 + t.Fatalf("NewIndexFile merged shard: %v", err) 63 + } 64 + defer mergedIndexFile.Close() 65 + 66 + searcher, err := NewSearcher(mergedIndexFile) 67 + if err != nil { 68 + t.Fatalf("NewSearcher: %v", err) 69 + } 70 + defer searcher.Close() 71 + 72 + data := searcher.(*indexData) 73 + if got := data.getCategory(0); got != FileCategoryGenerated { 74 + t.Fatalf("got category %v, want %v", got, FileCategoryGenerated) 75 + } 76 + } 77 + 13 78 // We compare 2 simple shards before and after the transformation 14 79 // explode(merge(shard1, shard2)). We expect the input and output shards to be 15 80 // identical.
+3 -3
index/shard_builder.go
··· 549 549 if index := bytes.IndexByte(doc.Content, 0); index >= 0 { 550 550 doc.SkipReason = SkipReasonBinary 551 551 } 552 + // Preserve the original content for category detection in callers that 553 + // bypass Builder.Add and pass skipped documents directly. 554 + DetermineFileCategory(&doc) 552 555 } 553 556 554 557 if doc.SkipReason != SkipReasonNone { ··· 558 561 } 559 562 560 563 DetermineLanguageIfUnknown(&doc) 561 - if doc.Category == FileCategoryMissing { 562 - DetermineFileCategory(&doc) 563 - } 564 564 565 565 sort.Sort(symbolSlice{doc.Symbols, doc.SymbolsMetaData}) 566 566 var last DocumentSection
+43
index/shard_builder_test.go
··· 3 3 import ( 4 4 "strings" 5 5 "testing" 6 + 7 + "github.com/sourcegraph/zoekt" 6 8 ) 7 9 8 10 func TestShardName(t *testing.T) { ··· 93 95 }) 94 96 } 95 97 } 98 + 99 + func TestShardBuilderAddDeterminesCategoryBeforeReplacingSkippedContent(t *testing.T) { 100 + b, err := NewShardBuilder(&zoekt.Repository{}) 101 + if err != nil { 102 + t.Fatalf("NewShardBuilder: %v", err) 103 + } 104 + 105 + err = b.Add(Document{ 106 + Name: "generated.go", 107 + Content: []byte("// Code generated by protoc-gen-go. DO NOT EDIT.\npackage generated\n"), 108 + SkipReason: SkipReasonTooManyTrigrams, 109 + }) 110 + if err != nil { 111 + t.Fatalf("Add: %v", err) 112 + } 113 + 114 + if len(b.categories) != 1 { 115 + t.Fatalf("got %d categories, want 1", len(b.categories)) 116 + } 117 + 118 + category, err := decodeCategory(b.categories[0]) 119 + if err != nil { 120 + t.Fatalf("decodeCategory: %v", err) 121 + } 122 + if category != FileCategoryGenerated { 123 + t.Fatalf("got category %v, want %v", category, FileCategoryGenerated) 124 + } 125 + 126 + if len(b.languages) != 2 { 127 + t.Fatalf("got %d encoded language bytes, want 2", len(b.languages)) 128 + } 129 + 130 + langCode := uint16(b.languages[0]) | uint16(b.languages[1])<<8 131 + got, ok := b.languageMap["Go"] 132 + if !ok { 133 + t.Fatalf("languageMap missing Go entry: %#v", b.languageMap) 134 + } 135 + if got != langCode { 136 + t.Fatalf("got stored language code %d, want Go=%d", langCode, got) 137 + } 138 + }