fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

merging: support exploding compound shards (#271)

This change let's us split a compound shard into its constituent repos.

In the future this should happen instead of deleting too small compound shards. Fow now, the feature is behind a feature flag. To activate, place a file EXPLODE in the index dir.

+440 -47
+103 -4
cmd/zoekt-merge-index/main.go
··· 2 2 3 3 import ( 4 4 "bufio" 5 + "fmt" 5 6 "log" 6 7 "os" 7 8 "path/filepath" ··· 32 33 return err 33 34 } 34 35 35 - func main() { 36 - paths := os.Args[1:] 36 + func mergeCmd(paths []string) error { 37 37 if paths[0] == "-" { 38 38 paths = []string{} 39 39 scanner := bufio.NewScanner(os.Stdin) ··· 41 41 paths = append(paths, strings.TrimSpace(scanner.Text())) 42 42 } 43 43 if err := scanner.Err(); err != nil { 44 - log.Fatal(err) 44 + return err 45 45 } 46 46 log.Printf("merging %d paths from stdin", len(paths)) 47 47 } 48 48 err := merge(filepath.Dir(paths[0]), paths) 49 49 if err != nil { 50 - log.Fatal(err) 50 + return err 51 + } 52 + return nil 53 + } 54 + 55 + // explode splits a shard into indiviual shards and places them in dstDir. 56 + // If it returns without error, the input shard was deleted and the first 57 + // result contains the list of all new shards. 58 + // 59 + // explode cleans up tmp files created in the process on a best effort basis. 60 + func explode(dstDir string, inputShard string) error { 61 + f, err := os.Open(inputShard) 62 + if err != nil { 63 + return err 64 + } 65 + defer f.Close() 66 + 67 + indexFile, err := zoekt.NewIndexFile(f) 68 + if err != nil { 69 + return err 70 + } 71 + defer indexFile.Close() 72 + 73 + exploded, err := zoekt.Explode(dstDir, indexFile) 74 + defer func() { 75 + // best effort removal of tmp files. If os.Remove failes, indexserver will delete 76 + // the leftover tmp files during the next cleanup. 77 + for tmpFn := range exploded { 78 + os.Remove(tmpFn) 79 + } 80 + }() 81 + if err != nil { 82 + return fmt.Errorf("zoekt.Explode: %w", err) 83 + } 84 + var fns []string 85 + for tmpFn, dstFn := range exploded { 86 + err = os.Rename(tmpFn, dstFn) 87 + if err != nil { 88 + // clean up the shards we already renamed to avoid duplicate results. 89 + for _, fn := range fns { 90 + os.Remove(fn) 91 + } 92 + return fmt.Errorf("explode: rename failed: %w", err) 93 + } 94 + fns = append(fns, dstFn) 95 + } 96 + 97 + // Don't remove the input shard if its name matches one of the destination 98 + // shards. This can happen, for example, if the input shard is a simple shard. 99 + for _, dstFn := range exploded { 100 + if dstFn == inputShard { 101 + return nil 102 + } 103 + } 104 + 105 + removeInputShard := func() (err error) { 106 + defer func() { 107 + if err != nil { 108 + // delete the new shards to avoid duplicate results. 109 + for _, fn := range fns { 110 + os.Remove(fn) 111 + } 112 + } 113 + }() 114 + 115 + paths, err := zoekt.IndexFilePaths(inputShard) 116 + if err != nil { 117 + return err 118 + } 119 + for _, path := range paths { 120 + err = os.Remove(path) 121 + if err != nil { 122 + return err 123 + } 124 + } 125 + return nil 126 + } 127 + 128 + if err = removeInputShard(); err != nil { 129 + return fmt.Errorf("explode: error removing input shard %s: %w", inputShard, err) 130 + } 131 + return nil 132 + } 133 + 134 + func explodeCmd(path string) error { 135 + return explode(filepath.Dir(path), path) 136 + } 137 + 138 + func main() { 139 + switch subCommand := os.Args[1]; subCommand { 140 + case "merge": 141 + if err := mergeCmd(os.Args[2:]); err != nil { 142 + log.Fatal(err) 143 + } 144 + case "explode": 145 + if err := explodeCmd(os.Args[2]); err != nil { 146 + log.Fatal(err) 147 + } 148 + default: 149 + log.Fatalf("unknown subcommand %s", subCommand) 51 150 } 52 151 }
+90
cmd/zoekt-merge-index/main_test.go
··· 49 49 t.Errorf("got %v, want 2 files.", result.Files) 50 50 } 51 51 } 52 + 53 + // TODO (stefan): make zoekt-git-index deterministic to compare the simple shards 54 + // byte by byte instead of by search results. 55 + 56 + // Merge 2 simple shards and then explode them. 57 + func TestExplode(t *testing.T) { 58 + dir := t.TempDir() 59 + 60 + v16Shards, err := filepath.Glob("../../testdata/shards/repo*_v16.*.zoekt") 61 + if err != nil { 62 + t.Fatal(err) 63 + } 64 + sort.Strings(v16Shards) 65 + t.Log(v16Shards) 66 + 67 + err = merge(dir, v16Shards) 68 + if err != nil { 69 + t.Fatal(err) 70 + } 71 + 72 + cs, err := filepath.Glob(filepath.Join(dir, "compound-*.zoekt")) 73 + if err != nil { 74 + t.Fatal(err) 75 + } 76 + err = explode(dir, cs[0]) 77 + if err != nil { 78 + t.Fatal(err) 79 + } 80 + 81 + cs, err = filepath.Glob(filepath.Join(dir, "compound-*.zoekt")) 82 + if err != nil { 83 + t.Fatal(err) 84 + } 85 + 86 + if len(cs) != 0 { 87 + t.Fatalf("explode should have deleted the compound shard if it returned without error") 88 + } 89 + 90 + exploded, err := filepath.Glob(filepath.Join(dir, "*.zoekt")) 91 + if err != nil { 92 + t.Fatal(err) 93 + } 94 + 95 + if len(exploded) != len(v16Shards) { 96 + t.Fatalf("the number of simple shards before %d and after %d should be the same", len(v16Shards), len(exploded)) 97 + } 98 + 99 + ss, err := shards.NewDirectorySearcher(dir) 100 + if err != nil { 101 + t.Fatalf("NewDirectorySearcher(%s): %v", dir, err) 102 + } 103 + defer ss.Close() 104 + 105 + var sOpts zoekt.SearchOptions 106 + ctx := context.Background() 107 + 108 + cases := []struct { 109 + searchLiteral string 110 + wantResults int 111 + }{ 112 + { 113 + searchLiteral: "apple", 114 + wantResults: 1, 115 + }, 116 + { 117 + searchLiteral: "hello", 118 + wantResults: 1, 119 + }, 120 + { 121 + searchLiteral: "main", 122 + wantResults: 2, 123 + }, 124 + } 125 + 126 + for _, c := range cases { 127 + t.Run(c.searchLiteral, func(t *testing.T) { 128 + q, err := query.Parse(c.searchLiteral) 129 + if err != nil { 130 + t.Fatalf("Parse(%s): %v", c.searchLiteral, err) 131 + } 132 + result, err := ss.Search(ctx, q, &sOpts) 133 + if err != nil { 134 + t.Fatalf("Search(%v): %v", q, err) 135 + } 136 + if got := len(result.Files); got != c.wantResults { 137 + t.Fatalf("wanted %d results, got %d", c.wantResults, got) 138 + } 139 + }) 140 + } 141 + }
+27 -11
cmd/zoekt-sourcegraph-indexserver/cleanup.go
··· 433 433 } 434 434 435 435 if info.Size() < s.minSizeBytes { 436 - paths, err := zoekt.IndexFilePaths(path) 437 - if err != nil { 438 - debug.Printf("failed getting all file paths for %s", path) 436 + // feature flag: place file EXPLODE in IndexDir 437 + if _, err := os.Stat(filepath.Join(s.IndexDir, "EXPLODE")); err == nil { 438 + cmd := exec.Command("zoekt-merge-index", "explode", path) 439 + 440 + s.muIndexDir.Lock() 441 + b, err := cmd.CombinedOutput() 442 + s.muIndexDir.Unlock() 443 + 444 + if err != nil { 445 + debug.Printf("failed to explode compound shard %s: %s", path, string(b)) 446 + } else { 447 + shardsLog(s.IndexDir, "explode", []shard{{Path: path}}) 448 + } 439 449 continue 440 - } 441 - s.muIndexDir.Lock() 442 - for _, p := range paths { 443 - os.Remove(p) 450 + } else { 451 + paths, err := zoekt.IndexFilePaths(path) 452 + if err != nil { 453 + debug.Printf("failed getting all file paths for %s", path) 454 + continue 455 + } 456 + s.muIndexDir.Lock() 457 + for _, p := range paths { 458 + os.Remove(p) 459 + } 460 + s.muIndexDir.Unlock() 461 + shardsLog(s.IndexDir, "delete", []shard{{Path: path}}) 462 + continue 444 463 } 445 - s.muIndexDir.Unlock() 446 - shardsLog(s.IndexDir, "delete", []shard{{Path: path}}) 447 - continue 448 464 } 449 465 450 466 s.muIndexDir.Lock() ··· 474 490 if mockMerger != nil { 475 491 runMerge = mockMerger 476 492 } else { 477 - runMerge = exec.Command("zoekt-merge-index", fn).Run 493 + runMerge = exec.Command("zoekt-merge-index", "merge", fn).Run 478 494 } 479 495 480 496 repos, _, err := zoekt.ReadMetadataPath(fn)
+1 -1
cmd/zoekt-sourcegraph-indexserver/merge.go
··· 213 213 return nil, nil, nil 214 214 } 215 215 216 - cmd := exec.Command("zoekt-merge-index", "-") 216 + cmd := exec.Command("zoekt-merge-index", "merge", "-") 217 217 218 218 outBuf := &bytes.Buffer{} 219 219 errBuf := &bytes.Buffer{}
+117 -30
merge.go
··· 3 3 import ( 4 4 "crypto/sha1" 5 5 "fmt" 6 + "io" 6 7 "io/ioutil" 7 8 "log" 9 + "net/url" 8 10 "os" 9 11 "path/filepath" 10 12 "runtime" ··· 119 121 } 120 122 } 121 123 122 - doc := Document{ 123 - Name: string(d.fileName(docID)), 124 - // Content set below since it can return an error 125 - // Branches set below since it requires lookups 126 - SubRepositoryPath: d.subRepoPaths[repoID][d.subRepos[docID]], 127 - Language: d.languageMap[d.getLanguage(docID)], 128 - // SkipReason not set, will be part of content from original indexer. 124 + if err := addDocument(d, ib, repoID, docID); err != nil { 125 + return nil, err 129 126 } 127 + } 128 + } 129 + 130 + return ib, nil 131 + } 130 132 131 - var err error 132 - if doc.Content, err = d.readContents(docID); err != nil { 133 - return nil, err 133 + // Explode takes an IndexFile f and creates 1 simple shard per repository 134 + // contained in f. Explode returns a map of tmpName -> dstName. It is the 135 + // responsibility of the caller to rename the temporary shard(s) and delete the 136 + // input shard. 137 + func Explode(dstDir string, f IndexFile) (map[string]string, error) { 138 + searcher, err := NewSearcher(f) 139 + if err != nil { 140 + return nil, err 141 + } 142 + d := searcher.(*indexData) 143 + 144 + shardNames := make(map[string]string, len(d.repoMetaData)) 145 + 146 + writeShard := func(ib *IndexBuilder) error { 147 + if len(ib.repoList) != 1 { 148 + return fmt.Errorf("expected ib to contain exactly 1 repository") 149 + } 150 + fn := filepath.Join(dstDir, shardName(ib.repoList[0].Name, ib.indexFormatVersion, 0)) 151 + fnTmp := fn + ".tmp" 152 + shardNames[fnTmp] = fn 153 + return builderWriteAll(fnTmp, ib) 154 + } 155 + 156 + var ib *IndexBuilder 157 + lastRepoID := -1 158 + for docID := uint32(0); int(docID) < len(d.fileBranchMasks); docID++ { 159 + repoID := int(d.repos[docID]) 160 + 161 + if d.repoMetaData[repoID].Tombstone { 162 + continue 163 + } 164 + 165 + if repoID != lastRepoID { 166 + if lastRepoID > repoID { 167 + return shardNames, fmt.Errorf("non-contiguous repo ids in %s for document %d: old=%d current=%d", d.String(), docID, lastRepoID, repoID) 134 168 } 169 + lastRepoID = repoID 135 170 136 - if doc.Symbols, _, err = d.readDocSections(docID, nil); err != nil { 137 - return nil, err 171 + if ib != nil { 172 + if err := writeShard(ib); err != nil { 173 + return shardNames, err 174 + } 138 175 } 139 176 140 - doc.SymbolsMetaData = make([]*Symbol, len(doc.Symbols)) 141 - for i := range doc.SymbolsMetaData { 142 - doc.SymbolsMetaData[i] = d.symbols.data(d.fileEndSymbol[docID] + uint32(i)) 177 + ib = newIndexBuilder() 178 + ib.indexFormatVersion = IndexFormatVersion 179 + if err := ib.setRepository(&d.repoMetaData[repoID]); err != nil { 180 + return shardNames, err 143 181 } 182 + } 144 183 145 - // calculate branches 146 - { 147 - mask := d.fileBranchMasks[docID] 148 - id := uint32(1) 149 - for mask != 0 { 150 - if mask&0x1 != 0 { 151 - doc.Branches = append(doc.Branches, d.branchNames[repoID][uint(id)]) 152 - } 153 - id <<= 1 154 - mask >>= 1 155 - } 156 - } 184 + err := addDocument(d, ib, repoID, docID) 185 + if err != nil { 186 + return shardNames, err 187 + } 188 + } 189 + 190 + if ib != nil { 191 + if err := writeShard(ib); err != nil { 192 + return shardNames, err 193 + } 194 + } 195 + 196 + return shardNames, nil 197 + } 198 + 199 + func addDocument(d *indexData, ib *IndexBuilder, repoID int, docID uint32) error { 200 + doc := Document{ 201 + Name: string(d.fileName(docID)), 202 + // Content set below since it can return an error 203 + // Branches set below since it requires lookups 204 + SubRepositoryPath: d.subRepoPaths[repoID][d.subRepos[docID]], 205 + Language: d.languageMap[d.getLanguage(docID)], 206 + // SkipReason not set, will be part of content from original indexer. 207 + } 208 + 209 + var err error 210 + if doc.Content, err = d.readContents(docID); err != nil { 211 + return err 212 + } 213 + 214 + if doc.Symbols, _, err = d.readDocSections(docID, nil); err != nil { 215 + return err 216 + } 217 + 218 + doc.SymbolsMetaData = make([]*Symbol, len(doc.Symbols)) 219 + for i := range doc.SymbolsMetaData { 220 + doc.SymbolsMetaData[i] = d.symbols.data(d.fileEndSymbol[docID] + uint32(i)) 221 + } 157 222 158 - if err := ib.Add(doc); err != nil { 159 - return nil, err 223 + // calculate branches 224 + { 225 + mask := d.fileBranchMasks[docID] 226 + id := uint32(1) 227 + for mask != 0 { 228 + if mask&0x1 != 0 { 229 + doc.Branches = append(doc.Branches, d.branchNames[repoID][uint(id)]) 160 230 } 231 + id <<= 1 232 + mask >>= 1 161 233 } 162 234 } 235 + return ib.Add(doc) 236 + } 163 237 164 - return ib, nil 238 + // copied from builder package to avoid circular imports. 239 + func hashString(s string) string { 240 + h := sha1.New() 241 + _, _ = io.WriteString(h, s) 242 + return fmt.Sprintf("%x", h.Sum(nil)) 243 + } 244 + 245 + // copied from builder package to avoid circular imports. 246 + func shardName(name string, version, n int) string { 247 + abs := url.QueryEscape(name) 248 + if len(abs) > 200 { 249 + abs = abs[:200] + hashString(abs)[:8] 250 + } 251 + return fmt.Sprintf("%s_v%d.%05d.zoekt", abs, version, n) 165 252 }
+7 -1
testdata/gen-shards.sh
··· 2 2 3 3 set -ex 4 4 5 + # generate repo17.v17.0000.zoekt 5 6 cp -r repo repo17 6 7 7 8 go run ../cmd/zoekt-index -disable_ctags repo17 8 - go run ../cmd/zoekt-merge-index repo17_v16.00000.zoekt 9 + go run ../cmd/zoekt-merge-index merge repo17_v16.00000.zoekt 9 10 mv compound*zoekt repo17_v17.00000.zoekt 10 11 11 12 rm -rf repo17 repo17_v16.00000.zoekt zoekt-builder-shard-log.tsv 12 13 13 14 mv *.zoekt shards/ 15 + 16 + # generate repo2.v16.0000.zoekt 17 + go run ../cmd/zoekt-index repo2 18 + rm zoekt-builder-shard-log.tsv 19 + mv *.zoekt shards/
+82
testdata/golden/TestReadSearch/repo2_v16.00000.golden
··· 1 + { 2 + "FormatVersion": 16, 3 + "FeatureVersion": 12, 4 + "FileMatches": [ 5 + [ 6 + { 7 + "Score": 910, 8 + "Debug": "", 9 + "FileName": "main.go", 10 + "Repository": "repo2", 11 + "Branches": null, 12 + "LineMatches": [ 13 + { 14 + "Line": "ZnVuYyBtYWluKCkgew==", 15 + "LineStart": 33, 16 + "LineEnd": 46, 17 + "LineNumber": 7, 18 + "Before": null, 19 + "After": null, 20 + "FileName": false, 21 + "Score": 501, 22 + "LineFragments": [ 23 + { 24 + "LineOffset": 0, 25 + "Offset": 33, 26 + "MatchLength": 9, 27 + "SymbolInfo": null 28 + } 29 + ] 30 + } 31 + ], 32 + "RepositoryID": 0, 33 + "RepositoryPriority": 0, 34 + "Content": null, 35 + "Checksum": "Ju1TnQKZ6mE=", 36 + "Language": "Go", 37 + "SubRepositoryName": "", 38 + "SubRepositoryPath": "", 39 + "Version": "" 40 + } 41 + ], 42 + [ 43 + { 44 + "Score": 710, 45 + "Debug": "", 46 + "FileName": "main.go", 47 + "Repository": "repo2", 48 + "Branches": null, 49 + "LineMatches": [ 50 + { 51 + "Line": "cGFja2FnZSBtYWlu", 52 + "LineStart": 0, 53 + "LineEnd": 12, 54 + "LineNumber": 1, 55 + "Before": null, 56 + "After": null, 57 + "FileName": false, 58 + "Score": 501, 59 + "LineFragments": [ 60 + { 61 + "LineOffset": 0, 62 + "Offset": 0, 63 + "MatchLength": 7, 64 + "SymbolInfo": null 65 + } 66 + ] 67 + } 68 + ], 69 + "RepositoryID": 0, 70 + "RepositoryPriority": 0, 71 + "Content": null, 72 + "Checksum": "Ju1TnQKZ6mE=", 73 + "Language": "Go", 74 + "SubRepositoryName": "", 75 + "SubRepositoryPath": "", 76 + "Version": "" 77 + } 78 + ], 79 + null, 80 + null 81 + ] 82 + }
+13
testdata/repo2/main.go
··· 1 + package main 2 + 3 + import ( 4 + "fmt" 5 + ) 6 + 7 + func main() { 8 + var b, c int = 1, 2 9 + fmt.Println(b, c) 10 + 11 + fruit := "apple" 12 + fmt.Println(fruit) 13 + }
testdata/shards/repo2_v16.00000.zoekt

This is a binary file and will not be displayed.