fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1package main 2 3import ( 4 "bytes" 5 "context" 6 "crypto/sha1" 7 "encoding/json" 8 "errors" 9 "fmt" 10 "io" 11 "log" 12 "net/url" 13 "os" 14 "os/exec" 15 "path/filepath" 16 "sort" 17 "strconv" 18 "strings" 19 "time" 20 21 "github.com/sourcegraph/zoekt" 22 "github.com/sourcegraph/zoekt/build" 23 "github.com/sourcegraph/zoekt/ctags" 24 25 sglog "github.com/sourcegraph/log" 26) 27 28const defaultIndexingTimeout = 1*time.Hour + 30*time.Minute 29 30// IndexOptions are the options that Sourcegraph can set via it's search 31// configuration endpoint. 32type IndexOptions struct { 33 // LargeFiles is a slice of glob patterns where matching file paths should 34 // be indexed regardless of their size. The pattern syntax can be found 35 // here: https://golang.org/pkg/path/filepath/#Match. 36 LargeFiles []string 37 38 // Symbols if true will make zoekt index the output of ctags. 39 Symbols bool 40 41 // Branches is a slice of branches to index. 42 Branches []zoekt.RepositoryBranch 43 44 // RepoID is the Sourcegraph Repository ID. 45 RepoID uint32 46 47 // Name is the Repository Name. 48 Name string 49 50 // CloneURL is the internal clone URL for Name. 51 CloneURL string 52 53 // Priority indicates ranking in results, higher first. 54 Priority float64 55 56 // DocumentRanksVersion when non-empty will lead to indexing using offline 57 // ranking. When the string changes this will also cause us to re-index with 58 // new ranks. 59 DocumentRanksVersion string 60 61 // Public is true if the repository is public. 62 Public bool 63 64 // Fork is true if the repository is a fork. 65 Fork bool 66 67 // Archived is true if the repository is archived. 68 Archived bool 69 70 // Map from language to scip-ctags, universal-ctags, or neither 71 LanguageMap ctags.LanguageMap 72 73 // The number of threads to use for indexing shards. Defaults to the number of available 74 // CPUs. If the server flag -cpu_fraction is set, then this value overrides it. 75 ShardConcurrency int32 76} 77 78// indexArgs represents the arguments we pass to zoekt-git-index 79type indexArgs struct { 80 IndexOptions 81 82 // Incremental indicates to skip indexing if already indexed. 83 Incremental bool 84 85 // IndexDir is the index directory to store the shards. 86 IndexDir string 87 88 // Parallelism is the number of shards to compute in parallel. 89 Parallelism int 90 91 // FileLimit is the maximum size of a file 92 FileLimit int 93 94 // UseDelta is true if we want to use the new delta indexer. This should 95 // only be true for repositories we explicitly enable. 96 UseDelta bool 97 98 // DeltaShardNumberFallbackThreshold is an upper limit on the number of preexisting shards that can exist 99 // before attempting a delta build. 100 DeltaShardNumberFallbackThreshold uint64 101 102 // ShardMerging is true if we want zoekt-git-index to respect compound shards. 103 ShardMerging bool 104} 105 106// BuildOptions returns a build.Options represented by indexArgs. Note: it 107// doesn't set fields like repository/branch. 108func (o *indexArgs) BuildOptions() *build.Options { 109 return &build.Options{ 110 // It is important that this RepositoryDescription exactly matches what 111 // the indexer we call will produce. This is to ensure that 112 // IncrementalSkipIndexing and IndexState can correctly calculate if 113 // nothing needs to be done. 114 RepositoryDescription: zoekt.Repository{ 115 ID: uint32(o.IndexOptions.RepoID), 116 Name: o.Name, 117 Branches: o.Branches, 118 RawConfig: map[string]string{ 119 "repoid": strconv.Itoa(int(o.IndexOptions.RepoID)), 120 "priority": strconv.FormatFloat(o.Priority, 'g', -1, 64), 121 "public": marshalBool(o.Public), 122 "fork": marshalBool(o.Fork), 123 "archived": marshalBool(o.Archived), 124 }, 125 }, 126 IndexDir: o.IndexDir, 127 Parallelism: o.Parallelism, 128 SizeMax: o.FileLimit, 129 LargeFiles: o.LargeFiles, 130 CTagsMustSucceed: o.Symbols, 131 DisableCTags: !o.Symbols, 132 IsDelta: o.UseDelta, 133 134 DocumentRanksVersion: o.DocumentRanksVersion, 135 136 LanguageMap: o.LanguageMap, 137 138 ShardMerging: o.ShardMerging, 139 } 140} 141 142func marshalBool(b bool) string { 143 if b { 144 return "1" 145 } 146 return "0" 147} 148 149func (o *indexArgs) String() string { 150 s := fmt.Sprintf("%d %s", o.RepoID, o.Name) 151 for i, b := range o.Branches { 152 if i == 0 { 153 s = fmt.Sprintf("%s@%s=%s", s, b.Name, b.Version) 154 } else { 155 s = fmt.Sprintf("%s,%s=%s", s, b.Name, b.Version) 156 } 157 } 158 return s 159} 160 161type gitIndexConfig struct { 162 // runCmd is the function that's used to execute all external commands (such as calls to "git" or "zoekt-git-index") 163 // that gitIndex may construct. 164 runCmd func(*exec.Cmd) error 165 166 // findRepositoryMetadata is the function that returns the repository metadata for the 167 // repository specified in args. 'ok' is false if the repository's metadata 168 // couldn't be found or if an error occurred. 169 // 170 // The primary purpose of this configuration option is to be able to provide a stub 171 // implementation for this in our test suite. All other callers should use build.Options.FindRepositoryMetadata(). 172 findRepositoryMetadata func(args *indexArgs) (repository *zoekt.Repository, metadata *zoekt.IndexMetadata, ok bool, err error) 173 174 // timeout defines how long the index server waits before killing an indexing job. 175 timeout time.Duration 176} 177 178func gitIndex(c gitIndexConfig, o *indexArgs, sourcegraph Sourcegraph, l sglog.Logger) error { 179 logger := l.Scoped("gitIndex") 180 181 if len(o.Branches) == 0 { 182 return errors.New("zoekt-git-index requires 1 or more branches") 183 } 184 185 if c.runCmd == nil { 186 return errors.New("runCmd in provided configuration was nil - a function must be provided") 187 } 188 runCmd := c.runCmd 189 190 if c.findRepositoryMetadata == nil { 191 return errors.New("findRepositoryMetadata in provided configuration was nil - a function must be provided") 192 } 193 194 buildOptions := o.BuildOptions() 195 ctx, cancel := context.WithTimeout(context.Background(), c.timeout) 196 defer cancel() 197 198 gitDir, err := tmpGitDir(o.Name) 199 if err != nil { 200 return err 201 } 202 defer os.RemoveAll(gitDir) // best-effort cleanup 203 204 // Create a repo to fetch into 205 cmd := exec.CommandContext(ctx, "git", 206 // use a random default branch. This is so that HEAD isn't a symref to a 207 // branch that is indexed. For example if you are indexing 208 // HEAD,master. Then HEAD would be pointing to master by default. 209 "-c", "init.defaultBranch=nonExistentBranchBB0FOFCH32", 210 "init", 211 // we don't need a working copy 212 "--bare", 213 gitDir) 214 cmd.Stdin = &bytes.Buffer{} 215 if err := runCmd(cmd); err != nil { 216 return err 217 } 218 219 var fetchDuration time.Duration 220 successfullyFetchedCommitsCount := 0 221 allFetchesSucceeded := true 222 223 defer func() { 224 success := strconv.FormatBool(allFetchesSucceeded) 225 name := repoNameForMetric(o.Name) 226 metricFetchDuration.WithLabelValues(success, name).Observe(fetchDuration.Seconds()) 227 }() 228 229 runFetch := func(branches []zoekt.RepositoryBranch) error { 230 // We shallow fetch each commit specified in zoekt.Branches. This requires 231 // the server to have configured both uploadpack.allowAnySHA1InWant and 232 // uploadpack.allowFilter. (See gitservice.go in the Sourcegraph repository) 233 fetchArgs := []string{ 234 "-C", gitDir, 235 "-c", "protocol.version=2", 236 "-c", "http.extraHeader=X-Sourcegraph-Actor-UID: internal", 237 "fetch", "--depth=1", o.CloneURL, 238 } 239 240 var commits []string 241 for _, b := range branches { 242 commits = append(commits, b.Version) 243 } 244 245 fetchArgs = append(fetchArgs, commits...) 246 247 cmd = exec.CommandContext(ctx, "git", fetchArgs...) 248 cmd.Stdin = &bytes.Buffer{} 249 250 start := time.Now() 251 err := runCmd(cmd) 252 fetchDuration += time.Since(start) 253 254 if err != nil { 255 allFetchesSucceeded = false 256 var bs []string 257 for _, b := range branches { 258 bs = append(bs, b.String()) 259 } 260 261 formattedBranches := strings.Join(bs, ", ") 262 return fmt.Errorf("fetching %s: %w", formattedBranches, err) 263 } 264 265 successfullyFetchedCommitsCount += len(commits) 266 return nil 267 } 268 269 fetchPriorAndLatestCommits := func() error { 270 prior, err := priorBranches(c, o) 271 if err != nil { 272 return err 273 } 274 275 var allBranches []zoekt.RepositoryBranch 276 allBranches = append(allBranches, o.Branches...) 277 allBranches = append(allBranches, prior...) 278 279 return runFetch(allBranches) 280 } 281 282 fetchOnlyLatestCommits := func() error { 283 return runFetch(o.Branches) 284 } 285 286 if o.UseDelta { 287 err := fetchPriorAndLatestCommits() 288 if err != nil { 289 name := buildOptions.RepositoryDescription.Name 290 id := buildOptions.RepositoryDescription.ID 291 292 log.Printf("delta build: failed to prepare delta build for %q (ID %d): failed to fetch both latest and prior commits: %s", name, id, err) 293 err = fetchOnlyLatestCommits() 294 if err != nil { 295 return err 296 } 297 } 298 } else { 299 err := fetchOnlyLatestCommits() 300 if err != nil { 301 return err 302 } 303 } 304 305 logger.Debug("successfully fetched git data", 306 sglog.String("repo", o.Name), 307 sglog.Uint32("id", o.RepoID), 308 sglog.Int("commits_count", successfullyFetchedCommitsCount), 309 sglog.Duration("duration", fetchDuration), 310 ) 311 312 // We then create the relevant refs for each fetched commit. 313 for _, b := range o.Branches { 314 ref := b.Name 315 if ref != "HEAD" { 316 ref = "refs/heads/" + ref 317 } 318 cmd = exec.CommandContext(ctx, "git", "-C", gitDir, "update-ref", ref, b.Version) 319 cmd.Stdin = &bytes.Buffer{} 320 if err := runCmd(cmd); err != nil { 321 return fmt.Errorf("failed update-ref %s to %s: %w", ref, b.Version, err) 322 } 323 } 324 325 // create git configuration with options 326 type configKV struct{ Key, Value string } 327 config := []configKV{{ 328 // zoekt.name is used by zoekt-git-index to set the repository name. 329 Key: "name", 330 Value: o.Name, 331 }} 332 for k, v := range buildOptions.RepositoryDescription.RawConfig { 333 config = append(config, configKV{Key: k, Value: v}) 334 } 335 sort.Slice(config, func(i, j int) bool { 336 return config[i].Key < config[j].Key 337 }) 338 339 // write git configuration to repo 340 for _, kv := range config { 341 cmd = exec.CommandContext(ctx, "git", "-C", gitDir, "config", "zoekt."+kv.Key, kv.Value) 342 cmd.Stdin = &bytes.Buffer{} 343 if err := runCmd(cmd); err != nil { 344 return err 345 } 346 } 347 348 args := []string{ 349 "-submodules=false", 350 } 351 352 if o.DocumentRanksVersion != "" { 353 // We store the document ranks as JSON in gitDir and tell zoekt-git-index where 354 // to find the file. 355 documentsRankFile := filepath.Join(gitDir, "documents.rank") 356 357 saveDocumentRanks := func() error { 358 r, err := sourcegraph.GetDocumentRanks(context.Background(), o.Name) 359 if err != nil { 360 return fmt.Errorf("GetDocumentRanks: %w", err) 361 } 362 363 b, err := json.Marshal(r) 364 if err != nil { 365 return err 366 } 367 368 if err := os.WriteFile(documentsRankFile, b, 0o600); err != nil { 369 return fmt.Errorf("failed to write %s to disk: %w", documentsRankFile, err) 370 } 371 372 return nil 373 } 374 375 if err := saveDocumentRanks(); err != nil { 376 // log and fall back to online ranking 377 logger.Warn( 378 "error saving document ranks. Falling back to online ranking", 379 sglog.Error(err), 380 sglog.String("repo", o.Name), 381 sglog.Uint32("id", o.RepoID), 382 ) 383 } else { 384 args = append(args, 385 "-offline_ranking", documentsRankFile, 386 "-offline_ranking_version", o.DocumentRanksVersion) 387 } 388 } 389 390 // Even though we check for incremental in this process, we still pass it 391 // in just in case we regress in how we check in process. We will still 392 // notice thanks to metrics and increased load on gitserver. 393 if o.Incremental { 394 args = append(args, "-incremental") 395 } 396 397 var branches []string 398 for _, b := range o.Branches { 399 branches = append(branches, b.Name) 400 } 401 args = append(args, "-branches", strings.Join(branches, ",")) 402 403 if o.UseDelta { 404 args = append(args, "-delta") 405 args = append(args, "-delta_threshold", strconv.FormatUint(o.DeltaShardNumberFallbackThreshold, 10)) 406 } 407 408 if len(o.LanguageMap) > 0 { 409 var languageMap []string 410 for language, parser := range o.LanguageMap { 411 languageMap = append(languageMap, language+":"+ctags.ParserToString(parser)) 412 } 413 args = append(args, "-language_map", strings.Join(languageMap, ",")) 414 } 415 416 args = append(args, buildOptions.Args()...) 417 args = append(args, gitDir) 418 419 cmd = exec.CommandContext(ctx, "zoekt-git-index", args...) 420 cmd.Stdin = &bytes.Buffer{} 421 if err := runCmd(cmd); err != nil { 422 return err 423 } 424 425 return nil 426} 427 428func priorBranches(c gitIndexConfig, o *indexArgs) ([]zoekt.RepositoryBranch, error) { 429 existingRepository, _, found, err := c.findRepositoryMetadata(o) 430 if err != nil { 431 return nil, fmt.Errorf("loading repository metadata: %w", err) 432 } 433 434 if !found || len(existingRepository.Branches) == 0 { 435 return nil, fmt.Errorf("no prior shards found") 436 } 437 438 return existingRepository.Branches, nil 439} 440 441func tmpGitDir(name string) (string, error) { 442 abs := url.QueryEscape(name) 443 if len(abs) > 200 { 444 h := sha1.New() 445 _, _ = io.WriteString(h, abs) 446 abs = abs[:200] + fmt.Sprintf("%x", h.Sum(nil))[:8] 447 } 448 dir := filepath.Join(os.TempDir(), abs+".git") 449 if _, err := os.Stat(dir); err == nil { 450 if err := os.RemoveAll(dir); err != nil { 451 return "", err 452 } 453 } 454 return dir, nil 455}