fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

Add metric for indexing delay (#804)

Our dashboards currently report indexing delay using the
`index_queue_age_seconds` metric, which tracks the duration indexing jobs wait
in the queue before being "popped" for indexing. For dot com, we regularly see
the median for this metric at 2 days. However, this metric may be misleading
because it includes "noop" indexing jobs where the index is already up to date.
These jobs can stay in the queue for a long time since they are not high
priority.

This PR adds a new metric `index_indexing_delay_seconds`, which reports the
duration from when an indexing job enters the queue, to when it completes. It
uses 'state' as a label, so by setting `state='success'` we can (roughly) see
the delay from when Zoekt learns about a new version, to when that version is
available to search.

+62 -23
+1 -1
cmd/zoekt-sourcegraph-indexserver/backoff_test.go
··· 24 24 25 25 // item is disallowed from being pushed to heap during backoff period 26 26 if item, ok := queue.Pop(); ok { 27 - qi := queue.items[item.RepoID] 27 + qi := queue.items[item.Opts.RepoID] 28 28 if qi.backoff.backoffUntil.Before(bumpTime) { 29 29 t.Errorf("backoffDuration already passed before first attempt to push item to heap in Bump(). Increase backoffDuration for the Queue. backoffDuration: %s. maxBackoffDuration: %s.", 30 30 backoffDuration, maxBackoffDuration)
+30 -2
cmd/zoekt-sourcegraph-indexserver/main.go
··· 89 89 "name", // name of the repository that was indexed 90 90 }) 91 91 92 + metricIndexingDelay = promauto.NewHistogramVec(prometheus.HistogramOpts{ 93 + Name: "index_indexing_delay_seconds", 94 + Help: "A histogram of durations from when an index job is added to the queue, to the time it completes.", 95 + Buckets: []float64{ 96 + 60, // 1m 97 + 300, // 5m 98 + 1200, // 20m 99 + 2400, // 40m 100 + 3600, // 1h 101 + 10800, // 3h 102 + 18000, // 5h 103 + 36000, // 10h 104 + 43200, // 12h 105 + 54000, // 15h 106 + 72000, // 20h 107 + 86400, // 24h 108 + 108000, // 30h 109 + 126000, // 35h 110 + 172800, // 48h 111 + }}, []string{ 112 + "state", // state is an indexState 113 + "name", // the name of the repository that was indexed 114 + }) 115 + 92 116 metricFetchDuration = promauto.NewHistogramVec(prometheus.HistogramOpts{ 93 117 Name: "index_fetch_seconds", 94 118 Help: "A histogram of latencies for fetching a repository.", ··· 400 424 continue 401 425 } 402 426 403 - opts, ok := s.queue.Pop() 427 + item, ok := s.queue.Pop() 404 428 if !ok { 405 429 time.Sleep(time.Second) 406 430 continue 407 431 } 408 432 433 + opts := item.Opts 409 434 args := s.indexArgs(opts) 410 435 411 436 ran := s.muIndexDir.With(opts.Name, func() { ··· 416 441 state, err := s.Index(args) 417 442 418 443 elapsed := time.Since(start) 419 - 420 444 metricIndexDuration.WithLabelValues(string(state), repoNameForMetric(opts.Name)).Observe(elapsed.Seconds()) 421 445 446 + indexDelay := time.Since(item.DateAddedToQueue) 447 + metricIndexingDelay.WithLabelValues(string(state), repoNameForMetric(opts.Name)).Observe(indexDelay.Seconds()) 448 + 422 449 if err != nil { 423 450 log.Printf("error indexing %s: %s", args.String(), err) 424 451 } ··· 434 461 sglog.Uint32("id", args.RepoID), 435 462 sglog.Strings("branches", branches), 436 463 sglog.Duration("duration", elapsed), 464 + sglog.Duration("index_delay", indexDelay), 437 465 ) 438 466 case indexStateSuccessMeta: 439 467 log.Printf("updated meta %s in %v", args.String(), elapsed)
+12 -7
cmd/zoekt-sourcegraph-indexserver/queue.go
··· 82 82 return q 83 83 } 84 84 85 - // Pop returns the opts of the next repo to index. If the queue is empty ok is 86 - // false. 87 - func (q *Queue) Pop() (opts IndexOptions, ok bool) { 85 + type QueueItem struct { 86 + // Opts are the options to use when indexing the repo. 87 + Opts IndexOptions 88 + // DateAddedToQueue is the time when this indexing job was added to the queue, used for telemetry. 89 + DateAddedToQueue time.Time 90 + } 91 + 92 + // Pop returns options and metadata for the next repo to index. If the queue is empty ok is false. 93 + func (q *Queue) Pop() (result QueueItem, ok bool) { 88 94 q.mu.Lock() 89 95 if len(q.pq) == 0 { 90 96 q.mu.Unlock() 91 - return IndexOptions{}, false 97 + return QueueItem{}, false 92 98 } 93 99 94 100 item := heap.Pop(&q.pq).(*queueItem) 95 - opts = item.opts 96 101 97 102 metricQueueLen.Set(float64(len(q.pq))) 98 103 metricQueueCap.Set(float64(len(q.items))) ··· 102 107 103 108 q.mu.Unlock() 104 109 105 - name := repoNameForMetric(opts.Name) 110 + name := repoNameForMetric(item.opts.Name) 106 111 age := time.Since(dateAdded) 107 112 metricQueueAge.WithLabelValues(name).Observe(age.Seconds()) 108 113 109 - return opts, true 114 + return QueueItem{item.opts, dateAdded}, true 110 115 } 111 116 112 117 // Len returns the number of items in the queue.
+19 -13
cmd/zoekt-sourcegraph-indexserver/queue_test.go
··· 31 31 // Ensure we process all the even commits first, then odd. 32 32 want := 0 33 33 for { 34 - opts, ok := queue.Pop() 34 + item, ok := queue.Pop() 35 35 if !ok { 36 36 break 37 37 } 38 - got, _ := strconv.Atoi(opts.Branches[0].Version) 38 + got, _ := strconv.Atoi(item.Opts.Branches[0].Version) 39 39 if got != want { 40 - t.Fatalf("got %v, want %v", opts, want) 40 + t.Fatalf("got %v, want %v", got, want) 41 41 } 42 42 want += 2 43 43 if want == 100 { 44 44 // We now switch to processing the odd numbers 45 45 want = 1 46 46 } 47 + 48 + // sanity check the date added 49 + if item.DateAddedToQueue.Unix() <= 0 { 50 + t.Fatalf("invalid DateAddedToQueue %v", item.DateAddedToQueue) 51 + } 52 + 47 53 // update current, shouldn't put the job in the queue 48 - queue.SetIndexed(opts, indexStateSuccess) 54 + queue.SetIndexed(item.Opts, indexStateSuccess) 49 55 } 50 56 if want != 101 { 51 57 t.Fatalf("only popped %d items", want) ··· 64 70 65 71 want := 0 66 72 for { 67 - opts, ok := queue.Pop() 73 + item, ok := queue.Pop() 68 74 if !ok { 69 75 break 70 76 } 71 - got, _ := strconv.Atoi(opts.Branches[0].Version) 77 + got, _ := strconv.Atoi(item.Opts.Branches[0].Version) 72 78 if got != want { 73 - t.Fatalf("got %v, want %v", opts, want) 79 + t.Fatalf("got %v, want %v", item, want) 74 80 } 75 - queue.SetIndexed(opts, indexStateSuccess) 81 + queue.SetIndexed(item.Opts, indexStateSuccess) 76 82 want++ 77 83 } 78 84 if want != 100 { ··· 88 94 queue.AddOrUpdate(IndexOptions{RepoID: 2, Name: "bar"}) 89 95 queue.MaybeRemoveMissing([]uint32{2}) 90 96 91 - opts, _ := queue.Pop() 92 - if opts.Name != "bar" { 93 - t.Fatalf("queue should only contain bar, pop returned %v", opts.Name) 97 + item, _ := queue.Pop() 98 + if item.Opts.Name != "bar" { 99 + t.Fatalf("queue should only contain bar, pop returned %v", item.Opts.Name) 94 100 } 95 101 _, ok := queue.Pop() 96 102 if ok { ··· 116 122 want := []IndexOptions{{RepoID: 2, Name: "bar"}} 117 123 var got []IndexOptions 118 124 for { 119 - opts, ok := queue.Pop() 125 + item, ok := queue.Pop() 120 126 if !ok { 121 127 break 122 128 } 123 - got = append(got, opts) 129 + got = append(got, item.Opts) 124 130 } 125 131 126 132 if d := cmp.Diff(want, got); d != "" {