Skip to content

Commit cfb1844

Browse files
jcrossleyjenkins
authored and
jenkins
committed
finagle/finagle-core: Add request classification stats in ServerStatsFilter
Problem We'd like to be able to see on the server side what fraction of requests have been retried in some manner -- backups, requeues, or retries. This can, for example, be useful in diagnosing retry storms. Solution Add these stats under /request_classification scope. Differential Revision: https://phabricator.twitter.biz/D1229878
1 parent 019d429 commit cfb1844

File tree

6 files changed

+171
-5
lines changed

6 files changed

+171
-5
lines changed

CHANGELOG.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ New Features
3737
* finagle-netty4: Added support for custom event loop implementations. ``PHAB_ID=D1185136``
3838
* finagle-core: Add low priority offload executor. ``PHAB_ID=D1189064``
3939
* finagle-core: A new context, `c.t.f.Retry` is set for a request if it is a retry. ``PHAB_ID=D1228127``
40+
* finagle-core: Add new counters under <server_label>/request_classification for retries/requeues/backups.
41+
``PHAB_ID=D1229878``
4042

4143

4244
Bug Fixes

doc/src/sphinx/metrics/Public.rst

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,21 @@ ServerStatsFilter
8282
from client to server. Be aware that clock drift between hosts, stop the world
8383
pauses, and queue backups can contribute here. Not supported by all protocols.
8484

85+
**request_classification/total**
86+
Counter of the total number of requests, including those with responses flagged Ignorable.
87+
This can be used as a denominator when calculating the fraction of requests that are retries,
88+
requeues, or backups (see counters below).
89+
90+
**request_classification/retry**
91+
Counter of the number of requests received that are retries.
92+
93+
**request_classification/requeue**
94+
Counter of the number of requests received that are requeues (e.g., retried NACKs).
95+
96+
**request_classification/backup**
97+
Counter of the number of requests that are backups.
98+
99+
85100
RequestSemaphoreFilter
86101
<<<<<<<<<<<<<<<<<<<<<<
87102

finagle-core/src/main/scala/com/twitter/finagle/context/Requeues.scala

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,13 @@ object Requeues extends Contexts.broadcast.Key[Requeues]("com.twitter.finagle.Re
2525
def current: Option[Requeues] =
2626
Contexts.broadcast.get(Requeues)
2727

28+
def isRequeue: Boolean = {
29+
current match {
30+
case Some(requeues) if requeues.attempt > 0 => true
31+
case _ => false
32+
}
33+
}
34+
2835
override def marshal(requeues: Requeues): Buf = {
2936
val bw = BufByteWriter.fixed(4)
3037
bw.writeIntBE(requeues.attempt)

finagle-core/src/main/scala/com/twitter/finagle/filter/ServerStatsFilter.scala

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,19 @@ package com.twitter.finagle.filter
22

33
import com.twitter.finagle.context.Deadline
44
import com.twitter.finagle.stats.StatsReceiver
5-
import com.twitter.finagle.{param, Service, ServiceFactory, SimpleFilter, Stack, Stackable}
6-
import com.twitter.util.{Time, Duration, Future, Stopwatch}
5+
import com.twitter.finagle.param
6+
import com.twitter.finagle.Service
7+
import com.twitter.finagle.ServiceFactory
8+
import com.twitter.finagle.SimpleFilter
9+
import com.twitter.finagle.Stack
10+
import com.twitter.finagle.Stackable
11+
import com.twitter.finagle.context.BackupRequest
12+
import com.twitter.finagle.context.Requeues
13+
import com.twitter.finagle.context.RetryContext
14+
import com.twitter.util.Time
15+
import com.twitter.util.Duration
16+
import com.twitter.util.Future
17+
import com.twitter.util.Stopwatch
718
import java.util.concurrent.TimeUnit
819

920
private[finagle] object ServerStatsFilter {
@@ -27,7 +38,8 @@ private[finagle] object ServerStatsFilter {
2738

2839
/**
2940
* A [[com.twitter.finagle.Filter]] that records the elapsed execution
30-
* times of the underlying [[com.twitter.finagle.Service]].
41+
* times of the underlying [[com.twitter.finagle.Service]], in addition to classifying requests
42+
* as a retry, requeue, or backup.
3143
*
3244
* @note the stat does not include the time that it takes to satisfy
3345
* the returned `Future`, only how long it takes for the `Service`
@@ -42,7 +54,32 @@ private[finagle] class ServerStatsFilter[Req, Rep](
4254
private[this] val handletime = statsReceiver.stat("handletime_us")
4355
private[this] val transitTimeStat = statsReceiver.stat("transit_latency_ms")
4456

57+
// We use a new scope here for clarity. We record "total" for two reasons:
58+
// 1. This filter is separate from the StatsFilter, which records /requests after requests have
59+
// completed -- meaning /total and /requests can be different.
60+
// 2. /requests *excludes* requests that have been superceded by a backup or have their response
61+
// flagged with FailureFlags.Ignorable (for accounting with success/failures). Here, we want to
62+
// have an exact measure of requests so we can see the true proportion of retries/requeues/backups.
63+
private[this] val requestClassificationScope = statsReceiver.scope("request_classification")
64+
private[this] val backupCounter = requestClassificationScope.counter("backup")
65+
private[this] val requeueCounter = requestClassificationScope.counter("requeue")
66+
private[this] val retryCounter = requestClassificationScope.counter("retry")
67+
private[this] val totalCounter = requestClassificationScope.counter("total")
68+
4569
def apply(request: Req, service: Service[Req, Rep]): Future[Rep] = {
70+
totalCounter.incr()
71+
if (BackupRequest.wasInitiated) {
72+
backupCounter.incr()
73+
}
74+
75+
if (Requeues.isRequeue) {
76+
requeueCounter.incr()
77+
}
78+
79+
if (RetryContext.isRetry) {
80+
retryCounter.incr()
81+
}
82+
4683
val startAt = nowNanos()
4784

4885
Deadline.current match {

finagle-core/src/main/scala/com/twitter/finagle/server/StackServer.scala

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ import com.twitter.finagle.service.TimeoutFilter
1212
import com.twitter.finagle.stats.ServerStatsReceiver
1313
import com.twitter.finagle.tracing._
1414
import com.twitter.finagle.Stack
15-
import com.twitter.finagle._
1615
import com.twitter.jvm.Jvm
1716

1817
object StackServer {

finagle-core/src/test/scala/com/twitter/finagle/filter/ServerStatsFilterTest.scala

Lines changed: 107 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,14 @@ package com.twitter.finagle.filter
22

33
import com.twitter.finagle.Service
44
import com.twitter.finagle.stats.InMemoryStatsReceiver
5-
import com.twitter.util.{Stopwatch, Time, Future}
5+
import com.twitter.util.Future
6+
import com.twitter.util.Stopwatch
7+
import com.twitter.util.Time
68
import com.twitter.conversions.DurationOps._
9+
import com.twitter.finagle.context
10+
import com.twitter.finagle.context.BackupRequest
11+
import com.twitter.finagle.context.Contexts
12+
import com.twitter.finagle.context.RetryContext
713
import org.scalatest.funsuite.AnyFunSuite
814

915
class ServerStatsFilterTest extends AnyFunSuite {
@@ -21,4 +27,104 @@ class ServerStatsFilterTest extends AnyFunSuite {
2127
assert(actual == expected)
2228
}
2329
}
30+
31+
test("Record retry stat when request is retry") {
32+
val stats = new InMemoryStatsReceiver
33+
val svc = Service.mk[Unit, Unit] { _ =>
34+
Future.Done
35+
}
36+
val filter = new ServerStatsFilter[Unit, Unit](stats)
37+
filter.andThen(svc)(())
38+
39+
assert(stats.counters(Seq("request_classification", "retry")) == 0)
40+
assert(stats.counters(Seq("request_classification", "requeue")) == 0)
41+
assert(stats.counters(Seq("request_classification", "backup")) == 0)
42+
43+
RetryContext.withRetry {
44+
filter.andThen(svc)(())
45+
}
46+
47+
assert(stats.counters(Seq("request_classification", "retry")) == 1)
48+
assert(stats.counters(Seq("request_classification", "requeue")) == 0)
49+
assert(stats.counters(Seq("request_classification", "backup")) == 0)
50+
assert(stats.counters(Seq("request_classification", "total")) == 2)
51+
}
52+
53+
test("Record requeue stat when request is requeue") {
54+
val stats = new InMemoryStatsReceiver
55+
val svc = Service.mk[Unit, Unit] { _ =>
56+
Future.Done
57+
}
58+
val filter = new ServerStatsFilter[Unit, Unit](stats)
59+
filter.andThen(svc)(())
60+
61+
assert(stats.counters(Seq("request_classification", "retry")) == 0)
62+
assert(stats.counters(Seq("request_classification", "requeue")) == 0)
63+
assert(stats.counters(Seq("request_classification", "backup")) == 0)
64+
65+
Contexts.broadcast.let(context.Requeues, context.Requeues(0)) {
66+
filter.andThen(svc)(())
67+
}
68+
69+
assert(stats.counters(Seq("request_classification", "retry")) == 0)
70+
assert(stats.counters(Seq("request_classification", "requeue")) == 0)
71+
assert(stats.counters(Seq("request_classification", "backup")) == 0)
72+
73+
Contexts.broadcast.let(context.Requeues, context.Requeues(2)) {
74+
filter.andThen(svc)(())
75+
}
76+
77+
assert(stats.counters(Seq("request_classification", "retry")) == 0)
78+
assert(stats.counters(Seq("request_classification", "requeue")) == 1)
79+
assert(stats.counters(Seq("request_classification", "backup")) == 0)
80+
assert(stats.counters(Seq("request_classification", "total")) == 3)
81+
}
82+
83+
test("Record retry stat when request is backup") {
84+
val stats = new InMemoryStatsReceiver
85+
val svc = Service.mk[Unit, Unit] { _ =>
86+
Future.Done
87+
}
88+
val filter = new ServerStatsFilter[Unit, Unit](stats)
89+
filter.andThen(svc)(())
90+
91+
assert(stats.counters(Seq("request_classification", "retry")) == 0)
92+
assert(stats.counters(Seq("request_classification", "requeue")) == 0)
93+
assert(stats.counters(Seq("request_classification", "backup")) == 0)
94+
95+
BackupRequest.let {
96+
filter.andThen(svc)(())
97+
}
98+
99+
assert(stats.counters(Seq("request_classification", "retry")) == 0)
100+
assert(stats.counters(Seq("request_classification", "requeue")) == 0)
101+
assert(stats.counters(Seq("request_classification", "backup")) == 1)
102+
assert(stats.counters(Seq("request_classification", "total")) == 2)
103+
}
104+
105+
test("Record retry+requeue+backup stat when request is retry+requeue+backup") {
106+
val stats = new InMemoryStatsReceiver
107+
val svc = Service.mk[Unit, Unit] { _ =>
108+
Future.Done
109+
}
110+
val filter = new ServerStatsFilter[Unit, Unit](stats)
111+
filter.andThen(svc)(())
112+
113+
assert(stats.counters(Seq("request_classification", "retry")) == 0)
114+
assert(stats.counters(Seq("request_classification", "requeue")) == 0)
115+
assert(stats.counters(Seq("request_classification", "backup")) == 0)
116+
117+
Contexts.broadcast.let(context.Requeues, context.Requeues(1)) {
118+
BackupRequest.let {
119+
RetryContext.withRetry {
120+
filter.andThen(svc)(())
121+
}
122+
}
123+
}
124+
125+
assert(stats.counters(Seq("request_classification", "retry")) == 1)
126+
assert(stats.counters(Seq("request_classification", "requeue")) == 1)
127+
assert(stats.counters(Seq("request_classification", "backup")) == 1)
128+
assert(stats.counters(Seq("request_classification", "total")) == 2)
129+
}
24130
}

0 commit comments

Comments
 (0)