k/quotas: throttling on in/out static shard wide TP quotas &ut

dlex · dlex · commit 7665c321e968 · 2023-01-15T17:05:53.000-05:00
Record in and out traffic in quota_manager
Store throttle-until time in connection_context (only for shard quotas)
Aggregate throttling delays (both requested and enforced)
from shard quotas and from per-client quotas
Test effective static throughput through one connection
diff --git a/src/v/kafka/server/connection_context.cc b/src/v/kafka/server/connection_context.cc
@@ -193,33 +193,70 @@ bool connection_context::is_finished_parsing() const {
     return conn->input().eof() || _server.abort_requested();
 }
 
+connection_context::delay_t
+connection_context::record_tp_and_calculate_throttle(
+  const request_header& hdr, const size_t request_size) {
+    using clock = quota_manager::clock;
+    static_assert(std::is_same_v<clock, delay_t::clock>);
+    const auto now = clock::now();
+
+    // Throttle on client based quotas
+    quota_manager::throttle_delay client_quota_delay{};
+    if (hdr.key == fetch_api::key) {
+        client_quota_delay = _server.quota_mgr().throttle_fetch_tp(
+          hdr.client_id, now);
+    } else if (hdr.key == produce_api::key) {
+        client_quota_delay = _server.quota_mgr().record_produce_tp_and_throttle(
+          hdr.client_id, request_size, now);
+    }
+
+    // Throttle on shard wide quotas
+    _server.quota_mgr().record_request_tp(request_size, now);
+    const quota_manager::shard_delays_t shard_delays
+      = _server.quota_mgr().get_shard_delays(_throttled_until, now);
+
+    // Sum up
+    const clock::duration delay_enforce = std::max(
+      shard_delays.enforce, client_quota_delay.enforce_duration());
+    const clock::duration delay_request = std::max(
+      {shard_delays.request,
+       client_quota_delay.duration,
+       clock::duration::zero()});
+    if (
+      delay_enforce != clock::duration::zero()
+      || delay_request != clock::duration::zero()) {
+        vlog(
+          klog.trace,
+          "[{}:{}] throttle request:{{shard:{}, client:{}}}, "
+          "enforce:{{shard:{}, client:{}}}",
+          _client_addr,
+          client_port(),
+          shard_delays.request,
+          client_quota_delay.duration,
+          shard_delays.enforce,
+          client_quota_delay.enforce_duration());
+    }
+    return delay_t{.request = delay_request, .enforce = delay_enforce};
+}
+
 ss::future<session_resources> connection_context::throttle_request(
   const request_header& hdr, size_t request_size) {
-    // update the throughput tracker for this client using the
-    // size of the current request and return any computed delay
-    // to apply for quota throttling.
-    //
     // note that when throttling is first determined, the request is
     // allowed to pass through, and only subsequent requests are
     // delayed. this is a similar strategy used by kafka 2.0: the
     // response is important because it allows clients to
     // distinguish throttling delays from real delays. delays
     // applied to subsequent messages allow backpressure to take
     // affect.
-    quota_manager::throttle_delay delay{};
-    if (hdr.key == fetch_api::key) {
-        delay = _server.quota_mgr().throttle_fetch_tp(hdr.client_id);
-    } else if (hdr.key == produce_api::key) {
-        delay = _server.quota_mgr().record_produce_tp_and_throttle(
-          hdr.client_id, request_size);
-    }
+
+    const delay_t delay = record_tp_and_calculate_throttle(hdr, request_size);
     request_data r_data = request_data{
       .request_key = hdr.key,
       .client_id = ss::sstring{hdr.client_id.value_or("")}};
     auto tracker = std::make_unique<request_tracker>(_server.probe());
     auto fut = ss::now();
-    if (delay.enforce && delay.duration > ss::lowres_clock::duration::zero()) {
-        fut = ss::sleep_abortable(delay.duration, _server.abort_source());
+    if (delay.enforce > delay_t::clock::duration::zero()) {
+        fut = ss::sleep_abortable(delay.enforce, _server.abort_source());
     }
     auto track = track_latency(hdr.key);
     return fut
@@ -228,7 +265,7 @@ ss::future<session_resources> connection_context::throttle_request(
       })
       .then([this,
              r_data = std::move(r_data),
-             delay,
+             delay = delay.request,
              track,
              tracker = std::move(tracker)](ssx::semaphore_units units) mutable {
           return server().get_request_unit().then(
@@ -240,7 +277,7 @@ ss::future<session_resources> connection_context::throttle_request(
              tracker = std::move(tracker)](
               ssx::semaphore_units qd_units) mutable {
                 session_resources r{
-                  .backpressure_delay = delay.duration,
+                  .backpressure_delay = delay,
                   .memlocks = std::move(mem_units),
                   .queue_units = std::move(qd_units),
                   .tracker = std::move(tracker),
@@ -466,6 +503,14 @@ ss::future<> connection_context::maybe_process_responses() {
             _server.quota_mgr().record_fetch_tp(
               resp_and_res.resources->request_data.client_id, msg.size());
         }
+        // Respose sizes only take effect on throttling at the next request
+        // processing. The better way was to measure throttle delay right here
+        // and apply it to the immediate response, but that would require
+        // drastic changes to kafka message processing framework - because
+        // throttle_ms has been serialized long ago already. With the current
+        // approach, egress token bucket level will always be an extra burst
+        // into the negative while under pressure.
+        _server.quota_mgr().record_response_tp(msg.size());
         try {
             return conn->write(std::move(msg))
               .then([] {
diff --git a/src/v/kafka/server/connection_context.h b/src/v/kafka/server/connection_context.h
@@ -207,6 +207,26 @@ class connection_context final
     ss::future<ssx::semaphore_units>
     reserve_request_units(api_key key, size_t size);
 
+    /// Calculated throttle delay pair.
+    /// \p request is the primary throttle delay that should be applied now.
+    /// In Kafka 2.0 compliant behaviour, it is only reported to the clients in
+    /// the throttle_ms field, so that they can do the throttling on client
+    /// side.
+    /// \p enforce is the delay value that has not been implemented by the
+    /// client on the last response, and has to be implemented here in the
+    /// broker.
+    struct delay_t {
+        using clock = ss::lowres_clock;
+        clock::duration request{};
+        clock::duration enforce{};
+    };
+
+    /// Update throughput trackers (per-client, per-shard, and whatever are
+    /// going to emerge) on ingress traffic and claculate aggregated throttle
+    /// delays from all of them.
+    delay_t record_tp_and_calculate_throttle(
+      const request_header& hdr, size_t request_size);
+
     // Apply backpressure sequence, where the request processing may be
     // delayed for various reasons, including throttling but also because
     // too few server resources are available to accomodate the request
@@ -309,6 +329,7 @@ class connection_context final
     ctx_log _authlog;
     std::optional<security::tls::mtls_state> _mtls_state;
     config::binding<uint32_t> _max_request_size;
+    ss::lowres_clock::time_point _throttled_until;
 };
 
 } // namespace kafka
diff --git a/src/v/kafka/server/tests/produce_consume_test.cc b/src/v/kafka/server/tests/produce_consume_test.cc
@@ -66,19 +66,24 @@ struct prod_consume_fixture : public redpanda_thread_fixture {
         return res;
     }
 
-    template<typename T>
-    ss::future<model::offset> produce(T&& batch_factory) {
+    ss::future<kafka::produce_response>
+    produce_raw(std::vector<kafka::produce_request::partition>&& partitions) {
         kafka::produce_request::topic tp;
-        size_t count = random_generators::get_int(1, 20);
-        tp.partitions = batch_factory(count);
+        tp.partitions = std::move(partitions);
         tp.name = test_topic;
         std::vector<kafka::produce_request::topic> topics;
         topics.push_back(std::move(tp));
         kafka::produce_request req(std::nullopt, 1, std::move(topics));
         req.data.timeout_ms = std::chrono::seconds(2);
         req.has_idempotent = false;
         req.has_transactional = false;
-        return producer->dispatch(std::move(req))
+        return producer->dispatch(std::move(req));
+    }
+
+    template<typename T>
+    ss::future<model::offset> produce(T&& batch_factory) {
+        const size_t count = random_generators::get_int(1, 20);
+        return produce_raw(batch_factory(count))
           .then([count](kafka::produce_response r) {
               return r.data.responses.begin()->partitions.begin()->base_offset
                      + model::offset(count - 1);
@@ -179,3 +184,134 @@ FIXTURE_TEST(test_version_handler, prod_consume_fixture) {
         .get(),
       kafka::client::kafka_request_disconnected_exception);
 }
+
+static std::vector<kafka::produce_request::partition>
+single_batch(const size_t volume) {
+    storage::record_batch_builder builder(
+      model::record_batch_type::raft_data, model::offset(0));
+    {
+        const ss::sstring data(volume, 's');
+        iobuf v{};
+        v.append(data.data(), data.size());
+        builder.add_raw_kv(iobuf{}, std::move(v));
+    }
+
+    kafka::produce_request::partition partition;
+    partition.partition_index = model::partition_id(0);
+    partition.records.emplace(std::move(builder).build());
+
+    std::vector<kafka::produce_request::partition> res;
+    res.push_back(std::move(partition));
+    return res;
+}
+
+FIXTURE_TEST(test_node_throughput_limits, prod_consume_fixture) {
+    namespace ch = std::chrono;
+
+    // configure
+    constexpr uint64_t pershard_rate_limit_in = 9_KiB;
+    constexpr uint64_t pershard_rate_limit_out = 7_KiB;
+    constexpr auto window_width = 200ms;
+    constexpr size_t batch_size = 256;
+    ss::smp::invoke_on_all([&] {
+        auto& config = config::shard_local_cfg();
+        config.get("kafka_throughput_limit_node_in_bps")
+          .set_value(
+            std::make_optional(pershard_rate_limit_in * ss::smp::count));
+        config.get("kafka_throughput_limit_node_out_bps")
+          .set_value(
+            std::make_optional(pershard_rate_limit_out * ss::smp::count));
+        config.get("kafka_quota_balancer_window_ms").set_value(window_width);
+        config.get("fetch_max_bytes").set_value(batch_size);
+        config.get("max_kafka_throttle_delay_ms").set_value(60'000ms);
+    }).get0();
+    wait_for_controller_leadership().get();
+    start();
+
+    // PRODUCE 10 KiB in smaller batches, check throttle but do not honour it,
+    // check that has to take 1 s
+    size_t kafka_in_data_len = 0;
+    {
+        constexpr size_t kafka_packet_overhead = 127;
+        const auto batches_cnt = pershard_rate_limit_in
+                                 / (batch_size + kafka_packet_overhead);
+        ch::steady_clock::time_point start;
+        ch::milliseconds throttle_time{};
+        // warmup is the number of iterations enough to exhaust the token bucket
+        // at least twice
+        const int warmup
+          = 2 * pershard_rate_limit_in
+              * ch::duration_cast<ch::milliseconds>(window_width).count() / 1000
+              / (batch_size + kafka_packet_overhead)
+            + 1;
+        for (int k = -warmup; k != batches_cnt; ++k) {
+            if (k == 0) {
+                start = ch::steady_clock::now();
+                throttle_time = {};
+            }
+            throttle_time += produce_raw(single_batch(batch_size))
+                               .then([](const kafka::produce_response& r) {
+                                   return r.data.throttle_time_ms;
+                               })
+                               .get0();
+            kafka_in_data_len += batch_size;
+        }
+        const auto stop = ch::steady_clock::now();
+        const auto wire_data_length = (batch_size + kafka_packet_overhead)
+                                      * batches_cnt;
+        const auto time_estimated = ch::milliseconds(
+          wire_data_length * 1000 / pershard_rate_limit_in);
+        BOOST_TEST_CHECK(
+          abs(stop - start - time_estimated) < time_estimated / 25,
+          "stop-start[" << stop - start << "] == time_estimated["
+                        << time_estimated << "] ±4%");
+    }
+
+    // CONSUME
+    size_t kafka_out_data_len = 0;
+    {
+        constexpr size_t kafka_packet_overhead = 62;
+        ch::steady_clock::time_point start;
+        size_t total_size{};
+        ch::milliseconds throttle_time{};
+        const int warmup
+          = 2 * pershard_rate_limit_out
+              * ch::duration_cast<ch::milliseconds>(window_width).count() / 1000
+              / (batch_size + kafka_packet_overhead)
+            + 1;
+        // consume cannot be measured by the number of fetches because the size
+        // of fetch payload is up to redpanda, "fetch_max_bytes" is merely a
+        // guidance. Therefore the consume test runs as long as there is data
+        // to fetch. We only can consume almost as much as have been produced:
+        const auto kafka_data_cap = kafka_in_data_len - batch_size * 2;
+        for (int k = -warmup; kafka_out_data_len < kafka_data_cap; ++k) {
+            if (k == 0) {
+                start = ch::steady_clock::now();
+                total_size = {};
+                throttle_time = {};
+            }
+            const auto fetch_resp = fetch_next().get0();
+            BOOST_REQUIRE_EQUAL(fetch_resp.data.topics.size(), 1);
+            BOOST_REQUIRE_EQUAL(fetch_resp.data.topics[0].partitions.size(), 1);
+            BOOST_TEST_REQUIRE(
+              fetch_resp.data.topics[0].partitions[0].records.has_value());
+            const auto kafka_data_len = fetch_resp.data.topics[0]
+                                          .partitions[0]
+                                          .records.value()
+                                          .size_bytes();
+            total_size += kafka_data_len + kafka_packet_overhead;
+            throttle_time += fetch_resp.data.throttle_time_ms;
+            kafka_out_data_len += kafka_data_len;
+        }
+        const auto stop = ch::steady_clock::now();
+        const auto time_estimated = ch::milliseconds(
+          total_size * 1000 / pershard_rate_limit_out);
+        BOOST_TEST_CHECK(
+          abs(stop - start - time_estimated) < time_estimated / 25,
+          "stop-start[" << stop - start << "] == time_estimated["
+                        << time_estimated << "] ±4%");
+    }
+
+    // otherwise test is not valid:
+    BOOST_REQUIRE_GT(kafka_in_data_len, kafka_out_data_len);
+}