Skip to content

Commit 88d9e44

Browse files
authored
Merge pull request #26790 from WillemKauf/tx_pr_6_recovery_rpc
`raft`: add `remake_learner_state` to `consensus` protocol [SC&R #5]
2 parents 2987df0 + 343774c commit 88d9e44

26 files changed

+455
-24
lines changed

src/v/cluster/cluster_utils.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,7 @@ cluster::errc map_update_interruption_error_code(std::error_code ec) {
145145
case raft::errc::group_not_exists:
146146
case raft::errc::replicate_first_stage_exception:
147147
case raft::errc::invalid_input_records:
148+
case raft::errc::not_learner:
148149
return errc::replication_error;
149150
}
150151
__builtin_unreachable();

src/v/cluster/controller.cc

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -655,6 +655,8 @@ ss::future<> controller::start(
655655
std::ref(_partition_manager),
656656
std::ref(_as));
657657

658+
co_await set_raft_manager_remake_cb();
659+
658660
co_await _members_backend.invoke_on(
659661
members_manager::shard, &members_backend::start);
660662
co_await _config_manager.invoke_on(
@@ -919,6 +921,7 @@ ss::future<> controller::stop() {
919921
co_await _data_migration_frontend.stop();
920922
co_await _topic_mount_handler.stop();
921923
co_await _config_manager.stop();
924+
co_await clear_raft_manager_remake_cb();
922925
co_await _api.stop();
923926
co_await _shard_balancer.stop();
924927
co_await _backend.stop();
@@ -1262,4 +1265,25 @@ controller::validate_configuration_invariants() {
12621265
co_return invariants;
12631266
}
12641267

1268+
ss::future<std::error_code> controller::trigger_remake_cb(raft::group_id g) {
1269+
auto ec = co_await _api.local().remake_partition(g);
1270+
if (ec) {
1271+
vlog(clusterlog.warn, "Unable to remake group {}, {}", g, ec);
1272+
}
1273+
co_return ec;
1274+
}
1275+
1276+
ss::future<> controller::set_raft_manager_remake_cb() {
1277+
co_await _raft_manager.invoke_on_all([this](raft::group_manager& gm) {
1278+
gm.set_remake_cb(
1279+
[this](raft::group_id g) -> ss::future<std::error_code> {
1280+
return trigger_remake_cb(g);
1281+
});
1282+
});
1283+
}
1284+
1285+
ss::future<> controller::clear_raft_manager_remake_cb() {
1286+
co_await _raft_manager.invoke_on_all(&raft::group_manager::clear_remake_cb);
1287+
}
1288+
12651289
} // namespace cluster

src/v/cluster/controller.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,15 @@ class controller {
272272
private:
273273
friend controller_probe;
274274

275+
using remake_cb_t
276+
= ss::noncopyable_function<ss::future<std::error_code>(model::ntp)>;
277+
278+
ss::future<std::error_code> trigger_remake_cb(raft::group_id g);
279+
280+
ss::future<> set_raft_manager_remake_cb();
281+
282+
ss::future<> clear_raft_manager_remake_cb();
283+
275284
/**
276285
* Create a \c bootstrap_cluster_cmd, replicate-and-wait it to the current
277286
* quorum, retry infinitely if replicate-and-wait fails.

src/v/cluster/controller_api.cc

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -579,17 +579,31 @@ controller_api::get_global_reconciliation_state(
579579
co_return state;
580580
}
581581

582-
ss::future<std::error_code>
583-
controller_api::remake_partition(const model::ntp& ntp) {
584-
auto shard_for_opt = shard_for(ntp);
582+
ss::future<std::error_code> controller_api::remake_partition(raft::group_id g) {
583+
auto shard_for_opt = shard_for(g);
585584
if (!shard_for_opt.has_value()) {
586585
co_return errc::partition_not_exists;
587586
}
588587

589588
auto shard = shard_for_opt.value();
589+
auto ntp_opt = co_await _partition_manager.invoke_on(
590+
shard, [g](cluster::partition_manager& pm) -> std::optional<model::ntp> {
591+
auto p = pm.partition_for(g);
592+
if (!p) {
593+
return std::nullopt;
594+
}
595+
return p->ntp();
596+
});
597+
598+
if (!ntp_opt.has_value()) {
599+
co_return errc::partition_not_exists;
600+
}
601+
602+
auto ntp = std::move(ntp_opt).value();
603+
590604
co_return co_await _backend.invoke_on(
591605
shard, [&ntp](cluster::controller_backend& b) {
592-
return b.remake_partition(ntp);
606+
return b.remake_partition(std::move(ntp));
593607
});
594608
}
595609

src/v/cluster/controller_api.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,8 @@ class controller_api {
8888
std::optional<ss::shard_id> shard_for(const raft::group_id& group) const;
8989
std::optional<ss::shard_id> shard_for(const model::ntp& ntp) const;
9090

91-
ss::future<std::error_code> remake_partition(const model::ntp& ntp);
91+
// Remakes the partition for the provided raft group.
92+
ss::future<std::error_code> remake_partition(raft::group_id g);
9293

9394
private:
9495
ss::future<std::optional<backend_operation>>

src/v/cluster/tests/remake_partition_tests.cc

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,9 @@ FIXTURE_TEST(remake_partition_test, remake_partition_fixture) {
113113
add_topic(model::topic_namespace_view{test_ntp}).get();
114114
wait_for_leader(test_ntp).get();
115115

116-
auto ec = controller->get_api().local().remake_partition(test_ntp).get();
116+
auto group
117+
= controller->get_partition_manager().local().get(test_ntp)->group();
118+
auto ec = controller->get_api().local().remake_partition(group).get();
117119
BOOST_REQUIRE_EQUAL(ec, cluster::errc::success);
118120

119121
// Wait till partition is recreated
@@ -229,7 +231,8 @@ FIXTURE_TEST(remake_partition_with_produce_test, remake_partition_fixture) {
229231
BOOST_REQUIRE_EQUAL(records.size(), total_num_records);
230232
}
231233

232-
auto ec = controller->get_api().local().remake_partition(test_ntp).get();
234+
auto group = p->group();
235+
auto ec = controller->get_api().local().remake_partition(group).get();
233236
BOOST_REQUIRE_EQUAL(ec, cluster::errc::success);
234237

235238
// Wait till partition is recreated

src/v/kafka/server/group.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2146,6 +2146,7 @@ kafka::error_code map_store_offset_error_code(std::error_code ec) {
21462146
case raft::errc::group_not_exists:
21472147
case raft::errc::replicate_first_stage_exception:
21482148
case raft::errc::transfer_to_current_leader:
2149+
case raft::errc::not_learner:
21492150
return error_code::unknown_server_error;
21502151
}
21512152
}

src/v/raft/buffered_protocol.cc

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,20 @@ buffered_protocol::transfer_leadership(
185185
&consensus_client_protocol::transfer_leadership);
186186
}
187187

188+
ss::future<result<remake_learner_state_reply>>
189+
buffered_protocol::remake_learner_state(
190+
model::node_id target_node,
191+
remake_learner_state_request req,
192+
rpc::client_opts opts) {
193+
return apply_with_gate(
194+
_gate,
195+
_base_protocol,
196+
target_node,
197+
std::move(req),
198+
std::move(opts),
199+
&consensus_client_protocol::remake_learner_state);
200+
}
201+
188202
ss::future<bool> buffered_protocol::ensure_disconnect(model::node_id node_id) {
189203
return _base_protocol.ensure_disconnect(node_id);
190204
}

src/v/raft/buffered_protocol.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,9 @@ class buffered_protocol : public consensus_client_protocol::impl {
152152

153153
ss::future<> reset_backoff(model::node_id n) final;
154154

155+
ss::future<result<remake_learner_state_reply>> remake_learner_state(
156+
model::node_id, remake_learner_state_request, rpc::client_opts) final;
157+
155158
ss::future<> stop();
156159

157160
private:

src/v/raft/consensus.cc

Lines changed: 98 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,8 @@ consensus::consensus(
106106
config::binding<std::chrono::milliseconds> disk_timeout,
107107
config::binding<bool> enable_longest_log_detection,
108108
consensus_client_protocol client,
109-
consensus::leader_cb_t cb,
109+
remake_cb_t remake_cb,
110+
consensus::leader_cb_t leader_cb,
110111
storage::api& storage,
111112
std::optional<std::reference_wrapper<coordinated_recovery_throttle>>
112113
recovery_throttle,
@@ -123,7 +124,8 @@ consensus::consensus(
123124
, _disk_timeout(std::move(disk_timeout))
124125
, _enable_longest_log_detection(std::move(enable_longest_log_detection))
125126
, _client_protocol(client)
126-
, _leader_notification(std::move(cb))
127+
, _remake_notification(std::move(remake_cb))
128+
, _leader_notification(std::move(leader_cb))
127129
, _fstats(_self)
128130
, _batcher(this, config::shard_local_cfg().raft_replicate_batch_window_size())
129131
, _event_manager(this)
@@ -1709,6 +1711,18 @@ ss::future<> consensus::write_last_applied(model::offset o) {
17091711
storage::kvstore::key_space::consensus, std::move(key), std::move(val));
17101712
}
17111713

1714+
ss::future<> consensus::truncate_state(model::offset truncate_at) {
1715+
co_await _log->truncate(storage::truncate_config(truncate_at));
1716+
_probe->log_truncated();
1717+
// update flushed offset
1718+
_flushed_offset = std::min(
1719+
model::prev_offset(truncate_at), _flushed_offset);
1720+
1721+
co_await _configuration_manager.truncate(truncate_at);
1722+
_probe->configuration_update();
1723+
update_follower_stats(_configuration_manager.get_latest());
1724+
}
1725+
17121726
model::offset consensus::read_last_applied() const {
17131727
const auto key = last_applied_key();
17141728
auto value = _storage.kvs().get(
@@ -2235,7 +2249,6 @@ consensus::do_append_entries(append_entries_request&& r) {
22352249
last_visible_index(),
22362250
_last_leader_visible_offset,
22372251
truncate_at);
2238-
_probe->log_truncated();
22392252

22402253
_majority_replicated_index = std::min(
22412254
model::prev_offset(truncate_at), _majority_replicated_index);
@@ -2248,17 +2261,7 @@ consensus::do_append_entries(append_entries_request&& r) {
22482261
model::prev_offset(truncate_at), _flushed_offset);
22492262

22502263
try {
2251-
co_await _log->truncate(storage::truncate_config(truncate_at));
2252-
// update flushed offset once again after truncation as flush is
2253-
// executed concurrently to append entries and it may race with
2254-
// the truncation
2255-
_flushed_offset = std::min(
2256-
model::prev_offset(truncate_at), _flushed_offset);
2257-
2258-
co_await _configuration_manager.truncate(truncate_at);
2259-
_probe->configuration_update();
2260-
update_follower_stats(_configuration_manager.get_latest());
2261-
2264+
co_await truncate_state(truncate_at);
22622265
auto lstats = _log->offsets();
22632266
if (unlikely(lstats.dirty_offset != adjusted_prev_log_index)) {
22642267
vlog(
@@ -4287,4 +4290,85 @@ size_t consensus::bytes_to_deliver_to_learners() const {
42874290
return total;
42884291
}
42894292

4293+
ss::future<remake_learner_state_reply>
4294+
consensus::remake_learner_state(vnode target) {
4295+
_probe->recovery_reset();
4296+
remake_learner_state_request req{
4297+
.node_id = _self,
4298+
.target_node_id = target,
4299+
.group = _group,
4300+
.term = _term};
4301+
vlog(_ctxlog.info, "Issuing remake group request {}", req);
4302+
static constexpr auto timeout = 10s;
4303+
result<remake_learner_state_reply> reply
4304+
= co_await _client_protocol.remake_learner_state(
4305+
target.id(), req, rpc::client_opts(timeout));
4306+
if (!reply) {
4307+
vlog(
4308+
_ctxlog.warn,
4309+
"Unable to issue remake group request {}, {}",
4310+
req,
4311+
reply.error());
4312+
co_return remake_learner_state_reply{};
4313+
}
4314+
4315+
co_return reply.value();
4316+
}
4317+
4318+
ss::future<remake_learner_state_reply>
4319+
consensus::do_remake_learner_state(remake_learner_state_request req) {
4320+
remake_learner_state_reply reply{};
4321+
using is_success = remake_learner_state_reply::is_success;
4322+
try {
4323+
auto units = co_await _op_lock.get_units();
4324+
4325+
// Perform validation of request under _op_lock
4326+
auto maybe_err = [&]() -> std::optional<raft::errc> {
4327+
if (req.term != _term) {
4328+
return raft::errc::not_leader;
4329+
}
4330+
if (req.source_node() != _leader_id) {
4331+
return raft::errc::leadership_transfer_in_progress;
4332+
}
4333+
if (req.target_node() != _self) {
4334+
return raft::errc::invalid_target_node;
4335+
}
4336+
if (!is_learner()) {
4337+
return raft::errc::not_learner;
4338+
}
4339+
if (req.group != _group) {
4340+
return raft::errc::group_not_exists;
4341+
}
4342+
4343+
return std::nullopt;
4344+
}();
4345+
4346+
if (maybe_err.has_value()) {
4347+
reply.success = is_success::no;
4348+
vlog(
4349+
_ctxlog.warn,
4350+
"Unable to process remake group request {}, raft::errc {}",
4351+
req,
4352+
maybe_err.value());
4353+
} else {
4354+
auto cluster_err = co_await _remake_notification(req.group);
4355+
reply.success = cluster_err ? is_success::no : is_success::yes;
4356+
vlog(
4357+
_ctxlog.warn,
4358+
"Unable to process remake group request {}, cluster::errc {}",
4359+
req,
4360+
cluster_err);
4361+
}
4362+
} catch (...) {
4363+
vlog(
4364+
_ctxlog.warn,
4365+
"Unable to process remake group request {}, caught exception: {}",
4366+
req,
4367+
std::current_exception());
4368+
reply.success = is_success::no;
4369+
}
4370+
4371+
co_return reply;
4372+
}
4373+
42904374
} // namespace raft

0 commit comments

Comments
 (0)