-
Notifications
You must be signed in to change notification settings - Fork 1.1k
feat(cluster): Migration cancellation support #2869
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 12 commits
5823cf3
c95ecbd
c0de0e3
ae93e71
38c37f3
75be21e
b6d0c3d
def00bf
90e56df
64e637a
ee329e9
ccc9634
4700c15
06c75df
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -619,6 +619,8 @@ static string_view StateToStr(MigrationState state) { | |
return "SYNC"sv; | ||
case MigrationState::C_FINISHED: | ||
return "FINISHED"sv; | ||
case MigrationState::C_CANCELLED: | ||
return "CANCELLED"sv; | ||
case MigrationState::C_MAX_INVALID: | ||
break; | ||
} | ||
|
@@ -736,8 +738,18 @@ void ClusterFamily::RemoveOutgoingMigrations(const std::vector<MigrationInfo>& m | |
auto it = std::find_if(outgoing_migration_jobs_.begin(), outgoing_migration_jobs_.end(), | ||
[&m](const auto& om) { return m == om->GetMigrationInfo(); }); | ||
DCHECK(it != outgoing_migration_jobs_.end()); | ||
DCHECK(it->get() != nullptr); | ||
OutgoingMigration& migration = *it->get(); | ||
if (migration.GetState() != MigrationState::C_FINISHED) { | ||
|
||
LOG(INFO) << "Outgoing migration cancelled: slots " | ||
<< SlotRange::ToString(migration.GetSlots()) << " to " << migration.GetHostIp() | ||
<< ":" << migration.GetPort(); | ||
migration.CancelAll(); | ||
} | ||
outgoing_migration_jobs_.erase(it); | ||
} | ||
|
||
// Flushing of removed slots is done outside this function. | ||
} | ||
|
||
void ClusterFamily::RemoveIncomingMigrations(const std::vector<MigrationInfo>& migrations) { | ||
|
@@ -748,6 +760,30 @@ void ClusterFamily::RemoveIncomingMigrations(const std::vector<MigrationInfo>& m | |
return m.node_id == im->GetSourceID() && m.slot_ranges == im->GetSlots(); | ||
}); | ||
DCHECK(it != incoming_migrations_jobs_.end()); | ||
DCHECK(it->get() != nullptr); | ||
IncomingSlotMigration& migration = *it->get(); | ||
|
||
// Flush non-owned migrations | ||
SlotSet migration_slots(migration.GetSlots()); | ||
SlotSet removed = migration_slots.GetRemovedSlots(tl_cluster_config->GetOwnedSlots()); | ||
|
||
// First cancel socket, then flush slots, so that new entries won't arrive after we flush. | ||
if (migration.GetState() != MigrationState::C_FINISHED) { | ||
|
||
migration.Cancel(); | ||
} | ||
|
||
if (!removed.Empty()) { | ||
auto removed_ranges = make_shared<SlotRanges>(removed.ToSlotRanges()); | ||
LOG_IF(WARNING, migration.GetState() == MigrationState::C_FINISHED) | ||
<< "Flushing slots of removed FINISHED migration " << migration.GetSourceID() | ||
<< ", slots: " << SlotRange::ToString(*removed_ranges); | ||
shard_set->pool()->DispatchOnAll([removed_ranges](unsigned, ProactorBase*) { | ||
if (EngineShard* shard = EngineShard::tlocal(); shard) { | ||
shard->db_slice().FlushSlots(*removed_ranges); | ||
} | ||
}); | ||
} | ||
|
||
incoming_migrations_jobs_.erase(it); | ||
} | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,7 +25,8 @@ class ClusterShardMigration { | |
executor_ = std::make_unique<JournalExecutor>(service); | ||
} | ||
|
||
void Start(Context* cntx, io::Source* source) { | ||
void Start(Context* cntx, util::FiberSocketBase* source) { | ||
socket_ = source; | ||
JournalReader reader{source, 0}; | ||
TransactionReader tx_reader{false}; | ||
|
||
|
@@ -50,6 +51,14 @@ class ClusterShardMigration { | |
} | ||
} | ||
|
||
void Cancel() { | ||
if (socket_ != nullptr) { | ||
socket_->proactor()->Dispatch([s = socket_, sid = source_shard_id_]() { | ||
s->Shutdown(SHUT_RDWR); // Does not Close(), only forbids further I/O. | ||
}); | ||
} | ||
} | ||
|
||
private: | ||
void ExecuteTxWithNoShardSync(TransactionData&& tx_data, Context* cntx) { | ||
if (cntx->IsCancelled()) { | ||
|
@@ -68,6 +77,7 @@ class ClusterShardMigration { | |
|
||
private: | ||
uint32_t source_shard_id_; | ||
util::FiberSocketBase* socket_ = nullptr; | ||
std::unique_ptr<JournalExecutor> executor_; | ||
}; | ||
|
||
|
@@ -85,15 +95,28 @@ IncomingSlotMigration::IncomingSlotMigration(string source_id, Service* se, Slot | |
} | ||
|
||
IncomingSlotMigration::~IncomingSlotMigration() { | ||
sync_fb_.JoinIfNeeded(); | ||
} | ||
|
||
void IncomingSlotMigration::Join() { | ||
bc_->Wait(); | ||
state_ = MigrationState::C_FINISHED; | ||
} | ||
|
||
void IncomingSlotMigration::StartFlow(uint32_t shard, io::Source* source) { | ||
void IncomingSlotMigration::Cancel() { | ||
LOG(INFO) << "Cancelling incoming migration of slots " << SlotRange::ToString(slots_); | ||
cntx_.Cancel(); | ||
|
||
auto cb = [this](util::ProactorBase* pb) { | ||
if (const auto* shard = EngineShard::tlocal(); shard) { | ||
if (auto& flow = shard_flows_[shard->shard_id()]; flow) { | ||
flow->Cancel(); | ||
} | ||
} | ||
}; | ||
|
||
shard_set->pool()->AwaitFiberOnAll(std::move(cb)); | ||
} | ||
|
||
void IncomingSlotMigration::StartFlow(uint32_t shard, util::FiberSocketBase* source) { | ||
VLOG(1) << "Start flow for shard: " << shard; | ||
|
||
shard_flows_[shard]->Start(&cntx_, source); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why do you need it? If we cancel migration we should remove it at all
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
See the usage near calling
FinishMigration()
, although I'm open to suggestions here :)