Skip to content

Commit dce9588

Browse files
authored
fix data race of LocalAdmissionController (#10332)
close #10331 Signed-off-by: guo-shaoge <[email protected]>
1 parent fc932c9 commit dce9588

File tree

2 files changed

+19
-13
lines changed

2 files changed

+19
-13
lines changed

dbms/src/Flash/ResourceControl/LocalAdmissionController.cpp

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,8 @@ double ResourceGroup::getAcquireRUNumWithoutLock(double speed, uint32_t n_sec, d
141141
if unlikely (acquire_num == 0.0 && remaining_ru == 0.0)
142142
acquire_num = DEFAULT_BUFFER_TOKENS;
143143

144+
// The purpose of subtracting remaining_ru is try to ensure that the number of local tokens
145+
// always stays same with the amount consumed.
144146
acquire_num -= remaining_ru;
145147
acquire_num = (acquire_num > 0.0 ? acquire_num : 0.0);
146148
return acquire_num;
@@ -364,11 +366,9 @@ void LocalAdmissionController::mainLoop()
364366
static_assert(
365367
tick_interval <= ResourceGroup::COMPUTE_RU_CONSUMPTION_SPEED_INTERVAL && tick_interval <= DEGRADE_MODE_DURATION
366368
&& tick_interval <= DEFAULT_TARGET_PERIOD);
367-
auto cur_tick_beg = current_tick;
368-
auto cur_tick_end = cur_tick_beg + tick_interval;
369+
auto cur_tick_end = current_tick + tick_interval;
369370
while (!stopped.load())
370371
{
371-
if (current_tick < cur_tick_end)
372372
{
373373
std::unique_lock<std::mutex> lock(mu);
374374
if (keyspace_low_token_resource_groups.empty())
@@ -388,19 +388,16 @@ void LocalAdmissionController::mainLoop()
388388
try
389389
{
390390
while (current_tick >= cur_tick_end)
391-
{
392-
updateRUConsumptionSpeed();
393-
cur_tick_beg = cur_tick_end;
394391
cur_tick_end += tick_interval;
395-
}
396392

393+
updateRUConsumptionSpeed();
397394
if (const auto gac_req_opt = buildGACRequest(/*is_final_report=*/false); gac_req_opt.has_value())
398395
{
399396
std::lock_guard lock(gac_requests_mu);
400397
gac_requests.push_back(gac_req_opt.value());
401398
gac_requests_cv.notify_all();
402399
}
403-
clearCPUTimeWithoutLock(current_tick);
400+
clearCPUTime(current_tick);
404401
checkDegradeMode();
405402
}
406403
catch (...)
@@ -437,13 +434,17 @@ std::optional<resource_manager::TokenBucketsRequest> LocalAdmissionController::b
437434
else
438435
{
439436
std::unordered_set<std::pair<KeyspaceID, std::string>, LACPairHash> local_keyspace_low_token_resource_groups;
437+
std::unordered_map<std::pair<KeyspaceID, std::string>, ResourceGroupPtr, LACPairHash>
438+
local_keyspace_resource_groups;
440439
{
441440
std::lock_guard lock(mu);
442441
local_keyspace_low_token_resource_groups = keyspace_low_token_resource_groups;
443442
keyspace_low_token_resource_groups.clear();
443+
444+
local_keyspace_resource_groups = keyspace_resource_groups;
444445
}
445446

446-
for (const auto & ele : keyspace_resource_groups)
447+
for (const auto & ele : local_keyspace_resource_groups)
447448
{
448449
const bool need_fetch_token = local_keyspace_low_token_resource_groups.contains(ele.first);
449450
const bool need_report = ele.second->shouldReportRUConsumption(current_tick);
@@ -644,7 +645,7 @@ std::vector<std::pair<KeyspaceID, std::string>> LocalAdmissionController::handle
644645

645646
const String err_msg = fmt::format("handle acquire token resp failed: rg: {}(keyspace={})", name, keyspace_id);
646647
// It's possible for one_resp.granted_r_u_tokens() to be empty
647-
// when the acquire_token_req is only for report RU consumption.
648+
// when the acquire_token_req is only for report RU consumption or GAC got error(like nan token).
648649
if (one_resp.granted_r_u_tokens().empty())
649650
{
650651
resource_group->endRequest();
@@ -653,6 +654,7 @@ std::vector<std::pair<KeyspaceID, std::string>> LocalAdmissionController::handle
653654

654655
if unlikely (one_resp.granted_r_u_tokens().size() != 1)
655656
{
657+
resource_group->endRequest();
656658
LOG_ERROR(
657659
log,
658660
"{} unexpected resp.granted_r_u_tokens().size(): {} one_resp: {}",
@@ -665,13 +667,15 @@ std::vector<std::pair<KeyspaceID, std::string>> LocalAdmissionController::handle
665667
const resource_manager::GrantedRUTokenBucket & granted_token_bucket = one_resp.granted_r_u_tokens()[0];
666668
if unlikely (granted_token_bucket.type() != resource_manager::RequestUnitType::RU)
667669
{
670+
resource_group->endRequest();
668671
LOG_ERROR(log, "{} unexpected request type, one_resp: {}", err_msg, one_resp.ShortDebugString());
669672
continue;
670673
}
671674

672675
const auto trickle_ms = granted_token_bucket.trickle_time_ms();
673676
if unlikely (trickle_ms < 0)
674677
{
678+
resource_group->endRequest();
675679
LOG_ERROR(
676680
log,
677681
"{} unexpected trickle_ms: {} one_resp: {}",
@@ -686,6 +690,7 @@ std::vector<std::pair<KeyspaceID, std::string>> LocalAdmissionController::handle
686690
double added_tokens = granted_token_bucket.granted_tokens().tokens();
687691
if unlikely (!std::isfinite(added_tokens) || added_tokens < 0.0)
688692
{
693+
resource_group->endRequest();
689694
LOG_ERROR(
690695
log,
691696
"{} invalid added_tokens: {} one_resp: {}",
@@ -858,7 +863,7 @@ bool LocalAdmissionController::handleDeleteEvent(
858863
std::lock_guard lock(mu);
859864
erase_num = deleteResourceGroupWithoutLock(keyspace_id, name);
860865
}
861-
LOG_DEBUG(log, "delete resource group {}(keyspace={}), erase_num: {}", name, keyspace_id, erase_num);
866+
LOG_INFO(log, "delete resource group {}(keyspace={}), erase_num: {}", name, keyspace_id, erase_num);
862867
return true;
863868
}
864869

@@ -896,7 +901,7 @@ bool LocalAdmissionController::handlePutEvent(
896901
updateMaxRUPerSecAfterDeleteWithoutLock(rg->user_ru_per_sec);
897902
}
898903
}
899-
LOG_DEBUG(log, "modify resource group {}(keyspace={}) to: {}", name, keyspace_id, group_pb.ShortDebugString());
904+
LOG_INFO(log, "modify resource group {}(keyspace={}) to: {}", name, keyspace_id, group_pb.ShortDebugString());
900905
return true;
901906
}
902907

dbms/src/Flash/ResourceControl/LocalAdmissionController.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -614,9 +614,10 @@ class LocalAdmissionController final : private boost::noncopyable
614614
std::string & err_msg);
615615
void updateMaxRUPerSecAfterDeleteWithoutLock(uint64_t deleted_user_ru_per_sec);
616616

617-
void clearCPUTimeWithoutLock(const SteadyClock::time_point & now)
617+
void clearCPUTime(const SteadyClock::time_point & now)
618618
{
619619
static_assert(CLEAR_CPU_TIME_DURATION > ResourceGroup::COMPUTE_RU_CONSUMPTION_SPEED_INTERVAL);
620+
std::lock_guard lock(mu);
620621
if (now - last_clear_cpu_time >= CLEAR_CPU_TIME_DURATION)
621622
{
622623
for (auto & ele : keyspace_resource_groups)

0 commit comments

Comments
 (0)