Skip to content

Commit b979744

Browse files
branch-3.0: [fix](cloud-mow) FE should release mow lock when calculate delete bitmap catch exception (#43088)
PR Body: Now mow table lock is released on ms when doing commit txn, however if calculate delete bitmap failed before commiting txn, this lock will not release which will lead to another loading task hang on geting mow lock until this lock is expired on last txn. Cherry-picked from #41759 Co-authored-by: huanghaibin <[email protected]>
1 parent c71cb2d commit b979744

File tree

13 files changed

+313
-2
lines changed

13 files changed

+313
-2
lines changed

be/src/cloud/cloud_meta_mgr.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,9 @@ static std::string debug_info(const Request& req) {
293293
return fmt::format(" tablet_id={}", req.rowset_meta().tablet_id());
294294
} else if constexpr (is_any_v<Request, RemoveDeleteBitmapRequest>) {
295295
return fmt::format(" tablet_id={}", req.tablet_id());
296+
} else if constexpr (is_any_v<Request, RemoveDeleteBitmapUpdateLockRequest>) {
297+
return fmt::format(" table_id={}, tablet_id={}, lock_id={}", req.table_id(),
298+
req.tablet_id(), req.lock_id());
296299
} else {
297300
static_assert(!sizeof(Request));
298301
}
@@ -1112,6 +1115,25 @@ Status CloudMetaMgr::get_delete_bitmap_update_lock(const CloudTablet& tablet, in
11121115
return st;
11131116
}
11141117

1118+
Status CloudMetaMgr::remove_delete_bitmap_update_lock(const CloudTablet& tablet, int64_t lock_id,
1119+
int64_t initiator) {
1120+
VLOG_DEBUG << "remove_delete_bitmap_update_lock , tablet_id: " << tablet.tablet_id()
1121+
<< ",lock_id:" << lock_id;
1122+
RemoveDeleteBitmapUpdateLockRequest req;
1123+
RemoveDeleteBitmapUpdateLockResponse res;
1124+
req.set_cloud_unique_id(config::cloud_unique_id);
1125+
req.set_tablet_id(tablet.tablet_id());
1126+
req.set_lock_id(lock_id);
1127+
req.set_initiator(initiator);
1128+
auto st = retry_rpc("remove delete bitmap update lock", req, &res,
1129+
&MetaService_Stub::remove_delete_bitmap_update_lock);
1130+
if (!st.ok()) {
1131+
LOG(WARNING) << "remove delete bitmap update lock fail,tablet_id=" << tablet.tablet_id()
1132+
<< " lock_id=" << lock_id << " st=" << st.to_string();
1133+
}
1134+
return st;
1135+
}
1136+
11151137
Status CloudMetaMgr::remove_old_version_delete_bitmap(
11161138
int64_t tablet_id,
11171139
const std::vector<std::tuple<std::string, uint64_t, uint64_t>>& to_delete) {

be/src/cloud/cloud_meta_mgr.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,9 @@ class CloudMetaMgr {
101101
Status get_delete_bitmap_update_lock(const CloudTablet& tablet, int64_t lock_id,
102102
int64_t initiator);
103103

104+
Status remove_delete_bitmap_update_lock(const CloudTablet& tablet, int64_t lock_id,
105+
int64_t initiator);
106+
104107
Status remove_old_version_delete_bitmap(
105108
int64_t tablet_id,
106109
const std::vector<std::tuple<std::string, uint64_t, uint64_t>>& to_delete);

cloud/src/common/bvars.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,8 @@ BvarLatencyRecorderWithTag g_bvar_ms_get_delete_bitmap("ms", "get_delete_bitmap"
7474
BvarLatencyRecorderWithTag g_bvar_ms_get_delete_bitmap_update_lock("ms",
7575
"get_delete_bitmap_update_lock");
7676
BvarLatencyRecorderWithTag g_bvar_ms_remove_delete_bitmap("ms", "remove_delete_bitmap");
77+
BvarLatencyRecorderWithTag g_bvar_ms_remove_delete_bitmap_update_lock(
78+
"ms", "remove_delete_bitmap_update_lock");
7779
BvarLatencyRecorderWithTag g_bvar_ms_get_instance("ms", "get_instance");
7880
BvarLatencyRecorderWithTag g_bvar_ms_get_rl_task_commit_attach("ms", "get_rl_task_commit_attach");
7981
BvarLatencyRecorderWithTag g_bvar_ms_reset_rl_progress("ms", "reset_rl_progress");

cloud/src/common/bvars.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,7 @@ extern BvarLatencyRecorderWithTag g_bvar_ms_update_delete_bitmap;
173173
extern BvarLatencyRecorderWithTag g_bvar_ms_get_delete_bitmap;
174174
extern BvarLatencyRecorderWithTag g_bvar_ms_get_delete_bitmap_update_lock;
175175
extern BvarLatencyRecorderWithTag g_bvar_ms_remove_delete_bitmap;
176+
extern BvarLatencyRecorderWithTag g_bvar_ms_remove_delete_bitmap_update_lock;
176177
extern BvarLatencyRecorderWithTag g_bvar_ms_get_cluster_status;
177178
extern BvarLatencyRecorderWithTag g_bvar_ms_set_cluster_status;
178179
extern BvarLatencyRecorderWithTag g_bvar_ms_get_instance;

cloud/src/meta-service/meta_service.cpp

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2095,6 +2095,58 @@ void MetaServiceImpl::get_delete_bitmap_update_lock(google::protobuf::RpcControl
20952095
}
20962096
}
20972097

2098+
void MetaServiceImpl::remove_delete_bitmap_update_lock(
2099+
google::protobuf::RpcController* controller,
2100+
const RemoveDeleteBitmapUpdateLockRequest* request,
2101+
RemoveDeleteBitmapUpdateLockResponse* response, ::google::protobuf::Closure* done) {
2102+
RPC_PREPROCESS(remove_delete_bitmap_update_lock);
2103+
std::string cloud_unique_id = request->has_cloud_unique_id() ? request->cloud_unique_id() : "";
2104+
if (cloud_unique_id.empty()) {
2105+
code = MetaServiceCode::INVALID_ARGUMENT;
2106+
msg = "cloud unique id not set";
2107+
return;
2108+
}
2109+
2110+
instance_id = get_instance_id(resource_mgr_, cloud_unique_id);
2111+
if (instance_id.empty()) {
2112+
code = MetaServiceCode::INVALID_ARGUMENT;
2113+
msg = "empty instance_id";
2114+
LOG(INFO) << msg << ", cloud_unique_id=" << cloud_unique_id;
2115+
return;
2116+
}
2117+
2118+
RPC_RATE_LIMIT(remove_delete_bitmap_update_lock)
2119+
std::unique_ptr<Transaction> txn;
2120+
TxnErrorCode err = txn_kv_->create_txn(&txn);
2121+
if (err != TxnErrorCode::TXN_OK) {
2122+
code = cast_as<ErrCategory::CREATE>(err);
2123+
msg = "failed to init txn";
2124+
return;
2125+
}
2126+
if (!check_delete_bitmap_lock(code, msg, ss, txn, instance_id, request->table_id(),
2127+
request->lock_id(), request->initiator())) {
2128+
LOG(WARNING) << "failed to check delete bitmap tablet lock"
2129+
<< " table_id=" << request->table_id() << " tablet_id=" << request->tablet_id()
2130+
<< " request lock_id=" << request->lock_id()
2131+
<< " request initiator=" << request->initiator() << " msg " << msg;
2132+
return;
2133+
}
2134+
std::string lock_key =
2135+
meta_delete_bitmap_update_lock_key({instance_id, request->table_id(), -1});
2136+
txn->remove(lock_key);
2137+
err = txn->commit();
2138+
if (err != TxnErrorCode::TXN_OK) {
2139+
code = cast_as<ErrCategory::COMMIT>(err);
2140+
ss << "failed to remove delete bitmap tablet lock , err=" << err;
2141+
msg = ss.str();
2142+
return;
2143+
}
2144+
2145+
LOG(INFO) << "remove delete bitmap table lock table_id=" << request->table_id()
2146+
<< " tablet_id=" << request->tablet_id() << " lock_id=" << request->lock_id()
2147+
<< ", key=" << hex(lock_key) << ", initiator=" << request->initiator();
2148+
}
2149+
20982150
void MetaServiceImpl::remove_delete_bitmap(google::protobuf::RpcController* controller,
20992151
const RemoveDeleteBitmapRequest* request,
21002152
RemoveDeleteBitmapResponse* response,

cloud/src/meta-service/meta_service.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,11 @@ class MetaServiceImpl : public cloud::MetaService {
274274
RemoveDeleteBitmapResponse* response,
275275
::google::protobuf::Closure* done) override;
276276

277+
void remove_delete_bitmap_update_lock(google::protobuf::RpcController* controller,
278+
const RemoveDeleteBitmapUpdateLockRequest* request,
279+
RemoveDeleteBitmapUpdateLockResponse* response,
280+
::google::protobuf::Closure* done) override;
281+
277282
// cloud control get cluster's status by this api
278283
void get_cluster_status(google::protobuf::RpcController* controller,
279284
const GetClusterStatusRequest* request,
@@ -647,6 +652,14 @@ class MetaServiceProxy final : public MetaService {
647652
call_impl(&cloud::MetaService::remove_delete_bitmap, controller, request, response, done);
648653
}
649654

655+
void remove_delete_bitmap_update_lock(google::protobuf::RpcController* controller,
656+
const RemoveDeleteBitmapUpdateLockRequest* request,
657+
RemoveDeleteBitmapUpdateLockResponse* response,
658+
::google::protobuf::Closure* done) override {
659+
call_impl(&cloud::MetaService::remove_delete_bitmap_update_lock, controller, request,
660+
response, done);
661+
}
662+
650663
// cloud control get cluster's status by this api
651664
void get_cluster_status(google::protobuf::RpcController* controller,
652665
const GetClusterStatusRequest* request,

fe/fe-core/src/main/java/org/apache/doris/cloud/rpc/MetaServiceClient.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,17 @@ public Cloud.GetDeleteBitmapUpdateLockResponse getDeleteBitmapUpdateLock(
345345
return blockingStub.getDeleteBitmapUpdateLock(request);
346346
}
347347

348+
public Cloud.RemoveDeleteBitmapUpdateLockResponse removeDeleteBitmapUpdateLock(
349+
Cloud.RemoveDeleteBitmapUpdateLockRequest request) {
350+
if (!request.hasCloudUniqueId()) {
351+
Cloud.RemoveDeleteBitmapUpdateLockRequest.Builder builder = Cloud.RemoveDeleteBitmapUpdateLockRequest
352+
.newBuilder();
353+
builder.mergeFrom(request);
354+
return blockingStub.removeDeleteBitmapUpdateLock(builder.setCloudUniqueId(Config.cloud_unique_id).build());
355+
}
356+
return blockingStub.removeDeleteBitmapUpdateLock(request);
357+
}
358+
348359
public Cloud.GetInstanceResponse getInstance(Cloud.GetInstanceRequest request) {
349360
if (!request.hasCloudUniqueId()) {
350361
Cloud.GetInstanceRequest.Builder builder = Cloud.GetInstanceRequest.newBuilder();

fe/fe-core/src/main/java/org/apache/doris/cloud/rpc/MetaServiceProxy.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -335,6 +335,12 @@ public Cloud.GetDeleteBitmapUpdateLockResponse getDeleteBitmapUpdateLock(
335335
return w.executeRequest((client) -> client.getDeleteBitmapUpdateLock(request));
336336
}
337337

338+
public Cloud.RemoveDeleteBitmapUpdateLockResponse removeDeleteBitmapUpdateLock(
339+
Cloud.RemoveDeleteBitmapUpdateLockRequest request)
340+
throws RpcException {
341+
return w.executeRequest((client) -> client.removeDeleteBitmapUpdateLock(request));
342+
}
343+
338344
public Cloud.AlterObjStoreInfoResponse alterObjStoreInfo(Cloud.AlterObjStoreInfoRequest request)
339345
throws RpcException {
340346
return w.executeRequest((client) -> client.alterObjStoreInfo(request));

fe/fe-core/src/main/java/org/apache/doris/cloud/transaction/CloudGlobalTransactionMgr.java

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,8 @@
5656
import org.apache.doris.cloud.proto.Cloud.MetaServiceCode;
5757
import org.apache.doris.cloud.proto.Cloud.PrecommitTxnRequest;
5858
import org.apache.doris.cloud.proto.Cloud.PrecommitTxnResponse;
59+
import org.apache.doris.cloud.proto.Cloud.RemoveDeleteBitmapUpdateLockRequest;
60+
import org.apache.doris.cloud.proto.Cloud.RemoveDeleteBitmapUpdateLockResponse;
5961
import org.apache.doris.cloud.proto.Cloud.SubTxnInfo;
6062
import org.apache.doris.cloud.proto.Cloud.TableStatsPB;
6163
import org.apache.doris.cloud.proto.Cloud.TabletIndexPB;
@@ -648,7 +650,13 @@ private void calcDeleteBitmapForMow(long dbId, List<OlapTable> tableList, long t
648650
Map<Long, List<TCalcDeleteBitmapPartitionInfo>> backendToPartitionInfos = getCalcDeleteBitmapInfo(
649651
backendToPartitionTablets, partitionVersions, baseCompactionCnts, cumulativeCompactionCnts,
650652
cumulativePoints);
651-
sendCalcDeleteBitmaptask(dbId, transactionId, backendToPartitionInfos);
653+
try {
654+
sendCalcDeleteBitmaptask(dbId, transactionId, backendToPartitionInfos);
655+
} catch (UserException e) {
656+
LOG.warn("failed to sendCalcDeleteBitmaptask for txn=" + transactionId + ",exception=" + e.getMessage());
657+
removeDeleteBitmapUpdateLock(tableToPartitions, transactionId);
658+
throw e;
659+
}
652660
}
653661

654662
private void getPartitionInfo(List<OlapTable> tableList,
@@ -869,6 +877,33 @@ private void getDeleteBitmapUpdateLock(Map<Long, Set<Long>> tableToParttions, lo
869877
}
870878
}
871879

880+
private void removeDeleteBitmapUpdateLock(Map<Long, Set<Long>> tableToParttions, long transactionId) {
881+
for (Map.Entry<Long, Set<Long>> entry : tableToParttions.entrySet()) {
882+
RemoveDeleteBitmapUpdateLockRequest.Builder builder = RemoveDeleteBitmapUpdateLockRequest.newBuilder();
883+
builder.setTableId(entry.getKey())
884+
.setLockId(transactionId)
885+
.setInitiator(-1);
886+
final RemoveDeleteBitmapUpdateLockRequest request = builder.build();
887+
RemoveDeleteBitmapUpdateLockResponse response = null;
888+
try {
889+
response = MetaServiceProxy.getInstance().removeDeleteBitmapUpdateLock(request);
890+
if (LOG.isDebugEnabled()) {
891+
LOG.debug("remove delete bitmap lock, transactionId={}, Request: {}, Response: {}",
892+
transactionId, request, response);
893+
}
894+
Preconditions.checkNotNull(response);
895+
Preconditions.checkNotNull(response.getStatus());
896+
if (response.getStatus().getCode() != MetaServiceCode.OK) {
897+
LOG.warn("remove delete bitmap lock failed, transactionId={}, response:{}",
898+
transactionId, response);
899+
}
900+
} catch (Exception e) {
901+
LOG.warn("ignore get delete bitmap lock exception, transactionId={}, exception={}",
902+
transactionId, e);
903+
}
904+
}
905+
}
906+
872907
private void sendCalcDeleteBitmaptask(long dbId, long transactionId,
873908
Map<Long, List<TCalcDeleteBitmapPartitionInfo>> backendToPartitionInfos)
874909
throws UserException {

gensrc/proto/cloud.proto

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1463,6 +1463,18 @@ message GetDeleteBitmapUpdateLockResponse {
14631463
repeated int64 cumulative_points = 4;
14641464
}
14651465

1466+
message RemoveDeleteBitmapUpdateLockRequest {
1467+
optional string cloud_unique_id = 1; // For auth
1468+
optional int64 table_id = 2;
1469+
optional int64 tablet_id = 3;
1470+
optional int64 lock_id = 4;
1471+
optional int64 initiator = 5;
1472+
}
1473+
1474+
message RemoveDeleteBitmapUpdateLockResponse {
1475+
optional MetaServiceResponseStatus status = 1;
1476+
}
1477+
14661478
message GetRLTaskCommitAttachRequest {
14671479
optional string cloud_unique_id = 1; // For auth
14681480
optional int64 db_id = 2;
@@ -1574,6 +1586,7 @@ service MetaService {
15741586
rpc update_delete_bitmap(UpdateDeleteBitmapRequest) returns(UpdateDeleteBitmapResponse);
15751587
rpc get_delete_bitmap(GetDeleteBitmapRequest) returns(GetDeleteBitmapResponse);
15761588
rpc get_delete_bitmap_update_lock(GetDeleteBitmapUpdateLockRequest) returns(GetDeleteBitmapUpdateLockResponse);
1589+
rpc remove_delete_bitmap_update_lock(RemoveDeleteBitmapUpdateLockRequest) returns(RemoveDeleteBitmapUpdateLockResponse);
15771590
rpc remove_delete_bitmap(RemoveDeleteBitmapRequest) returns(RemoveDeleteBitmapResponse);
15781591

15791592
// routine load progress

0 commit comments

Comments
 (0)