-
Notifications
You must be signed in to change notification settings - Fork 6k
br: enable parallel restore #58724
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
br: enable parallel restore #58724
Changes from all commits
2bd7ac0
2336b97
0e63777
7f460c6
57977bd
bd28d82
49feca5
ff610a7
188a45a
117c262
83c90c9
2061a56
ff73ec0
0a306e7
7fb2590
99abdd3
19ead02
292e622
89e5a05
145926b
14afb10
caee50f
b97beac
21e5ed5
72933a9
6e1d8d1
ab5cc78
c360107
90fa6fa
8ddcc12
7117a46
48bf9da
026aa8c
d283e4b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
load("@io_bazel_rules_go//go:def.bzl", "go_library") | ||
|
||
go_library( | ||
name = "registry", | ||
srcs = [ | ||
"heartbeat.go", | ||
"registration.go", | ||
], | ||
importpath = "github.com/pingcap/tidb/br/pkg/registry", | ||
visibility = ["//visibility:public"], | ||
deps = [ | ||
"//br/pkg/errors", | ||
"//br/pkg/glue", | ||
"//br/pkg/metautil", | ||
"//br/pkg/utils", | ||
"//pkg/domain", | ||
"//pkg/kv", | ||
"//pkg/util/sqlexec", | ||
"//pkg/util/table-filter", | ||
"@com_github_pingcap_errors//:errors", | ||
"@com_github_pingcap_log//:log", | ||
"@org_uber_go_zap//:zap", | ||
], | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
// Copyright 2025 PingCAP, Inc. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
package registry | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
"time" | ||
|
||
"github.com/pingcap/errors" | ||
"github.com/pingcap/log" | ||
"go.uber.org/zap" | ||
) | ||
|
||
const ( | ||
UpdateHeartbeatSQLTemplate = ` | ||
UPDATE %s.%s | ||
SET last_heartbeat_time = %%? | ||
WHERE id = %%?` | ||
|
||
// defaultHeartbeatIntervalSeconds is the default interval in seconds between heartbeat updates | ||
defaultHeartbeatIntervalSeconds = 60 | ||
) | ||
|
||
// UpdateHeartbeat updates the last_heartbeat_time timestamp for a task | ||
func (r *Registry) UpdateHeartbeat(ctx context.Context, restoreID uint64) error { | ||
currentTime := time.Now() | ||
updateSQL := fmt.Sprintf(UpdateHeartbeatSQLTemplate, RestoreRegistryDBName, RestoreRegistryTableName) | ||
|
||
if err := r.heartbeatSession.ExecuteInternal(ctx, updateSQL, currentTime, restoreID); err != nil { | ||
return errors.Annotatef(err, "failed to update heartbeat for task %d", restoreID) | ||
} | ||
|
||
log.Debug("updated task heartbeat", | ||
zap.Uint64("restore_id", restoreID), | ||
zap.Time("timestamp", currentTime)) | ||
|
||
return nil | ||
} | ||
|
||
// HeartbeatManager handles periodic heartbeat updates for a restore task | ||
// it only updates the restore task but will not remove any stalled tasks, the purpose of this logic is to provide | ||
// some insights to user of the task status | ||
type HeartbeatManager struct { | ||
registry *Registry | ||
restoreID uint64 | ||
interval time.Duration | ||
stopCh chan struct{} | ||
doneCh chan struct{} | ||
} | ||
|
||
// NewHeartbeatManager creates a new heartbeat manager for the given restore task | ||
func NewHeartbeatManager(registry *Registry, restoreID uint64) *HeartbeatManager { | ||
return &HeartbeatManager{ | ||
registry: registry, | ||
restoreID: restoreID, | ||
interval: time.Duration(defaultHeartbeatIntervalSeconds) * time.Second, | ||
stopCh: make(chan struct{}), | ||
doneCh: make(chan struct{}), | ||
} | ||
} | ||
|
||
// Start begins the heartbeat background process | ||
func (m *HeartbeatManager) Start(ctx context.Context) { | ||
go func() { | ||
defer close(m.doneCh) | ||
|
||
ticker := time.NewTicker(m.interval) | ||
defer ticker.Stop() | ||
|
||
// send an initial heartbeat | ||
if err := m.registry.UpdateHeartbeat(ctx, m.restoreID); err != nil { | ||
log.Warn("failed to send initial heartbeat", | ||
zap.Uint64("restore_id", m.restoreID), | ||
zap.Error(err)) | ||
} | ||
|
||
for { | ||
select { | ||
case <-ticker.C: | ||
if err := m.registry.UpdateHeartbeat(ctx, m.restoreID); err != nil { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If heartbeat is expired, will it override it? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes this is just periodically publishing a new timestamp, it doesn't do anything else, just a hint to user of the status of this task and they can decide it's live or an orphan |
||
log.Warn("failed to update heartbeat", | ||
zap.Uint64("restore_id", m.restoreID), | ||
zap.Error(err)) | ||
} | ||
case <-m.stopCh: | ||
return | ||
case <-ctx.Done(): | ||
log.Warn("heartbeat manager context done", | ||
zap.Uint64("restore_id", m.restoreID), | ||
zap.Error(ctx.Err())) | ||
return | ||
} | ||
} | ||
}() | ||
} | ||
|
||
// Stop ends the heartbeat background process | ||
func (m *HeartbeatManager) Stop() { | ||
close(m.stopCh) | ||
<-m.doneCh // Wait for goroutine to exit | ||
log.Info("stopped heartbeat manager") | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
what's the purpose of introduce heartbeat?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is to give use an idea of whether this restore task is orphaned or not. Cuz in the case of a running BR pod OOM there is no chance for it to clean up the table and restore task, so user might need to manually clean it up but such case should be very rare.