Skip to content

Commit cbafb78

Browse files
authored
ddl: refresh TiFlash PlacementRules periodically (#61865)
close #61864
1 parent c3acc96 commit cbafb78

File tree

4 files changed

+126
-0
lines changed

4 files changed

+126
-0
lines changed

pkg/ddl/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ go_library(
7676
deps = [
7777
"//br/pkg/storage",
7878
"//pkg/config",
79+
"//pkg/config/kerneltype",
7980
"//pkg/ddl/copr",
8081
"//pkg/ddl/ingest",
8182
"//pkg/ddl/label",

pkg/ddl/ddl_tiflash_api.go

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,14 @@ import (
3030

3131
"github.com/pingcap/errors"
3232
"github.com/pingcap/failpoint"
33+
"github.com/pingcap/tidb/pkg/config/kerneltype"
3334
"github.com/pingcap/tidb/pkg/ddl/logutil"
3435
ddlutil "github.com/pingcap/tidb/pkg/ddl/util"
3536
"github.com/pingcap/tidb/pkg/domain/infosync"
3637
"github.com/pingcap/tidb/pkg/infoschema"
3738
infoschemacontext "github.com/pingcap/tidb/pkg/infoschema/context"
3839
"github.com/pingcap/tidb/pkg/meta/model"
40+
"github.com/pingcap/tidb/pkg/parser/ast"
3941
"github.com/pingcap/tidb/pkg/sessionctx"
4042
"github.com/pingcap/tidb/pkg/table"
4143
"github.com/pingcap/tidb/pkg/util"
@@ -221,6 +223,8 @@ var (
221223
PullTiFlashPdTick = atomicutil.NewUint64(30 * 5)
222224
// UpdateTiFlashStoreTick indicates the number of intervals before we fully update TiFlash stores.
223225
UpdateTiFlashStoreTick = atomicutil.NewUint64(5)
226+
// RefreshRulesTick indicates the number of intervals before we refresh TiFlash rules.
227+
RefreshRulesTick = atomicutil.NewUint64(10)
224228
// PollTiFlashBackoffMaxTick is the max tick before we try to update TiFlash replica availability for one table.
225229
PollTiFlashBackoffMaxTick TiFlashTick = 10
226230
// PollTiFlashBackoffMinTick is the min tick before we try to update TiFlash replica availability for one table.
@@ -548,6 +552,99 @@ func (d *ddl) refreshTiFlashTicker(ctx sessionctx.Context, pollTiFlashContext *T
548552
return nil
549553
}
550554

555+
type pending struct {
556+
ID int64
557+
TableInfo *model.TableInfo
558+
DBInfo *model.DBInfo
559+
}
560+
561+
// refreshTiFlashPlacementRules will refresh the placement rules of TiFlash replicas if on tick.
562+
// 1. It will scan all the meta and check if there is any TiFlash replica.
563+
// 2. If there is, it will check if the placement rules are missing.
564+
// 3. If the placement rules are missing, it will add by submit a ActionSetTiFlashReplica job to repair the entire table.
565+
func (d *ddl) refreshTiFlashPlacementRules(sctx sessionctx.Context, tick uint64) error {
566+
if tick%RefreshRulesTick.Load() != 0 {
567+
return nil
568+
}
569+
schema := d.infoCache.GetLatest()
570+
if schema == nil {
571+
return errors.New("schema is nil")
572+
}
573+
574+
var pendings []pending
575+
576+
for _, dbResult := range schema.ListTablesWithSpecialAttribute(infoschemacontext.TiFlashAttribute) {
577+
db, ok := schema.SchemaByName(dbResult.DBName)
578+
if !ok {
579+
return infoschema.ErrDatabaseNotExists.GenWithStackByArgs(dbResult.DBName.O)
580+
}
581+
for _, tblInfo := range dbResult.TableInfos {
582+
if tblInfo.TiFlashReplica == nil {
583+
continue
584+
}
585+
586+
if ps := tblInfo.GetPartitionInfo(); ps != nil {
587+
collectPendings := func(ps []model.PartitionDefinition) {
588+
for _, p := range ps {
589+
pendings = append(pendings, pending{
590+
ID: p.ID,
591+
TableInfo: tblInfo,
592+
DBInfo: db,
593+
})
594+
}
595+
}
596+
collectPendings(ps.Definitions)
597+
collectPendings(ps.AddingDefinitions)
598+
} else {
599+
pendings = append(pendings, pending{
600+
ID: tblInfo.ID,
601+
TableInfo: tblInfo,
602+
DBInfo: db,
603+
})
604+
}
605+
}
606+
}
607+
608+
fixed := make(map[int64]struct{})
609+
for _, replica := range pendings {
610+
if _, ok := fixed[replica.TableInfo.ID]; ok {
611+
continue
612+
}
613+
rule, err := infosync.GetPlacementRule(d.ctx, replica.ID)
614+
if err != nil {
615+
logutil.DDLLogger().Warn("get placement rule err", zap.Error(err))
616+
continue
617+
}
618+
// pdhttp.GetPlacementRule returns the zero object instead of nil pointer when not found.
619+
ruleIsMissing := rule == nil || len(rule.ID) == 0
620+
if ruleIsMissing && replica.TableInfo.TiFlashReplica.Count > 0 {
621+
job := &model.Job{
622+
Version: model.GetJobVerInUse(),
623+
SchemaID: replica.DBInfo.ID,
624+
TableID: replica.TableInfo.ID,
625+
SchemaName: replica.DBInfo.Name.L,
626+
TableName: replica.TableInfo.Name.L,
627+
Type: model.ActionSetTiFlashReplica,
628+
BinlogInfo: &model.HistoryInfo{},
629+
CDCWriteSource: sctx.GetSessionVars().CDCWriteSource,
630+
SQLMode: sctx.GetSessionVars().SQLMode,
631+
}
632+
args := model.SetTiFlashReplicaArgs{TiflashReplica: ast.TiFlashReplicaSpec{
633+
Count: replica.TableInfo.TiFlashReplica.Count,
634+
Labels: replica.TableInfo.TiFlashReplica.LocationLabels,
635+
}}
636+
err = d.executor.doDDLJob2(sctx, job, &args)
637+
if err != nil {
638+
logutil.DDLLogger().Warn("fix placement rule err", zap.Error(err))
639+
} else {
640+
logutil.DDLLogger().Info("fix placement rule success", zap.Int64("tableID", replica.TableInfo.ID))
641+
fixed[replica.TableInfo.ID] = struct{}{}
642+
}
643+
}
644+
}
645+
return nil
646+
}
647+
551648
func (d *ddl) PollTiFlashRoutine() {
552649
pollTiflashContext, err := NewTiFlashManagementContext()
553650
if err != nil {
@@ -594,6 +691,11 @@ func (d *ddl) PollTiFlashRoutine() {
594691
logutil.DDLLogger().Warn("refreshTiFlashTicker returns error", zap.Error(err))
595692
}
596693
}
694+
if kerneltype.IsNextGen() {
695+
if err := d.refreshTiFlashPlacementRules(sctx, pollTiflashContext.PollCounter); err != nil {
696+
logutil.DDLLogger().Warn("refreshTiFlashPlacementRules returns error", zap.Error(err))
697+
}
698+
}
597699
} else {
598700
infosync.CleanTiFlashProgressCache()
599701
}

pkg/domain/infosync/info.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1305,6 +1305,16 @@ func GetTiFlashRegionCountFromPD(ctx context.Context, tableID int64, regionCount
13051305
return is.tiflashReplicaManager.GetRegionCountFromPD(ctx, tableID, regionCount)
13061306
}
13071307

1308+
// GetPlacementRule is a helper function to get placement rule by table id.
1309+
func GetPlacementRule(ctx context.Context, tableID int64) (*pdhttp.Rule, error) {
1310+
is, err := getGlobalInfoSyncer()
1311+
if err != nil {
1312+
return nil, err
1313+
}
1314+
1315+
return is.tiflashReplicaManager.GetPlacementRule(ctx, tableID)
1316+
}
1317+
13081318
// GetTiFlashStoresStat gets the TiKV store information by accessing PD's api.
13091319
func GetTiFlashStoresStat(ctx context.Context) (*pdhttp.StoresInfo, error) {
13101320
is, err := getGlobalInfoSyncer()

pkg/domain/infosync/tiflash_manager.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@ type TiFlashReplicaManager interface {
5555
SetPlacementRule(ctx context.Context, rule *pd.Rule) error
5656
// SetPlacementRuleBatch is a helper function to set a batch of placement rules.
5757
SetPlacementRuleBatch(ctx context.Context, rules []*pd.Rule) error
58+
// GetPlacementRule is a helper function to get placement rule.
59+
GetPlacementRule(ctx context.Context, tableID int64) (*pd.Rule, error)
5860
// DeletePlacementRule is to delete placement rule for certain group.
5961
DeletePlacementRule(ctx context.Context, group string, ruleID string) error
6062
// GetGroupRules to get all placement rule in a certain group.
@@ -89,6 +91,13 @@ type TiFlashReplicaManagerCtx struct {
8991
codec tikv.Codec
9092
}
9193

94+
// GetPlacementRule is a helper function to get placement rule by table id.
95+
func (m *TiFlashReplicaManagerCtx) GetPlacementRule(ctx context.Context, tableID int64) (*pd.Rule, error) {
96+
ruleID := MakeRuleID(tableID)
97+
ruleID = encodeRuleID(m.codec, ruleID)
98+
return m.pdHTTPCli.GetPlacementRule(ctx, placement.TiFlashRuleGroupID, ruleID)
99+
}
100+
92101
// Close is called to close TiFlashReplicaManagerCtx.
93102
func (*TiFlashReplicaManagerCtx) Close(context.Context) {}
94103

@@ -349,6 +358,10 @@ func makeBaseRule() pd.Rule {
349358
}
350359
}
351360

361+
func (*mockTiFlashReplicaManagerCtx) GetPlacementRule(context.Context, int64) (*pd.Rule, error) {
362+
return nil, errors.New("not implemented")
363+
}
364+
352365
// MakeNewRule creates a pd rule for TiFlash.
353366
func MakeNewRule(id int64, count uint64, locationLabels []string) pd.Rule {
354367
ruleID := MakeRuleID(id)

0 commit comments

Comments
 (0)