@@ -27,6 +27,75 @@ import (
27
27
"google.golang.org/grpc/backoff"
28
28
)
29
29
30
+ type RecoveryStage int
31
+
32
+ const (
33
+ StageUnknown RecoveryStage = iota
34
+ StageCollectingMeta
35
+ StageMakingRecoveryPlan
36
+ StageResetPDAllocateID
37
+ StageRecovering
38
+ StageFlashback
39
+ )
40
+
41
+ func (s RecoveryStage ) String () string {
42
+ switch s {
43
+ case StageCollectingMeta :
44
+ return "collecting meta"
45
+ case StageMakingRecoveryPlan :
46
+ return "making recovery plan"
47
+ case StageResetPDAllocateID :
48
+ return "resetting PD allocate ID"
49
+ case StageRecovering :
50
+ return "recovering"
51
+ case StageFlashback :
52
+ return "flashback"
53
+ default :
54
+ return "unknown"
55
+ }
56
+ }
57
+
58
+ type recoveryError struct {
59
+ error
60
+ atStage RecoveryStage
61
+ }
62
+
63
+ func FailedAt (err error ) RecoveryStage {
64
+ if rerr , ok := err .(recoveryError ); ok {
65
+ return rerr .atStage
66
+ }
67
+ return StageUnknown
68
+ }
69
+
70
+ type recoveryBackoffer struct {
71
+ state utils.RetryState
72
+ }
73
+
74
+ func newRecoveryBackoffer () * recoveryBackoffer {
75
+ return & recoveryBackoffer {
76
+ state : utils .InitialRetryState (16 , 30 * time .Second , 4 * time .Minute ),
77
+ }
78
+ }
79
+
80
+ func (bo * recoveryBackoffer ) NextBackoff (err error ) time.Duration {
81
+ s := FailedAt (err )
82
+ switch s {
83
+ case StageCollectingMeta , StageMakingRecoveryPlan , StageResetPDAllocateID , StageRecovering :
84
+ log .Info ("Recovery data retrying." , zap .Error (err ), zap .Stringer ("stage" , s ))
85
+ return bo .state .ExponentialBackoff ()
86
+ case StageFlashback :
87
+ log .Info ("Giving up retry for flashback stage." , zap .Error (err ), zap .Stringer ("stage" , s ))
88
+ bo .state .GiveUp ()
89
+ return 0
90
+ }
91
+ log .Warn ("unknown stage of backing off." , zap .Int ("val" , int (s )))
92
+ return bo .state .ExponentialBackoff ()
93
+ }
94
+
95
+ func (bo * recoveryBackoffer ) Attempt () int {
96
+ return bo .state .Attempt ()
97
+ }
98
+
30
99
// RecoverData recover the tikv cluster
31
100
// 1. read all meta data from tikvs
32
101
// 2. make recovery plan and then recovery max allocate ID firstly
@@ -35,39 +104,52 @@ import (
35
104
// 5. prepare the flashback
36
105
// 6. flashback to resolveTS
37
106
func RecoverData (ctx context.Context , resolveTS uint64 , allStores []* metapb.Store , mgr * conn.Mgr , progress glue.Progress , restoreTS uint64 , concurrency uint32 ) (int , error ) {
107
+ // Roughly handle the case that some TiKVs are rebooted during making plan.
108
+ // Generally, retry the whole procedure will be fine for most cases. But perhaps we can do finer-grained retry,
109
+ // say, we may reuse the recovery plan, and probably no need to rebase PD allocation ID once we have done it.
110
+ return utils .WithRetryV2 (ctx , newRecoveryBackoffer (), func (ctx context.Context ) (int , error ) {
111
+ return doRecoveryData (ctx , resolveTS , allStores , mgr , progress , restoreTS , concurrency )
112
+ })
113
+ }
114
+
115
+ func doRecoveryData (ctx context.Context , resolveTS uint64 , allStores []* metapb.Store , mgr * conn.Mgr , progress glue.Progress , restoreTS uint64 , concurrency uint32 ) (int , error ) {
116
+ var cancel context.CancelFunc
117
+ ctx , cancel = context .WithCancel (ctx )
118
+ defer cancel ()
119
+
38
120
var recovery = NewRecovery (allStores , mgr , progress , concurrency )
39
121
if err := recovery .ReadRegionMeta (ctx ); err != nil {
40
- return 0 , errors . Trace ( err )
122
+ return 0 , recoveryError { error : err , atStage : StageCollectingMeta }
41
123
}
42
124
43
125
totalRegions := recovery .GetTotalRegions ()
44
126
45
127
if err := recovery .MakeRecoveryPlan (); err != nil {
46
- return totalRegions , errors . Trace ( err )
128
+ return totalRegions , recoveryError { error : err , atStage : StageMakingRecoveryPlan }
47
129
}
48
130
49
131
log .Info ("recover the alloc id to pd" , zap .Uint64 ("max alloc id" , recovery .MaxAllocID ))
50
132
if err := recovery .mgr .RecoverBaseAllocID (ctx , recovery .MaxAllocID ); err != nil {
51
- return totalRegions , errors . Trace ( err )
133
+ return totalRegions , recoveryError { error : err , atStage : StageResetPDAllocateID }
52
134
}
53
135
54
136
// Once TiKV shuts down and reboot then, it may be left with no leader because of the recovery mode.
55
137
// This wathcher will retrigger `RecoveryRegions` for those stores.
56
138
recovery .SpawnTiKVShutDownWatchers (ctx )
57
139
if err := recovery .RecoverRegions (ctx ); err != nil {
58
- return totalRegions , errors . Trace ( err )
140
+ return totalRegions , recoveryError { error : err , atStage : StageRecovering }
59
141
}
60
142
61
143
if err := recovery .WaitApply (ctx ); err != nil {
62
- return totalRegions , errors . Trace ( err )
144
+ return totalRegions , recoveryError { error : err , atStage : StageRecovering }
63
145
}
64
146
65
147
if err := recovery .PrepareFlashbackToVersion (ctx , resolveTS , restoreTS - 1 ); err != nil {
66
- return totalRegions , errors . Trace ( err )
148
+ return totalRegions , recoveryError { error : err , atStage : StageFlashback }
67
149
}
68
150
69
151
if err := recovery .FlashbackToVersion (ctx , resolveTS , restoreTS ); err != nil {
70
- return totalRegions , errors . Trace ( err )
152
+ return totalRegions , recoveryError { error : err , atStage : StageFlashback }
71
153
}
72
154
73
155
return totalRegions , nil
0 commit comments