@@ -4,6 +4,7 @@ package restore
4
4
import (
5
5
"context"
6
6
"io"
7
+ "time"
7
8
8
9
"github.com/pingcap/errors"
9
10
"github.com/pingcap/kvproto/pkg/metapb"
@@ -12,7 +13,9 @@ import (
12
13
"github.com/pingcap/tidb/br/pkg/common"
13
14
"github.com/pingcap/tidb/br/pkg/conn"
14
15
"github.com/pingcap/tidb/br/pkg/glue"
16
+ "github.com/pingcap/tidb/br/pkg/logutil"
15
17
"github.com/pingcap/tidb/br/pkg/utils"
18
+ "github.com/pingcap/tidb/br/pkg/utils/storewatch"
16
19
"github.com/pingcap/tidb/ddl"
17
20
"github.com/pingcap/tidb/util/mathutil"
18
21
tikvstore "github.com/tikv/client-go/v2/kv"
@@ -48,6 +51,9 @@ func RecoverData(ctx context.Context, resolveTS uint64, allStores []*metapb.Stor
48
51
return totalRegions , errors .Trace (err )
49
52
}
50
53
54
+ // Once TiKV shuts down and reboot then, it may be left with no leader because of the recovery mode.
55
+ // This wathcher will retrigger `RecoveryRegions` for those stores.
56
+ recovery .SpawnTiKVShutDownWatchers (ctx )
51
57
if err := recovery .RecoverRegions (ctx ); err != nil {
52
58
return totalRegions , errors .Trace (err )
53
59
}
@@ -213,6 +219,39 @@ func (recovery *Recovery) GetTotalRegions() int {
213
219
return len (regions )
214
220
}
215
221
222
+ func (recovery * Recovery ) RecoverRegionOfStore (ctx context.Context , storeID uint64 , plan []* recovpb.RecoverRegionRequest ) error {
223
+ storeAddr := getStoreAddress (recovery .allStores , storeID )
224
+ recoveryClient , conn , err := recovery .newRecoveryClient (ctx , storeAddr )
225
+ if err != nil {
226
+ log .Error ("create tikv client failed" , zap .Uint64 ("store id" , storeID ))
227
+ return errors .Trace (err )
228
+ }
229
+ defer conn .Close ()
230
+ log .Info ("send recover region to tikv" , zap .String ("tikv address" , storeAddr ), zap .Uint64 ("store id" , storeID ))
231
+ stream , err := recoveryClient .RecoverRegion (ctx )
232
+ if err != nil {
233
+ log .Error ("create recover region failed" , zap .Uint64 ("store id" , storeID ))
234
+ return errors .Trace (err )
235
+ }
236
+
237
+ // for a TiKV, send the stream
238
+ for _ , s := range plan {
239
+ if err = stream .Send (s ); err != nil {
240
+ log .Error ("send recover region failed" , zap .Error (err ))
241
+ return errors .Trace (err )
242
+ }
243
+ }
244
+
245
+ reply , err := stream .CloseAndRecv ()
246
+ if err != nil {
247
+ log .Error ("close the stream failed" )
248
+ return errors .Trace (err )
249
+ }
250
+ recovery .progress .Inc ()
251
+ log .Info ("recover region execution success" , zap .Uint64 ("store id" , reply .GetStoreId ()))
252
+ return nil
253
+ }
254
+
216
255
// RecoverRegions send the recovery plan to recovery region (force leader etc)
217
256
// only tikvs have regions whose have to recover be sent
218
257
func (recovery * Recovery ) RecoverRegions (ctx context.Context ) (err error ) {
@@ -224,46 +263,60 @@ func (recovery *Recovery) RecoverRegions(ctx context.Context) (err error) {
224
263
if err := ectx .Err (); err != nil {
225
264
break
226
265
}
266
+ storeId := storeId
267
+ plan := plan
227
268
228
- storeAddr := getStoreAddress (recovery .allStores , storeId )
229
- recoveryPlan := plan
230
- recoveryStoreId := storeId
231
269
workers .ApplyOnErrorGroup (eg , func () error {
232
- recoveryClient , conn , err := recovery .newRecoveryClient (ectx , storeAddr )
233
- if err != nil {
234
- log .Error ("create tikv client failed" , zap .Uint64 ("store id" , recoveryStoreId ))
235
- return errors .Trace (err )
236
- }
237
- defer conn .Close ()
238
- log .Info ("send recover region to tikv" , zap .String ("tikv address" , storeAddr ), zap .Uint64 ("store id" , recoveryStoreId ))
239
- stream , err := recoveryClient .RecoverRegion (ectx )
240
- if err != nil {
241
- log .Error ("create recover region failed" , zap .Uint64 ("store id" , recoveryStoreId ))
242
- return errors .Trace (err )
243
- }
244
-
245
- // for a TiKV, send the stream
246
- for _ , s := range recoveryPlan {
247
- if err = stream .Send (s ); err != nil {
248
- log .Error ("send recover region failed" , zap .Error (err ))
249
- return errors .Trace (err )
250
- }
251
- }
252
-
253
- reply , err := stream .CloseAndRecv ()
254
- if err != nil {
255
- log .Error ("close the stream failed" )
256
- return errors .Trace (err )
257
- }
258
- recovery .progress .Inc ()
259
- log .Info ("recover region execution success" , zap .Uint64 ("store id" , reply .GetStoreId ()))
260
- return nil
270
+ return recovery .RecoverRegionOfStore (ectx , storeId , plan )
261
271
})
262
272
}
263
273
// Wait for all TiKV instances force leader and wait apply to last log.
264
274
return eg .Wait ()
265
275
}
266
276
277
+ func (recovery * Recovery ) SpawnTiKVShutDownWatchers (ctx context.Context ) {
278
+ rebootStores := map [uint64 ]struct {}{}
279
+ cb := storewatch .MakeCallback (storewatch .WithOnReboot (func (s * metapb.Store ) {
280
+ log .Info ("Store reboot detected, will regenerate leaders." , zap .Uint64 ("id" , s .GetId ()))
281
+ rebootStores [s .Id ] = struct {}{}
282
+ }), storewatch .WithOnDisconnect (func (s * metapb.Store ) {
283
+ log .Warn ("A store disconnected." , zap .Uint64 ("id" , s .GetId ()), zap .String ("addr" , s .GetAddress ()))
284
+ }), storewatch .WithOnNewStoreRegistered (func (s * metapb.Store ) {
285
+ log .Info ("Start to observing the state of store." , zap .Uint64 ("id" , s .GetId ()))
286
+ }))
287
+ watcher := storewatch .New (recovery .mgr .PDClient (), cb )
288
+ tick := time .NewTicker (30 * time .Second )
289
+ mainLoop := func () {
290
+ for {
291
+ select {
292
+ case <- ctx .Done ():
293
+ return
294
+ case <- tick .C :
295
+ err := watcher .Step (ctx )
296
+ if err != nil {
297
+ log .Warn ("Failed to step watcher." , logutil .ShortError (err ))
298
+ }
299
+ for id := range rebootStores {
300
+ plan , ok := recovery .RecoveryPlan [id ]
301
+ if ! ok {
302
+ log .Warn ("Store reboot detected, but no recovery plan found." , zap .Uint64 ("id" , id ))
303
+ continue
304
+ }
305
+ err := recovery .RecoverRegionOfStore (ctx , id , plan )
306
+ if err != nil {
307
+ log .Warn ("Store reboot detected, but failed to regenerate leader." , zap .Uint64 ("id" , id ), logutil .ShortError (err ))
308
+ continue
309
+ }
310
+ log .Info ("Succeed to reload the leader in store." , zap .Uint64 ("id" , id ))
311
+ delete (rebootStores , id )
312
+ }
313
+ }
314
+ }
315
+ }
316
+
317
+ go mainLoop ()
318
+ }
319
+
267
320
// WaitApply send wait apply to all tikv ensure all region peer apply log into the last
268
321
func (recovery * Recovery ) WaitApply (ctx context.Context ) (err error ) {
269
322
eg , ectx := errgroup .WithContext (ctx )
0 commit comments