@@ -18,6 +18,7 @@ import (
18
18
"github.com/pingcap/kvproto/pkg/pdpb"
19
19
"github.com/pingcap/log"
20
20
berrors "github.com/pingcap/tidb/br/pkg/errors"
21
+ "github.com/pingcap/tidb/br/pkg/lightning/common"
21
22
"github.com/pingcap/tidb/br/pkg/logutil"
22
23
"github.com/pingcap/tidb/br/pkg/restore/split"
23
24
"github.com/pingcap/tidb/br/pkg/rtree"
@@ -144,21 +145,15 @@ SplitRegions:
144
145
}
145
146
log .Info ("start to wait for scattering regions" ,
146
147
zap .Int ("regions" , len (scatterRegions )), zap .Duration ("take" , time .Since (startTime )))
147
- startTime = time .Now ()
148
- scatterCount := 0
149
- for _ , region := range scatterRegions {
150
- rs .waitForScatterRegion (ctx , region )
151
- if time .Since (startTime ) > split .ScatterWaitUpperInterval {
152
- break
153
- }
154
- scatterCount ++
155
- }
156
- if scatterCount == len (scatterRegions ) {
148
+
149
+ leftCnt := rs .WaitForScatterRegions (ctx , scatterRegions , split .ScatterWaitUpperInterval )
150
+ if leftCnt == 0 {
157
151
log .Info ("waiting for scattering regions done" ,
158
152
zap .Int ("regions" , len (scatterRegions )), zap .Duration ("take" , time .Since (startTime )))
159
153
} else {
160
154
log .Warn ("waiting for scattering regions timeout" ,
161
- zap .Int ("scatterCount" , scatterCount ),
155
+ zap .Int ("NotScatterCount" , leftCnt ),
156
+ zap .Int ("TotalScatterCount" , len (scatterRegions )),
162
157
zap .Int ("regions" , len (scatterRegions )),
163
158
zap .Duration ("take" , time .Since (startTime )))
164
159
}
@@ -188,26 +183,48 @@ func (rs *RegionSplitter) hasHealthyRegion(ctx context.Context, regionID uint64)
188
183
return len (regionInfo .PendingPeers ) == 0 , nil
189
184
}
190
185
191
- func (rs * RegionSplitter ) isScatterRegionFinished (ctx context.Context , regionID uint64 ) (bool , error ) {
186
+ // isScatterRegionFinished check the latest successful operator and return the follow status:
187
+ //
188
+ // return (finished, needRescatter, error)
189
+ //
190
+ // if the latest operator is not `scatter-operator`, or its status is SUCCESS, it's likely that the
191
+ // scatter region operator is finished.
192
+ //
193
+ // if the latest operator is `scatter-operator` and its status is TIMEOUT or CANCEL, the needRescatter
194
+ // is true and the function caller needs to scatter this region again.
195
+ func (rs * RegionSplitter ) isScatterRegionFinished (ctx context.Context , regionID uint64 ) (bool , bool , error ) {
192
196
resp , err := rs .client .GetOperator (ctx , regionID )
193
197
if err != nil {
194
- return false , errors .Trace (err )
198
+ if common .IsRetryableError (err ) {
199
+ // retry in the next cycle
200
+ return false , false , nil
201
+ }
202
+ return false , false , errors .Trace (err )
195
203
}
196
204
// Heartbeat may not be sent to PD
197
205
if respErr := resp .GetHeader ().GetError (); respErr != nil {
198
206
if respErr .GetType () == pdpb .ErrorType_REGION_NOT_FOUND {
199
- return true , nil
207
+ return true , false , nil
200
208
}
201
- return false , errors .Annotatef (berrors .ErrPDInvalidResponse , "get operator error: %s" , respErr .GetType ())
209
+ return false , false , errors .Annotatef (berrors .ErrPDInvalidResponse , "get operator error: %s" , respErr .GetType ())
202
210
}
203
211
retryTimes := ctx .Value (retryTimes ).(int )
204
212
if retryTimes > 3 {
205
213
log .Info ("get operator" , zap .Uint64 ("regionID" , regionID ), zap .Stringer ("resp" , resp ))
206
214
}
207
215
// If the current operator of the region is not 'scatter-region', we could assume
208
- // that 'scatter-operator' has finished or timeout
209
- ok := string (resp .GetDesc ()) != "scatter-region" || resp .GetStatus () != pdpb .OperatorStatus_RUNNING
210
- return ok , nil
216
+ // that 'scatter-operator' has finished
217
+ if string (resp .GetDesc ()) != "scatter-region" {
218
+ return true , false , nil
219
+ }
220
+ switch resp .GetStatus () {
221
+ case pdpb .OperatorStatus_SUCCESS :
222
+ return true , false , nil
223
+ case pdpb .OperatorStatus_RUNNING :
224
+ return false , false , nil
225
+ default :
226
+ return false , true , nil
227
+ }
211
228
}
212
229
213
230
func (rs * RegionSplitter ) waitForSplit (ctx context.Context , regionID uint64 ) {
@@ -233,26 +250,66 @@ type retryTimeKey struct{}
233
250
234
251
var retryTimes = new (retryTimeKey )
235
252
236
- func (rs * RegionSplitter ) waitForScatterRegion (ctx context.Context , regionInfo * split.RegionInfo ) {
237
- interval := split .ScatterWaitInterval
238
- regionID := regionInfo .Region .GetId ()
239
- for i := 0 ; i < split .ScatterWaitMaxRetryTimes ; i ++ {
240
- ctx1 := context .WithValue (ctx , retryTimes , i )
241
- ok , err := rs .isScatterRegionFinished (ctx1 , regionID )
242
- if err != nil {
243
- log .Warn ("scatter region failed: do not have the region" ,
244
- logutil .Region (regionInfo .Region ))
245
- return
253
+ func mapRegionInfoSlice (regionInfos []* split.RegionInfo ) map [uint64 ]* split.RegionInfo {
254
+ regionInfoMap := make (map [uint64 ]* split.RegionInfo )
255
+ for _ , info := range regionInfos {
256
+ regionID := info .Region .GetId ()
257
+ regionInfoMap [regionID ] = info
258
+ }
259
+ return regionInfoMap
260
+ }
261
+
262
+ func (rs * RegionSplitter ) WaitForScatterRegions (ctx context.Context , regionInfos []* split.RegionInfo , timeout time.Duration ) int {
263
+ var (
264
+ startTime = time .Now ()
265
+ interval = split .ScatterWaitInterval
266
+ leftRegions = mapRegionInfoSlice (regionInfos )
267
+ retryCnt = 0
268
+
269
+ reScatterRegions = make ([]* split.RegionInfo , 0 , len (regionInfos ))
270
+ )
271
+ for {
272
+ ctx1 := context .WithValue (ctx , retryTimes , retryCnt )
273
+ reScatterRegions = reScatterRegions [:0 ]
274
+ for regionID , regionInfo := range leftRegions {
275
+ ok , rescatter , err := rs .isScatterRegionFinished (ctx1 , regionID )
276
+ if err != nil {
277
+ log .Warn ("scatter region failed: do not have the region" ,
278
+ logutil .Region (regionInfo .Region ), zap .Error (err ))
279
+ delete (leftRegions , regionID )
280
+ continue
281
+ }
282
+ if ok {
283
+ delete (leftRegions , regionID )
284
+ continue
285
+ }
286
+ if rescatter {
287
+ reScatterRegions = append (reScatterRegions , regionInfo )
288
+ }
289
+ // RUNNING_STATUS, just wait and check it in the next loop
246
290
}
247
- if ok {
291
+
292
+ if len (leftRegions ) == 0 {
293
+ return 0
294
+ }
295
+
296
+ if len (reScatterRegions ) > 0 {
297
+ rs .ScatterRegions (ctx1 , reScatterRegions )
298
+ }
299
+
300
+ if time .Since (startTime ) > timeout {
248
301
break
249
302
}
303
+
304
+ retryCnt += 1
250
305
interval = 2 * interval
251
306
if interval > split .ScatterMaxWaitInterval {
252
307
interval = split .ScatterMaxWaitInterval
253
308
}
254
309
time .Sleep (interval )
255
310
}
311
+
312
+ return len (leftRegions )
256
313
}
257
314
258
315
func (rs * RegionSplitter ) splitAndScatterRegions (
@@ -780,16 +837,10 @@ func (helper *LogSplitHelper) Split(ctx context.Context) error {
780
837
}
781
838
}
782
839
783
- startTime := time .Now ()
784
840
regionSplitter := NewRegionSplitter (helper .client )
785
- for _ , region := range scatterRegions {
786
- regionSplitter .waitForScatterRegion (ctx , region )
787
- // It is too expensive to stop recovery and wait for a small number of regions
788
- // to complete scatter, so the maximum waiting time is reduced to 1 minute.
789
- if time .Since (startTime ) > time .Minute {
790
- break
791
- }
792
- }
841
+ // It is too expensive to stop recovery and wait for a small number of regions
842
+ // to complete scatter, so the maximum waiting time is reduced to 1 minute.
843
+ _ = regionSplitter .WaitForScatterRegions (ctx , scatterRegions , time .Minute )
793
844
}()
794
845
795
846
iter := helper .iterator ()
0 commit comments