@@ -16,6 +16,8 @@ package checker
16
16
17
17
import (
18
18
"fmt"
19
+ "math/rand"
20
+ "time"
19
21
20
22
"github.com/pingcap/kvproto/pkg/metapb"
21
23
"github.com/pingcap/log"
@@ -45,6 +47,7 @@ type ReplicaChecker struct {
45
47
cluster sche.CheckerCluster
46
48
conf config.CheckerConfigProvider
47
49
pendingProcessedRegions * cache.TTLUint64
50
+ r * rand.Rand
48
51
}
49
52
50
53
// NewReplicaChecker creates a replica checker.
@@ -53,6 +56,7 @@ func NewReplicaChecker(cluster sche.CheckerCluster, conf config.CheckerConfigPro
53
56
cluster : cluster ,
54
57
conf : conf ,
55
58
pendingProcessedRegions : pendingProcessedRegions ,
59
+ r : rand .New (rand .NewSource (time .Now ().UnixNano ())),
56
60
}
57
61
}
58
62
@@ -67,40 +71,40 @@ func (*ReplicaChecker) GetType() types.CheckerSchedulerType {
67
71
}
68
72
69
73
// Check verifies a region's replicas, creating an operator.Operator if need.
70
- func (r * ReplicaChecker ) Check (region * core.RegionInfo ) * operator.Operator {
74
+ func (c * ReplicaChecker ) Check (region * core.RegionInfo ) * operator.Operator {
71
75
replicaCheckerCounter .Inc ()
72
- if r .IsPaused () {
76
+ if c .IsPaused () {
73
77
replicaCheckerPausedCounter .Inc ()
74
78
return nil
75
79
}
76
- if op := r .checkDownPeer (region ); op != nil {
80
+ if op := c .checkDownPeer (region ); op != nil {
77
81
replicaCheckerNewOpCounter .Inc ()
78
82
op .SetPriorityLevel (constant .High )
79
83
return op
80
84
}
81
- if op := r .checkOfflinePeer (region ); op != nil {
85
+ if op := c .checkOfflinePeer (region ); op != nil {
82
86
replicaCheckerNewOpCounter .Inc ()
83
87
op .SetPriorityLevel (constant .High )
84
88
return op
85
89
}
86
- if op := r .checkMakeUpReplica (region ); op != nil {
90
+ if op := c .checkMakeUpReplica (region ); op != nil {
87
91
replicaCheckerNewOpCounter .Inc ()
88
92
op .SetPriorityLevel (constant .High )
89
93
return op
90
94
}
91
- if op := r .checkRemoveExtraReplica (region ); op != nil {
95
+ if op := c .checkRemoveExtraReplica (region ); op != nil {
92
96
replicaCheckerNewOpCounter .Inc ()
93
97
return op
94
98
}
95
- if op := r .checkLocationReplacement (region ); op != nil {
99
+ if op := c .checkLocationReplacement (region ); op != nil {
96
100
replicaCheckerNewOpCounter .Inc ()
97
101
return op
98
102
}
99
103
return nil
100
104
}
101
105
102
- func (r * ReplicaChecker ) checkDownPeer (region * core.RegionInfo ) * operator.Operator {
103
- if ! r .conf .IsRemoveDownReplicaEnabled () {
106
+ func (c * ReplicaChecker ) checkDownPeer (region * core.RegionInfo ) * operator.Operator {
107
+ if ! c .conf .IsRemoveDownReplicaEnabled () {
104
108
return nil
105
109
}
106
110
@@ -110,22 +114,22 @@ func (r *ReplicaChecker) checkDownPeer(region *core.RegionInfo) *operator.Operat
110
114
continue
111
115
}
112
116
storeID := peer .GetStoreId ()
113
- store := r .cluster .GetStore (storeID )
117
+ store := c .cluster .GetStore (storeID )
114
118
if store == nil {
115
119
log .Warn ("lost the store, maybe you are recovering the PD cluster" , zap .Uint64 ("store-id" , storeID ))
116
120
return nil
117
121
}
118
122
// Only consider the state of the Store, not `stats.DownSeconds`.
119
- if store .DownTime () < r .conf .GetMaxStoreDownTime () {
123
+ if store .DownTime () < c .conf .GetMaxStoreDownTime () {
120
124
continue
121
125
}
122
- return r .fixPeer (region , storeID , downStatus )
126
+ return c .fixPeer (region , storeID , downStatus )
123
127
}
124
128
return nil
125
129
}
126
130
127
- func (r * ReplicaChecker ) checkOfflinePeer (region * core.RegionInfo ) * operator.Operator {
128
- if ! r .conf .IsReplaceOfflineReplicaEnabled () {
131
+ func (c * ReplicaChecker ) checkOfflinePeer (region * core.RegionInfo ) * operator.Operator {
132
+ if ! c .conf .IsReplaceOfflineReplicaEnabled () {
129
133
return nil
130
134
}
131
135
@@ -136,7 +140,7 @@ func (r *ReplicaChecker) checkOfflinePeer(region *core.RegionInfo) *operator.Ope
136
140
137
141
for _ , peer := range region .GetPeers () {
138
142
storeID := peer .GetStoreId ()
139
- store := r .cluster .GetStore (storeID )
143
+ store := c .cluster .GetStore (storeID )
140
144
if store == nil {
141
145
log .Warn ("lost the store, maybe you are recovering the PD cluster" , zap .Uint64 ("store-id" , storeID ))
142
146
return nil
@@ -145,71 +149,71 @@ func (r *ReplicaChecker) checkOfflinePeer(region *core.RegionInfo) *operator.Ope
145
149
continue
146
150
}
147
151
148
- return r .fixPeer (region , storeID , offlineStatus )
152
+ return c .fixPeer (region , storeID , offlineStatus )
149
153
}
150
154
151
155
return nil
152
156
}
153
157
154
- func (r * ReplicaChecker ) checkMakeUpReplica (region * core.RegionInfo ) * operator.Operator {
155
- if ! r .conf .IsMakeUpReplicaEnabled () {
158
+ func (c * ReplicaChecker ) checkMakeUpReplica (region * core.RegionInfo ) * operator.Operator {
159
+ if ! c .conf .IsMakeUpReplicaEnabled () {
156
160
return nil
157
161
}
158
- if len (region .GetPeers ()) >= r .conf .GetMaxReplicas () {
162
+ if len (region .GetPeers ()) >= c .conf .GetMaxReplicas () {
159
163
return nil
160
164
}
161
165
log .Debug ("region has fewer than max replicas" , zap .Uint64 ("region-id" , region .GetID ()), zap .Int ("peers" , len (region .GetPeers ())))
162
- regionStores := r .cluster .GetRegionStores (region )
163
- target , filterByTempState := r .strategy (region ).SelectStoreToAdd (regionStores )
166
+ regionStores := c .cluster .GetRegionStores (region )
167
+ target , filterByTempState := c .strategy (c . r , region ).SelectStoreToAdd (regionStores )
164
168
if target == 0 {
165
169
log .Debug ("no store to add replica" , zap .Uint64 ("region-id" , region .GetID ()))
166
170
replicaCheckerNoTargetStoreCounter .Inc ()
167
171
if filterByTempState {
168
- r .pendingProcessedRegions .Put (region .GetID (), nil )
172
+ c .pendingProcessedRegions .Put (region .GetID (), nil )
169
173
}
170
174
return nil
171
175
}
172
176
newPeer := & metapb.Peer {StoreId : target }
173
- op , err := operator .CreateAddPeerOperator ("make-up-replica" , r .cluster , region , newPeer , operator .OpReplica )
177
+ op , err := operator .CreateAddPeerOperator ("make-up-replica" , c .cluster , region , newPeer , operator .OpReplica )
174
178
if err != nil {
175
179
log .Debug ("create make-up-replica operator fail" , errs .ZapError (err ))
176
180
return nil
177
181
}
178
182
return op
179
183
}
180
184
181
- func (r * ReplicaChecker ) checkRemoveExtraReplica (region * core.RegionInfo ) * operator.Operator {
182
- if ! r .conf .IsRemoveExtraReplicaEnabled () {
185
+ func (c * ReplicaChecker ) checkRemoveExtraReplica (region * core.RegionInfo ) * operator.Operator {
186
+ if ! c .conf .IsRemoveExtraReplicaEnabled () {
183
187
return nil
184
188
}
185
189
// when add learner peer, the number of peer will exceed max replicas for a while,
186
190
// just comparing the the number of voters to avoid too many cancel add operator log.
187
- if len (region .GetVoters ()) <= r .conf .GetMaxReplicas () {
191
+ if len (region .GetVoters ()) <= c .conf .GetMaxReplicas () {
188
192
return nil
189
193
}
190
194
log .Debug ("region has more than max replicas" , zap .Uint64 ("region-id" , region .GetID ()), zap .Int ("peers" , len (region .GetPeers ())))
191
- regionStores := r .cluster .GetRegionStores (region )
192
- old := r .strategy (region ).SelectStoreToRemove (regionStores )
195
+ regionStores := c .cluster .GetRegionStores (region )
196
+ old := c .strategy (c . r , region ).SelectStoreToRemove (regionStores )
193
197
if old == 0 {
194
198
replicaCheckerNoWorstPeerCounter .Inc ()
195
- r .pendingProcessedRegions .Put (region .GetID (), nil )
199
+ c .pendingProcessedRegions .Put (region .GetID (), nil )
196
200
return nil
197
201
}
198
- op , err := operator .CreateRemovePeerOperator ("remove-extra-replica" , r .cluster , operator .OpReplica , region , old )
202
+ op , err := operator .CreateRemovePeerOperator ("remove-extra-replica" , c .cluster , operator .OpReplica , region , old )
199
203
if err != nil {
200
204
replicaCheckerCreateOpFailedCounter .Inc ()
201
205
return nil
202
206
}
203
207
return op
204
208
}
205
209
206
- func (r * ReplicaChecker ) checkLocationReplacement (region * core.RegionInfo ) * operator.Operator {
207
- if ! r .conf .IsLocationReplacementEnabled () {
210
+ func (c * ReplicaChecker ) checkLocationReplacement (region * core.RegionInfo ) * operator.Operator {
211
+ if ! c .conf .IsLocationReplacementEnabled () {
208
212
return nil
209
213
}
210
214
211
- strategy := r .strategy (region )
212
- regionStores := r .cluster .GetRegionStores (region )
215
+ strategy := c .strategy (c . r , region )
216
+ regionStores := c .cluster .GetRegionStores (region )
213
217
oldStore := strategy .SelectStoreToRemove (regionStores )
214
218
if oldStore == 0 {
215
219
replicaCheckerAllRightCounter .Inc ()
@@ -223,19 +227,19 @@ func (r *ReplicaChecker) checkLocationReplacement(region *core.RegionInfo) *oper
223
227
}
224
228
225
229
newPeer := & metapb.Peer {StoreId : newStore }
226
- op , err := operator .CreateMovePeerOperator ("move-to-better-location" , r .cluster , region , operator .OpReplica , oldStore , newPeer )
230
+ op , err := operator .CreateMovePeerOperator ("move-to-better-location" , c .cluster , region , operator .OpReplica , oldStore , newPeer )
227
231
if err != nil {
228
232
replicaCheckerCreateOpFailedCounter .Inc ()
229
233
return nil
230
234
}
231
235
return op
232
236
}
233
237
234
- func (r * ReplicaChecker ) fixPeer (region * core.RegionInfo , storeID uint64 , status string ) * operator.Operator {
238
+ func (c * ReplicaChecker ) fixPeer (region * core.RegionInfo , storeID uint64 , status string ) * operator.Operator {
235
239
// Check the number of replicas first.
236
- if len (region .GetVoters ()) > r .conf .GetMaxReplicas () {
240
+ if len (region .GetVoters ()) > c .conf .GetMaxReplicas () {
237
241
removeExtra := fmt .Sprintf ("remove-extra-%s-replica" , status )
238
- op , err := operator .CreateRemovePeerOperator (removeExtra , r .cluster , operator .OpReplica , region , storeID )
242
+ op , err := operator .CreateRemovePeerOperator (removeExtra , c .cluster , operator .OpReplica , region , storeID )
239
243
if err != nil {
240
244
if status == offlineStatus {
241
245
replicaCheckerRemoveExtraOfflineFailedCounter .Inc ()
@@ -247,8 +251,8 @@ func (r *ReplicaChecker) fixPeer(region *core.RegionInfo, storeID uint64, status
247
251
return op
248
252
}
249
253
250
- regionStores := r .cluster .GetRegionStores (region )
251
- target , filterByTempState := r .strategy (region ).SelectStoreToFix (regionStores , storeID )
254
+ regionStores := c .cluster .GetRegionStores (region )
255
+ target , filterByTempState := c .strategy (c . r , region ).SelectStoreToFix (regionStores , storeID )
252
256
if target == 0 {
253
257
if status == offlineStatus {
254
258
replicaCheckerNoStoreOfflineCounter .Inc ()
@@ -257,13 +261,13 @@ func (r *ReplicaChecker) fixPeer(region *core.RegionInfo, storeID uint64, status
257
261
}
258
262
log .Debug ("no best store to add replica" , zap .Uint64 ("region-id" , region .GetID ()))
259
263
if filterByTempState {
260
- r .pendingProcessedRegions .Put (region .GetID (), nil )
264
+ c .pendingProcessedRegions .Put (region .GetID (), nil )
261
265
}
262
266
return nil
263
267
}
264
268
newPeer := & metapb.Peer {StoreId : target }
265
269
replace := fmt .Sprintf ("replace-%s-replica" , status )
266
- op , err := operator .CreateMovePeerOperator (replace , r .cluster , region , operator .OpReplica , storeID , newPeer )
270
+ op , err := operator .CreateMovePeerOperator (replace , c .cluster , region , operator .OpReplica , storeID , newPeer )
267
271
if err != nil {
268
272
if status == offlineStatus {
269
273
replicaCheckerReplaceOfflineFailedCounter .Inc ()
@@ -275,12 +279,13 @@ func (r *ReplicaChecker) fixPeer(region *core.RegionInfo, storeID uint64, status
275
279
return op
276
280
}
277
281
278
- func (r * ReplicaChecker ) strategy (region * core.RegionInfo ) * ReplicaStrategy {
282
+ func (c * ReplicaChecker ) strategy (r * rand. Rand , region * core.RegionInfo ) * ReplicaStrategy {
279
283
return & ReplicaStrategy {
280
- checkerName : r .Name (),
281
- cluster : r .cluster ,
282
- locationLabels : r .conf .GetLocationLabels (),
283
- isolationLevel : r .conf .GetIsolationLevel (),
284
+ checkerName : c .Name (),
285
+ cluster : c .cluster ,
286
+ locationLabels : c .conf .GetLocationLabels (),
287
+ isolationLevel : c .conf .GetIsolationLevel (),
284
288
region : region ,
289
+ r : r ,
285
290
}
286
291
}
0 commit comments