Skip to content

Commit 4a6219e

Browse files
rleungxti-chi-bot
authored andcommitted
This is an automated cherry-pick of tikv#8675
close tikv#8674 Signed-off-by: ti-chi-bot <[email protected]>
1 parent e9bcf4f commit 4a6219e

17 files changed

+225
-48
lines changed

pkg/schedule/checker/replica_checker.go

Lines changed: 76 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ package checker
1616

1717
import (
1818
"fmt"
19+
"math/rand"
20+
"time"
1921

2022
"github.com/pingcap/kvproto/pkg/metapb"
2123
"github.com/pingcap/log"
@@ -61,17 +63,31 @@ var (
6163
// Location management, mainly used for cross data center deployment.
6264
type ReplicaChecker struct {
6365
PauseController
66+
<<<<<<< HEAD
6467
cluster schedule.Cluster
6568
conf config.Config
6669
regionWaitingList cache.Cache
70+
=======
71+
cluster sche.CheckerCluster
72+
conf config.CheckerConfigProvider
73+
pendingProcessedRegions *cache.TTLUint64
74+
r *rand.Rand
75+
>>>>>>> 25dedabf5 (*: reduce rand NewSource (#8675))
6776
}
6877

6978
// NewReplicaChecker creates a replica checker.
7079
func NewReplicaChecker(cluster schedule.Cluster, conf config.Config, regionWaitingList cache.Cache) *ReplicaChecker {
7180
return &ReplicaChecker{
81+
<<<<<<< HEAD
7282
cluster: cluster,
7383
conf: conf,
7484
regionWaitingList: regionWaitingList,
85+
=======
86+
cluster: cluster,
87+
conf: conf,
88+
pendingProcessedRegions: pendingProcessedRegions,
89+
r: rand.New(rand.NewSource(time.Now().UnixNano())),
90+
>>>>>>> 25dedabf5 (*: reduce rand NewSource (#8675))
7591
}
7692
}
7793

@@ -81,40 +97,40 @@ func (r *ReplicaChecker) GetType() string {
8197
}
8298

8399
// Check verifies a region's replicas, creating an operator.Operator if need.
84-
func (r *ReplicaChecker) Check(region *core.RegionInfo) *operator.Operator {
100+
func (c *ReplicaChecker) Check(region *core.RegionInfo) *operator.Operator {
85101
replicaCheckerCounter.Inc()
86-
if r.IsPaused() {
102+
if c.IsPaused() {
87103
replicaCheckerPausedCounter.Inc()
88104
return nil
89105
}
90-
if op := r.checkDownPeer(region); op != nil {
106+
if op := c.checkDownPeer(region); op != nil {
91107
replicaCheckerNewOpCounter.Inc()
92108
op.SetPriorityLevel(constant.High)
93109
return op
94110
}
95-
if op := r.checkOfflinePeer(region); op != nil {
111+
if op := c.checkOfflinePeer(region); op != nil {
96112
replicaCheckerNewOpCounter.Inc()
97113
op.SetPriorityLevel(constant.High)
98114
return op
99115
}
100-
if op := r.checkMakeUpReplica(region); op != nil {
116+
if op := c.checkMakeUpReplica(region); op != nil {
101117
replicaCheckerNewOpCounter.Inc()
102118
op.SetPriorityLevel(constant.High)
103119
return op
104120
}
105-
if op := r.checkRemoveExtraReplica(region); op != nil {
121+
if op := c.checkRemoveExtraReplica(region); op != nil {
106122
replicaCheckerNewOpCounter.Inc()
107123
return op
108124
}
109-
if op := r.checkLocationReplacement(region); op != nil {
125+
if op := c.checkLocationReplacement(region); op != nil {
110126
replicaCheckerNewOpCounter.Inc()
111127
return op
112128
}
113129
return nil
114130
}
115131

116-
func (r *ReplicaChecker) checkDownPeer(region *core.RegionInfo) *operator.Operator {
117-
if !r.conf.IsRemoveDownReplicaEnabled() {
132+
func (c *ReplicaChecker) checkDownPeer(region *core.RegionInfo) *operator.Operator {
133+
if !c.conf.IsRemoveDownReplicaEnabled() {
118134
return nil
119135
}
120136

@@ -124,22 +140,22 @@ func (r *ReplicaChecker) checkDownPeer(region *core.RegionInfo) *operator.Operat
124140
continue
125141
}
126142
storeID := peer.GetStoreId()
127-
store := r.cluster.GetStore(storeID)
143+
store := c.cluster.GetStore(storeID)
128144
if store == nil {
129145
log.Warn("lost the store, maybe you are recovering the PD cluster", zap.Uint64("store-id", storeID))
130146
return nil
131147
}
132148
// Only consider the state of the Store, not `stats.DownSeconds`.
133-
if store.DownTime() < r.conf.GetMaxStoreDownTime() {
149+
if store.DownTime() < c.conf.GetMaxStoreDownTime() {
134150
continue
135151
}
136-
return r.fixPeer(region, storeID, downStatus)
152+
return c.fixPeer(region, storeID, downStatus)
137153
}
138154
return nil
139155
}
140156

141-
func (r *ReplicaChecker) checkOfflinePeer(region *core.RegionInfo) *operator.Operator {
142-
if !r.conf.IsReplaceOfflineReplicaEnabled() {
157+
func (c *ReplicaChecker) checkOfflinePeer(region *core.RegionInfo) *operator.Operator {
158+
if !c.conf.IsReplaceOfflineReplicaEnabled() {
143159
return nil
144160
}
145161

@@ -150,7 +166,7 @@ func (r *ReplicaChecker) checkOfflinePeer(region *core.RegionInfo) *operator.Ope
150166

151167
for _, peer := range region.GetPeers() {
152168
storeID := peer.GetStoreId()
153-
store := r.cluster.GetStore(storeID)
169+
store := c.cluster.GetStore(storeID)
154170
if store == nil {
155171
log.Warn("lost the store, maybe you are recovering the PD cluster", zap.Uint64("store-id", storeID))
156172
return nil
@@ -159,71 +175,79 @@ func (r *ReplicaChecker) checkOfflinePeer(region *core.RegionInfo) *operator.Ope
159175
continue
160176
}
161177

162-
return r.fixPeer(region, storeID, offlineStatus)
178+
return c.fixPeer(region, storeID, offlineStatus)
163179
}
164180

165181
return nil
166182
}
167183

168-
func (r *ReplicaChecker) checkMakeUpReplica(region *core.RegionInfo) *operator.Operator {
169-
if !r.conf.IsMakeUpReplicaEnabled() {
184+
func (c *ReplicaChecker) checkMakeUpReplica(region *core.RegionInfo) *operator.Operator {
185+
if !c.conf.IsMakeUpReplicaEnabled() {
170186
return nil
171187
}
172-
if len(region.GetPeers()) >= r.conf.GetMaxReplicas() {
188+
if len(region.GetPeers()) >= c.conf.GetMaxReplicas() {
173189
return nil
174190
}
175191
log.Debug("region has fewer than max replicas", zap.Uint64("region-id", region.GetID()), zap.Int("peers", len(region.GetPeers())))
176-
regionStores := r.cluster.GetRegionStores(region)
177-
target, filterByTempState := r.strategy(region).SelectStoreToAdd(regionStores)
192+
regionStores := c.cluster.GetRegionStores(region)
193+
target, filterByTempState := c.strategy(c.r, region).SelectStoreToAdd(regionStores)
178194
if target == 0 {
179195
log.Debug("no store to add replica", zap.Uint64("region-id", region.GetID()))
180196
replicaCheckerNoTargetStoreCounter.Inc()
181197
if filterByTempState {
198+
<<<<<<< HEAD
182199
r.regionWaitingList.Put(region.GetID(), nil)
200+
=======
201+
c.pendingProcessedRegions.Put(region.GetID(), nil)
202+
>>>>>>> 25dedabf5 (*: reduce rand NewSource (#8675))
183203
}
184204
return nil
185205
}
186206
newPeer := &metapb.Peer{StoreId: target}
187-
op, err := operator.CreateAddPeerOperator("make-up-replica", r.cluster, region, newPeer, operator.OpReplica)
207+
op, err := operator.CreateAddPeerOperator("make-up-replica", c.cluster, region, newPeer, operator.OpReplica)
188208
if err != nil {
189209
log.Debug("create make-up-replica operator fail", errs.ZapError(err))
190210
return nil
191211
}
192212
return op
193213
}
194214

195-
func (r *ReplicaChecker) checkRemoveExtraReplica(region *core.RegionInfo) *operator.Operator {
196-
if !r.conf.IsRemoveExtraReplicaEnabled() {
215+
func (c *ReplicaChecker) checkRemoveExtraReplica(region *core.RegionInfo) *operator.Operator {
216+
if !c.conf.IsRemoveExtraReplicaEnabled() {
197217
return nil
198218
}
199219
// when add learner peer, the number of peer will exceed max replicas for a while,
200220
// just comparing the the number of voters to avoid too many cancel add operator log.
201-
if len(region.GetVoters()) <= r.conf.GetMaxReplicas() {
221+
if len(region.GetVoters()) <= c.conf.GetMaxReplicas() {
202222
return nil
203223
}
204224
log.Debug("region has more than max replicas", zap.Uint64("region-id", region.GetID()), zap.Int("peers", len(region.GetPeers())))
205-
regionStores := r.cluster.GetRegionStores(region)
206-
old := r.strategy(region).SelectStoreToRemove(regionStores)
225+
regionStores := c.cluster.GetRegionStores(region)
226+
old := c.strategy(c.r, region).SelectStoreToRemove(regionStores)
207227
if old == 0 {
208228
replicaCheckerNoWorstPeerCounter.Inc()
229+
<<<<<<< HEAD
209230
r.regionWaitingList.Put(region.GetID(), nil)
231+
=======
232+
c.pendingProcessedRegions.Put(region.GetID(), nil)
233+
>>>>>>> 25dedabf5 (*: reduce rand NewSource (#8675))
210234
return nil
211235
}
212-
op, err := operator.CreateRemovePeerOperator("remove-extra-replica", r.cluster, operator.OpReplica, region, old)
236+
op, err := operator.CreateRemovePeerOperator("remove-extra-replica", c.cluster, operator.OpReplica, region, old)
213237
if err != nil {
214238
replicaCheckerCreateOpFailedCounter.Inc()
215239
return nil
216240
}
217241
return op
218242
}
219243

220-
func (r *ReplicaChecker) checkLocationReplacement(region *core.RegionInfo) *operator.Operator {
221-
if !r.conf.IsLocationReplacementEnabled() {
244+
func (c *ReplicaChecker) checkLocationReplacement(region *core.RegionInfo) *operator.Operator {
245+
if !c.conf.IsLocationReplacementEnabled() {
222246
return nil
223247
}
224248

225-
strategy := r.strategy(region)
226-
regionStores := r.cluster.GetRegionStores(region)
249+
strategy := c.strategy(c.r, region)
250+
regionStores := c.cluster.GetRegionStores(region)
227251
oldStore := strategy.SelectStoreToRemove(regionStores)
228252
if oldStore == 0 {
229253
replicaCheckerAllRightCounter.Inc()
@@ -237,19 +261,19 @@ func (r *ReplicaChecker) checkLocationReplacement(region *core.RegionInfo) *oper
237261
}
238262

239263
newPeer := &metapb.Peer{StoreId: newStore}
240-
op, err := operator.CreateMovePeerOperator("move-to-better-location", r.cluster, region, operator.OpReplica, oldStore, newPeer)
264+
op, err := operator.CreateMovePeerOperator("move-to-better-location", c.cluster, region, operator.OpReplica, oldStore, newPeer)
241265
if err != nil {
242266
replicaCheckerCreateOpFailedCounter.Inc()
243267
return nil
244268
}
245269
return op
246270
}
247271

248-
func (r *ReplicaChecker) fixPeer(region *core.RegionInfo, storeID uint64, status string) *operator.Operator {
272+
func (c *ReplicaChecker) fixPeer(region *core.RegionInfo, storeID uint64, status string) *operator.Operator {
249273
// Check the number of replicas first.
250-
if len(region.GetVoters()) > r.conf.GetMaxReplicas() {
274+
if len(region.GetVoters()) > c.conf.GetMaxReplicas() {
251275
removeExtra := fmt.Sprintf("remove-extra-%s-replica", status)
252-
op, err := operator.CreateRemovePeerOperator(removeExtra, r.cluster, operator.OpReplica, region, storeID)
276+
op, err := operator.CreateRemovePeerOperator(removeExtra, c.cluster, operator.OpReplica, region, storeID)
253277
if err != nil {
254278
if status == offlineStatus {
255279
replicaCheckerRemoveExtraOfflineFailedCounter.Inc()
@@ -261,8 +285,8 @@ func (r *ReplicaChecker) fixPeer(region *core.RegionInfo, storeID uint64, status
261285
return op
262286
}
263287

264-
regionStores := r.cluster.GetRegionStores(region)
265-
target, filterByTempState := r.strategy(region).SelectStoreToFix(regionStores, storeID)
288+
regionStores := c.cluster.GetRegionStores(region)
289+
target, filterByTempState := c.strategy(c.r, region).SelectStoreToFix(regionStores, storeID)
266290
if target == 0 {
267291
if status == offlineStatus {
268292
replicaCheckerNoStoreOfflineCounter.Inc()
@@ -271,13 +295,17 @@ func (r *ReplicaChecker) fixPeer(region *core.RegionInfo, storeID uint64, status
271295
}
272296
log.Debug("no best store to add replica", zap.Uint64("region-id", region.GetID()))
273297
if filterByTempState {
298+
<<<<<<< HEAD
274299
r.regionWaitingList.Put(region.GetID(), nil)
300+
=======
301+
c.pendingProcessedRegions.Put(region.GetID(), nil)
302+
>>>>>>> 25dedabf5 (*: reduce rand NewSource (#8675))
275303
}
276304
return nil
277305
}
278306
newPeer := &metapb.Peer{StoreId: target}
279307
replace := fmt.Sprintf("replace-%s-replica", status)
280-
op, err := operator.CreateMovePeerOperator(replace, r.cluster, region, operator.OpReplica, storeID, newPeer)
308+
op, err := operator.CreateMovePeerOperator(replace, c.cluster, region, operator.OpReplica, storeID, newPeer)
281309
if err != nil {
282310
if status == offlineStatus {
283311
replicaCheckerReplaceOfflineFailedCounter.Inc()
@@ -289,12 +317,20 @@ func (r *ReplicaChecker) fixPeer(region *core.RegionInfo, storeID uint64, status
289317
return op
290318
}
291319

292-
func (r *ReplicaChecker) strategy(region *core.RegionInfo) *ReplicaStrategy {
320+
func (c *ReplicaChecker) strategy(r *rand.Rand, region *core.RegionInfo) *ReplicaStrategy {
293321
return &ReplicaStrategy{
322+
<<<<<<< HEAD
294323
checkerName: replicaCheckerName,
295324
cluster: r.cluster,
296325
locationLabels: r.conf.GetLocationLabels(),
297326
isolationLevel: r.conf.GetIsolationLevel(),
327+
=======
328+
checkerName: c.Name(),
329+
cluster: c.cluster,
330+
locationLabels: c.conf.GetLocationLabels(),
331+
isolationLevel: c.conf.GetIsolationLevel(),
332+
>>>>>>> 25dedabf5 (*: reduce rand NewSource (#8675))
298333
region: region,
334+
r: r,
299335
}
300336
}

pkg/schedule/checker/replica_strategy.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
package checker
1616

1717
import (
18+
"math/rand"
19+
1820
"github.com/pingcap/log"
1921
"github.com/tikv/pd/pkg/core"
2022
"github.com/tikv/pd/pkg/core/constant"
@@ -26,6 +28,7 @@ import (
2628
// ReplicaStrategy collects some utilities to manipulate region peers. It
2729
// exists to allow replica_checker and rule_checker to reuse common logics.
2830
type ReplicaStrategy struct {
31+
r *rand.Rand
2932
checkerName string // replica-checker / rule-checker
3033
cluster schedule.Cluster
3134
locationLabels []string
@@ -76,8 +79,13 @@ func (s *ReplicaStrategy) SelectStoreToAdd(coLocationStores []*core.StoreInfo, e
7679

7780
isolationComparer := filter.IsolationComparer(s.locationLabels, coLocationStores)
7881
strictStateFilter := &filter.StoreStateFilter{ActionScope: s.checkerName, MoveRegion: true, AllowFastFailover: s.fastFailover, OperatorLevel: level}
82+
<<<<<<< HEAD
7983
targetCandidate := filter.NewCandidates(s.cluster.GetStores()).
8084
FilterTarget(s.cluster.GetOpts(), nil, nil, filters...).
85+
=======
86+
targetCandidate := filter.NewCandidates(s.r, s.cluster.GetStores()).
87+
FilterTarget(s.cluster.GetCheckerConfig(), nil, nil, filters...).
88+
>>>>>>> 25dedabf5 (*: reduce rand NewSource (#8675))
8189
KeepTheTopStores(isolationComparer, false) // greater isolation score is better
8290
if targetCandidate.Len() == 0 {
8391
return 0, false
@@ -137,8 +145,13 @@ func (s *ReplicaStrategy) SelectStoreToRemove(coLocationStores []*core.StoreInfo
137145
if s.fastFailover {
138146
level = constant.Urgent
139147
}
148+
<<<<<<< HEAD
140149
source := filter.NewCandidates(coLocationStores).
141150
FilterSource(s.cluster.GetOpts(), nil, nil, &filter.StoreStateFilter{ActionScope: s.checkerName, MoveRegion: true, OperatorLevel: level}).
151+
=======
152+
source := filter.NewCandidates(s.r, coLocationStores).
153+
FilterSource(s.cluster.GetCheckerConfig(), nil, nil, &filter.StoreStateFilter{ActionScope: s.checkerName, MoveRegion: true, OperatorLevel: level}).
154+
>>>>>>> 25dedabf5 (*: reduce rand NewSource (#8675))
142155
KeepTheTopStores(isolationComparer, true).
143156
PickTheTopStore(filter.RegionScoreComparer(s.cluster.GetOpts()), false)
144157
if source == nil {

0 commit comments

Comments
 (0)