@@ -304,7 +304,7 @@ func balanceBatchCopTaskWithContinuity(storeTaskMap map[uint64]*batchCopTask, ca
304
304
//
305
305
// The second balance strategy: Not only consider the region count between TiFlash stores, but also try to make the regions' range continuous(stored in TiFlash closely).
306
306
// If balanceWithContinuity is true, the second balance strategy is enable.
307
- func balanceBatchCopTask (ctx context. Context , aliveStores []* tikv.Store , originalTasks []* batchCopTask , balanceWithContinuity bool , balanceContinuousRegionCount int64 ) []* batchCopTask {
307
+ func balanceBatchCopTask (aliveStores []* tikv.Store , originalTasks []* batchCopTask , balanceWithContinuity bool , balanceContinuousRegionCount int64 ) []* batchCopTask {
308
308
if len (originalTasks ) == 0 {
309
309
log .Info ("Batch cop task balancer got an empty task set." )
310
310
return originalTasks
@@ -819,12 +819,24 @@ func filterAllStoresAccordingToTiFlashReplicaRead(allStores []uint64, aliveStore
819
819
return
820
820
}
821
821
822
+ func getAllUsedTiFlashStores (allTiFlashStores []* tikv.Store , allUsedTiFlashStoresMap map [uint64 ]struct {}) []* tikv.Store {
823
+ allUsedTiFlashStores := make ([]* tikv.Store , 0 , len (allUsedTiFlashStoresMap ))
824
+ for _ , store := range allTiFlashStores {
825
+ _ , ok := allUsedTiFlashStoresMap [store .StoreID ()]
826
+ if ok {
827
+ allUsedTiFlashStores = append (allUsedTiFlashStores , store )
828
+ }
829
+ }
830
+ return allUsedTiFlashStores
831
+ }
832
+
822
833
// getAliveStoresAndStoreIDs gets alive TiFlash stores and their IDs.
823
834
// If tiflashReplicaReadPolicy is not all_replicas, it will also return the IDs of the alive TiFlash stores in TiDB zone.
824
- func getAliveStoresAndStoreIDs (ctx context.Context , cache * RegionCache , ttl time.Duration , store * kvStore , tiflashReplicaReadPolicy tiflash.ReplicaRead , tidbZone string ) (aliveStores * aliveStoresBundle ) {
835
+ func getAliveStoresAndStoreIDs (ctx context.Context , cache * RegionCache , allUsedTiFlashStoresMap map [ uint64 ] struct {}, ttl time.Duration , store * kvStore , tiflashReplicaReadPolicy tiflash.ReplicaRead , tidbZone string ) (aliveStores * aliveStoresBundle ) {
825
836
aliveStores = new (aliveStoresBundle )
826
837
allTiFlashStores := cache .RegionCache .GetTiFlashStores (tikv .LabelFilterNoTiFlashWriteNode )
827
- aliveStores .storesInAllZones = filterAliveStores (ctx , allTiFlashStores , ttl , store )
838
+ allUsedTiFlashStores := getAllUsedTiFlashStores (allTiFlashStores , allUsedTiFlashStoresMap )
839
+ aliveStores .storesInAllZones = filterAliveStores (ctx , allUsedTiFlashStores , ttl , store )
828
840
829
841
if ! tiflashReplicaReadPolicy .IsAllReplicas () {
830
842
aliveStores .storeIDsInTiDBZone = make (map [uint64 ]struct {}, len (aliveStores .storesInAllZones ))
@@ -849,11 +861,28 @@ func getAliveStoresAndStoreIDs(ctx context.Context, cache *RegionCache, ttl time
849
861
// 1. tiflash_replica_read policy
850
862
// 2. whether the store is alive
851
863
// After filtering, it will build the RegionInfo.
852
- func filterAccessibleStoresAndBuildRegionInfo (cache * RegionCache , bo * Backoffer , task * copTask , rpcCtx * tikv.RPCContext , aliveStores * aliveStoresBundle , isTiDBLabelZoneSet bool , tiflashReplicaReadPolicy tiflash.ReplicaRead , regionInfoNeedsReloadOnSendFail []RegionInfo , regionsInOtherZones []uint64 , maxRemoteReadCountAllowed int , tidbZone string ) (regionInfo RegionInfo , _ []RegionInfo , _ []uint64 , err error ) {
864
+ func filterAccessibleStoresAndBuildRegionInfo (
865
+ cache * RegionCache ,
866
+ allStores []uint64 ,
867
+ bo * Backoffer ,
868
+ task * copTask ,
869
+ rpcCtx * tikv.RPCContext ,
870
+ aliveStores * aliveStoresBundle ,
871
+ tiflashReplicaReadPolicy tiflash.ReplicaRead ,
872
+ regionInfoNeedsReloadOnSendFail []RegionInfo ,
873
+ regionsInOtherZones []uint64 ,
874
+ maxRemoteReadCountAllowed int ,
875
+ tidbZone string ) (regionInfo RegionInfo , _ []RegionInfo , _ []uint64 , err error ) {
853
876
needCrossZoneAccess := false
854
- allStores , _ := cache .GetAllValidTiFlashStores (task .region , rpcCtx .Store , tikv .LabelFilterNoTiFlashWriteNode )
855
877
allStores , needCrossZoneAccess = filterAllStoresAccordingToTiFlashReplicaRead (allStores , aliveStores , tiflashReplicaReadPolicy )
856
- regionInfo = RegionInfo {Region : task .region , Meta : rpcCtx .Meta , Ranges : task .ranges , AllStores : allStores , PartitionIndex : task .partitionIndex }
878
+
879
+ regionInfo = RegionInfo {
880
+ Region : task .region ,
881
+ Meta : rpcCtx .Meta ,
882
+ Ranges : task .ranges ,
883
+ AllStores : allStores ,
884
+ PartitionIndex : task .partitionIndex }
885
+
857
886
if needCrossZoneAccess {
858
887
regionsInOtherZones = append (regionsInOtherZones , task .region .GetID ())
859
888
regionInfoNeedsReloadOnSendFail = append (regionInfoNeedsReloadOnSendFail , regionInfo )
@@ -862,7 +891,9 @@ func filterAccessibleStoresAndBuildRegionInfo(cache *RegionCache, bo *Backoffer,
862
891
for i := 0 ; i < 3 && i < len (regionsInOtherZones ); i ++ {
863
892
regionIDErrMsg += fmt .Sprintf ("%d, " , regionsInOtherZones [i ])
864
893
}
865
- err = errors .Errorf ("no less than %d region(s) can not be accessed by TiFlash in the zone [%s]: %setc" , len (regionsInOtherZones ), tidbZone , regionIDErrMsg )
894
+ err = errors .Errorf (
895
+ "no less than %d region(s) can not be accessed by TiFlash in the zone [%s]: %setc" ,
896
+ len (regionsInOtherZones ), tidbZone , regionIDErrMsg )
866
897
// We need to reload the region cache here to avoid the failure throughout the region cache refresh TTL.
867
898
cache .OnSendFailForBatchRegions (bo , rpcCtx .Store , regionInfoNeedsReloadOnSendFail , true , err )
868
899
return regionInfo , nil , nil , err
@@ -895,10 +926,7 @@ func buildBatchCopTasksCore(bo *backoff.Backoffer, store *kvStore, rangesForEach
895
926
if ! isTiDBLabelZoneSet {
896
927
tiflashReplicaReadPolicy = tiflash .AllReplicas
897
928
}
898
- aliveStores = getAliveStoresAndStoreIDs (bo .GetCtx (), cache , ttl , store , tiflashReplicaReadPolicy , tidbZone )
899
- if tiflashReplicaReadPolicy .IsClosestReplicas () {
900
- maxRemoteReadCountAllowed = len (aliveStores .storeIDsInTiDBZone ) * tiflash .MaxRemoteReadCountPerNodeForClosestReplicas
901
- }
929
+
902
930
for {
903
931
var tasks []* copTask
904
932
rangesLen = 0
@@ -919,17 +947,16 @@ func buildBatchCopTasksCore(bo *backoff.Backoffer, store *kvStore, rangesForEach
919
947
}
920
948
}
921
949
922
- var batchTasks []* batchCopTask
923
- var regionIDsInOtherZones []uint64
924
- var regionInfosNeedReloadOnSendFail []RegionInfo
925
- storeTaskMap := make (map [string ]* batchCopTask )
950
+ rpcCtxs := make ([]* tikv.RPCContext , 0 , len (tasks ))
951
+ usedTiFlashStores := make ([][]uint64 , 0 , len (tasks ))
952
+ usedTiFlashStoresMap := make (map [uint64 ]struct {}, 0 )
926
953
needRetry := false
927
- storeIDsUnionSetForAllTasks := make (map [uint64 ]struct {})
928
954
for _ , task := range tasks {
929
955
rpcCtx , err := cache .GetTiFlashRPCContext (bo .TiKVBackoffer (), task .region , isMPP , tikv .LabelFilterNoTiFlashWriteNode )
930
956
if err != nil {
931
957
return nil , errors .Trace (err )
932
958
}
959
+
933
960
// When rpcCtx is nil, it's not only attributed to the miss region, but also
934
961
// some TiFlash stores crash and can't be recovered.
935
962
// That is not an error that can be easily recovered, so we regard this error
@@ -941,36 +968,62 @@ func buildBatchCopTasksCore(bo *backoff.Backoffer, store *kvStore, rangesForEach
941
968
// Then `splitRegion` will reloads these regions.
942
969
continue
943
970
}
971
+
972
+ allStores , _ := cache .GetAllValidTiFlashStores (task .region , rpcCtx .Store , tikv .LabelFilterNoTiFlashWriteNode )
973
+ for _ , storeID := range allStores {
974
+ usedTiFlashStoresMap [storeID ] = struct {}{}
975
+ }
976
+ rpcCtxs = append (rpcCtxs , rpcCtx )
977
+ usedTiFlashStores = append (usedTiFlashStores , allStores )
978
+ }
979
+
980
+ if needRetry {
981
+ // As mentioned above, nil rpcCtx is always attributed to failed stores.
982
+ // It's equal to long poll the store but get no response. Here we'd better use
983
+ // TiFlash error to trigger the TiKV fallback mechanism.
984
+ err := bo .Backoff (tikv .BoTiFlashRPC (), errors .New ("Cannot find region with TiFlash peer" ))
985
+ if err != nil {
986
+ return nil , errors .Trace (err )
987
+ }
988
+ continue
989
+ }
990
+
991
+ aliveStores = getAliveStoresAndStoreIDs (bo .GetCtx (), cache , usedTiFlashStoresMap , ttl , store , tiflashReplicaReadPolicy , tidbZone )
992
+ if tiflashReplicaReadPolicy .IsClosestReplicas () {
993
+ if len (aliveStores .storeIDsInTiDBZone ) == 0 {
994
+ return nil , errors .Errorf ("There is no region in tidb zone(%s)" , tidbZone )
995
+ }
996
+ maxRemoteReadCountAllowed = len (aliveStores .storeIDsInTiDBZone ) * tiflash .MaxRemoteReadCountPerNodeForClosestReplicas
997
+ }
998
+
999
+ var batchTasks []* batchCopTask
1000
+ var regionIDsInOtherZones []uint64
1001
+ var regionInfosNeedReloadOnSendFail []RegionInfo
1002
+ storeTaskMap := make (map [string ]* batchCopTask )
1003
+ storeIDsUnionSetForAllTasks := make (map [uint64 ]struct {})
1004
+ for idx , task := range tasks {
1005
+ var err error
944
1006
var regionInfo RegionInfo
945
- regionInfo , regionInfosNeedReloadOnSendFail , regionIDsInOtherZones , err = filterAccessibleStoresAndBuildRegionInfo (cache , bo , task , rpcCtx , aliveStores , isTiDBLabelZoneSet , tiflashReplicaReadPolicy , regionInfosNeedReloadOnSendFail , regionIDsInOtherZones , maxRemoteReadCountAllowed , tidbZone )
1007
+ regionInfo , regionInfosNeedReloadOnSendFail , regionIDsInOtherZones , err = filterAccessibleStoresAndBuildRegionInfo (cache , usedTiFlashStores [ idx ], bo , task , rpcCtxs [ idx ] , aliveStores , tiflashReplicaReadPolicy , regionInfosNeedReloadOnSendFail , regionIDsInOtherZones , maxRemoteReadCountAllowed , tidbZone )
946
1008
if err != nil {
947
1009
return nil , err
948
1010
}
949
- if batchCop , ok := storeTaskMap [rpcCtx .Addr ]; ok {
1011
+ if batchCop , ok := storeTaskMap [rpcCtxs [ idx ] .Addr ]; ok {
950
1012
batchCop .regionInfos = append (batchCop .regionInfos , regionInfo )
951
1013
} else {
952
1014
batchTask := & batchCopTask {
953
- storeAddr : rpcCtx .Addr ,
1015
+ storeAddr : rpcCtxs [ idx ] .Addr ,
954
1016
cmdType : cmdType ,
955
- ctx : rpcCtx ,
1017
+ ctx : rpcCtxs [ idx ] ,
956
1018
regionInfos : []RegionInfo {regionInfo },
957
1019
}
958
- storeTaskMap [rpcCtx .Addr ] = batchTask
1020
+ storeTaskMap [rpcCtxs [ idx ] .Addr ] = batchTask
959
1021
}
960
1022
for _ , storeID := range regionInfo .AllStores {
961
1023
storeIDsUnionSetForAllTasks [storeID ] = struct {}{}
962
1024
}
963
1025
}
964
- if needRetry {
965
- // As mentioned above, nil rpcCtx is always attributed to failed stores.
966
- // It's equal to long poll the store but get no response. Here we'd better use
967
- // TiFlash error to trigger the TiKV fallback mechanism.
968
- err := bo .Backoff (tikv .BoTiFlashRPC (), errors .New ("Cannot find region with TiFlash peer" ))
969
- if err != nil {
970
- return nil , errors .Trace (err )
971
- }
972
- continue
973
- }
1026
+
974
1027
if len (regionIDsInOtherZones ) != 0 {
975
1028
warningMsg := fmt .Sprintf ("total %d region(s) can not be accessed by TiFlash in the zone [%s]:" , len (regionIDsInOtherZones ), tidbZone )
976
1029
regionIDErrMsg := ""
@@ -998,7 +1051,7 @@ func buildBatchCopTasksCore(bo *backoff.Backoffer, store *kvStore, rangesForEach
998
1051
storesUnionSetForAllTasks = append (storesUnionSetForAllTasks , store )
999
1052
}
1000
1053
}
1001
- batchTasks = balanceBatchCopTask (bo . GetCtx (), storesUnionSetForAllTasks , batchTasks , balanceWithContinuity , balanceContinuousRegionCount )
1054
+ batchTasks = balanceBatchCopTask (storesUnionSetForAllTasks , batchTasks , balanceWithContinuity , balanceContinuousRegionCount )
1002
1055
balanceElapsed := time .Since (balanceStart )
1003
1056
if log .GetLevel () <= zap .DebugLevel {
1004
1057
msg := "After region balance:"
0 commit comments