Skip to content

Commit 173c2e1

Browse files
nolouchti-chi-bot
andauthored
resource_control: allow configuration of the maximum retry time for the local bucket (#8352) (#8365)
* client/controller: record context error and add slowlog about token bucket (#8344) (#8355) close #8343, ref #8349 client/controller: record context error and add slowlog about token bucket - record low process start time, and log it if it's too slow - record the context error Signed-off-by: Shuning Chen <[email protected]> * This is an automated cherry-pick of #8352 close #8349 Signed-off-by: nolouch <[email protected]> Signed-off-by: Shuning Chen <[email protected]> --------- Signed-off-by: Shuning Chen <[email protected]> Signed-off-by: nolouch <[email protected]> Co-authored-by: Ti Chi Robot <[email protected]>
1 parent 358de10 commit 173c2e1

File tree

9 files changed

+207
-77
lines changed

9 files changed

+207
-77
lines changed

client/resource_group/controller/config.go

Lines changed: 65 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,12 @@ const (
5252
defaultTargetPeriod = 5 * time.Second
5353
// defaultMaxWaitDuration is the max duration to wait for the token before throwing error.
5454
defaultMaxWaitDuration = 30 * time.Second
55+
// defaultLTBTokenRPCMaxDelay is the upper bound of backoff delay for local token bucket RPC.
56+
defaultLTBTokenRPCMaxDelay = 1 * time.Second
57+
// defaultWaitRetryTimes is the times to retry when waiting for the token.
58+
defaultWaitRetryTimes = 20
59+
// defaultWaitRetryInterval is the interval to retry when waiting for the token.
60+
defaultWaitRetryInterval = 50 * time.Millisecond
5561
)
5662

5763
const (
@@ -73,18 +79,36 @@ const (
7379

7480
// Because the resource manager has not been deployed in microservice mode,
7581
// do not enable this function.
76-
defaultDegradedModeWaitDuration = 0
82+
defaultDegradedModeWaitDuration = time.Duration(0)
7783
defaultAvgBatchProportion = 0.7
7884
)
7985

80-
// Config is the configuration of the resource manager controller which includes some option for client needed.
81-
type Config struct {
86+
// TokenRPCParams is the parameters for local bucket RPC.
87+
type TokenRPCParams struct {
88+
// WaitRetryInterval is the interval to retry when waiting for the token.
89+
WaitRetryInterval Duration `toml:"wait-retry-interval" json:"wait-retry-interval"`
90+
91+
// WaitRetryTimes is the times to retry when waiting for the token.
92+
WaitRetryTimes int `toml:"wait-retry-times" json:"wait-retry-times"`
93+
}
94+
95+
// LocalBucketConfig is the configuration for local bucket. not export to server side.
96+
type LocalBucketConfig struct {
97+
TokenRPCParams `toml:"token-rpc-params" json:"token-rpc-params"`
98+
}
99+
100+
// BaseConfig is the configuration of the resource manager controller which includes some option for client needed.
101+
// TODO: unified the configuration for client and server, server side in pkg/mcs/resourcemanger/config.go.
102+
type BaseConfig struct {
82103
// EnableDegradedMode is to control whether resource control client enable degraded mode when server is disconnect.
83104
DegradedModeWaitDuration Duration `toml:"degraded-mode-wait-duration" json:"degraded-mode-wait-duration"`
84105

85106
// LTBMaxWaitDuration is the max wait time duration for local token bucket.
86107
LTBMaxWaitDuration Duration `toml:"ltb-max-wait-duration" json:"ltb-max-wait-duration"`
87108

109+
// LTBTokenRPCMaxDelay is the upper bound of backoff delay for local token bucket RPC.
110+
LTBTokenRPCMaxDelay Duration `toml:"ltb-token-rpc-max-delay" json:"ltb-token-rpc-max-delay"`
111+
88112
// RequestUnit is the configuration determines the coefficients of the RRU and WRU cost.
89113
// This configuration should be modified carefully.
90114
RequestUnit RequestUnitConfig `toml:"request-unit" json:"request-unit"`
@@ -93,13 +117,43 @@ type Config struct {
93117
EnableControllerTraceLog bool `toml:"enable-controller-trace-log" json:"enable-controller-trace-log,string"`
94118
}
95119

120+
// Config is the configuration of the resource manager controller.
121+
type Config struct {
122+
BaseConfig
123+
LocalBucketConfig
124+
}
125+
126+
// Adjust adjusts the configuration.
127+
func (c *Config) Adjust() {
128+
// valid the configuration, TODO: separately add the valid function.
129+
if c.BaseConfig.LTBMaxWaitDuration.Duration == 0 {
130+
c.BaseConfig.LTBMaxWaitDuration = NewDuration(defaultMaxWaitDuration)
131+
}
132+
if c.LocalBucketConfig.WaitRetryInterval.Duration == 0 {
133+
c.LocalBucketConfig.WaitRetryInterval = NewDuration(defaultWaitRetryInterval)
134+
}
135+
// adjust the client settings. calculate the retry times.
136+
if int(c.BaseConfig.LTBTokenRPCMaxDelay.Duration) != int(c.LocalBucketConfig.WaitRetryInterval.Duration)*c.LocalBucketConfig.WaitRetryTimes {
137+
c.LocalBucketConfig.WaitRetryTimes = int(c.BaseConfig.LTBTokenRPCMaxDelay.Duration / c.LocalBucketConfig.WaitRetryInterval.Duration)
138+
}
139+
}
140+
96141
// DefaultConfig returns the default resource manager controller configuration.
97142
func DefaultConfig() *Config {
98143
return &Config{
99-
DegradedModeWaitDuration: NewDuration(defaultDegradedModeWaitDuration),
100-
LTBMaxWaitDuration: NewDuration(defaultMaxWaitDuration),
101-
RequestUnit: DefaultRequestUnitConfig(),
102-
EnableControllerTraceLog: false,
144+
BaseConfig: BaseConfig{
145+
DegradedModeWaitDuration: NewDuration(defaultDegradedModeWaitDuration),
146+
RequestUnit: DefaultRequestUnitConfig(),
147+
EnableControllerTraceLog: false,
148+
LTBMaxWaitDuration: NewDuration(defaultMaxWaitDuration),
149+
LTBTokenRPCMaxDelay: NewDuration(defaultLTBTokenRPCMaxDelay),
150+
},
151+
LocalBucketConfig: LocalBucketConfig{
152+
TokenRPCParams: TokenRPCParams{
153+
WaitRetryInterval: NewDuration(defaultWaitRetryInterval),
154+
WaitRetryTimes: defaultWaitRetryTimes,
155+
},
156+
},
103157
}
104158
}
105159

@@ -155,6 +209,8 @@ type RUConfig struct {
155209

156210
// some config for client
157211
LTBMaxWaitDuration time.Duration
212+
WaitRetryInterval time.Duration
213+
WaitRetryTimes int
158214
DegradedModeWaitDuration time.Duration
159215
}
160216

@@ -176,6 +232,8 @@ func GenerateRUConfig(config *Config) *RUConfig {
176232
WriteBytesCost: RequestUnit(config.RequestUnit.WriteCostPerByte),
177233
CPUMsCost: RequestUnit(config.RequestUnit.CPUMsCost),
178234
LTBMaxWaitDuration: config.LTBMaxWaitDuration.Duration,
235+
WaitRetryInterval: config.WaitRetryInterval.Duration,
236+
WaitRetryTimes: config.WaitRetryTimes,
179237
DegradedModeWaitDuration: config.DegradedModeWaitDuration.Duration,
180238
}
181239
}

client/resource_group/controller/controller.go

Lines changed: 49 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -38,14 +38,12 @@ import (
3838
)
3939

4040
const (
41-
controllerConfigPath = "resource_group/controller"
42-
maxRetry = 10
43-
retryInterval = 50 * time.Millisecond
44-
maxNotificationChanLen = 200
45-
needTokensAmplification = 1.1
46-
trickleReserveDuration = 1250 * time.Millisecond
47-
48-
watchRetryInterval = 30 * time.Second
41+
controllerConfigPath = "resource_group/controller"
42+
maxNotificationChanLen = 200
43+
needTokensAmplification = 1.1
44+
trickleReserveDuration = 1250 * time.Millisecond
45+
slowNotifyFilterDuration = 10 * time.Millisecond
46+
watchRetryInterval = 30 * time.Second
4947
)
5048

5149
type selectType int
@@ -104,6 +102,20 @@ func WithMaxWaitDuration(d time.Duration) ResourceControlCreateOption {
104102
}
105103
}
106104

105+
// WithWaitRetryInterval is the option to set the retry interval when waiting for the token.
106+
func WithWaitRetryInterval(d time.Duration) ResourceControlCreateOption {
107+
return func(controller *ResourceGroupsController) {
108+
controller.ruConfig.WaitRetryInterval = d
109+
}
110+
}
111+
112+
// WithWaitRetryTimes is the option to set the times to retry when waiting for the token.
113+
func WithWaitRetryTimes(times int) ResourceControlCreateOption {
114+
return func(controller *ResourceGroupsController) {
115+
controller.ruConfig.WaitRetryTimes = times
116+
}
117+
}
118+
107119
var _ ResourceGroupKVInterceptor = (*ResourceGroupsController)(nil)
108120

109121
// ResourceGroupsController implements ResourceGroupKVInterceptor.
@@ -119,7 +131,7 @@ type ResourceGroupsController struct {
119131
calculators []ResourceCalculator
120132

121133
// When a signal is received, it means the number of available token is low.
122-
lowTokenNotifyChan chan struct{}
134+
lowTokenNotifyChan chan notifyMsg
123135
// When a token bucket response received from server, it will be sent to the channel.
124136
tokenResponseChan chan []*rmpb.TokenBucketResponse
125137
// When the token bucket of a resource group is updated, it will be sent to the channel.
@@ -161,7 +173,7 @@ func NewResourceGroupController(
161173
clientUniqueID: clientUniqueID,
162174
provider: provider,
163175
ruConfig: ruConfig,
164-
lowTokenNotifyChan: make(chan struct{}, 1),
176+
lowTokenNotifyChan: make(chan notifyMsg, 1),
165177
tokenResponseChan: make(chan []*rmpb.TokenBucketResponse, 1),
166178
tokenBucketUpdateChan: make(chan *groupCostController, maxNotificationChanLen),
167179
opts: opts,
@@ -172,6 +184,7 @@ func NewResourceGroupController(
172184
log.Info("load resource controller config", zap.Reflect("config", config), zap.Reflect("ru-config", controller.ruConfig))
173185
controller.calculators = []ResourceCalculator{newKVCalculator(controller.ruConfig), newSQLCalculator(controller.ruConfig)}
174186
controller.safeRuConfig.Store(controller.ruConfig)
187+
enableControllerTraceLog.Store(config.EnableControllerTraceLog)
175188
return controller, nil
176189
}
177190

@@ -180,12 +193,13 @@ func loadServerConfig(ctx context.Context, provider ResourceGroupProvider) (*Con
180193
if err != nil {
181194
return nil, err
182195
}
196+
config := DefaultConfig()
197+
defer config.Adjust()
183198
kvs := resp.GetKvs()
184199
if len(kvs) == 0 {
185200
log.Warn("[resource group controller] server does not save config, load config failed")
186-
return DefaultConfig(), nil
201+
return config, nil
187202
}
188-
config := &Config{}
189203
err = json.Unmarshal(kvs[0].GetValue(), config)
190204
if err != nil {
191205
return nil, err
@@ -267,7 +281,7 @@ func (c *ResourceGroupsController) Start(ctx context.Context) {
267281
c.executeOnAllGroups((*groupCostController).updateRunState)
268282
c.executeOnAllGroups((*groupCostController).updateAvgRequestResourcePerSec)
269283
if len(c.run.currentRequests) == 0 {
270-
c.collectTokenBucketRequests(c.loopCtx, FromPeriodReport, periodicReport /* select resource groups which should be reported periodically */)
284+
c.collectTokenBucketRequests(c.loopCtx, FromPeriodReport, periodicReport /* select resource groups which should be reported periodically */, notifyMsg{})
271285
}
272286
case <-watchRetryTimer.C:
273287
if !c.ruConfig.isSingleGroupByKeyspace && watchMetaChannel == nil {
@@ -288,7 +302,6 @@ func (c *ResourceGroupsController) Start(ctx context.Context) {
288302
watchRetryTimer.Reset(watchRetryInterval)
289303
}
290304
}
291-
292305
case <-emergencyTokenAcquisitionTicker.C:
293306
c.executeOnAllGroups((*groupCostController).resetEmergencyTokenAcquisition)
294307
/* channels */
@@ -305,11 +318,11 @@ func (c *ResourceGroupsController) Start(ctx context.Context) {
305318
c.handleTokenBucketResponse(resp)
306319
}
307320
c.run.currentRequests = nil
308-
case <-c.lowTokenNotifyChan:
321+
case notifyMsg := <-c.lowTokenNotifyChan:
309322
c.executeOnAllGroups((*groupCostController).updateRunState)
310323
c.executeOnAllGroups((*groupCostController).updateAvgRequestResourcePerSec)
311324
if len(c.run.currentRequests) == 0 {
312-
c.collectTokenBucketRequests(c.loopCtx, FromLowRU, lowToken /* select low tokens resource group */)
325+
c.collectTokenBucketRequests(c.loopCtx, FromLowRU, lowToken /* select low tokens resource group */, notifyMsg)
313326
}
314327
if c.run.inDegradedMode {
315328
c.executeOnAllGroups((*groupCostController).applyDegradedMode)
@@ -366,10 +379,11 @@ func (c *ResourceGroupsController) Start(ctx context.Context) {
366379
}
367380
for _, item := range resp {
368381
cfgRevision = item.Kv.ModRevision
369-
config := &Config{}
382+
config := DefaultConfig()
370383
if err := json.Unmarshal(item.Kv.Value, config); err != nil {
371384
continue
372385
}
386+
config.Adjust()
373387
c.ruConfig = GenerateRUConfig(config)
374388

375389
// Stay compatible with serverless
@@ -383,7 +397,6 @@ func (c *ResourceGroupsController) Start(ctx context.Context) {
383397
}
384398
log.Info("load resource controller config after config changed", zap.Reflect("config", config), zap.Reflect("ruConfig", c.ruConfig))
385399
}
386-
387400
case gc := <-c.tokenBucketUpdateChan:
388401
now := gc.run.now
389402
go gc.handleTokenBucketUpdateEvent(c.loopCtx, now)
@@ -489,7 +502,7 @@ func (c *ResourceGroupsController) handleTokenBucketResponse(resp []*rmpb.TokenB
489502
}
490503
}
491504

492-
func (c *ResourceGroupsController) collectTokenBucketRequests(ctx context.Context, source string, typ selectType) {
505+
func (c *ResourceGroupsController) collectTokenBucketRequests(ctx context.Context, source string, typ selectType, notifyMsg notifyMsg) {
493506
c.run.currentRequests = make([]*rmpb.TokenBucketRequest, 0)
494507
c.groupsController.Range(func(name, value any) bool {
495508
gc := value.(*groupCostController)
@@ -501,11 +514,11 @@ func (c *ResourceGroupsController) collectTokenBucketRequests(ctx context.Contex
501514
return true
502515
})
503516
if len(c.run.currentRequests) > 0 {
504-
c.sendTokenBucketRequests(ctx, c.run.currentRequests, source)
517+
c.sendTokenBucketRequests(ctx, c.run.currentRequests, source, notifyMsg)
505518
}
506519
}
507520

508-
func (c *ResourceGroupsController) sendTokenBucketRequests(ctx context.Context, requests []*rmpb.TokenBucketRequest, source string) {
521+
func (c *ResourceGroupsController) sendTokenBucketRequests(ctx context.Context, requests []*rmpb.TokenBucketRequest, source string, notifyMsg notifyMsg) {
509522
now := time.Now()
510523
req := &rmpb.TokenBucketsRequest{
511524
Requests: requests,
@@ -523,13 +536,16 @@ func (c *ResourceGroupsController) sendTokenBucketRequests(ctx context.Context,
523536
if err != nil {
524537
// Don't log any errors caused by the stopper canceling the context.
525538
if !errors.ErrorEqual(err, context.Canceled) {
526-
log.L().Sugar().Infof("[resource group controller] token bucket rpc error: %v", err)
539+
log.Error("[resource group controller] token bucket rpc error", zap.Error(err))
527540
}
528541
resp = nil
529542
failedTokenRequestDuration.Observe(latency.Seconds())
530543
} else {
531544
successfulTokenRequestDuration.Observe(latency.Seconds())
532545
}
546+
if !notifyMsg.startTime.IsZero() && time.Since(notifyMsg.startTime) > slowNotifyFilterDuration {
547+
log.Warn("[resource group controller] slow token bucket request", zap.String("source", source), zap.Duration("cost", time.Since(notifyMsg.startTime)))
548+
}
533549
logControllerTrace("[resource group controller] token bucket response", zap.Time("now", time.Now()), zap.Any("resp", resp), zap.String("source", source), zap.Duration("latency", latency))
534550
c.tokenResponseChan <- resp
535551
}()
@@ -625,7 +641,7 @@ type groupCostController struct {
625641
// fast path to make once token limit with un-limit burst.
626642
burstable *atomic.Bool
627643

628-
lowRUNotifyChan chan<- struct{}
644+
lowRUNotifyChan chan<- notifyMsg
629645
tokenBucketUpdateChan chan<- *groupCostController
630646

631647
// run contains the state that is updated by the main loop.
@@ -715,7 +731,7 @@ type tokenCounter struct {
715731
func newGroupCostController(
716732
group *rmpb.ResourceGroup,
717733
mainCfg *RUConfig,
718-
lowRUNotifyChan chan struct{},
734+
lowRUNotifyChan chan notifyMsg,
719735
tokenBucketUpdateChan chan *groupCostController,
720736
) (*groupCostController, error) {
721737
switch group.Mode {
@@ -834,7 +850,7 @@ func (gc *groupCostController) updateRunState() {
834850
}
835851
*gc.run.consumption = *gc.mu.consumption
836852
gc.mu.Unlock()
837-
logControllerTrace("[resource group controller] update run state", zap.Any("request-unit-consumption", gc.run.consumption))
853+
logControllerTrace("[resource group controller] update run state", zap.String("name", gc.name), zap.Any("request-unit-consumption", gc.run.consumption))
838854
gc.run.now = newTime
839855
}
840856

@@ -1034,7 +1050,7 @@ func (gc *groupCostController) applyBasicConfigForRUTokenCounters() {
10341050
cfg.NewRate = 99999999
10351051
})
10361052
counter.limiter.Reconfigure(gc.run.now, cfg, resetLowProcess())
1037-
log.Info("[resource group controller] resource token bucket enter degraded mode", zap.String("resource-group", gc.name), zap.String("type", rmpb.RequestUnitType_name[int32(typ)]))
1053+
log.Info("[resource group controller] resource token bucket enter degraded mode", zap.String("name", gc.name), zap.String("type", rmpb.RequestUnitType_name[int32(typ)]))
10381054
}
10391055
}
10401056

@@ -1088,6 +1104,9 @@ func (gc *groupCostController) modifyTokenCounter(counter *tokenCounter, bucket
10881104
timerDuration = (trickleDuration + trickleReserveDuration) / 2
10891105
}
10901106
counter.notify.mu.Lock()
1107+
if counter.notify.setupNotificationTimer != nil {
1108+
counter.notify.setupNotificationTimer.Stop()
1109+
}
10911110
counter.notify.setupNotificationTimer = time.NewTimer(timerDuration)
10921111
counter.notify.setupNotificationCh = counter.notify.setupNotificationTimer.C
10931112
counter.notify.setupNotificationThreshold = 1
@@ -1222,7 +1241,7 @@ func (gc *groupCostController) onRequestWait(
12221241
var i int
12231242
var d time.Duration
12241243
retryLoop:
1225-
for i = 0; i < maxRetry; i++ {
1244+
for i = 0; i < gc.mainCfg.WaitRetryTimes; i++ {
12261245
switch gc.mode {
12271246
case rmpb.GroupMode_RawMode:
12281247
res := make([]*Reservation, 0, len(requestResourceLimitTypeList))
@@ -1246,8 +1265,8 @@ func (gc *groupCostController) onRequestWait(
12461265
}
12471266
}
12481267
gc.metrics.requestRetryCounter.Inc()
1249-
time.Sleep(retryInterval)
1250-
waitDuration += retryInterval
1268+
time.Sleep(gc.mainCfg.WaitRetryInterval)
1269+
waitDuration += gc.mainCfg.WaitRetryInterval
12511270
}
12521271
if err != nil {
12531272
if errs.ErrClientResourceGroupThrottled.Equal(err) {
@@ -1260,7 +1279,7 @@ func (gc *groupCostController) onRequestWait(
12601279
sub(gc.mu.consumption, delta)
12611280
gc.mu.Unlock()
12621281
failpoint.Inject("triggerUpdate", func() {
1263-
gc.lowRUNotifyChan <- struct{}{}
1282+
gc.lowRUNotifyChan <- notifyMsg{}
12641283
})
12651284
return nil, nil, waitDuration, 0, err
12661285
}

client/resource_group/controller/controller_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ func createTestGroupCostController(re *require.Assertions) *groupCostController
4545
JobTypes: []string{"lightning", "br"},
4646
},
4747
}
48-
ch1 := make(chan struct{})
48+
ch1 := make(chan notifyMsg)
4949
ch2 := make(chan *groupCostController)
5050
gc, err := newGroupCostController(group, DefaultRUConfig(), ch1, ch2)
5151
re.NoError(err)

0 commit comments

Comments
 (0)