Skip to content

Commit c870a52

Browse files
authored
ebs br: make sure fsr credit is full filled (#48627) (#48743)
close #48629
1 parent 3a55235 commit c870a52

File tree

4 files changed

+122
-9
lines changed

4 files changed

+122
-9
lines changed

br/pkg/aws/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ go_library(
1212
"@com_github_aws_aws_sdk_go//aws",
1313
"@com_github_aws_aws_sdk_go//aws/awserr",
1414
"@com_github_aws_aws_sdk_go//aws/session",
15+
"@com_github_aws_aws_sdk_go//service/cloudwatch",
1516
"@com_github_aws_aws_sdk_go//service/ec2",
1617
"@com_github_aws_aws_sdk_go//service/ec2/ec2iface",
1718
"@com_github_pingcap_errors//:errors",

br/pkg/aws/ebs.go

Lines changed: 119 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,15 @@ package aws
55
import (
66
"context"
77
"fmt"
8+
"math"
89
"strings"
910
"sync"
1011
"time"
1112

1213
"github.com/aws/aws-sdk-go/aws"
1314
"github.com/aws/aws-sdk-go/aws/awserr"
1415
"github.com/aws/aws-sdk-go/aws/session"
16+
"github.com/aws/aws-sdk-go/service/cloudwatch"
1517
"github.com/aws/aws-sdk-go/service/ec2"
1618
"github.com/aws/aws-sdk-go/service/ec2/ec2iface"
1719
"github.com/pingcap/errors"
@@ -31,7 +33,8 @@ const (
3133
)
3234

3335
type EC2Session struct {
34-
ec2 ec2iface.EC2API
36+
ec2 ec2iface.EC2API
37+
cloudwatchClient *cloudwatch.CloudWatch
3538
// aws operation concurrency
3639
concurrency uint
3740
}
@@ -51,7 +54,8 @@ func NewEC2Session(concurrency uint, region string) (*EC2Session, error) {
5154
return nil, errors.Trace(err)
5255
}
5356
ec2Session := ec2.New(sess)
54-
return &EC2Session{ec2: ec2Session, concurrency: concurrency}, nil
57+
cloudwatchClient := cloudwatch.New(sess)
58+
return &EC2Session{ec2: ec2Session, cloudwatchClient: cloudwatchClient, concurrency: concurrency}, nil
5559
}
5660

5761
// CreateSnapshots is the mainly steps to control the data volume snapshots.
@@ -325,8 +329,63 @@ func (e *EC2Session) EnableDataFSR(meta *config.EBSBasedBRMeta, targetAZ string)
325329
return snapshotsIDsMap, eg.Wait()
326330
}
327331

328-
// waitDataFSREnabled waits FSR for data volume snapshots are all enabled
332+
// waitDataFSREnabled waits FSR for data volume snapshots are all enabled and also have enough credit balance
329333
func (e *EC2Session) waitDataFSREnabled(snapShotIDs []*string, targetAZ string) error {
334+
// Record current time
335+
start := time.Now()
336+
337+
// get the maximum size of volumes, in GiB
338+
var maxVolumeSize int64 = 0
339+
resp, err := e.ec2.DescribeSnapshots(&ec2.DescribeSnapshotsInput{SnapshotIds: snapShotIDs})
340+
if err != nil {
341+
return errors.Trace(err)
342+
}
343+
if len(resp.Snapshots) <= 0 {
344+
return errors.Errorf("specified snapshot [%s] is not found", *snapShotIDs[0])
345+
}
346+
347+
for _, s := range resp.Snapshots {
348+
if *s.VolumeSize > maxVolumeSize {
349+
maxVolumeSize = *s.VolumeSize
350+
}
351+
}
352+
353+
// Calculate the time in minutes to fill 1.0 credit according to
354+
// https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ebs-fast-snapshot-restore.html#volume-creation-credits
355+
// 5 minutes more is just for safe
356+
fillElapsedTime := 60.0/(math.Min(10, 1024.0/(float64)(maxVolumeSize))) + 5
357+
358+
// We have to sleep for at least fillElapsedTime minutes in order to make credits are filled to 1.0
359+
// Let's heartbeat every 5 minutes
360+
for time.Since(start) <= time.Duration(fillElapsedTime)*time.Minute {
361+
log.Info("FSR enablement is ongoing, going to sleep for 5 minutes...")
362+
time.Sleep(5 * time.Minute)
363+
}
364+
365+
// Wait that all snapshot has enough fsr credit balance, it's very likely true since we have wait for long enough
366+
log.Info("Start check and wait all snapshots have enough fsr credit balance")
367+
368+
startIdx := 0
369+
retryCount := 0
370+
for startIdx < len(snapShotIDs) {
371+
creditBalance, _ := e.getFSRCreditBalance(snapShotIDs[startIdx], targetAZ)
372+
if creditBalance != nil && *creditBalance >= 1.0 {
373+
startIdx++
374+
retryCount = 0
375+
} else {
376+
if creditBalance == nil {
377+
// For invalid calling, retry 3 times
378+
if retryCount >= 3 {
379+
return errors.Errorf("cloudwatch metrics for %s operation failed after retrying", *snapShotIDs[startIdx])
380+
}
381+
retryCount++
382+
}
383+
// Retry for both invalid calling and not enough fsr credit
384+
// Cloudwatch by default flushes every 5 seconds. So, 20 seconds wait should be enough
385+
time.Sleep(20 * time.Second)
386+
}
387+
}
388+
330389
// Create a map to store the strings as keys
331390
pendingSnapshots := make(map[string]struct{})
332391

@@ -379,6 +438,51 @@ func (e *EC2Session) waitDataFSREnabled(snapShotIDs []*string, targetAZ string)
379438
}
380439
}
381440

441+
// getFSRCreditBalance is used to get maximum fsr credit balance of snapshot for last 5 minutes
442+
func (e *EC2Session) getFSRCreditBalance(snapshotID *string, targetAZ string) (*float64, error) {
443+
// Set the time range to query for metrics
444+
startTime := time.Now().Add(-5 * time.Minute)
445+
endTime := time.Now()
446+
447+
// Prepare the input for the GetMetricStatisticsWithContext API call
448+
input := &cloudwatch.GetMetricStatisticsInput{
449+
StartTime: aws.Time(startTime),
450+
EndTime: aws.Time(endTime),
451+
Namespace: aws.String("AWS/EBS"),
452+
MetricName: aws.String("FastSnapshotRestoreCreditsBalance"),
453+
Dimensions: []*cloudwatch.Dimension{
454+
{
455+
Name: aws.String("SnapshotId"),
456+
Value: snapshotID,
457+
},
458+
{
459+
Name: aws.String("AvailabilityZone"),
460+
Value: aws.String(targetAZ),
461+
},
462+
},
463+
Period: aws.Int64(300),
464+
Statistics: []*string{aws.String("Maximum")},
465+
}
466+
467+
log.Info("metrics input", zap.Any("input", input))
468+
469+
// Call cloudwatchClient API to retrieve the FastSnapshotRestoreCreditsBalance metric data
470+
resp, err := e.cloudwatchClient.GetMetricStatisticsWithContext(context.Background(), input)
471+
if err != nil {
472+
log.Error("GetMetricStatisticsWithContext failed", zap.Error(err))
473+
return nil, errors.Trace(err)
474+
}
475+
476+
// parse the response
477+
if len(resp.Datapoints) == 0 {
478+
log.Warn("No result for metric FastSnapshotRestoreCreditsBalance returned", zap.Stringp("snapshot", snapshotID))
479+
return nil, nil
480+
}
481+
result := resp.Datapoints[0]
482+
log.Info("credit balance", zap.Stringp("snapshot", snapshotID), zap.Float64p("credit", result.Maximum))
483+
return result.Maximum, nil
484+
}
485+
382486
// DisableDataFSR disables FSR for data volume snapshots
383487
func (e *EC2Session) DisableDataFSR(snapshotsIDsMap map[string][]*string) error {
384488
if len(snapshotsIDsMap) == 0 {
@@ -530,7 +634,7 @@ func (e *EC2Session) CreateVolumes(meta *config.EBSBasedBRMeta, volumeType strin
530634
return newVolumeIDMap, eg.Wait()
531635
}
532636

533-
func (e *EC2Session) WaitVolumesCreated(volumeIDMap map[string]string, progress glue.Progress) (int64, error) {
637+
func (e *EC2Session) WaitVolumesCreated(volumeIDMap map[string]string, progress glue.Progress, fsrEnabledRequired bool) (int64, error) {
534638
pendingVolumes := make([]*string, 0, len(volumeIDMap))
535639
for oldVolID := range volumeIDMap {
536640
newVolumeID := volumeIDMap[oldVolID]
@@ -550,7 +654,11 @@ func (e *EC2Session) WaitVolumesCreated(volumeIDMap map[string]string, progress
550654
return 0, errors.Trace(err)
551655
}
552656

553-
createdVolumeSize, unfinishedVolumes := e.HandleDescribeVolumesResponse(resp)
657+
createdVolumeSize, unfinishedVolumes, err := e.HandleDescribeVolumesResponse(resp, fsrEnabledRequired)
658+
if err != nil {
659+
return 0, errors.Trace(err)
660+
}
661+
554662
progress.IncBy(int64(len(pendingVolumes) - len(unfinishedVolumes)))
555663
totalVolumeSize += createdVolumeSize
556664
pendingVolumes = unfinishedVolumes
@@ -593,12 +701,16 @@ func ec2Tag(key, val string) *ec2.Tag {
593701
return &ec2.Tag{Key: &key, Value: &val}
594702
}
595703

596-
func (e *EC2Session) HandleDescribeVolumesResponse(resp *ec2.DescribeVolumesOutput) (int64, []*string) {
704+
func (e *EC2Session) HandleDescribeVolumesResponse(resp *ec2.DescribeVolumesOutput, fsrEnabledRequired bool) (int64, []*string, error) {
597705
totalVolumeSize := int64(0)
598706

599707
var unfinishedVolumes []*string
600708
for _, volume := range resp.Volumes {
601709
if *volume.State == ec2.VolumeStateAvailable {
710+
if fsrEnabledRequired && volume.FastRestored != nil && !*volume.FastRestored {
711+
log.Error("snapshot fsr is not enabled for the volume", zap.String("volume", *volume.SnapshotId))
712+
return 0, nil, errors.Errorf("Snapshot [%s] of volume [%s] is not fsr enabled", *volume.SnapshotId, *volume.VolumeId)
713+
}
602714
log.Info("volume is available", zap.String("id", *volume.VolumeId))
603715
totalVolumeSize += *volume.Size
604716
} else {
@@ -607,5 +719,5 @@ func (e *EC2Session) HandleDescribeVolumesResponse(resp *ec2.DescribeVolumesOutp
607719
}
608720
}
609721

610-
return totalVolumeSize, unfinishedVolumes
722+
return totalVolumeSize, unfinishedVolumes, nil
611723
}

br/pkg/aws/ebs_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ func TestHandleDescribeVolumesResponse(t *testing.T) {
7272
}
7373

7474
e := &EC2Session{}
75-
createdVolumeSize, unfinishedVolumes := e.HandleDescribeVolumesResponse(curentVolumesStates)
75+
createdVolumeSize, unfinishedVolumes, _ := e.HandleDescribeVolumesResponse(curentVolumesStates, false)
7676
require.Equal(t, int64(4), createdVolumeSize)
7777
require.Equal(t, 1, len(unfinishedVolumes))
7878
}

br/pkg/task/restore_ebs_meta.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -262,7 +262,7 @@ func (h *restoreEBSMetaHelper) restoreVolumes(progress glue.Progress) (map[strin
262262
if err != nil {
263263
return nil, 0, errors.Trace(err)
264264
}
265-
totalSize, err = ec2Session.WaitVolumesCreated(volumeIDMap, progress)
265+
totalSize, err = ec2Session.WaitVolumesCreated(volumeIDMap, progress, h.cfg.UseFSR)
266266
if err != nil {
267267
return nil, 0, errors.Trace(err)
268268
}

0 commit comments

Comments
 (0)