@@ -5,13 +5,15 @@ package aws
5
5
import (
6
6
"context"
7
7
"fmt"
8
+ "math"
8
9
"strings"
9
10
"sync"
10
11
"time"
11
12
12
13
"github.com/aws/aws-sdk-go/aws"
13
14
"github.com/aws/aws-sdk-go/aws/awserr"
14
15
"github.com/aws/aws-sdk-go/aws/session"
16
+ "github.com/aws/aws-sdk-go/service/cloudwatch"
15
17
"github.com/aws/aws-sdk-go/service/ec2"
16
18
"github.com/aws/aws-sdk-go/service/ec2/ec2iface"
17
19
"github.com/pingcap/errors"
@@ -31,7 +33,8 @@ const (
31
33
)
32
34
33
35
type EC2Session struct {
34
- ec2 ec2iface.EC2API
36
+ ec2 ec2iface.EC2API
37
+ cloudwatchClient * cloudwatch.CloudWatch
35
38
// aws operation concurrency
36
39
concurrency uint
37
40
}
@@ -51,7 +54,8 @@ func NewEC2Session(concurrency uint, region string) (*EC2Session, error) {
51
54
return nil , errors .Trace (err )
52
55
}
53
56
ec2Session := ec2 .New (sess )
54
- return & EC2Session {ec2 : ec2Session , concurrency : concurrency }, nil
57
+ cloudwatchClient := cloudwatch .New (sess )
58
+ return & EC2Session {ec2 : ec2Session , cloudwatchClient : cloudwatchClient , concurrency : concurrency }, nil
55
59
}
56
60
57
61
// CreateSnapshots is the mainly steps to control the data volume snapshots.
@@ -325,8 +329,63 @@ func (e *EC2Session) EnableDataFSR(meta *config.EBSBasedBRMeta, targetAZ string)
325
329
return snapshotsIDsMap , eg .Wait ()
326
330
}
327
331
328
- // waitDataFSREnabled waits FSR for data volume snapshots are all enabled
332
+ // waitDataFSREnabled waits FSR for data volume snapshots are all enabled and also have enough credit balance
329
333
func (e * EC2Session ) waitDataFSREnabled (snapShotIDs []* string , targetAZ string ) error {
334
+ // Record current time
335
+ start := time .Now ()
336
+
337
+ // get the maximum size of volumes, in GiB
338
+ var maxVolumeSize int64 = 0
339
+ resp , err := e .ec2 .DescribeSnapshots (& ec2.DescribeSnapshotsInput {SnapshotIds : snapShotIDs })
340
+ if err != nil {
341
+ return errors .Trace (err )
342
+ }
343
+ if len (resp .Snapshots ) <= 0 {
344
+ return errors .Errorf ("specified snapshot [%s] is not found" , * snapShotIDs [0 ])
345
+ }
346
+
347
+ for _ , s := range resp .Snapshots {
348
+ if * s .VolumeSize > maxVolumeSize {
349
+ maxVolumeSize = * s .VolumeSize
350
+ }
351
+ }
352
+
353
+ // Calculate the time in minutes to fill 1.0 credit according to
354
+ // https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ebs-fast-snapshot-restore.html#volume-creation-credits
355
+ // 5 minutes more is just for safe
356
+ fillElapsedTime := 60.0 / (math .Min (10 , 1024.0 / (float64 )(maxVolumeSize ))) + 5
357
+
358
+ // We have to sleep for at least fillElapsedTime minutes in order to make credits are filled to 1.0
359
+ // Let's heartbeat every 5 minutes
360
+ for time .Since (start ) <= time .Duration (fillElapsedTime )* time .Minute {
361
+ log .Info ("FSR enablement is ongoing, going to sleep for 5 minutes..." )
362
+ time .Sleep (5 * time .Minute )
363
+ }
364
+
365
+ // Wait that all snapshot has enough fsr credit balance, it's very likely true since we have wait for long enough
366
+ log .Info ("Start check and wait all snapshots have enough fsr credit balance" )
367
+
368
+ startIdx := 0
369
+ retryCount := 0
370
+ for startIdx < len (snapShotIDs ) {
371
+ creditBalance , _ := e .getFSRCreditBalance (snapShotIDs [startIdx ], targetAZ )
372
+ if creditBalance != nil && * creditBalance >= 1.0 {
373
+ startIdx ++
374
+ retryCount = 0
375
+ } else {
376
+ if creditBalance == nil {
377
+ // For invalid calling, retry 3 times
378
+ if retryCount >= 3 {
379
+ return errors .Errorf ("cloudwatch metrics for %s operation failed after retrying" , * snapShotIDs [startIdx ])
380
+ }
381
+ retryCount ++
382
+ }
383
+ // Retry for both invalid calling and not enough fsr credit
384
+ // Cloudwatch by default flushes every 5 seconds. So, 20 seconds wait should be enough
385
+ time .Sleep (20 * time .Second )
386
+ }
387
+ }
388
+
330
389
// Create a map to store the strings as keys
331
390
pendingSnapshots := make (map [string ]struct {})
332
391
@@ -379,6 +438,51 @@ func (e *EC2Session) waitDataFSREnabled(snapShotIDs []*string, targetAZ string)
379
438
}
380
439
}
381
440
441
+ // getFSRCreditBalance is used to get maximum fsr credit balance of snapshot for last 5 minutes
442
+ func (e * EC2Session ) getFSRCreditBalance (snapshotID * string , targetAZ string ) (* float64 , error ) {
443
+ // Set the time range to query for metrics
444
+ startTime := time .Now ().Add (- 5 * time .Minute )
445
+ endTime := time .Now ()
446
+
447
+ // Prepare the input for the GetMetricStatisticsWithContext API call
448
+ input := & cloudwatch.GetMetricStatisticsInput {
449
+ StartTime : aws .Time (startTime ),
450
+ EndTime : aws .Time (endTime ),
451
+ Namespace : aws .String ("AWS/EBS" ),
452
+ MetricName : aws .String ("FastSnapshotRestoreCreditsBalance" ),
453
+ Dimensions : []* cloudwatch.Dimension {
454
+ {
455
+ Name : aws .String ("SnapshotId" ),
456
+ Value : snapshotID ,
457
+ },
458
+ {
459
+ Name : aws .String ("AvailabilityZone" ),
460
+ Value : aws .String (targetAZ ),
461
+ },
462
+ },
463
+ Period : aws .Int64 (300 ),
464
+ Statistics : []* string {aws .String ("Maximum" )},
465
+ }
466
+
467
+ log .Info ("metrics input" , zap .Any ("input" , input ))
468
+
469
+ // Call cloudwatchClient API to retrieve the FastSnapshotRestoreCreditsBalance metric data
470
+ resp , err := e .cloudwatchClient .GetMetricStatisticsWithContext (context .Background (), input )
471
+ if err != nil {
472
+ log .Error ("GetMetricStatisticsWithContext failed" , zap .Error (err ))
473
+ return nil , errors .Trace (err )
474
+ }
475
+
476
+ // parse the response
477
+ if len (resp .Datapoints ) == 0 {
478
+ log .Warn ("No result for metric FastSnapshotRestoreCreditsBalance returned" , zap .Stringp ("snapshot" , snapshotID ))
479
+ return nil , nil
480
+ }
481
+ result := resp .Datapoints [0 ]
482
+ log .Info ("credit balance" , zap .Stringp ("snapshot" , snapshotID ), zap .Float64p ("credit" , result .Maximum ))
483
+ return result .Maximum , nil
484
+ }
485
+
382
486
// DisableDataFSR disables FSR for data volume snapshots
383
487
func (e * EC2Session ) DisableDataFSR (snapshotsIDsMap map [string ][]* string ) error {
384
488
if len (snapshotsIDsMap ) == 0 {
@@ -530,7 +634,7 @@ func (e *EC2Session) CreateVolumes(meta *config.EBSBasedBRMeta, volumeType strin
530
634
return newVolumeIDMap , eg .Wait ()
531
635
}
532
636
533
- func (e * EC2Session ) WaitVolumesCreated (volumeIDMap map [string ]string , progress glue.Progress ) (int64 , error ) {
637
+ func (e * EC2Session ) WaitVolumesCreated (volumeIDMap map [string ]string , progress glue.Progress , fsrEnabledRequired bool ) (int64 , error ) {
534
638
pendingVolumes := make ([]* string , 0 , len (volumeIDMap ))
535
639
for oldVolID := range volumeIDMap {
536
640
newVolumeID := volumeIDMap [oldVolID ]
@@ -550,7 +654,11 @@ func (e *EC2Session) WaitVolumesCreated(volumeIDMap map[string]string, progress
550
654
return 0 , errors .Trace (err )
551
655
}
552
656
553
- createdVolumeSize , unfinishedVolumes := e .HandleDescribeVolumesResponse (resp )
657
+ createdVolumeSize , unfinishedVolumes , err := e .HandleDescribeVolumesResponse (resp , fsrEnabledRequired )
658
+ if err != nil {
659
+ return 0 , errors .Trace (err )
660
+ }
661
+
554
662
progress .IncBy (int64 (len (pendingVolumes ) - len (unfinishedVolumes )))
555
663
totalVolumeSize += createdVolumeSize
556
664
pendingVolumes = unfinishedVolumes
@@ -593,12 +701,16 @@ func ec2Tag(key, val string) *ec2.Tag {
593
701
return & ec2.Tag {Key : & key , Value : & val }
594
702
}
595
703
596
- func (e * EC2Session ) HandleDescribeVolumesResponse (resp * ec2.DescribeVolumesOutput ) (int64 , []* string ) {
704
+ func (e * EC2Session ) HandleDescribeVolumesResponse (resp * ec2.DescribeVolumesOutput , fsrEnabledRequired bool ) (int64 , []* string , error ) {
597
705
totalVolumeSize := int64 (0 )
598
706
599
707
var unfinishedVolumes []* string
600
708
for _ , volume := range resp .Volumes {
601
709
if * volume .State == ec2 .VolumeStateAvailable {
710
+ if fsrEnabledRequired && volume .FastRestored != nil && ! * volume .FastRestored {
711
+ log .Error ("snapshot fsr is not enabled for the volume" , zap .String ("volume" , * volume .SnapshotId ))
712
+ return 0 , nil , errors .Errorf ("Snapshot [%s] of volume [%s] is not fsr enabled" , * volume .SnapshotId , * volume .VolumeId )
713
+ }
602
714
log .Info ("volume is available" , zap .String ("id" , * volume .VolumeId ))
603
715
totalVolumeSize += * volume .Size
604
716
} else {
@@ -607,5 +719,5 @@ func (e *EC2Session) HandleDescribeVolumesResponse(resp *ec2.DescribeVolumesOutp
607
719
}
608
720
}
609
721
610
- return totalVolumeSize , unfinishedVolumes
722
+ return totalVolumeSize , unfinishedVolumes , nil
611
723
}
0 commit comments