6
6
"bytes"
7
7
"context"
8
8
"fmt"
9
- "strings"
10
9
"sync"
11
10
"testing"
12
11
"time"
@@ -518,6 +517,85 @@ func TestEnableCheckPointLimit(t *testing.T) {
518
517
}
519
518
}
520
519
520
+ func TestOwnerChangeCheckPointLagged (t * testing.T ) {
521
+ c := createFakeCluster (t , 4 , false )
522
+ defer func () {
523
+ fmt .Println (c )
524
+ }()
525
+ c .splitAndScatter ("01" , "02" , "022" , "023" , "033" , "04" , "043" )
526
+ ctx , cancel := context .WithCancel (context .Background ())
527
+ defer cancel ()
528
+
529
+ env := newTestEnv (c , t )
530
+ rngs := env .ranges
531
+ if len (rngs ) == 0 {
532
+ rngs = []kv.KeyRange {{}}
533
+ }
534
+ env .task = streamhelper.TaskEvent {
535
+ Type : streamhelper .EventAdd ,
536
+ Name : "whole" ,
537
+ Info : & backup.StreamBackupTaskInfo {
538
+ Name : "whole" ,
539
+ StartTs : oracle .GoTimeToTS (oracle .GetTimeFromTS (0 ).Add (1 * time .Minute )),
540
+ },
541
+ Ranges : rngs ,
542
+ }
543
+
544
+ adv := streamhelper .NewCheckpointAdvancer (env )
545
+ adv .UpdateConfigWith (func (c * config.Config ) {
546
+ c .CheckPointLagLimit = 1 * time .Minute
547
+ })
548
+ ctx1 , cancel1 := context .WithCancel (context .Background ())
549
+ adv .OnStart (ctx1 )
550
+ adv .OnBecomeOwner (ctx1 )
551
+ log .Info ("advancer1 become owner" )
552
+ require .NoError (t , adv .OnTick (ctx1 ))
553
+
554
+ // another advancer but never advance checkpoint before
555
+ adv2 := streamhelper .NewCheckpointAdvancer (env )
556
+ adv2 .UpdateConfigWith (func (c * config.Config ) {
557
+ c .CheckPointLagLimit = 1 * time .Minute
558
+ })
559
+ ctx2 , cancel2 := context .WithCancel (context .Background ())
560
+ adv2 .OnStart (ctx2 )
561
+
562
+ for i := 0 ; i < 5 ; i ++ {
563
+ c .advanceClusterTimeBy (2 * time .Minute )
564
+ c .advanceCheckpointBy (2 * time .Minute )
565
+ require .NoError (t , adv .OnTick (ctx1 ))
566
+ }
567
+ c .advanceClusterTimeBy (2 * time .Minute )
568
+ require .ErrorContains (t , adv .OnTick (ctx1 ), "lagged too large" )
569
+
570
+ // resume task to make next tick normally
571
+ c .advanceCheckpointBy (2 * time .Minute )
572
+ env .ResumeTask (ctx )
573
+
574
+ // stop advancer1, and advancer2 should take over
575
+ cancel1 ()
576
+ log .Info ("advancer1 owner canceled, and advancer2 become owner" )
577
+ adv2 .OnBecomeOwner (ctx2 )
578
+ require .NoError (t , adv2 .OnTick (ctx2 ))
579
+
580
+ // advancer2 should take over and tick normally
581
+ for i := 0 ; i < 10 ; i ++ {
582
+ c .advanceClusterTimeBy (2 * time .Minute )
583
+ c .advanceCheckpointBy (2 * time .Minute )
584
+ require .NoError (t , adv2 .OnTick (ctx2 ))
585
+ }
586
+ c .advanceClusterTimeBy (2 * time .Minute )
587
+ require .ErrorContains (t , adv2 .OnTick (ctx2 ), "lagged too large" )
588
+ // stop advancer2, and advancer1 should take over
589
+ c .advanceCheckpointBy (2 * time .Minute )
590
+ env .ResumeTask (ctx )
591
+ cancel2 ()
592
+ log .Info ("advancer2 owner canceled, and advancer1 become owner" )
593
+
594
+ adv .OnBecomeOwner (ctx )
595
+ // advancer1 should take over and tick normally when come back
596
+ require .NoError (t , adv .OnTick (ctx ))
597
+ }
598
+
521
599
func TestCheckPointLagged (t * testing.T ) {
522
600
c := createFakeCluster (t , 4 , false )
523
601
defer func () {
@@ -548,8 +626,10 @@ func TestCheckPointLagged(t *testing.T) {
548
626
})
549
627
adv .StartTaskListener (ctx )
550
628
c .advanceClusterTimeBy (2 * time .Minute )
629
+ // if global ts is not advanced, the checkpoint will not be lagged
630
+ c .advanceCheckpointBy (2 * time .Minute )
551
631
require .NoError (t , adv .OnTick (ctx ))
552
- c .advanceClusterTimeBy (1 * time .Minute )
632
+ c .advanceClusterTimeBy (3 * time .Minute )
553
633
require .ErrorContains (t , adv .OnTick (ctx ), "lagged too large" )
554
634
// after some times, the isPaused will be set and ticks are skipped
555
635
require .Eventually (t , func () bool {
@@ -573,8 +653,10 @@ func TestCheckPointResume(t *testing.T) {
573
653
})
574
654
adv .StartTaskListener (ctx )
575
655
c .advanceClusterTimeBy (1 * time .Minute )
656
+ // if global ts is not advanced, the checkpoint will not be lagged
657
+ c .advanceCheckpointBy (1 * time .Minute )
576
658
require .NoError (t , adv .OnTick (ctx ))
577
- c .advanceClusterTimeBy (1 * time .Minute )
659
+ c .advanceClusterTimeBy (2 * time .Minute )
578
660
require .ErrorContains (t , adv .OnTick (ctx ), "lagged too large" )
579
661
require .Eventually (t , func () bool {
580
662
return assert .NoError (t , adv .OnTick (ctx ))
@@ -604,18 +686,48 @@ func TestUnregisterAfterPause(t *testing.T) {
604
686
c .CheckPointLagLimit = 1 * time .Minute
605
687
})
606
688
adv .StartTaskListener (ctx )
689
+
690
+ // wait for the task to be added
691
+ require .Eventually (t , func () bool {
692
+ return adv .HasTask ()
693
+ }, 5 * time .Second , 100 * time .Millisecond )
694
+
695
+ // task is should be paused when global checkpoint is laggeod
696
+ // even the global checkpoint is equal to task start ts(not advanced all the time)
607
697
c .advanceClusterTimeBy (1 * time .Minute )
608
698
require .NoError (t , adv .OnTick (ctx ))
609
699
env .PauseTask (ctx , "whole" )
610
- time .Sleep (1 * time .Second )
611
700
c .advanceClusterTimeBy (1 * time .Minute )
701
+ require .Error (t , adv .OnTick (ctx ), "checkpoint is lagged" )
702
+ env .unregisterTask ()
703
+ env .putTask ()
704
+
705
+ // wait for the task to be added
706
+ require .Eventually (t , func () bool {
707
+ return adv .HasTask ()
708
+ }, 5 * time .Second , 100 * time .Millisecond )
709
+
710
+ require .Error (t , adv .OnTick (ctx ), "checkpoint is lagged" )
711
+
712
+ env .unregisterTask ()
713
+ // wait for the task to be deleted
714
+ require .Eventually (t , func () bool {
715
+ return ! adv .HasTask ()
716
+ }, 5 * time .Second , 100 * time .Millisecond )
717
+
718
+ // reset
719
+ c .advanceClusterTimeBy (- 1 * time .Minute )
612
720
require .NoError (t , adv .OnTick (ctx ))
721
+ env .PauseTask (ctx , "whole" )
722
+ c .advanceClusterTimeBy (1 * time .Minute )
613
723
env .unregisterTask ()
614
724
env .putTask ()
725
+ // wait for the task to be add
615
726
require .Eventually (t , func () bool {
616
- err := adv .OnTick (ctx )
617
- return err != nil && strings .Contains (err .Error (), "check point lagged too large" )
618
- }, 5 * time .Second , 300 * time .Millisecond )
727
+ return adv .HasTask ()
728
+ }, 5 * time .Second , 100 * time .Millisecond )
729
+
730
+ require .Error (t , adv .OnTick (ctx ), "checkpoint is lagged" )
619
731
}
620
732
621
733
// If the start ts is *NOT* lagged, even both the cluster and pd are lagged, the task should run normally.
@@ -727,13 +839,18 @@ func TestAddTaskWithLongRunTask2(t *testing.T) {
727
839
adv .UpdateConfigWith (func (c * config.Config ) {
728
840
c .CheckPointLagLimit = 1 * time .Minute
729
841
})
842
+ adv .StartTaskListener (ctx )
730
843
c .advanceClusterTimeBy (3 * time .Minute )
731
844
c .advanceCheckpointBy (1 * time .Minute )
732
845
env .advanceCheckpointBy (2 * time .Minute )
733
846
env .mockPDConnectionError ()
734
- adv .StartTaskListener (ctx )
735
- // Try update checkpoint
736
- require .NoError (t , adv .OnTick (ctx ))
847
+ // if cannot connect to pd, the checkpoint will be rolled back
848
+ // because at this point. the global ts is 2 minutes
849
+ // and the local checkpoint ts is 1 minute
850
+ require .Error (t , adv .OnTick (ctx ), "checkpoint rollback" )
851
+
852
+ // only when local checkpoint > global ts, the next tick will be normal
853
+ c .advanceCheckpointBy (12 * time .Minute )
737
854
// Verify no err raised
738
855
require .NoError (t , adv .OnTick (ctx ))
739
856
}
@@ -767,11 +884,17 @@ func TestAddTaskWithLongRunTask3(t *testing.T) {
767
884
adv .UpdateConfigWith (func (c * config.Config ) {
768
885
c .CheckPointLagLimit = 1 * time .Minute
769
886
})
770
- c .advanceClusterTimeBy (3 * time .Minute )
887
+ // advance cluster time to 4 minutes, and checkpoint to 1 minutes
888
+ // if start ts equals to checkpoint, the task will not be paused
889
+ adv .StartTaskListener (ctx )
890
+ c .advanceClusterTimeBy (2 * time .Minute )
891
+ c .advanceCheckpointBy (1 * time .Minute )
892
+ env .advanceCheckpointBy (1 * time .Minute )
893
+ require .NoError (t , adv .OnTick (ctx ))
894
+
895
+ c .advanceClusterTimeBy (2 * time .Minute )
771
896
c .advanceCheckpointBy (1 * time .Minute )
772
897
env .advanceCheckpointBy (1 * time .Minute )
773
- env .mockPDConnectionError ()
774
- adv .StartTaskListener (ctx )
775
898
// Try update checkpoint
776
899
require .ErrorContains (t , adv .OnTick (ctx ), "lagged too large" )
777
900
// Verify no err raised after paused
0 commit comments