6
6
"bytes"
7
7
"context"
8
8
"fmt"
9
- "strings"
10
9
"sync"
11
10
"testing"
12
11
"time"
@@ -519,6 +518,85 @@ func TestEnableCheckPointLimit(t *testing.T) {
519
518
}
520
519
}
521
520
521
+ func TestOwnerChangeCheckPointLagged (t * testing.T ) {
522
+ c := createFakeCluster (t , 4 , false )
523
+ defer func () {
524
+ fmt .Println (c )
525
+ }()
526
+ c .splitAndScatter ("01" , "02" , "022" , "023" , "033" , "04" , "043" )
527
+ ctx , cancel := context .WithCancel (context .Background ())
528
+ defer cancel ()
529
+
530
+ env := newTestEnv (c , t )
531
+ rngs := env .ranges
532
+ if len (rngs ) == 0 {
533
+ rngs = []kv.KeyRange {{}}
534
+ }
535
+ env .task = streamhelper.TaskEvent {
536
+ Type : streamhelper .EventAdd ,
537
+ Name : "whole" ,
538
+ Info : & backup.StreamBackupTaskInfo {
539
+ Name : "whole" ,
540
+ StartTs : oracle .GoTimeToTS (oracle .GetTimeFromTS (0 ).Add (1 * time .Minute )),
541
+ },
542
+ Ranges : rngs ,
543
+ }
544
+
545
+ adv := streamhelper .NewCheckpointAdvancer (env )
546
+ adv .UpdateConfigWith (func (c * config.Config ) {
547
+ c .CheckPointLagLimit = 1 * time .Minute
548
+ })
549
+ ctx1 , cancel1 := context .WithCancel (context .Background ())
550
+ adv .OnStart (ctx1 )
551
+ adv .OnBecomeOwner (ctx1 )
552
+ log .Info ("advancer1 become owner" )
553
+ require .NoError (t , adv .OnTick (ctx1 ))
554
+
555
+ // another advancer but never advance checkpoint before
556
+ adv2 := streamhelper .NewCheckpointAdvancer (env )
557
+ adv2 .UpdateConfigWith (func (c * config.Config ) {
558
+ c .CheckPointLagLimit = 1 * time .Minute
559
+ })
560
+ ctx2 , cancel2 := context .WithCancel (context .Background ())
561
+ adv2 .OnStart (ctx2 )
562
+
563
+ for i := 0 ; i < 5 ; i ++ {
564
+ c .advanceClusterTimeBy (2 * time .Minute )
565
+ c .advanceCheckpointBy (2 * time .Minute )
566
+ require .NoError (t , adv .OnTick (ctx1 ))
567
+ }
568
+ c .advanceClusterTimeBy (2 * time .Minute )
569
+ require .ErrorContains (t , adv .OnTick (ctx1 ), "lagged too large" )
570
+
571
+ // resume task to make next tick normally
572
+ c .advanceCheckpointBy (2 * time .Minute )
573
+ env .ResumeTask (ctx )
574
+
575
+ // stop advancer1, and advancer2 should take over
576
+ cancel1 ()
577
+ log .Info ("advancer1 owner canceled, and advancer2 become owner" )
578
+ adv2 .OnBecomeOwner (ctx2 )
579
+ require .NoError (t , adv2 .OnTick (ctx2 ))
580
+
581
+ // advancer2 should take over and tick normally
582
+ for i := 0 ; i < 10 ; i ++ {
583
+ c .advanceClusterTimeBy (2 * time .Minute )
584
+ c .advanceCheckpointBy (2 * time .Minute )
585
+ require .NoError (t , adv2 .OnTick (ctx2 ))
586
+ }
587
+ c .advanceClusterTimeBy (2 * time .Minute )
588
+ require .ErrorContains (t , adv2 .OnTick (ctx2 ), "lagged too large" )
589
+ // stop advancer2, and advancer1 should take over
590
+ c .advanceCheckpointBy (2 * time .Minute )
591
+ env .ResumeTask (ctx )
592
+ cancel2 ()
593
+ log .Info ("advancer2 owner canceled, and advancer1 become owner" )
594
+
595
+ adv .OnBecomeOwner (ctx )
596
+ // advancer1 should take over and tick normally when come back
597
+ require .NoError (t , adv .OnTick (ctx ))
598
+ }
599
+
522
600
func TestCheckPointLagged (t * testing.T ) {
523
601
c := createFakeCluster (t , 4 , false )
524
602
defer func () {
@@ -548,8 +626,10 @@ func TestCheckPointLagged(t *testing.T) {
548
626
})
549
627
adv .StartTaskListener (ctx )
550
628
c .advanceClusterTimeBy (2 * time .Minute )
629
+ // if global ts is not advanced, the checkpoint will not be lagged
630
+ c .advanceCheckpointBy (2 * time .Minute )
551
631
require .NoError (t , adv .OnTick (ctx ))
552
- c .advanceClusterTimeBy (1 * time .Minute )
632
+ c .advanceClusterTimeBy (3 * time .Minute )
553
633
require .ErrorContains (t , adv .OnTick (ctx ), "lagged too large" )
554
634
// after some times, the isPaused will be set and ticks are skipped
555
635
require .Eventually (t , func () bool {
@@ -573,8 +653,10 @@ func TestCheckPointResume(t *testing.T) {
573
653
})
574
654
adv .StartTaskListener (ctx )
575
655
c .advanceClusterTimeBy (1 * time .Minute )
656
+ // if global ts is not advanced, the checkpoint will not be lagged
657
+ c .advanceCheckpointBy (1 * time .Minute )
576
658
require .NoError (t , adv .OnTick (ctx ))
577
- c .advanceClusterTimeBy (1 * time .Minute )
659
+ c .advanceClusterTimeBy (2 * time .Minute )
578
660
require .ErrorContains (t , adv .OnTick (ctx ), "lagged too large" )
579
661
require .Eventually (t , func () bool {
580
662
return assert .NoError (t , adv .OnTick (ctx ))
@@ -604,18 +686,48 @@ func TestUnregisterAfterPause(t *testing.T) {
604
686
c .CheckPointLagLimit = 1 * time .Minute
605
687
})
606
688
adv .StartTaskListener (ctx )
689
+
690
+ // wait for the task to be added
691
+ require .Eventually (t , func () bool {
692
+ return adv .HasTask ()
693
+ }, 5 * time .Second , 100 * time .Millisecond )
694
+
695
+ // task is should be paused when global checkpoint is laggeod
696
+ // even the global checkpoint is equal to task start ts(not advanced all the time)
607
697
c .advanceClusterTimeBy (1 * time .Minute )
608
698
require .NoError (t , adv .OnTick (ctx ))
609
699
env .PauseTask (ctx , "whole" )
610
- time .Sleep (1 * time .Second )
611
700
c .advanceClusterTimeBy (1 * time .Minute )
701
+ require .Error (t , adv .OnTick (ctx ), "checkpoint is lagged" )
702
+ env .unregisterTask ()
703
+ env .putTask ()
704
+
705
+ // wait for the task to be added
706
+ require .Eventually (t , func () bool {
707
+ return adv .HasTask ()
708
+ }, 5 * time .Second , 100 * time .Millisecond )
709
+
710
+ require .Error (t , adv .OnTick (ctx ), "checkpoint is lagged" )
711
+
712
+ env .unregisterTask ()
713
+ // wait for the task to be deleted
714
+ require .Eventually (t , func () bool {
715
+ return ! adv .HasTask ()
716
+ }, 5 * time .Second , 100 * time .Millisecond )
717
+
718
+ // reset
719
+ c .advanceClusterTimeBy (- 1 * time .Minute )
612
720
require .NoError (t , adv .OnTick (ctx ))
721
+ env .PauseTask (ctx , "whole" )
722
+ c .advanceClusterTimeBy (1 * time .Minute )
613
723
env .unregisterTask ()
614
724
env .putTask ()
725
+ // wait for the task to be add
615
726
require .Eventually (t , func () bool {
616
- err := adv .OnTick (ctx )
617
- return err != nil && strings .Contains (err .Error (), "check point lagged too large" )
618
- }, 5 * time .Second , 300 * time .Millisecond )
727
+ return adv .HasTask ()
728
+ }, 5 * time .Second , 100 * time .Millisecond )
729
+
730
+ require .Error (t , adv .OnTick (ctx ), "checkpoint is lagged" )
619
731
}
620
732
621
733
// If the start ts is *NOT* lagged, even both the cluster and pd are lagged, the task should run normally.
@@ -727,13 +839,18 @@ func TestAddTaskWithLongRunTask2(t *testing.T) {
727
839
adv .UpdateConfigWith (func (c * config.Config ) {
728
840
c .CheckPointLagLimit = 1 * time .Minute
729
841
})
842
+ adv .StartTaskListener (ctx )
730
843
c .advanceClusterTimeBy (3 * time .Minute )
731
844
c .advanceCheckpointBy (1 * time .Minute )
732
845
env .advanceCheckpointBy (2 * time .Minute )
733
846
env .mockPDConnectionError ()
734
- adv .StartTaskListener (ctx )
735
- // Try update checkpoint
736
- require .NoError (t , adv .OnTick (ctx ))
847
+ // if cannot connect to pd, the checkpoint will be rolled back
848
+ // because at this point. the global ts is 2 minutes
849
+ // and the local checkpoint ts is 1 minute
850
+ require .Error (t , adv .OnTick (ctx ), "checkpoint rollback" )
851
+
852
+ // only when local checkpoint > global ts, the next tick will be normal
853
+ c .advanceCheckpointBy (12 * time .Minute )
737
854
// Verify no err raised
738
855
require .NoError (t , adv .OnTick (ctx ))
739
856
}
@@ -767,11 +884,17 @@ func TestAddTaskWithLongRunTask3(t *testing.T) {
767
884
adv .UpdateConfigWith (func (c * config.Config ) {
768
885
c .CheckPointLagLimit = 1 * time .Minute
769
886
})
770
- c .advanceClusterTimeBy (3 * time .Minute )
887
+ // advance cluster time to 4 minutes, and checkpoint to 1 minutes
888
+ // if start ts equals to checkpoint, the task will not be paused
889
+ adv .StartTaskListener (ctx )
890
+ c .advanceClusterTimeBy (2 * time .Minute )
891
+ c .advanceCheckpointBy (1 * time .Minute )
892
+ env .advanceCheckpointBy (1 * time .Minute )
893
+ require .NoError (t , adv .OnTick (ctx ))
894
+
895
+ c .advanceClusterTimeBy (2 * time .Minute )
771
896
c .advanceCheckpointBy (1 * time .Minute )
772
897
env .advanceCheckpointBy (1 * time .Minute )
773
- env .mockPDConnectionError ()
774
- adv .StartTaskListener (ctx )
775
898
// Try update checkpoint
776
899
require .ErrorContains (t , adv .OnTick (ctx ), "lagged too large" )
777
900
// Verify no err raised after paused
0 commit comments