@@ -96,8 +96,13 @@ type jobContext struct {
96
96
logger * zap.Logger
97
97
98
98
// per job step fields, they will be changed on each call of transitOneJobStep.
99
+ // stepCtx is initilaized and destroyed for each job step except reorg job,
100
+ // which returns timeout error periodically.
101
+ stepCtx context.Context
102
+ stepCtxCancel context.CancelCauseFunc
103
+ reorgTimeoutOccurred bool
104
+ inInnerRunOneJobStep bool // Only used for multi-schema change DDL job.
99
105
100
- stepCtx context.Context
101
106
metaMut * meta.Mutator
102
107
// decoded JobArgs, we store it here to avoid decoding it multiple times and
103
108
// pass some runtime info specific to some job type.
@@ -107,6 +112,32 @@ type jobContext struct {
107
112
oldDDLCtx * ddlCtx
108
113
}
109
114
115
+ func (c * jobContext ) shouldPollDDLJob () bool {
116
+ // If we are in multi-schema change DDL and this is not the outermost
117
+ // runOneJobStep, we should not start a goroutine to poll the ddl job.
118
+ return ! c .inInnerRunOneJobStep
119
+ }
120
+
121
+ func (c * jobContext ) initStepCtx () {
122
+ if c .stepCtx == nil {
123
+ stepCtx , cancel := context .WithCancelCause (c .ctx )
124
+ c .stepCtx = stepCtx
125
+ c .stepCtxCancel = cancel
126
+ }
127
+ }
128
+
129
+ func (c * jobContext ) cleanStepCtx () {
130
+ // reorgTimeoutOccurred indicates whether the current reorg process
131
+ // was temporarily exit due to a timeout condition. When set to true,
132
+ // it prevents premature cleanup of step context.
133
+ if c .reorgTimeoutOccurred {
134
+ c .reorgTimeoutOccurred = false // reset flag
135
+ return
136
+ }
137
+ c .stepCtxCancel (context .Canceled )
138
+ c .stepCtx = nil // unset stepCtx for the next step initialization
139
+ }
140
+
110
141
func (c * jobContext ) getAutoIDRequirement () autoid.Requirement {
111
142
return & asAutoIDRequirement {
112
143
store : c .store ,
@@ -581,7 +612,11 @@ func (w *worker) transitOneJobStep(
581
612
582
613
// If running job meets error, we will save this error in job Error and retry
583
614
// later if the job is not cancelled.
615
+ << << << < HEAD
584
616
schemaVer , updateRawArgs , runJobErr := w .runOneJobStep (jobCtx , job , sysTblMgr )
617
+ == == == =
618
+ schemaVer , updateRawArgs , runJobErr := w .runOneJobStep (jobCtx , job )
619
+ >> >> >> > e8fb24a20d4 (ddl : update row count periodically when running reorg job (#60828 ))
585
620
586
621
failpoint .InjectCall ("onJobRunAfter" , job )
587
622
@@ -784,7 +819,6 @@ func (*worker) processJobPausingRequest(jobCtx *jobContext, job *model.Job) (isR
784
819
func (w * worker ) runOneJobStep (
785
820
jobCtx * jobContext ,
786
821
job * model.Job ,
787
- sysTblMgr systable.Manager ,
788
822
) (ver int64 , updateRawArgs bool , err error ) {
789
823
defer tidbutil .Recover (metrics .LabelDDLWorker , fmt .Sprintf ("%s runOneJobStep" , w ),
790
824
func () {
@@ -794,8 +828,14 @@ func (w *worker) runOneJobStep(
794
828
// Mock for run ddl job panic.
795
829
failpoint .Inject ("mockPanicInRunDDLJob" , func (failpoint.Value ) {})
796
830
831
+ failpoint .InjectCall ("onRunOneJobStep" )
797
832
if job .Type != model.ActionMultiSchemaChange {
833
+ << << << < HEAD
798
834
jobCtx .logger .Info ("run DDL job" , zap .String ("job" , job .String ()))
835
+ == == == =
836
+ jobCtx .logger .Info ("run one job step" , zap .String ("job" , job .String ()))
837
+ failpoint .InjectCall ("onRunOneJobStep" )
838
+ >> >> >> > e8fb24a20d4 (ddl : update row count periodically when running reorg job (#60828 ))
799
839
}
800
840
timeStart := time .Now ()
801
841
if job .RealStartTS == 0 {
@@ -819,27 +859,20 @@ func (w *worker) runOneJobStep(
819
859
return ver , false , err
820
860
}
821
861
822
- // when sysTblMgr is nil, clean up the job step context just for clearness.
823
- // Otherwise, we are in multi-schema change DDL and this is not the outermost
824
- // runOneJobStep, we should keep the job step context.
825
- if sysTblMgr != nil {
826
- jobCtx .stepCtx = nil
827
- }
828
-
829
862
// It would be better to do the positive check, but no idea to list all valid states here now.
830
863
if job .IsRollingback () {
831
864
// when rolling back, we use worker context to process.
832
865
jobCtx .stepCtx = w .workCtx
833
866
} else {
834
867
job .State = model .JobStateRunning
835
868
836
- if sysTblMgr != nil {
869
+ if jobCtx .shouldPollDDLJob () {
870
+ failpoint .InjectCall ("beforePollDDLJob" )
837
871
stopCheckingJobCancelled := make (chan struct {})
838
872
defer close (stopCheckingJobCancelled )
839
873
840
- var cancelStep context.CancelCauseFunc
841
- jobCtx .stepCtx , cancelStep = context .WithCancelCause (jobCtx .ctx )
842
- defer cancelStep (context .Canceled )
874
+ jobCtx .initStepCtx ()
875
+ defer jobCtx .cleanStepCtx ()
843
876
w .wg .Run (func () {
844
877
ticker := time .NewTicker (2 * time .Second )
845
878
defer ticker .Stop ()
@@ -849,8 +882,14 @@ func (w *worker) runOneJobStep(
849
882
case <- stopCheckingJobCancelled :
850
883
return
851
884
case <- ticker .C :
885
+ << << << < HEAD
852
886
latestJob , err := sysTblMgr .GetJobByID (w .workCtx , job .ID )
853
887
if err == systable.ErrNotFound {
888
+ == == == =
889
+ failpoint .InjectCall ("checkJobCancelled" , job )
890
+ latestJob , err := jobCtx .sysTblMgr .GetJobByID (w .workCtx , job .ID )
891
+ if goerrors .Is (err , systable .ErrNotFound ) {
892
+ >> >> >> > e8fb24a20d4 (ddl : update row count periodically when running reorg job (#60828 ))
854
893
logutil .DDLLogger ().Info (
855
894
"job not found, might already finished" ,
856
895
zap .Int64 ("job_id" , job .ID ))
@@ -867,13 +906,13 @@ func (w *worker) runOneJobStep(
867
906
logutil .DDLLogger ().Info ("job is cancelled" ,
868
907
zap .Int64 ("job_id" , job .ID ),
869
908
zap .Stringer ("state" , latestJob .State ))
870
- cancelStep (dbterror .ErrCancelledDDLJob )
909
+ jobCtx . stepCtxCancel (dbterror .ErrCancelledDDLJob )
871
910
return
872
911
case model .JobStatePausing , model .JobStatePaused :
873
912
logutil .DDLLogger ().Info ("job is paused" ,
874
913
zap .Int64 ("job_id" , job .ID ),
875
914
zap .Stringer ("state" , latestJob .State ))
876
- cancelStep (dbterror .ErrPausedDDLJob .FastGenByArgs (job .ID ))
915
+ jobCtx . stepCtxCancel (dbterror .ErrPausedDDLJob .FastGenByArgs (job .ID ))
877
916
return
878
917
case model .JobStateDone , model .JobStateSynced :
879
918
return
@@ -1089,6 +1128,11 @@ func updateGlobalVersionAndWaitSynced(
1089
1128
var err error
1090
1129
1091
1130
if latestSchemaVersion == 0 {
1131
+ // If the DDL step is still in progress (e.g., during reorg timeout),
1132
+ // skip logging to avoid generating redundant entries.
1133
+ if jobCtx .stepCtx != nil {
1134
+ return nil
1135
+ }
1092
1136
logutil .DDLLogger ().Info ("schema version doesn't change" , zap .Int64 ("jobID" , job .ID ))
1093
1137
return nil
1094
1138
}
0 commit comments