@@ -60,7 +60,7 @@ func setTTLTaskFinishedSQL(jobID string, scanID int64, state *cache.TTLTaskState
60
60
return setTTLTaskFinishedTemplate , []any {now .Format (timeFormat ), string (stateStr ), jobID , scanID }, nil
61
61
}
62
62
63
- const updateTTLTaskHeartBeatTempalte = `UPDATE mysql.tidb_ttl_task
63
+ const updateTTLTaskHeartBeatTemplate = `UPDATE mysql.tidb_ttl_task
64
64
SET state = %?,
65
65
owner_hb_time = %?
66
66
WHERE job_id = %? AND scan_id = %?`
@@ -70,7 +70,18 @@ func updateTTLTaskHeartBeatSQL(jobID string, scanID int64, now time.Time, state
70
70
if err != nil {
71
71
return "" , nil , err
72
72
}
73
- return updateTTLTaskHeartBeatTempalte , []any {string (stateStr ), now .Format (timeFormat ), jobID , scanID }, nil
73
+ return updateTTLTaskHeartBeatTemplate , []any {string (stateStr ), now .Format (timeFormat ), jobID , scanID }, nil
74
+ }
75
+
76
+ const updateTTLTaskStateTemplate = `UPDATE mysql.tidb_ttl_task
77
+ SET state = %? WHERE job_id = %? AND scan_id = %?`
78
+
79
+ func updateTTLTaskStateSQL (jobID string , scanID int64 , state * cache.TTLTaskState ) (string , []any , error ) {
80
+ stateStr , err := json .Marshal (state )
81
+ if err != nil {
82
+ return "" , nil , err
83
+ }
84
+ return updateTTLTaskStateTemplate , []any {string (stateStr ), jobID , scanID }, nil
74
85
}
75
86
76
87
const countRunningTasks = "SELECT count(1) FROM mysql.tidb_ttl_task WHERE status = 'running'"
@@ -160,6 +171,8 @@ func (m *taskManager) resizeScanWorkers(count int) error {
160
171
jobID = curTask .JobID
161
172
scanID = curTask .ScanID
162
173
scanErr = errors .New ("timeout to cancel scan task" )
174
+
175
+ result = curTask .result (scanErr )
163
176
}
164
177
165
178
task := findTaskWithID (m .runningTasks , jobID , scanID )
@@ -169,6 +182,7 @@ func (m *taskManager) resizeScanWorkers(count int) error {
169
182
}
170
183
logutil .Logger (m .ctx ).Debug ("scan task finished" , zap .String ("jobID" , task .JobID ), zap .Int64 ("taskID" , task .ScanID ), zap .Error (scanErr ))
171
184
185
+ result .interruptByWorkerStop = true
172
186
task .result = result
173
187
}
174
188
return err
@@ -439,40 +453,83 @@ func (m *taskManager) syncTaskFromTable(se session.Session, jobID string, scanID
439
453
// updateHeartBeat updates the heartbeat for all tasks with current instance as owner
440
454
func (m * taskManager ) updateHeartBeat (ctx context.Context , se session.Session , now time.Time ) error {
441
455
for _ , task := range m .runningTasks {
442
- state := & cache.TTLTaskState {
443
- TotalRows : task .statistics .TotalRows .Load (),
444
- SuccessRows : task .statistics .SuccessRows .Load (),
445
- ErrorRows : task .statistics .ErrorRows .Load (),
446
- }
447
- if task .result != nil && task .result .err != nil {
448
- state .ScanTaskErr = task .result .err .Error ()
456
+ if err := m .updateTaskState (ctx , se , task , true , now ); err != nil {
457
+ task .taskLogger (logutil .Logger (m .ctx )).Warn ("fail to heartbeat task" , zap .Error (err ))
449
458
}
459
+ }
460
+ return nil
461
+ }
450
462
451
- intest .Assert (se .GetSessionVars ().Location ().String () == now .Location ().String ())
452
- sql , args , err := updateTTLTaskHeartBeatSQL (task .JobID , task .ScanID , now , state )
453
- if err != nil {
454
- return err
455
- }
456
- _ , err = se .ExecuteSQL (ctx , sql , args ... )
457
- if err != nil {
458
- return errors .Wrapf (err , "execute sql: %s" , sql )
459
- }
463
+ func (m * taskManager ) updateTaskState (ctx context.Context , se session.Session , task * runningScanTask , heartbeat bool , now time.Time ) error {
464
+ state := & cache.TTLTaskState {
465
+ TotalRows : task .statistics .TotalRows .Load (),
466
+ SuccessRows : task .statistics .SuccessRows .Load (),
467
+ ErrorRows : task .statistics .ErrorRows .Load (),
468
+ }
469
+
470
+ if prevState := task .TTLTask .State ; prevState != nil {
471
+ // If a task was timeout and taken over by the current instance,
472
+ // adding the previous state to the current state to make the statistics more accurate.
473
+ state .TotalRows += prevState .SuccessRows + prevState .ErrorRows
474
+ state .SuccessRows += prevState .SuccessRows
475
+ state .ErrorRows += prevState .ErrorRows
476
+ }
477
+
478
+ if task .result != nil && task .result .err != nil {
479
+ state .ScanTaskErr = task .result .err .Error ()
480
+ }
481
+
482
+ intest .Assert (se .GetSessionVars ().Location ().String () == now .Location ().String ())
483
+ var sql string
484
+ var args []any
485
+ var err error
486
+ if heartbeat {
487
+ sql , args , err = updateTTLTaskHeartBeatSQL (task .JobID , task .ScanID , now , state )
488
+ } else {
489
+ sql , args , err = updateTTLTaskStateSQL (task .JobID , task .ScanID , state )
490
+ }
491
+ if err != nil {
492
+ return err
493
+ }
494
+ _ , err = se .ExecuteSQL (ctx , sql , args ... )
495
+ if err != nil {
496
+ return errors .Wrapf (err , "execute sql: %s" , sql )
460
497
}
461
498
return nil
462
499
}
463
500
464
501
func (m * taskManager ) checkFinishedTask (se session.Session , now time.Time ) {
465
502
stillRunningTasks := make ([]* runningScanTask , 0 , len (m .runningTasks ))
466
503
for _ , task := range m .runningTasks {
467
- if ! task .finished (logutil .Logger (m .ctx )) {
504
+ interruptByWorkerStop := task .result != nil && task .result .interruptByWorkerStop
505
+ // If the task is interrupted or finished, we should remove it from memory.
506
+ // Otherwise, we should keep it in memory and wait for the next round to check.
507
+ if ! interruptByWorkerStop && ! task .finished (logutil .Logger (m .ctx )) {
468
508
stillRunningTasks = append (stillRunningTasks , task )
469
509
continue
470
510
}
471
511
// we should cancel task to release inner context and avoid memory leak
472
512
task .cancel ()
513
+
514
+ logger := task .taskLogger (logutil .Logger (m .ctx ))
515
+ // When a task cannot continue running because there is no worker to run it,
516
+ // we should only remove it from memory without reporting its final status.
517
+ // Then some other instance will take over this task when its heartbeat timeout.
518
+ if interruptByWorkerStop {
519
+ logger .Info (
520
+ "remove a task from memory without mark it as finished because it is from a stopped worker" ,
521
+ )
522
+ if err := m .updateTaskState (m .ctx , se , task , false , now ); err != nil {
523
+ logger .Warn ("fail to report a interrupted task" , zap .Error (err ))
524
+ }
525
+ continue
526
+ }
527
+
528
+ // Update the meta of a task to mark it as finished.
473
529
err := m .reportTaskFinished (se , now , task )
474
530
if err != nil {
475
- logutil .Logger (m .ctx ).Error ("fail to report finished task" , zap .Error (err ))
531
+ logger .Error (
532
+ "fail to report finished task" , zap .Error (err ))
476
533
stillRunningTasks = append (stillRunningTasks , task )
477
534
continue
478
535
}
0 commit comments