ttl: reschedule task to other instances when shriking worker

lcwangchao · lcwangchao · commit 695c2c7a22de · 2024-11-27T15:54:21.000+08:00
diff --git a/pkg/ttl/ttlworker/del.go b/pkg/ttl/ttlworker/del.go
@@ -22,6 +22,7 @@ import (
 	"sync/atomic"
 	"time"
 
+	"github.com/pingcap/log"
 	"github.com/pingcap/tidb/pkg/sessionctx/variable"
 	"github.com/pingcap/tidb/pkg/ttl/cache"
 	"github.com/pingcap/tidb/pkg/ttl/metrics"
@@ -94,12 +95,12 @@ func (t *ttlDeleteTask) doDelete(ctx context.Context, rawSe session.Session) (re
 	leftRows := t.rows
 	defer func() {
 		if len(leftRows) > 0 {
-			t.statistics.IncErrorRows(len(leftRows))
+			retryRows = append(retryRows, leftRows...)
 		}
 	}()
 
 	se := newTableSession(rawSe, t.tbl, t.expire)
-	for len(leftRows) > 0 {
+	for len(leftRows) > 0 && ctx.Err() == nil {
 		maxBatch := variable.TTLDeleteBatchSize.Load()
 		var delBatch [][]types.Datum
 		if int64(len(leftRows)) < maxBatch {
@@ -133,7 +134,6 @@ func (t *ttlDeleteTask) doDelete(ctx context.Context, rawSe session.Session) (re
 		sqlInterval := time.Since(sqlStart)
 		if err != nil {
 			metrics.DeleteErrorDuration.Observe(sqlInterval.Seconds())
-			needRetry = needRetry && ctx.Err() == nil
 			logutil.BgLogger().Warn(
 				"delete SQL in TTL failed",
 				zap.Error(err),
@@ -214,6 +214,11 @@ func (b *ttlDelRetryBuffer) DoRetry(do func(*ttlDeleteTask) [][]types.Datum) tim
 	return b.retryInterval
 }
 
+// SetRetryInterval sets the retry interval of the buffer.
+func (b *ttlDelRetryBuffer) SetRetryInterval(interval time.Duration) {
+	b.retryInterval = interval
+}
+
 // Drain drains a retry buffer.
 func (b *ttlDelRetryBuffer) Drain() {
 	for ele := b.list.Front(); ele != nil; ele = ele.Next() {
@@ -296,8 +301,37 @@ func (w *ttlDeleteWorker) loop() error {
 	timer := time.NewTimer(w.retryBuffer.retryInterval)
 	defer timer.Stop()
 
-	// drain retry buffer to make sure the statistics are correct
-	defer w.retryBuffer.Drain()
+	defer func() {
+		// Have a final try to delete all rows in retry buffer while the worker stops
+		// to avoid leaving any TTL rows undeleted when shrinking the delete worker.
+		if w.retryBuffer.Len() > 0 {
+			start := time.Now()
+			log.Info(
+				"try to delete TTL rows in del worker buffer immediately because the worker is going to stop",
+				zap.Int("bufferLen", w.retryBuffer.Len()),
+			)
+			retryCtx, cancel := context.WithTimeout(context.Background(), time.Second*10)
+			defer cancel()
+			w.retryBuffer.SetRetryInterval(0)
+			w.retryBuffer.DoRetry(func(task *ttlDeleteTask) [][]types.Datum {
+				return task.doDelete(retryCtx, se)
+			})
+			log.Info(
+				"delete TTL rows in del worker buffer finished",
+				zap.Duration("duration", time.Since(start)),
+			)
+		}
+
+		// drain retry buffer to make sure the statistics are correct
+		if w.retryBuffer.Len() > 0 {
+			log.Warn(
+				"some TTL rows are still in the buffer while the worker is going to stop, mark them as error",
+				zap.Int("bufferLen", w.retryBuffer.Len()),
+			)
+			w.retryBuffer.Drain()
+		}
+	}()
+
 	for w.Status() == workerStatusRunning {
 		tracer.EnterPhase(metrics.PhaseIdle)
 		select {
diff --git a/pkg/ttl/ttlworker/scan.go b/pkg/ttl/ttlworker/scan.go
@@ -85,6 +85,9 @@ type ttlScanTaskExecResult struct {
 	time time.Time
 	task *ttlScanTask
 	err  error
+	// interruptByWorkerStop indicates whether the task has to stop for the worker stops.
+	// when it is true, we should reschedule this task in another worker or TiDB again.
+	interruptByWorkerStop bool
 }
 
 func (t *ttlScanTask) result(err error) *ttlScanTaskExecResult {
@@ -99,6 +102,17 @@ func (t *ttlScanTask) getDatumRows(rows []chunk.Row) [][]types.Datum {
 	return datums
 }
 
+func (t *ttlScanTask) taskLogger(l *zap.Logger) *zap.Logger {
+	return l.With(
+		zap.String("jobID", t.JobID),
+		zap.Int64("scanID", t.ScanID),
+		zap.Int64("tableID", t.TableID),
+		zap.String("db", t.tbl.Schema.O),
+		zap.String("table", t.tbl.Name.O),
+		zap.String("partition", t.tbl.Partition.O),
+	)
+}
+
 func (t *ttlScanTask) doScan(ctx context.Context, delCh chan<- *ttlDeleteTask, sessPool util.SessionPool) *ttlScanTaskExecResult {
 	// TODO: merge the ctx and the taskCtx in ttl scan task, to allow both "cancel" and gracefully stop workers
 	// now, the taskCtx is only check at the beginning of every loop
@@ -121,13 +135,7 @@ func (t *ttlScanTask) doScan(ctx context.Context, delCh chan<- *ttlDeleteTask, s
 		case <-doScanFinished.Done():
 			return
 		}
-		logger := logutil.BgLogger().With(
-			zap.Int64("tableID", t.TableID),
-			zap.String("table", t.tbl.Name.O),
-			zap.String("partition", t.tbl.Partition.O),
-			zap.String("jobID", t.JobID),
-			zap.Int64("scanID", t.ScanID),
-		)
+		logger := t.taskLogger(logutil.BgLogger())
 		logger.Info("kill the running statement in scan task because the task or worker cancelled")
 		rawSess.KillStmt()
 		ticker := time.NewTicker(time.Minute)
diff --git a/pkg/ttl/ttlworker/task_manager.go b/pkg/ttl/ttlworker/task_manager.go
@@ -60,7 +60,7 @@ func setTTLTaskFinishedSQL(jobID string, scanID int64, state *cache.TTLTaskState
 	return setTTLTaskFinishedTemplate, []any{now.Format(timeFormat), string(stateStr), jobID, scanID}, nil
 }
 
-const updateTTLTaskHeartBeatTempalte = `UPDATE mysql.tidb_ttl_task
+const updateTTLTaskHeartBeatTemplate = `UPDATE mysql.tidb_ttl_task
     SET state = %?,
 		owner_hb_time = %?
     WHERE job_id = %? AND scan_id = %?`
@@ -70,7 +70,18 @@ func updateTTLTaskHeartBeatSQL(jobID string, scanID int64, now time.Time, state
 	if err != nil {
 		return "", nil, err
 	}
-	return updateTTLTaskHeartBeatTempalte, []any{string(stateStr), now.Format(timeFormat), jobID, scanID}, nil
+	return updateTTLTaskHeartBeatTemplate, []any{string(stateStr), now.Format(timeFormat), jobID, scanID}, nil
+}
+
+const updateTTLTaskStateTemplate = `UPDATE mysql.tidb_ttl_task
+    SET state = %? WHERE job_id = %? AND scan_id = %?`
+
+func updateTTLTaskStateSQL(jobID string, scanID int64, state *cache.TTLTaskState) (string, []any, error) {
+	stateStr, err := json.Marshal(state)
+	if err != nil {
+		return "", nil, err
+	}
+	return updateTTLTaskStateTemplate, []any{string(stateStr), jobID, scanID}, nil
 }
 
 const countRunningTasks = "SELECT count(1) FROM mysql.tidb_ttl_task WHERE status = 'running'"
@@ -160,6 +171,8 @@ func (m *taskManager) resizeScanWorkers(count int) error {
 			jobID = curTask.JobID
 			scanID = curTask.ScanID
 			scanErr = errors.New("timeout to cancel scan task")
+
+			result = curTask.result(scanErr)
 		}
 
 		task := findTaskWithID(m.runningTasks, jobID, scanID)
@@ -169,6 +182,7 @@ func (m *taskManager) resizeScanWorkers(count int) error {
 		}
 		logutil.Logger(m.ctx).Debug("scan task finished", zap.String("jobID", task.JobID), zap.Int64("taskID", task.ScanID), zap.Error(scanErr))
 
+		result.interruptByWorkerStop = true
 		task.result = result
 	}
 	return err
@@ -439,40 +453,83 @@ func (m *taskManager) syncTaskFromTable(se session.Session, jobID string, scanID
 // updateHeartBeat updates the heartbeat for all tasks with current instance as owner
 func (m *taskManager) updateHeartBeat(ctx context.Context, se session.Session, now time.Time) error {
 	for _, task := range m.runningTasks {
-		state := &cache.TTLTaskState{
-			TotalRows:   task.statistics.TotalRows.Load(),
-			SuccessRows: task.statistics.SuccessRows.Load(),
-			ErrorRows:   task.statistics.ErrorRows.Load(),
-		}
-		if task.result != nil && task.result.err != nil {
-			state.ScanTaskErr = task.result.err.Error()
+		if err := m.updateTaskState(ctx, se, task, true, now); err != nil {
+			task.taskLogger(logutil.Logger(m.ctx)).Warn("fail to heartbeat task", zap.Error(err))
 		}
+	}
+	return nil
+}
 
-		intest.Assert(se.GetSessionVars().Location().String() == now.Location().String())
-		sql, args, err := updateTTLTaskHeartBeatSQL(task.JobID, task.ScanID, now, state)
-		if err != nil {
-			return err
-		}
-		_, err = se.ExecuteSQL(ctx, sql, args...)
-		if err != nil {
-			return errors.Wrapf(err, "execute sql: %s", sql)
-		}
+func (m *taskManager) updateTaskState(ctx context.Context, se session.Session, task *runningScanTask, heartbeat bool, now time.Time) error {
+	state := &cache.TTLTaskState{
+		TotalRows:   task.statistics.TotalRows.Load(),
+		SuccessRows: task.statistics.SuccessRows.Load(),
+		ErrorRows:   task.statistics.ErrorRows.Load(),
+	}
+
+	if prevState := task.TTLTask.State; prevState != nil {
+		// If a task was timeout and taken over by the current instance,
+		// adding the previous state to the current state to make the statistics more accurate.
+		state.TotalRows += prevState.SuccessRows + prevState.ErrorRows
+		state.SuccessRows += prevState.SuccessRows
+		state.ErrorRows += prevState.ErrorRows
+	}
+
+	if task.result != nil && task.result.err != nil {
+		state.ScanTaskErr = task.result.err.Error()
+	}
+
+	intest.Assert(se.GetSessionVars().Location().String() == now.Location().String())
+	var sql string
+	var args []any
+	var err error
+	if heartbeat {
+		sql, args, err = updateTTLTaskHeartBeatSQL(task.JobID, task.ScanID, now, state)
+	} else {
+		sql, args, err = updateTTLTaskStateSQL(task.JobID, task.ScanID, state)
+	}
+	if err != nil {
+		return err
+	}
+	_, err = se.ExecuteSQL(ctx, sql, args...)
+	if err != nil {
+		return errors.Wrapf(err, "execute sql: %s", sql)
 	}
 	return nil
 }
 
 func (m *taskManager) checkFinishedTask(se session.Session, now time.Time) {
 	stillRunningTasks := make([]*runningScanTask, 0, len(m.runningTasks))
 	for _, task := range m.runningTasks {
-		if !task.finished(logutil.Logger(m.ctx)) {
+		interruptByWorkerStop := task.result != nil && task.result.interruptByWorkerStop
+		// If the task is interrupted or finished, we should remove it from memory.
+		// Otherwise, we should keep it in memory and wait for the next round to check.
+		if !interruptByWorkerStop && !task.finished(logutil.Logger(m.ctx)) {
 			stillRunningTasks = append(stillRunningTasks, task)
 			continue
 		}
 		// we should cancel task to release inner context and avoid memory leak
 		task.cancel()
+
+		logger := task.taskLogger(logutil.Logger(m.ctx))
+		// When a task cannot continue running because there is no worker to run it,
+		// we should only remove it from memory without reporting its final status.
+		// Then some other instance will take over this task when its heartbeat timeout.
+		if interruptByWorkerStop {
+			logger.Info(
+				"remove a task from memory without mark it as finished because it is from a stopped worker",
+			)
+			if err := m.updateTaskState(m.ctx, se, task, false, now); err != nil {
+				logger.Warn("fail to report a interrupted task", zap.Error(err))
+			}
+			continue
+		}
+
+		// Update the meta of a task to mark it as finished.
 		err := m.reportTaskFinished(se, now, task)
 		if err != nil {
-			logutil.Logger(m.ctx).Error("fail to report finished task", zap.Error(err))
+			logger.Error(
+				"fail to report finished task", zap.Error(err))
 			stillRunningTasks = append(stillRunningTasks, task)
 			continue
 		}