@@ -22,6 +22,7 @@ import (
22
22
"sync/atomic"
23
23
"time"
24
24
25
+ "github.com/pingcap/log"
25
26
"github.com/pingcap/tidb/pkg/sessionctx/variable"
26
27
"github.com/pingcap/tidb/pkg/ttl/cache"
27
28
"github.com/pingcap/tidb/pkg/ttl/metrics"
@@ -94,12 +95,12 @@ func (t *ttlDeleteTask) doDelete(ctx context.Context, rawSe session.Session) (re
94
95
leftRows := t .rows
95
96
defer func () {
96
97
if len (leftRows ) > 0 {
97
- t . statistics . IncErrorRows ( len ( leftRows ) )
98
+ retryRows = append ( retryRows , leftRows ... )
98
99
}
99
100
}()
100
101
101
102
se := newTableSession (rawSe , t .tbl , t .expire )
102
- for len (leftRows ) > 0 {
103
+ for len (leftRows ) > 0 && ctx . Err () == nil {
103
104
maxBatch := variable .TTLDeleteBatchSize .Load ()
104
105
var delBatch [][]types.Datum
105
106
if int64 (len (leftRows )) < maxBatch {
@@ -133,7 +134,6 @@ func (t *ttlDeleteTask) doDelete(ctx context.Context, rawSe session.Session) (re
133
134
sqlInterval := time .Since (sqlStart )
134
135
if err != nil {
135
136
metrics .DeleteErrorDuration .Observe (sqlInterval .Seconds ())
136
- needRetry = needRetry && ctx .Err () == nil
137
137
logutil .BgLogger ().Warn (
138
138
"delete SQL in TTL failed" ,
139
139
zap .Error (err ),
@@ -214,6 +214,11 @@ func (b *ttlDelRetryBuffer) DoRetry(do func(*ttlDeleteTask) [][]types.Datum) tim
214
214
return b .retryInterval
215
215
}
216
216
217
+ // SetRetryInterval sets the retry interval of the buffer.
218
+ func (b * ttlDelRetryBuffer ) SetRetryInterval (interval time.Duration ) {
219
+ b .retryInterval = interval
220
+ }
221
+
217
222
// Drain drains a retry buffer.
218
223
func (b * ttlDelRetryBuffer ) Drain () {
219
224
for ele := b .list .Front (); ele != nil ; ele = ele .Next () {
@@ -296,8 +301,36 @@ func (w *ttlDeleteWorker) loop() error {
296
301
timer := time .NewTimer (w .retryBuffer .retryInterval )
297
302
defer timer .Stop ()
298
303
299
- // drain retry buffer to make sure the statistics are correct
300
- defer w .retryBuffer .Drain ()
304
+ defer func () {
305
+ // have a final try for all rows in retry buffer before the worker stops
306
+ if w .retryBuffer .Len () > 0 {
307
+ start := time .Now ()
308
+ log .Info (
309
+ "try to delete TTL rows in del worker buffer immediately because the worker is going to stop" ,
310
+ zap .Int ("bufferLen" , w .retryBuffer .Len ()),
311
+ )
312
+ retryCtx , cancel := context .WithTimeout (context .Background (), time .Second * 10 )
313
+ defer cancel ()
314
+ w .retryBuffer .SetRetryInterval (0 )
315
+ w .retryBuffer .DoRetry (func (task * ttlDeleteTask ) [][]types.Datum {
316
+ return task .doDelete (retryCtx , se )
317
+ })
318
+ log .Info (
319
+ "delete TTL rows in del worker buffer finished" ,
320
+ zap .Duration ("duration" , time .Since (start )),
321
+ )
322
+ }
323
+
324
+ // drain retry buffer to make sure the statistics are correct
325
+ if w .retryBuffer .Len () > 0 {
326
+ log .Warn (
327
+ "some TTL rows are still in the buffer while the worker is going to stop, mark them as error" ,
328
+ zap .Int ("bufferLen" , w .retryBuffer .Len ()),
329
+ )
330
+ w .retryBuffer .Drain ()
331
+ }
332
+ }()
333
+
301
334
for w .Status () == workerStatusRunning {
302
335
tracer .EnterPhase (metrics .PhaseIdle )
303
336
select {
0 commit comments