Skip to content

Commit e68c26a

Browse files
authored
ttl: force to kill SQL in scan task when canceling TTL job/task (#56518)
close #56511
1 parent f4e820d commit e68c26a

File tree

6 files changed

+134
-3
lines changed

6 files changed

+134
-3
lines changed

pkg/ttl/session/BUILD.bazel

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ go_library(
1515
"//pkg/ttl/metrics",
1616
"//pkg/util/chunk",
1717
"//pkg/util/sqlexec",
18+
"//pkg/util/sqlkiller",
1819
"//pkg/util/timeutil",
1920
"@com_github_pingcap_errors//:errors",
2021
],
@@ -29,12 +30,13 @@ go_test(
2930
"sysvar_test.go",
3031
],
3132
flaky = True,
32-
shard_count = 6,
33+
shard_count = 7,
3334
deps = [
3435
":session",
3536
"//pkg/sessionctx/variable",
3637
"//pkg/testkit",
3738
"//pkg/testkit/testsetup",
39+
"//pkg/util",
3840
"@com_github_pingcap_errors//:errors",
3941
"@com_github_stretchr_testify//require",
4042
"@org_uber_go_goleak//:goleak",

pkg/ttl/session/session.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import (
2828
"github.com/pingcap/tidb/pkg/ttl/metrics"
2929
"github.com/pingcap/tidb/pkg/util/chunk"
3030
"github.com/pingcap/tidb/pkg/util/sqlexec"
31+
"github.com/pingcap/tidb/pkg/util/sqlkiller"
3132
"github.com/pingcap/tidb/pkg/util/timeutil"
3233
)
3334

@@ -54,6 +55,8 @@ type Session interface {
5455
ResetWithGlobalTimeZone(ctx context.Context) error
5556
// GlobalTimeZone returns the global timezone. It is used to compute expire time for TTL
5657
GlobalTimeZone(ctx context.Context) (*time.Location, error)
58+
// KillStmt kills the current statement execution
59+
KillStmt()
5760
// Close closes the session
5861
Close()
5962
// Now returns the current time in location specified by session var
@@ -180,6 +183,11 @@ func (s *session) GlobalTimeZone(ctx context.Context) (*time.Location, error) {
180183
return timeutil.ParseTimeZone(str)
181184
}
182185

186+
// KillStmt kills the current statement execution
187+
func (s *session) KillStmt() {
188+
s.GetSessionVars().SQLKiller.SendKillSignal(sqlkiller.QueryInterrupted)
189+
}
190+
183191
// Close closes the session
184192
func (s *session) Close() {
185193
if s.closeFn != nil {

pkg/ttl/session/session_test.go

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,12 @@ package session_test
1717
import (
1818
"context"
1919
"testing"
20+
"time"
2021

2122
"github.com/pingcap/errors"
2223
"github.com/pingcap/tidb/pkg/testkit"
2324
"github.com/pingcap/tidb/pkg/ttl/session"
25+
"github.com/pingcap/tidb/pkg/util"
2426
"github.com/stretchr/testify/require"
2527
)
2628

@@ -64,3 +66,28 @@ func TestSessionResetTimeZone(t *testing.T) {
6466
require.NoError(t, se.ResetWithGlobalTimeZone(context.TODO()))
6567
tk.MustQuery("select @@time_zone").Check(testkit.Rows("UTC"))
6668
}
69+
70+
func TestSessionKill(t *testing.T) {
71+
store, do := testkit.CreateMockStoreAndDomain(t)
72+
tk := testkit.NewTestKit(t, store)
73+
se := session.NewSession(tk.Session(), tk.Session(), nil)
74+
sleepStmt := "select sleep(123)"
75+
wg := util.WaitGroupWrapper{}
76+
wg.Run(func() {
77+
start := time.Now()
78+
for time.Since(start) < 10*time.Second {
79+
time.Sleep(10 * time.Millisecond)
80+
processes := do.InfoSyncer().GetSessionManager().ShowProcessList()
81+
for _, proc := range processes {
82+
if proc.Info == sleepStmt {
83+
se.KillStmt()
84+
return
85+
}
86+
}
87+
}
88+
require.FailNow(t, "wait sleep stmt timeout")
89+
})
90+
// the killed sleep stmt will return "1"
91+
tk.MustQuery(sleepStmt).Check(testkit.Rows("1"))
92+
wg.Wait()
93+
}

pkg/ttl/ttlworker/scan.go

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,45 @@ func (t *ttlScanTask) doScan(ctx context.Context, delCh chan<- *ttlDeleteTask, s
111111
if err != nil {
112112
return t.result(err)
113113
}
114-
defer rawSess.Close()
114+
115+
doScanFinished, setDoScanFinished := context.WithCancel(context.Background())
116+
wg := util.WaitGroupWrapper{}
117+
wg.Run(func() {
118+
select {
119+
case <-taskCtx.Done():
120+
case <-ctx.Done():
121+
case <-doScanFinished.Done():
122+
return
123+
}
124+
logger := logutil.BgLogger().With(
125+
zap.Int64("tableID", t.TableID),
126+
zap.String("table", t.tbl.Name.O),
127+
zap.String("partition", t.tbl.Partition.O),
128+
zap.String("jobID", t.JobID),
129+
zap.Int64("scanID", t.ScanID),
130+
)
131+
logger.Info("kill the running statement in scan task because the task or worker cancelled")
132+
rawSess.KillStmt()
133+
ticker := time.NewTicker(time.Minute)
134+
defer ticker.Stop()
135+
for {
136+
// Have a small probability that the kill signal will be lost when the session is idle.
137+
// So wait for a while and send the kill signal again if the scan is still running.
138+
select {
139+
case <-doScanFinished.Done():
140+
return
141+
case <-ticker.C:
142+
logger.Warn("scan task is still running after the kill signal sent, kill it again")
143+
rawSess.KillStmt()
144+
}
145+
}
146+
})
147+
148+
defer func() {
149+
setDoScanFinished()
150+
wg.Wait()
151+
rawSess.Close()
152+
}()
115153

116154
safeExpire, err := t.tbl.EvalExpireTime(taskCtx, rawSess, rawSess.Now())
117155
if err != nil {
@@ -182,7 +220,7 @@ func (t *ttlScanTask) doScan(ctx context.Context, delCh chan<- *ttlDeleteTask, s
182220
selectInterval := time.Since(sqlStart)
183221
if sqlErr != nil {
184222
metrics.SelectErrorDuration.Observe(selectInterval.Seconds())
185-
needRetry := retryable && retryTimes < scanTaskExecuteSQLMaxRetry && ctx.Err() == nil
223+
needRetry := retryable && retryTimes < scanTaskExecuteSQLMaxRetry && ctx.Err() == nil && t.ctx.Err() == nil
186224
logutil.BgLogger().Error("execute query for ttl scan task failed",
187225
zap.String("SQL", sql),
188226
zap.Int("retryTimes", retryTimes),

pkg/ttl/ttlworker/scan_test.go

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525
"github.com/pingcap/tidb/pkg/sessionctx/variable"
2626
"github.com/pingcap/tidb/pkg/ttl/cache"
2727
"github.com/pingcap/tidb/pkg/types"
28+
"github.com/pingcap/tidb/pkg/util"
2829
"github.com/pingcap/tidb/pkg/util/chunk"
2930
"github.com/stretchr/testify/require"
3031
)
@@ -446,3 +447,51 @@ func TestScanTaskCheck(t *testing.T) {
446447
require.Equal(t, 1, len(ch))
447448
require.Equal(t, "Total Rows: 1, Success Rows: 0, Error Rows: 0", task.statistics.String())
448449
}
450+
451+
func TestScanTaskCancelStmt(t *testing.T) {
452+
task := &ttlScanTask{
453+
ctx: context.Background(),
454+
tbl: newMockTTLTbl(t, "t1"),
455+
TTLTask: &cache.TTLTask{
456+
ExpireTime: time.UnixMilli(0),
457+
ScanRangeStart: []types.Datum{types.NewIntDatum(0)},
458+
},
459+
statistics: &ttlStatistics{},
460+
}
461+
462+
testCancel := func(ctx context.Context, doCancel func()) {
463+
mockPool := newMockSessionPool(t)
464+
startExec := make(chan struct{})
465+
mockPool.se.sessionInfoSchema = newMockInfoSchema(task.tbl.TableInfo)
466+
mockPool.se.executeSQL = func(_ context.Context, _ string, _ ...any) ([]chunk.Row, error) {
467+
close(startExec)
468+
select {
469+
case <-mockPool.se.killed:
470+
return nil, errors.New("killed")
471+
case <-time.After(10 * time.Second):
472+
return nil, errors.New("timeout")
473+
}
474+
}
475+
wg := util.WaitGroupWrapper{}
476+
wg.Run(func() {
477+
select {
478+
case <-startExec:
479+
case <-time.After(10 * time.Second):
480+
require.FailNow(t, "timeout")
481+
}
482+
doCancel()
483+
})
484+
r := task.doScan(ctx, nil, mockPool)
485+
require.NotNil(t, r)
486+
require.EqualError(t, r.err, "killed")
487+
wg.Wait()
488+
}
489+
490+
// test cancel with input context
491+
ctx, cancel := context.WithCancel(context.Background())
492+
testCancel(ctx, cancel)
493+
494+
// test cancel with task context
495+
task.ctx, cancel = context.WithCancel(context.Background())
496+
testCancel(context.Background(), cancel)
497+
}

pkg/ttl/ttlworker/session_test.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,7 @@ type mockSession struct {
155155
resetTimeZoneCalls int
156156
closed bool
157157
commitErr error
158+
killed chan struct{}
158159
}
159160

160161
func newMockSession(t *testing.T, tbl ...*cache.PhysicalTable) *mockSession {
@@ -168,6 +169,7 @@ func newMockSession(t *testing.T, tbl ...*cache.PhysicalTable) *mockSession {
168169
t: t,
169170
sessionInfoSchema: newMockInfoSchema(tbls...),
170171
sessionVars: sessVars,
172+
killed: make(chan struct{}),
171173
}
172174
}
173175

@@ -224,6 +226,11 @@ func (s *mockSession) GlobalTimeZone(_ context.Context) (*time.Location, error)
224226
return time.Local, nil
225227
}
226228

229+
// KillStmt kills the current statement execution
230+
func (s *mockSession) KillStmt() {
231+
close(s.killed)
232+
}
233+
227234
func (s *mockSession) Close() {
228235
s.closed = true
229236
}

0 commit comments

Comments
 (0)