Skip to content

Commit 1352db6

Browse files
authored
ddl: add lease not found and deadline exceed to retryable errors (#56630)
close #56550
1 parent 3df0f2e commit 1352db6

File tree

7 files changed

+44
-24
lines changed

7 files changed

+44
-24
lines changed

pkg/ddl/backfilling_dist_executor.go

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,8 @@ import (
2727
"github.com/pingcap/tidb/pkg/lightning/backend/external"
2828
"github.com/pingcap/tidb/pkg/lightning/common"
2929
"github.com/pingcap/tidb/pkg/meta/model"
30-
"github.com/pingcap/tidb/pkg/parser/terror"
3130
"github.com/pingcap/tidb/pkg/sessionctx/variable"
3231
"github.com/pingcap/tidb/pkg/table"
33-
"github.com/pingcap/tidb/pkg/util/dbterror"
3432
"github.com/tikv/client-go/v2/tikv"
3533
"go.uber.org/zap"
3634
)
@@ -223,17 +221,6 @@ func (*backfillDistExecutor) IsIdempotent(*proto.Subtask) bool {
223221
return true
224222
}
225223

226-
func isRetryableError(err error) bool {
227-
originErr := errors.Cause(err)
228-
if tErr, ok := originErr.(*terror.Error); ok {
229-
sqlErr := terror.ToSQLError(tErr)
230-
_, ok := dbterror.ReorgRetryableErrCodes[sqlErr.Code]
231-
return ok
232-
}
233-
// can't retry Unknown err.
234-
return false
235-
}
236-
237224
func (*backfillDistExecutor) IsRetryableError(err error) bool {
238225
return common.IsRetryableError(err) || isRetryableError(err)
239226
}

pkg/ddl/index.go

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -840,7 +840,7 @@ func (w *worker) checkVectorIndexProcessOnTiFlash(jobCtx *jobContext, job *model
840840
if dbterror.ErrWaitReorgTimeout.Equal(err) {
841841
return false, ver, nil
842842
}
843-
if !errorIsRetryable(err, job) {
843+
if !isRetryableJobError(err, job.ErrorCount) {
844844
logutil.DDLLogger().Warn("run add vector index job failed, convert job to rollback", zap.Stringer("job", job), zap.Error(err))
845845
ver, err = convertAddIdxJob2RollbackJob(jobCtx, job, tbl.Meta(), []*model.IndexInfo{indexInfo}, err)
846846
}
@@ -981,7 +981,7 @@ SwitchIndexState:
981981
var reorgTp model.ReorgType
982982
reorgTp, err = pickBackfillType(job)
983983
if err != nil {
984-
if !errorIsRetryable(err, job) {
984+
if !isRetryableJobError(err, job.ErrorCount) {
985985
job.State = model.JobStateCancelled
986986
}
987987
return ver, err
@@ -1261,7 +1261,7 @@ func runIngestReorgJob(w *worker, jobCtx *jobContext, job *model.Job,
12611261
if kv.ErrKeyExists.Equal(err) {
12621262
logutil.DDLLogger().Warn("import index duplicate key, convert job to rollback", zap.Stringer("job", job), zap.Error(err))
12631263
ver, err = convertAddIdxJob2RollbackJob(jobCtx, job, tbl.Meta(), allIndexInfos, err)
1264-
} else if !errorIsRetryable(err, job) {
1264+
} else if !isRetryableJobError(err, job.ErrorCount) {
12651265
logutil.DDLLogger().Warn("run reorg job failed, convert job to rollback",
12661266
zap.String("job", job.String()), zap.Error(err))
12671267
ver, err = convertAddIdxJob2RollbackJob(jobCtx, job, tbl.Meta(), allIndexInfos, err)
@@ -1274,10 +1274,20 @@ func runIngestReorgJob(w *worker, jobCtx *jobContext, job *model.Job,
12741274
return done, ver, nil
12751275
}
12761276

1277-
func errorIsRetryable(err error, job *model.Job) bool {
1278-
if job.ErrorCount+1 >= variable.GetDDLErrorCountLimit() {
1277+
func isRetryableJobError(err error, jobErrCnt int64) bool {
1278+
if jobErrCnt+1 >= variable.GetDDLErrorCountLimit() {
12791279
return false
12801280
}
1281+
return isRetryableError(err)
1282+
}
1283+
1284+
func isRetryableError(err error) bool {
1285+
errMsg := err.Error()
1286+
for _, m := range dbterror.ReorgRetryableErrMsgs {
1287+
if strings.Contains(errMsg, m) {
1288+
return true
1289+
}
1290+
}
12811291
originErr := errors.Cause(err)
12821292
if tErr, ok := originErr.(*terror.Error); ok {
12831293
sqlErr := terror.ToSQLError(tErr)
@@ -1347,7 +1357,7 @@ func runReorgJobAndHandleErr(
13471357
}
13481358
// TODO(tangenta): get duplicate column and match index.
13491359
err = ingest.TryConvertToKeyExistsErr(err, allIndexInfos[0], tbl.Meta())
1350-
if !errorIsRetryable(err, job) {
1360+
if !isRetryableJobError(err, job.ErrorCount) {
13511361
logutil.DDLLogger().Warn("run add index job failed, convert job to rollback", zap.Stringer("job", job), zap.Error(err))
13521362
ver, err = convertAddIdxJob2RollbackJob(jobCtx, job, tbl.Meta(), allIndexInfos, err)
13531363
if err1 := rh.RemoveDDLReorgHandle(job, reorgInfo.elements); err1 != nil {

pkg/ddl/job_worker.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -615,7 +615,7 @@ func (w *worker) transitOneJobStep(
615615
jobCtx.addUnSynced(job.ID)
616616

617617
// If error is non-retryable, we can ignore the sleep.
618-
if runJobErr != nil && errorIsRetryable(runJobErr, job) {
618+
if runJobErr != nil && isRetryableJobError(runJobErr, job.ErrorCount) {
619619
jobCtx.logger.Info("run DDL job failed, sleeps a while then retries it.",
620620
zap.Duration("waitTime", GetWaitTimeWhenErrorOccurred()), zap.Error(runJobErr))
621621
// wait a while to retry again. If we don't wait here, DDL will retry this job immediately,

pkg/owner/manager.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -530,6 +530,9 @@ func AcquireDistributedLock(
530530
}
531531
return false, nil
532532
})
533+
failpoint.Inject("mockAcquireDistLockFailed", func() {
534+
err = errors.Errorf("requested lease not found")
535+
})
533536
if err != nil {
534537
err1 := se.Close()
535538
if err1 != nil {

pkg/util/dbterror/ddl_terror.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -503,7 +503,7 @@ var (
503503
ErrWarnGlobalIndexNeedManuallyAnalyze = ClassDDL.NewStd(mysql.ErrWarnGlobalIndexNeedManuallyAnalyze)
504504
)
505505

506-
// ReorgRetryableErrCodes is the error codes that are retryable for reorganization.
506+
// ReorgRetryableErrCodes are the error codes that are retryable for reorganization.
507507
var ReorgRetryableErrCodes = map[uint16]struct{}{
508508
mysql.ErrPDServerTimeout: {},
509509
mysql.ErrTiKVServerTimeout: {},
@@ -526,3 +526,9 @@ var ReorgRetryableErrCodes = map[uint16]struct{}{
526526
// Temporary network partitioning may cause pk commit failure.
527527
uint16(terror.CodeResultUndetermined): {},
528528
}
529+
530+
// ReorgRetryableErrMsgs are the error messages that are retryable for reorganization.
531+
var ReorgRetryableErrMsgs = []string{
532+
"context deadline exceeded",
533+
"requested lease not found",
534+
}

tests/realtikvtest/addindextest1/disttask_test.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -286,3 +286,17 @@ func TestAddUKErrorMessage(t *testing.T) {
286286
err := tk.ExecToErr("alter table t add unique index uk(b);")
287287
require.ErrorContains(t, err, "Duplicate entry '1' for key 't.uk'")
288288
}
289+
290+
func TestAddIndexDistLockAcquireFailed(t *testing.T) {
291+
store := realtikvtest.CreateMockStoreAndSetup(t)
292+
tk := testkit.NewTestKit(t, store)
293+
tk.MustExec("use test")
294+
tk.MustExec("set global tidb_enable_dist_task = on;")
295+
t.Cleanup(func() {
296+
tk.MustExec("set global tidb_enable_dist_task = off;")
297+
})
298+
tk.MustExec("create table t (a int, b int);")
299+
tk.MustExec("insert into t values (1, 1);")
300+
testfailpoint.Enable(t, "github.com/pingcap/tidb/pkg/owner/mockAcquireDistLockFailed", "1*return(true)")
301+
tk.MustExec("alter table t add index idx(b);")
302+
}

tests/realtikvtest/addindextest3/ingest_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -535,14 +535,14 @@ func TestAddIndexIngestFailures(t *testing.T) {
535535
tk.MustExec("insert into t values (1, 1, 1);")
536536

537537
// Test precheck failed.
538-
require.NoError(t, failpoint.Enable("github.com/pingcap/tidb/pkg/ddl/ingest/mockIngestCheckEnvFailed", "return"))
538+
require.NoError(t, failpoint.Enable("github.com/pingcap/tidb/pkg/ddl/ingest/mockIngestCheckEnvFailed", "1*return"))
539539
tk.MustGetErrMsg("alter table t add index idx(b);", "[ddl:8256]Check ingest environment failed: mock error")
540540
require.NoError(t, failpoint.Disable("github.com/pingcap/tidb/pkg/ddl/ingest/mockIngestCheckEnvFailed"))
541541

542542
tk.MustExec(`set global tidb_enable_dist_task=on;`)
543543
// Test reset engine failed.
544-
require.NoError(t, failpoint.Enable("github.com/pingcap/tidb/pkg/ddl/ingest/mockResetEngineFailed", "return"))
545-
tk.MustGetErrMsg("alter table t add index idx(b);", "[0]mock reset engine failed")
544+
require.NoError(t, failpoint.Enable("github.com/pingcap/tidb/pkg/ddl/ingest/mockResetEngineFailed", "1*return"))
545+
tk.MustExec("alter table t add index idx(b);")
546546
require.NoError(t, failpoint.Disable("github.com/pingcap/tidb/pkg/ddl/ingest/mockResetEngineFailed"))
547547
tk.MustExec(`set global tidb_enable_dist_task=off;`)
548548
}

0 commit comments

Comments
 (0)