Skip to content

Commit f97de48

Browse files
committed
ttl: only gc in leader to save performance
1 parent d7a8b67 commit f97de48

File tree

5 files changed

+72
-5
lines changed

5 files changed

+72
-5
lines changed

pkg/metrics/grafana/tidb.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22885,7 +22885,7 @@
2288522885
"targets": [
2288622886
{
2288722887
"exemplar": true,
22888-
"expr": "avg(tidb_server_ttl_watermark_delay{k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\", type=\"schedule\"}) by (type, name)",
22888+
"expr": "max(tidb_server_ttl_watermark_delay{k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\", type=\"schedule\"}) by (type, name)",
2288922889
"interval": "",
2289022890
"legendFormat": "{{ name }}",
2289122891
"queryType": "randomWalk",

pkg/ttl/metrics/metrics.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,3 +255,8 @@ func UpdateDelayMetrics(records map[int64]*DelayMetricsRecord) {
255255
metrics.TTLWatermarkDelay.With(prometheus.Labels{metrics.LblType: "schedule", metrics.LblName: delay}).Set(v)
256256
}
257257
}
258+
259+
// ClearDelayMetrics clears the metrics of TTL delay
260+
func ClearDelayMetrics() {
261+
metrics.TTLWatermarkDelay.Reset()
262+
}

pkg/ttl/ttlworker/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ go_test(
109109
"@com_github_ngaut_pools//:pools",
110110
"@com_github_pingcap_errors//:errors",
111111
"@com_github_pingcap_failpoint//:failpoint",
112+
"@com_github_prometheus_client_golang//prometheus",
112113
"@com_github_prometheus_client_model//go",
113114
"@com_github_stretchr_testify//assert",
114115
"@com_github_stretchr_testify//mock",

pkg/ttl/ttlworker/job_manager.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -504,6 +504,12 @@ func (m *JobManager) reportMetrics(se session.Session) {
504504
metrics.RunningJobsCnt.Set(runningJobs)
505505
metrics.CancellingJobsCnt.Set(cancellingJobs)
506506

507+
if !m.isLeader() {
508+
// only the leader can do collect delay metrics to reduce the performance overhead
509+
metrics.ClearDelayMetrics()
510+
return
511+
}
512+
507513
if time.Since(m.lastReportDelayMetricsTime) > 10*time.Minute {
508514
m.lastReportDelayMetricsTime = time.Now()
509515
records, err := GetDelayMetricRecords(m.ctx, se, time.Now())
@@ -1093,6 +1099,11 @@ func summarizeTaskResult(tasks []*cache.TTLTask) (*TTLSummary, error) {
10931099

10941100
// DoGC deletes some old TTL job histories and redundant scan tasks
10951101
func (m *JobManager) DoGC(ctx context.Context, se session.Session, now time.Time) {
1102+
if !m.isLeader() {
1103+
// only the leader can do the GC to reduce the performance impact
1104+
return
1105+
}
1106+
10961107
// Remove the table not exist in info schema cache.
10971108
// Delete the table status before deleting the tasks. Therefore the related tasks
10981109
if err := m.updateInfoSchemaCache(se); err == nil {

pkg/ttl/ttlworker/job_manager_integration_test.go

Lines changed: 54 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ import (
3333
"github.com/pingcap/tidb/pkg/domain"
3434
"github.com/pingcap/tidb/pkg/kv"
3535
"github.com/pingcap/tidb/pkg/meta/model"
36+
metrics2 "github.com/pingcap/tidb/pkg/metrics"
3637
"github.com/pingcap/tidb/pkg/parser/ast"
3738
"github.com/pingcap/tidb/pkg/sessionctx/vardef"
3839
"github.com/pingcap/tidb/pkg/statistics"
@@ -49,6 +50,7 @@ import (
4950
"github.com/pingcap/tidb/pkg/util"
5051
"github.com/pingcap/tidb/pkg/util/logutil"
5152
"github.com/pingcap/tidb/pkg/util/skip"
53+
"github.com/prometheus/client_golang/prometheus"
5254
dto "github.com/prometheus/client_model/go"
5355
"github.com/stretchr/testify/require"
5456
"go.uber.org/atomic"
@@ -797,10 +799,15 @@ func TestGCScanTasks(t *testing.T) {
797799
addScanTaskRecord(3, 2, 1)
798800
addScanTaskRecord(3, 2, 2)
799801

802+
isLeader := false
800803
m := ttlworker.NewJobManager("manager-1", nil, store, nil, func() bool {
801-
return true
804+
return isLeader
802805
})
803806
se := session.NewSession(tk.Session(), tk.Session(), func(_ session.Session) {})
807+
// only leader can do GC
808+
m.DoGC(context.TODO(), se, se.Now())
809+
tk.MustQuery("select count(1) from mysql.tidb_ttl_task").Check(testkit.Rows("6"))
810+
isLeader = true
804811
m.DoGC(context.TODO(), se, se.Now())
805812
tk.MustQuery("select job_id, scan_id from mysql.tidb_ttl_task order by job_id, scan_id asc").Check(testkit.Rows("1 1", "1 2"))
806813
}
@@ -816,10 +823,15 @@ func TestGCTableStatus(t *testing.T) {
816823
// insert table status without corresponding table
817824
tk.MustExec("INSERT INTO mysql.tidb_ttl_table_status (table_id,parent_table_id) VALUES (?, ?)", 2024, 2024)
818825

826+
isLeader := false
819827
m := ttlworker.NewJobManager("manager-1", nil, store, nil, func() bool {
820-
return true
828+
return isLeader
821829
})
822830
se := session.NewSession(tk.Session(), tk.Session(), func(_ session.Session) {})
831+
// only leader can do GC
832+
m.DoGC(context.TODO(), se, se.Now())
833+
tk.MustQuery("select count(1) from mysql.tidb_ttl_table_status").Check(testkit.Rows("1"))
834+
isLeader = true
823835
m.DoGC(context.TODO(), se, se.Now())
824836
tk.MustQuery("select * from mysql.tidb_ttl_table_status").Check(nil)
825837

@@ -877,11 +889,16 @@ func TestGCTTLHistory(t *testing.T) {
877889
addHistory(6, 91)
878890
addHistory(7, 100)
879891

892+
isLeader := false
880893
m := ttlworker.NewJobManager("manager-1", nil, store, nil, func() bool {
881-
return true
894+
return isLeader
882895
})
883896
se := session.NewSession(tk.Session(), tk.Session(), func(_ session.Session) {})
884897
m.DoGC(context.TODO(), se, se.Now())
898+
// only leader can go GC
899+
tk.MustQuery("select count(1) from mysql.tidb_ttl_job_history").Check(testkit.Rows("7"))
900+
isLeader = true
901+
m.DoGC(context.TODO(), se, se.Now())
885902
tk.MustQuery("select job_id from mysql.tidb_ttl_job_history order by job_id asc").Check(testkit.Rows("1", "2", "3", "4", "5"))
886903
}
887904

@@ -1047,6 +1064,37 @@ func TestDelayMetrics(t *testing.T) {
10471064
checkRecord(records, "t3", now.Add(-3*time.Hour))
10481065
checkRecord(records, "t4", now.Add(-3*time.Hour))
10491066
checkRecord(records, "t5", emptyTime)
1067+
1068+
metrics.ClearDelayMetrics()
1069+
getMetricCnt := func() int {
1070+
ch := make(chan prometheus.Metric)
1071+
go func() {
1072+
metrics2.TTLWatermarkDelay.Collect(ch)
1073+
close(ch)
1074+
}()
1075+
1076+
cnt := 0
1077+
for range ch {
1078+
cnt++
1079+
}
1080+
return cnt
1081+
}
1082+
1083+
isLeader := false
1084+
m := ttlworker.NewJobManager("test-ttl-job-manager", nil, store, nil, func() bool {
1085+
return isLeader
1086+
})
1087+
// If the manager is not leader, the metrics will be empty.
1088+
m.ReportMetrics(se)
1089+
require.Zero(t, getMetricCnt())
1090+
// leader will collect metrics
1091+
isLeader = true
1092+
m.ReportMetrics(se)
1093+
require.Equal(t, len(metrics.WaterMarkScheduleDelayNames), getMetricCnt())
1094+
// when back to non-leader, the metrics will be empty.
1095+
isLeader = false
1096+
m.ReportMetrics(se)
1097+
require.Zero(t, getMetricCnt())
10501098
}
10511099

10521100
type poolTestWrapper struct {
@@ -1503,7 +1551,9 @@ func TestDisableTTLAfterLoseHeartbeat(t *testing.T) {
15031551

15041552
ctx := context.Background()
15051553
m1 := ttlworker.NewJobManager("test-ttl-job-manager-1", nil, store, nil, nil)
1506-
m2 := ttlworker.NewJobManager("test-ttl-job-manager-2", nil, store, nil, nil)
1554+
m2 := ttlworker.NewJobManager("test-ttl-job-manager-2", nil, store, nil, func() bool {
1555+
return true
1556+
})
15071557

15081558
now := se.Now()
15091559

0 commit comments

Comments
 (0)