Skip to content

Commit c526e52

Browse files
authored
ttl: only gc in leader to save performance (#59358) (#59471)
close #59357
1 parent 4d157f6 commit c526e52

File tree

6 files changed

+97
-4
lines changed

6 files changed

+97
-4
lines changed

pkg/metrics/grafana/tidb.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20236,7 +20236,7 @@
2023620236
"targets": [
2023720237
{
2023820238
"exemplar": true,
20239-
"expr": "avg(tidb_server_ttl_watermark_delay{k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\", type=\"schedule\"}) by (type, name)",
20239+
"expr": "max(tidb_server_ttl_watermark_delay{k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\", type=\"schedule\"}) by (type, name)",
2024020240
"interval": "",
2024120241
"legendFormat": "{{ name }}",
2024220242
"queryType": "randomWalk",

pkg/ttl/metrics/metrics.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,3 +255,8 @@ func UpdateDelayMetrics(records map[int64]*DelayMetricsRecord) {
255255
metrics.TTLWatermarkDelay.With(prometheus.Labels{metrics.LblType: "schedule", metrics.LblName: delay}).Set(v)
256256
}
257257
}
258+
259+
// ClearDelayMetrics clears the metrics of TTL delay
260+
func ClearDelayMetrics() {
261+
metrics.TTLWatermarkDelay.Reset()
262+
}

pkg/ttl/ttlworker/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ go_test(
101101
"@com_github_ngaut_pools//:pools",
102102
"@com_github_pingcap_errors//:errors",
103103
"@com_github_pingcap_failpoint//:failpoint",
104+
"@com_github_prometheus_client_golang//prometheus",
104105
"@com_github_prometheus_client_model//go",
105106
"@com_github_stretchr_testify//assert",
106107
"@com_github_stretchr_testify//mock",

pkg/ttl/ttlworker/job_manager.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -490,8 +490,15 @@ func (m *JobManager) reportMetrics(se session.Session) {
490490
metrics.RunningJobsCnt.Set(runningJobs)
491491
metrics.CancellingJobsCnt.Set(cancellingJobs)
492492

493+
if !m.isLeader() {
494+
// only the leader can do collect delay metrics to reduce the performance overhead
495+
metrics.ClearDelayMetrics()
496+
return
497+
}
498+
493499
if time.Since(m.lastReportDelayMetricsTime) > 10*time.Minute {
494500
m.lastReportDelayMetricsTime = time.Now()
501+
logutil.Logger(m.ctx).Info("TTL leader to collect delay metrics")
495502
records, err := GetDelayMetricRecords(m.ctx, se, time.Now())
496503
if err != nil {
497504
logutil.Logger(m.ctx).Info("failed to get TTL delay metrics", zap.Error(err))
@@ -998,6 +1005,12 @@ func summarizeTaskResult(tasks []*cache.TTLTask) (*TTLSummary, error) {
9981005

9991006
// DoGC deletes some old TTL job histories and redundant scan tasks
10001007
func (m *JobManager) DoGC(ctx context.Context, se session.Session) {
1008+
if !m.isLeader() {
1009+
// only the leader can do the GC to reduce the performance impact
1010+
return
1011+
}
1012+
1013+
logutil.Logger(m.ctx).Info("TTL leader to DoGC")
10011014
// Remove the table not exist in info schema cache.
10021015
// Delete the table status before deleting the tasks. Therefore the related tasks
10031016
if err := m.updateInfoSchemaCache(se); err == nil {

pkg/ttl/ttlworker/job_manager_integration_test.go

Lines changed: 67 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ import (
3030
"github.com/pingcap/failpoint"
3131
"github.com/pingcap/tidb/pkg/domain"
3232
"github.com/pingcap/tidb/pkg/kv"
33+
metrics2 "github.com/pingcap/tidb/pkg/metrics"
3334
"github.com/pingcap/tidb/pkg/parser/ast"
3435
"github.com/pingcap/tidb/pkg/parser/model"
3536
dbsession "github.com/pingcap/tidb/pkg/session"
@@ -44,6 +45,7 @@ import (
4445
"github.com/pingcap/tidb/pkg/ttl/session"
4546
"github.com/pingcap/tidb/pkg/ttl/ttlworker"
4647
"github.com/pingcap/tidb/pkg/util/logutil"
48+
"github.com/prometheus/client_golang/prometheus"
4749
dto "github.com/prometheus/client_model/go"
4850
"github.com/stretchr/testify/require"
4951
"go.uber.org/atomic"
@@ -807,10 +809,15 @@ func TestGCScanTasks(t *testing.T) {
807809
addScanTaskRecord(3, 2, 1)
808810
addScanTaskRecord(3, 2, 2)
809811

812+
isLeader := false
810813
m := ttlworker.NewJobManager("manager-1", nil, store, nil, func() bool {
811-
return true
814+
return isLeader
812815
})
813816
se := session.NewSession(tk.Session(), tk.Session(), func(_ session.Session) {})
817+
// only leader can do GC
818+
m.DoGC(context.TODO(), se)
819+
tk.MustQuery("select count(1) from mysql.tidb_ttl_task").Check(testkit.Rows("6"))
820+
isLeader = true
814821
m.DoGC(context.TODO(), se)
815822
tk.MustQuery("select job_id, scan_id from mysql.tidb_ttl_task order by job_id, scan_id asc").Check(testkit.Rows("1 1", "1 2"))
816823
}
@@ -826,10 +833,15 @@ func TestGCTableStatus(t *testing.T) {
826833
// insert table status without corresponding table
827834
tk.MustExec("INSERT INTO mysql.tidb_ttl_table_status (table_id,parent_table_id) VALUES (?, ?)", 2024, 2024)
828835

836+
isLeader := false
829837
m := ttlworker.NewJobManager("manager-1", nil, store, nil, func() bool {
830-
return true
838+
return isLeader
831839
})
832840
se := session.NewSession(tk.Session(), tk.Session(), func(_ session.Session) {})
841+
// only leader can do GC
842+
m.DoGC(context.TODO(), se)
843+
tk.MustQuery("select count(1) from mysql.tidb_ttl_table_status").Check(testkit.Rows("1"))
844+
isLeader = true
833845
m.DoGC(context.TODO(), se)
834846
tk.MustQuery("select * from mysql.tidb_ttl_table_status").Check(nil)
835847

@@ -887,11 +899,16 @@ func TestGCTTLHistory(t *testing.T) {
887899
addHistory(6, 91)
888900
addHistory(7, 100)
889901

902+
isLeader := false
890903
m := ttlworker.NewJobManager("manager-1", nil, store, nil, func() bool {
891-
return true
904+
return isLeader
892905
})
893906
se := session.NewSession(tk.Session(), tk.Session(), func(_ session.Session) {})
894907
m.DoGC(context.TODO(), se)
908+
// only leader can go GC
909+
tk.MustQuery("select count(1) from mysql.tidb_ttl_job_history").Check(testkit.Rows("7"))
910+
isLeader = true
911+
m.DoGC(context.TODO(), se)
895912
tk.MustQuery("select job_id from mysql.tidb_ttl_job_history order by job_id asc").Check(testkit.Rows("1", "2", "3", "4", "5"))
896913
}
897914

@@ -1057,6 +1074,53 @@ func TestDelayMetrics(t *testing.T) {
10571074
checkRecord(records, "t3", now.Add(-3*time.Hour))
10581075
checkRecord(records, "t4", now.Add(-3*time.Hour))
10591076
checkRecord(records, "t5", emptyTime)
1077+
1078+
metrics.ClearDelayMetrics()
1079+
getMetricCnt := func() int {
1080+
ch := make(chan prometheus.Metric)
1081+
go func() {
1082+
metrics2.TTLWatermarkDelay.Collect(ch)
1083+
close(ch)
1084+
}()
1085+
1086+
cnt := 0
1087+
for range ch {
1088+
cnt++
1089+
}
1090+
return cnt
1091+
}
1092+
1093+
isLeader := false
1094+
m := ttlworker.NewJobManager("test-ttl-job-manager", nil, store, nil, func() bool {
1095+
return isLeader
1096+
})
1097+
// If the manager is not leader, the metrics will be empty.
1098+
m.ReportMetrics(se)
1099+
require.Zero(t, getMetricCnt())
1100+
// leader will collect metrics
1101+
isLeader = true
1102+
m.SetLastReportDelayMetricsTime(time.Now().Add(-11 * time.Minute))
1103+
m.ReportMetrics(se)
1104+
require.Equal(t, len(metrics.WaterMarkScheduleDelayNames), getMetricCnt())
1105+
require.InDelta(t, time.Now().Unix(), m.GetLastReportDelayMetricsTime().Unix(), 5)
1106+
// will not collect metrics in 10 minutes
1107+
lastReportTime := time.Now().Add(-9 * time.Minute)
1108+
m.SetLastReportDelayMetricsTime(lastReportTime)
1109+
m.ReportMetrics(se)
1110+
require.Equal(t, len(metrics.WaterMarkScheduleDelayNames), getMetricCnt())
1111+
require.Equal(t, lastReportTime.Unix(), m.GetLastReportDelayMetricsTime().Unix(), 5)
1112+
// when back to non-leader, the metrics will be empty and last report time will not be updated.
1113+
isLeader = false
1114+
lastReportTime = time.Now().Add(-11 * time.Minute)
1115+
m.SetLastReportDelayMetricsTime(lastReportTime)
1116+
m.ReportMetrics(se)
1117+
require.Zero(t, getMetricCnt())
1118+
require.Equal(t, lastReportTime.Unix(), m.GetLastReportDelayMetricsTime().Unix())
1119+
// when back to leader again, the metrics will be collected.
1120+
isLeader = true
1121+
m.ReportMetrics(se)
1122+
require.Equal(t, len(metrics.WaterMarkScheduleDelayNames), getMetricCnt())
1123+
require.InDelta(t, time.Now().Unix(), m.GetLastReportDelayMetricsTime().Unix(), 5)
10601124
}
10611125

10621126
func TestManagerJobAdapterCanSubmitJob(t *testing.T) {

pkg/ttl/ttlworker/job_manager_test.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,16 @@ func (m *JobManager) UpdateHeartBeat(ctx context.Context, se session.Session, no
190190
return m.updateHeartBeat(ctx, se, now)
191191
}
192192

193+
// SetLastReportDelayMetricsTime sets the lastReportDelayMetricsTime for test
194+
func (m *JobManager) SetLastReportDelayMetricsTime(t time.Time) {
195+
m.lastReportDelayMetricsTime = t
196+
}
197+
198+
// GetLastReportDelayMetricsTime returns the lastReportDelayMetricsTime for test
199+
func (m *JobManager) GetLastReportDelayMetricsTime() time.Time {
200+
return m.lastReportDelayMetricsTime
201+
}
202+
193203
// ReportMetrics is an exported version of reportMetrics
194204
func (m *JobManager) ReportMetrics(se session.Session) {
195205
m.reportMetrics(se)

0 commit comments

Comments
 (0)