Skip to content

Commit 688f06a

Browse files
committed
br: mem dump when about to OOM (pingcap#59234)
close pingcap#56971
1 parent 8103236 commit 688f06a

File tree

8 files changed

+306
-55
lines changed

8 files changed

+306
-55
lines changed

br/cmd/br/cmd.go

Lines changed: 56 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,17 @@ const (
7979

8080
flagVersion = "version"
8181
flagVersionShort = "V"
82+
83+
// Memory management related constants
84+
quarterGiB uint64 = 256 * size.MB
85+
halfGiB uint64 = 512 * size.MB
86+
fourGiB uint64 = 4 * size.GB
87+
88+
// Environment variables
89+
envBRHeapDumpDir = "BR_HEAP_DUMP_DIR"
90+
91+
// Default heap dump paths
92+
defaultHeapDumpDir = "/tmp/br_heap_dumps"
8293
)
8394

8495
func timestampLogFileName() string {
@@ -113,10 +124,6 @@ func DefineCommonFlags(cmd *cobra.Command) {
113124
_ = cmd.PersistentFlags().MarkHidden(FlagRedactLog)
114125
}
115126

116-
const quarterGiB uint64 = 256 * size.MB
117-
const halfGiB uint64 = 512 * size.MB
118-
const fourGiB uint64 = 4 * size.GB
119-
120127
func calculateMemoryLimit(memleft uint64) uint64 {
121128
// memreserved = f(memleft) = 512MB * memleft / (memleft + 4GB)
122129
// * f(0) = 0
@@ -132,6 +139,47 @@ func calculateMemoryLimit(memleft uint64) uint64 {
132139
return memlimit
133140
}
134141

142+
// setupMemoryMonitoring configures memory limits and starts the memory monitor.
143+
// It returns an error if the setup fails.
144+
func setupMemoryMonitoring(ctx context.Context, memTotal, memUsed uint64) error {
145+
if memUsed >= memTotal {
146+
log.Warn("failed to obtain memory size, skip setting memory limit",
147+
zap.Uint64("memused", memUsed), zap.Uint64("memtotal", memTotal))
148+
return nil
149+
}
150+
151+
memleft := memTotal - memUsed
152+
memlimit := calculateMemoryLimit(memleft)
153+
// BR command needs 256 MiB at least, if the left memory is less than 256 MiB,
154+
// the memory limit cannot limit anyway and then finally OOM.
155+
memlimit = max(memlimit, quarterGiB)
156+
157+
log.Info("calculate the rest memory",
158+
zap.Uint64("memtotal", memTotal),
159+
zap.Uint64("memused", memUsed),
160+
zap.Uint64("memlimit", memlimit))
161+
162+
// No need to set memory limit because the left memory is sufficient.
163+
if memlimit >= uint64(math.MaxInt64) {
164+
return nil
165+
}
166+
167+
debug.SetMemoryLimit(int64(memlimit))
168+
169+
// Configure and start memory monitoring
170+
dumpDir := os.Getenv(envBRHeapDumpDir)
171+
if dumpDir == "" {
172+
dumpDir = defaultHeapDumpDir
173+
}
174+
175+
if err := utils.RunMemoryMonitor(ctx, dumpDir, memlimit); err != nil {
176+
log.Warn("Failed to start memory monitor", zap.Error(err))
177+
return err
178+
}
179+
180+
return nil
181+
}
182+
135183
// Init initializes BR cli.
136184
func Init(cmd *cobra.Command) (err error) {
137185
initOnce.Do(func() {
@@ -198,21 +246,10 @@ func Init(cmd *cobra.Command) (err error) {
198246
err = e
199247
return
200248
}
201-
if memused >= memtotal {
202-
log.Warn("failed to obtain memory size, skip setting memory limit",
203-
zap.Uint64("memused", memused), zap.Uint64("memtotal", memtotal))
204-
} else {
205-
memleft := memtotal - memused
206-
memlimit := calculateMemoryLimit(memleft)
207-
// BR command needs 256 MiB at least, if the left memory is less than 256 MiB,
208-
// the memory limit cannot limit anyway and then finally OOM.
209-
memlimit = max(memlimit, quarterGiB)
210-
log.Info("calculate the rest memory",
211-
zap.Uint64("memtotal", memtotal), zap.Uint64("memused", memused), zap.Uint64("memlimit", memlimit))
212-
// No need to set memory limit because the left memory is sufficient.
213-
if memlimit < uint64(math.MaxInt64) {
214-
debug.SetMemoryLimit(int64(memlimit))
215-
}
249+
250+
if e := setupMemoryMonitoring(GetDefaultContext(), memtotal, memused); e != nil {
251+
// only log the error, don't fail initialization
252+
log.Error("Failed to setup memory monitoring", zap.Error(e))
216253
}
217254
}
218255

br/cmd/br/main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313
func main() {
1414
gCtx := context.Background()
1515
ctx, cancel := utils.StartExitSingleListener(gCtx)
16+
defer cancel()
1617

1718
rootCmd := &cobra.Command{
1819
Use: "br",
@@ -34,7 +35,6 @@ func main() {
3435

3536
rootCmd.SetArgs(os.Args[1:])
3637
if err := rootCmd.Execute(); err != nil {
37-
cancel()
3838
log.Error("br failed", zap.Error(err))
3939
os.Exit(1) // nolint:gocritic
4040
}

br/pkg/utils/BUILD.bazel

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ go_library(
1313
"filter.go",
1414
"json.go",
1515
"key.go",
16+
"memory_monitor.go",
1617
"misc.go",
1718
"pointer.go",
1819
"pprof.go",
@@ -45,6 +46,8 @@ go_library(
4546
"//pkg/util/codec",
4647
"//pkg/util/encrypt",
4748
"//pkg/util/logutil",
49+
"//pkg/util/memory",
50+
"//pkg/util/memoryusagealarm",
4851
"//pkg/util/sqlexec",
4952
"//pkg/util/table-filter",
5053
"@com_github_cheggaaa_pb_v3//:pb",
@@ -86,6 +89,7 @@ go_test(
8689
"json_test.go",
8790
"key_test.go",
8891
"main_test.go",
92+
"memory_monitor_test.go",
8993
"misc_test.go",
9094
"progress_test.go",
9195
"register_test.go",
@@ -95,7 +99,7 @@ go_test(
9599
],
96100
embed = [":utils"],
97101
flaky = True,
98-
shard_count = 37,
102+
shard_count = 38,
99103
deps = [
100104
"//br/pkg/errors",
101105
"//pkg/kv",
@@ -119,6 +123,7 @@ go_test(
119123
"@io_etcd_go_etcd_tests_v3//integration",
120124
"@org_golang_google_grpc//codes",
121125
"@org_golang_google_grpc//status",
126+
"@org_uber_go_atomic//:atomic",
122127
"@org_uber_go_goleak//:goleak",
123128
"@org_uber_go_multierr//:multierr",
124129
],

br/pkg/utils/memory_monitor.go

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
// Copyright 2024 PingCAP, Inc. Licensed under Apache-2.0.
2+
3+
package utils
4+
5+
import (
6+
"context"
7+
"os"
8+
9+
"github.com/pingcap/log"
10+
"github.com/pingcap/tidb/pkg/util/memory"
11+
"github.com/pingcap/tidb/pkg/util/memoryusagealarm"
12+
"go.uber.org/atomic"
13+
"go.uber.org/zap"
14+
)
15+
16+
const (
17+
DefaultProfilesDir = "/tmp/profiles"
18+
// default memory usage alarm ratio (80%)
19+
defaultMemoryUsageAlarmRatio = 0.8
20+
// default number of alarm records to keep
21+
defaultMemoryUsageAlarmKeepRecordNum = 3
22+
)
23+
24+
// BRConfigProvider implements memoryusagealarm.ConfigProvider for BR
25+
type BRConfigProvider struct {
26+
ratio *atomic.Float64
27+
keepNum *atomic.Int64
28+
logDir string
29+
}
30+
31+
func (p *BRConfigProvider) GetMemoryUsageAlarmRatio() float64 {
32+
return p.ratio.Load()
33+
}
34+
35+
func (p *BRConfigProvider) GetMemoryUsageAlarmKeepRecordNum() int64 {
36+
return p.keepNum.Load()
37+
}
38+
39+
func (p *BRConfigProvider) GetLogDir() string {
40+
if p.logDir == "" {
41+
return DefaultProfilesDir
42+
}
43+
return p.logDir
44+
}
45+
46+
func (p *BRConfigProvider) GetComponentName() string {
47+
return "br"
48+
}
49+
50+
// RunMemoryMonitor starts monitoring memory usage and dumps profiles when thresholds are exceeded
51+
func RunMemoryMonitor(ctx context.Context, dumpDir string, memoryLimit uint64) error {
52+
// just in case
53+
if dumpDir == "" {
54+
dumpDir = DefaultProfilesDir
55+
}
56+
57+
// Set memory limit if specified
58+
if memoryLimit > 0 {
59+
memory.ServerMemoryLimit.Store(memoryLimit)
60+
}
61+
62+
log.Info("Memory monitor starting",
63+
zap.String("dump_dir", dumpDir),
64+
zap.Bool("using_temp_dir", dumpDir == os.TempDir()),
65+
zap.Float64("memory_usage_alarm_ratio", defaultMemoryUsageAlarmRatio),
66+
zap.Uint64("memory_limit_mb", memoryLimit/1024/1024))
67+
68+
// Initialize BR config provider with default values
69+
provider := &BRConfigProvider{
70+
ratio: atomic.NewFloat64(defaultMemoryUsageAlarmRatio),
71+
keepNum: atomic.NewInt64(defaultMemoryUsageAlarmKeepRecordNum),
72+
logDir: dumpDir,
73+
}
74+
75+
exitCh := make(chan struct{})
76+
handle := memoryusagealarm.NewMemoryUsageAlarmHandle(exitCh, provider)
77+
// BR doesn't need session manager so setting to nil
78+
handle.SetSessionManager(nil)
79+
80+
go func() {
81+
go handle.Run()
82+
<-ctx.Done()
83+
close(exitCh)
84+
}()
85+
86+
return nil
87+
}

br/pkg/utils/memory_monitor_test.go

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
// Copyright 2024 PingCAP, Inc. Licensed under Apache-2.0.
2+
3+
package utils
4+
5+
import (
6+
"testing"
7+
8+
"github.com/stretchr/testify/require"
9+
"go.uber.org/atomic"
10+
)
11+
12+
func TestBRConfigProvider(t *testing.T) {
13+
provider := &BRConfigProvider{
14+
ratio: atomic.NewFloat64(0.8),
15+
keepNum: atomic.NewInt64(3),
16+
logDir: "/custom/dir",
17+
}
18+
19+
// Test GetMemoryUsageAlarmRatio
20+
require.Equal(t, 0.8, provider.GetMemoryUsageAlarmRatio())
21+
22+
// Test GetMemoryUsageAlarmKeepRecordNum
23+
require.Equal(t, int64(3), provider.GetMemoryUsageAlarmKeepRecordNum())
24+
25+
// Test GetLogDir
26+
require.Equal(t, "/custom/dir", provider.GetLogDir())
27+
28+
// Test GetLogDir with default
29+
provider.logDir = ""
30+
require.Equal(t, DefaultProfilesDir, provider.GetLogDir())
31+
32+
// Test GetComponentName
33+
require.Equal(t, "br", provider.GetComponentName())
34+
}

pkg/domain/domain.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1327,7 +1327,8 @@ func NewDomain(store kv.Storage, schemaLease time.Duration, statsLease time.Dura
13271327
do.wg = util.NewWaitGroupEnhancedWrapper("domain", do.exit, config.GetGlobalConfig().TiDBEnableExitCheck)
13281328
do.SchemaValidator = NewSchemaValidator(schemaLease, do)
13291329
do.expensiveQueryHandle = expensivequery.NewExpensiveQueryHandle(do.exit)
1330-
do.memoryUsageAlarmHandle = memoryusagealarm.NewMemoryUsageAlarmHandle(do.exit)
1330+
do.memoryUsageAlarmHandle = memoryusagealarm.NewMemoryUsageAlarmHandle(do.exit,
1331+
&memoryusagealarm.TiDBConfigProvider{})
13311332
do.serverMemoryLimitHandle = servermemorylimit.NewServerMemoryLimitHandle(do.exit)
13321333
do.sysProcesses = SysProcesses{mu: &sync.RWMutex{}, procMap: make(map[uint64]sysproctrack.TrackProc)}
13331334
do.initDomainSysVars()

0 commit comments

Comments
 (0)