Skip to content

Commit b228f61

Browse files
committed
WIP
``` ERROR 2024-12-16 12:15:55,115 [shard 0:main] crash_tracker - service.cc:107 - Crash loop detected. Too many consecutive crashes 6, exceeded crash_loop_limit configured value 5. To recover Redpanda from this state, manually remove file at path "/var/lib/redpanda/data/startup_log". Crash loop automatically resets 1h after last crash or with node configuration changes. The following crashes have been recorded: Crash #1 at 2024-12-16 12:14:03 - Segmentation fault on shard 0. Backtrace: 0xab575a3 0x5400cdb 0x5409de2 0x376b120 /lib/x86_64-linux-gnu/libc.so.6+0x4251f /lib/x86_64-linux-gnu/libc.so.6+0x11e88c 0xa95055d 0xa93d12e 0xa8ab47d 0xa8a8316 0xa78cf74 0xa78ae70 0x35a0393 0xadce1b9 /lib/x86_64-linux-gnu/libc.so.6+0x29d8f /lib/x86_64-linux-gnu/libc.so.6+0x29e3f 0x3598be4 Crash #2 at 2024-12-16 12:14:09 - Segmentation fault on shard 0. Backtrace: 0xab575a3 0x5400cdb 0x5409de2 0x376b120 /lib/x86_64-linux-gnu/libc.so.6+0x4251f /lib/x86_64-linux-gnu/libc.so.6+0x11e88c 0xa95055d 0xa93d12e 0xa8ab47d 0xa8a8316 0xa78cf74 0xa78ae70 0x35a0393 0xadce1b9 /lib/x86_64-linux-gnu/libc.so.6+0x29d8f /lib/x86_64-linux-gnu/libc.so.6+0x29e3f 0x3598be4 Crash #3 at 2024-12-16 12:14:14 - Segmentation fault on shard 0. Backtrace: 0xab575a3 0x5400cdb 0x5409de2 0x376b120 /lib/x86_64-linux-gnu/libc.so.6+0x4251f /lib/x86_64-linux-gnu/libc.so.6+0x11e88c 0xa95055d 0xa93d12e 0xa8ab47d 0xa8a8316 0xa78cf74 0xa78ae70 0x35a0393 0xadce1b9 /lib/x86_64-linux-gnu/libc.so.6+0x29d8f /lib/x86_64-linux-gnu/libc.so.6+0x29e3f 0x3598be4 Crash #4 at 2024-12-16 12:14:20 - Segmentation fault on shard 0. Backtrace: 0xab575a3 0x5400cdb 0x5409de2 0x376b120 /lib/x86_64-linux-gnu/libc.so.6+0x4251f /lib/x86_64-linux-gnu/libc.so.6+0x11e88c 0xa95055d 0xa93d12e 0xa8ab47d 0xa8a8316 0xa78cf74 0xa78ae70 0x35a0393 0xadce1b9 /lib/x86_64-linux-gnu/libc.so.6+0x29d8f /lib/x86_64-linux-gnu/libc.so.6+0x29e3f 0x3598be4 Crash redpanda-data#5 at 2024-12-16 12:14:26 - Segmentation fault on shard 0. Backtrace: 0xab575a3 0x5400cdb 0x5409de2 0x376b120 /lib/x86_64-linux-gnu/libc.so.6+0x4251f /lib/x86_64-linux-gnu/libc.so.6+0x11e88c 0xa95055d 0xa93d12e 0xa8ab47d 0xa8a8316 0xa78cf74 0xa78ae70 0x35a0393 0xadce1b9 /lib/x86_64-linux-gnu/libc.so.6+0x29d8f /lib/x86_64-linux-gnu/libc.so.6+0x29e3f 0x3598be4 Crash redpanda-data#6 at 2024-12-16 12:14:32 - Segmentation fault on shard 0. Backtrace: 0xab575a3 0x5400cdb 0x5409de2 0x376b120 /lib/x86_64-linux-gnu/libc.so.6+0x4251f /lib/x86_64-linux-gnu/libc.so.6+0x11e88c 0xa95055d 0xa93d12e 0xa8ab47d 0xa8a8316 0xa78cf74 0xa78ae70 0x35a0393 0xadce1b9 /lib/x86_64-linux-gnu/libc.so.6+0x29d8f /lib/x86_64-linux-gnu/libc.so.6+0x29e3f 0x3598be4 ```
1 parent 64eca11 commit b228f61

File tree

19 files changed

+955
-129
lines changed

19 files changed

+955
-129
lines changed

src/v/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ add_subdirectory(datalake)
136136
add_subdirectory(cloud_io)
137137
add_subdirectory(cloud_topics)
138138
add_subdirectory(schema)
139+
add_subdirectory(crash_tracker)
139140

140141
option(ENABLE_GIT_VERSION "Build with Git metadata" OFF)
141142

src/v/config/node_config.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,10 @@ struct node_config final : public config_store {
121121
return data_directory().path / "startup_log";
122122
}
123123

124+
std::filesystem::path crash_report_dir_path() const {
125+
return data_directory().path / "crash_reports";
126+
}
127+
124128
/**
125129
* Return the configured cache path if set, otherwise a default
126130
* path within the data directory.

src/v/crash_tracker/BUILD

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
load("//bazel:build.bzl", "redpanda_cc_library")
2+
3+
redpanda_cc_library(
4+
name = "crash_tracker",
5+
srcs = [
6+
"limiter.cc",
7+
"logger.cc",
8+
"recorder.cc",
9+
"service.cc",
10+
"types.cc",
11+
],
12+
hdrs = [
13+
"limiter.h",
14+
"logger.h",
15+
"recorder.h",
16+
"service.h",
17+
"types.h",
18+
],
19+
implementation_deps = [
20+
"//src/v/bytes",
21+
"//src/v/bytes:iobuf",
22+
"//src/v/config",
23+
"//src/v/hashing:xx",
24+
"//src/v/model",
25+
"//src/v/random:generators",
26+
"//src/v/serde",
27+
"//src/v/serde:bytes",
28+
"//src/v/serde:enum",
29+
"//src/v/serde:iobuf",
30+
"//src/v/serde:sstring",
31+
"//src/v/serde:vector",
32+
"//src/v/utils:file_io",
33+
"@fmt",
34+
],
35+
include_prefix = "crash_tracker",
36+
visibility = ["//visibility:public"],
37+
deps = [
38+
"//src/v/base",
39+
"//src/v/model",
40+
"//src/v/serde",
41+
"@seastar",
42+
],
43+
)

src/v/crash_tracker/CMakeLists.txt

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
v_cc_library(
2+
NAME crash_tracker
3+
SRCS
4+
limiter.cc
5+
logger.cc
6+
recorder.cc
7+
service.cc
8+
types.cc
9+
DEPS
10+
Seastar::seastar
11+
v::base
12+
v::bytes
13+
v::model
14+
v::serde
15+
v::hashing
16+
v::random
17+
)

src/v/crash_tracker/limiter.cc

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
/*
2+
* Copyright 2025 Redpanda Data, Inc.
3+
*
4+
* Use of this software is governed by the Business Source License
5+
* included in the file licenses/BSL.md
6+
*
7+
* As of the Change Date specified in that file, in accordance with
8+
* the Business Source License, use of this software will be governed
9+
* by the Apache License, Version 2.0
10+
*/
11+
12+
#include "crash_tracker/limiter.h"
13+
14+
#include "config/node_config.h"
15+
#include "crash_tracker/limiter.h"
16+
#include "crash_tracker/logger.h"
17+
#include "crash_tracker/recorder.h"
18+
#include "crash_tracker/types.h"
19+
#include "hashing/xx.h"
20+
#include "model/timestamp.h"
21+
#include "serde/rw/envelope.h"
22+
#include "utils/file_io.h"
23+
24+
#include <seastar/core/seastar.hh>
25+
#include <seastar/util/print_safe.hh>
26+
27+
#include <fmt/chrono.h>
28+
29+
#include <chrono>
30+
#include <system_error>
31+
#include <unistd.h>
32+
33+
using namespace std::chrono_literals;
34+
35+
namespace crash_tracker {
36+
37+
// Crash tracking resets every 1h.
38+
static constexpr model::timestamp_clock::duration crash_reset_duration{1h};
39+
40+
ss::future<> limiter::check_for_crash_loop() const {
41+
auto file_path = config::node().crash_loop_tracker_path();
42+
std::optional<crash_tracker_metadata> maybe_crash_md;
43+
if (
44+
// Tracking is reset every time the broker boots in recovery mode.
45+
!config::node().recovery_mode_enabled()
46+
&& co_await ss::file_exists(file_path.string())) {
47+
// Ok to read the entire file, it contains a serialized uint32_t.
48+
auto buf = co_await read_fully(file_path);
49+
try {
50+
maybe_crash_md = serde::from_iobuf<crash_tracker_metadata>(
51+
std::move(buf));
52+
} catch (const serde::serde_exception&) {
53+
// A malformed log file, ignore and reset it later.
54+
// We truncate it below.
55+
vlog(ctlog.warn, "Ignorning malformed tracker file {}", file_path);
56+
}
57+
}
58+
59+
// Compute the checksum of the current node configuration.
60+
auto current_config = co_await read_fully_to_string(
61+
config::node().get_cfg_file_path());
62+
auto checksum = xxhash_64(current_config.c_str(), current_config.length());
63+
64+
if (maybe_crash_md) {
65+
auto& crash_md = maybe_crash_md.value();
66+
auto& limit = config::node().crash_loop_limit.value();
67+
68+
// Check if it has been atleast 1h since last unsuccessful restart.
69+
// Tracking resets every 1h.
70+
auto time_since_last_start
71+
= model::duration_since_epoch(model::timestamp::now())
72+
- model::duration_since_epoch(crash_md._last_start_ts);
73+
74+
auto crash_limit_ok = !limit || crash_md._crash_count <= limit.value();
75+
auto node_config_changed = crash_md._config_checksum != checksum;
76+
auto tracking_reset = time_since_last_start > crash_reset_duration;
77+
78+
auto ok_to_proceed = crash_limit_ok || node_config_changed
79+
|| tracking_reset;
80+
81+
if (!ok_to_proceed) {
82+
auto crashes = co_await _recorder.get_recorded_crashes();
83+
vlog(
84+
ctlog.error,
85+
"Crash loop detected. Too many consecutive crashes {}, exceeded "
86+
"{} configured value {}. To recover Redpanda from this state, "
87+
"manually remove file at path {}. Crash loop automatically "
88+
"resets 1h after last crash or with node configuration changes. "
89+
"{}",
90+
crash_md._crash_count,
91+
config::node().crash_loop_limit.name(),
92+
limit.value(),
93+
file_path,
94+
crashes.describe());
95+
throw std::runtime_error("Crash loop detected, aborting startup.");
96+
}
97+
98+
vlog(
99+
ctlog.debug,
100+
"Consecutive crashes detected: {} node config changed: {} "
101+
"time based tracking reset: {}",
102+
crash_md._crash_count,
103+
node_config_changed,
104+
tracking_reset);
105+
106+
if (node_config_changed || tracking_reset) {
107+
crash_md._crash_count = 0;
108+
}
109+
}
110+
111+
// Truncate and bump the crash count. We consider a run to be unclean by
112+
// default unless the scheduled cleanup (that runs very late in shutdown)
113+
// resets the file. See schedule_crash_tracker_file_cleanup().
114+
auto new_crash_count = maybe_crash_md
115+
? maybe_crash_md.value()._crash_count + 1
116+
: 1;
117+
crash_tracker_metadata updated{
118+
._crash_count = new_crash_count,
119+
._config_checksum = checksum,
120+
._last_start_ts = model::timestamp::now()};
121+
co_await write_fully(file_path, serde::to_iobuf(updated));
122+
co_await ss::sync_directory(
123+
config::node().data_directory.value().as_sstring());
124+
}
125+
126+
ss::future<> limiter::record_clean_shutdown() const {
127+
auto file = config::node().crash_loop_tracker_path().string();
128+
if (co_await ss::file_exists(file)) {
129+
co_await ss::remove_file(file);
130+
co_await ss::sync_directory(
131+
config::node().data_directory().as_sstring());
132+
vlog(ctlog.debug, "Deleted crash loop tracker file: {}", file);
133+
}
134+
135+
co_return;
136+
}
137+
138+
} // namespace crash_tracker

src/v/crash_tracker/limiter.h

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
/*
2+
* Copyright 2025 Redpanda Data, Inc.
3+
*
4+
* Use of this software is governed by the Business Source License
5+
* included in the file licenses/BSL.md
6+
*
7+
* As of the Change Date specified in that file, in accordance with
8+
* the Business Source License, use of this software will be governed
9+
* by the Apache License, Version 2.0
10+
*/
11+
12+
#pragma once
13+
14+
#include "crash_tracker/recorder.h"
15+
16+
namespace crash_tracker {
17+
18+
// Limits the number of restarts to a configured amount
19+
class limiter {
20+
public:
21+
explicit limiter(const recorder& recorder)
22+
: _recorder(recorder) {};
23+
ss::future<> check_for_crash_loop() const;
24+
ss::future<> record_clean_shutdown() const;
25+
26+
private:
27+
const recorder& _recorder;
28+
};
29+
30+
} // namespace crash_tracker

src/v/crash_tracker/logger.cc

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
/*
2+
* Copyright 2025 Redpanda Data, Inc.
3+
*
4+
* Use of this software is governed by the Business Source License
5+
* included in the file licenses/BSL.md
6+
*
7+
* As of the Change Date specified in that file, in accordance with
8+
* the Business Source License, use of this software will be governed
9+
* by the Apache License, Version 2.0
10+
*/
11+
#include "logger.h"
12+
13+
namespace crash_tracker {
14+
15+
seastar::logger ctlog("crash_tracker");
16+
17+
} // namespace crash_tracker

src/v/crash_tracker/logger.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
/*
2+
* Copyright 2025 Redpanda Data, Inc.
3+
*
4+
* Use of this software is governed by the Business Source License
5+
* included in the file licenses/BSL.md
6+
*
7+
* As of the Change Date specified in that file, in accordance with
8+
* the Business Source License, use of this software will be governed
9+
* by the Apache License, Version 2.0
10+
*/
11+
12+
#pragma once
13+
14+
#include "base/seastarx.h"
15+
16+
#include <seastar/util/log.hh>
17+
18+
namespace crash_tracker {
19+
extern ss::logger ctlog;
20+
} // namespace crash_tracker

0 commit comments

Comments
 (0)