Skip to content

Commit 99622f8

Browse files
committed
WIP
``` ERROR 2024-12-16 12:15:55,115 [shard 0:main] crash_tracker - service.cc:107 - Crash loop detected. Too many consecutive crashes 6, exceeded crash_loop_limit configured value 5. To recover Redpanda from this state, manually remove file at path "/var/lib/redpanda/data/startup_log". Crash loop automatically resets 1h after last crash or with node configuration changes. The following crashes have been recorded: Crash #1 at 2024-12-16 12:14:03 - Segmentation fault on shard 0. Backtrace: 0xab575a3 0x5400cdb 0x5409de2 0x376b120 /lib/x86_64-linux-gnu/libc.so.6+0x4251f /lib/x86_64-linux-gnu/libc.so.6+0x11e88c 0xa95055d 0xa93d12e 0xa8ab47d 0xa8a8316 0xa78cf74 0xa78ae70 0x35a0393 0xadce1b9 /lib/x86_64-linux-gnu/libc.so.6+0x29d8f /lib/x86_64-linux-gnu/libc.so.6+0x29e3f 0x3598be4 Crash #2 at 2024-12-16 12:14:09 - Segmentation fault on shard 0. Backtrace: 0xab575a3 0x5400cdb 0x5409de2 0x376b120 /lib/x86_64-linux-gnu/libc.so.6+0x4251f /lib/x86_64-linux-gnu/libc.so.6+0x11e88c 0xa95055d 0xa93d12e 0xa8ab47d 0xa8a8316 0xa78cf74 0xa78ae70 0x35a0393 0xadce1b9 /lib/x86_64-linux-gnu/libc.so.6+0x29d8f /lib/x86_64-linux-gnu/libc.so.6+0x29e3f 0x3598be4 Crash #3 at 2024-12-16 12:14:14 - Segmentation fault on shard 0. Backtrace: 0xab575a3 0x5400cdb 0x5409de2 0x376b120 /lib/x86_64-linux-gnu/libc.so.6+0x4251f /lib/x86_64-linux-gnu/libc.so.6+0x11e88c 0xa95055d 0xa93d12e 0xa8ab47d 0xa8a8316 0xa78cf74 0xa78ae70 0x35a0393 0xadce1b9 /lib/x86_64-linux-gnu/libc.so.6+0x29d8f /lib/x86_64-linux-gnu/libc.so.6+0x29e3f 0x3598be4 Crash #4 at 2024-12-16 12:14:20 - Segmentation fault on shard 0. Backtrace: 0xab575a3 0x5400cdb 0x5409de2 0x376b120 /lib/x86_64-linux-gnu/libc.so.6+0x4251f /lib/x86_64-linux-gnu/libc.so.6+0x11e88c 0xa95055d 0xa93d12e 0xa8ab47d 0xa8a8316 0xa78cf74 0xa78ae70 0x35a0393 0xadce1b9 /lib/x86_64-linux-gnu/libc.so.6+0x29d8f /lib/x86_64-linux-gnu/libc.so.6+0x29e3f 0x3598be4 Crash redpanda-data#5 at 2024-12-16 12:14:26 - Segmentation fault on shard 0. Backtrace: 0xab575a3 0x5400cdb 0x5409de2 0x376b120 /lib/x86_64-linux-gnu/libc.so.6+0x4251f /lib/x86_64-linux-gnu/libc.so.6+0x11e88c 0xa95055d 0xa93d12e 0xa8ab47d 0xa8a8316 0xa78cf74 0xa78ae70 0x35a0393 0xadce1b9 /lib/x86_64-linux-gnu/libc.so.6+0x29d8f /lib/x86_64-linux-gnu/libc.so.6+0x29e3f 0x3598be4 Crash redpanda-data#6 at 2024-12-16 12:14:32 - Segmentation fault on shard 0. Backtrace: 0xab575a3 0x5400cdb 0x5409de2 0x376b120 /lib/x86_64-linux-gnu/libc.so.6+0x4251f /lib/x86_64-linux-gnu/libc.so.6+0x11e88c 0xa95055d 0xa93d12e 0xa8ab47d 0xa8a8316 0xa78cf74 0xa78ae70 0x35a0393 0xadce1b9 /lib/x86_64-linux-gnu/libc.so.6+0x29d8f /lib/x86_64-linux-gnu/libc.so.6+0x29e3f 0x3598be4 ```
1 parent 398c983 commit 99622f8

File tree

15 files changed

+699
-128
lines changed

15 files changed

+699
-128
lines changed

src/v/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ add_subdirectory(datalake)
136136
add_subdirectory(cloud_io)
137137
add_subdirectory(cloud_topics)
138138
add_subdirectory(schema)
139+
add_subdirectory(crash_tracker)
139140

140141
option(ENABLE_GIT_VERSION "Build with Git metadata" OFF)
141142

src/v/config/node_config.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,10 @@ struct node_config final : public config_store {
121121
return data_directory().path / "startup_log";
122122
}
123123

124+
std::filesystem::path crash_report_dir_path() const {
125+
return data_directory().path / "crash_reports";
126+
}
127+
124128
/**
125129
* Return the configured cache path if set, otherwise a default
126130
* path within the data directory.

src/v/crash_tracker/BUILD

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
load("//bazel:build.bzl", "redpanda_cc_library")
2+
3+
redpanda_cc_library(
4+
name = "crash_tracker",
5+
srcs = [
6+
"api.cc",
7+
"logger.cc",
8+
"service.cc",
9+
],
10+
hdrs = [
11+
"api.h",
12+
"logger.h",
13+
"service.h",
14+
"types.h",
15+
],
16+
implementation_deps = [
17+
"//src/v/bytes",
18+
"//src/v/bytes:iobuf",
19+
"//src/v/config",
20+
"//src/v/hashing:xx",
21+
"//src/v/model",
22+
"//src/v/serde",
23+
"//src/v/serde:bytes",
24+
"//src/v/serde:iobuf",
25+
"//src/v/serde:sstring",
26+
"//src/v/serde:vector",
27+
"//src/v/utils:file_io",
28+
"@fmt",
29+
],
30+
include_prefix = "crash_tracker",
31+
visibility = ["//visibility:public"],
32+
deps = [
33+
"//src/v/base",
34+
"//src/v/model",
35+
"//src/v/serde",
36+
"@seastar",
37+
],
38+
)

src/v/crash_tracker/CMakeLists.txt

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
v_cc_library(
2+
NAME crash_tracker
3+
SRCS
4+
api.cc
5+
service.cc
6+
logger.cc
7+
DEPS
8+
Seastar::seastar
9+
v::base
10+
v::bytes
11+
v::model
12+
v::serde
13+
v::hashing
14+
)

src/v/crash_tracker/api.cc

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
/*
2+
* Copyright 2024 Redpanda Data, Inc.
3+
*
4+
* Use of this software is governed by the Business Source License
5+
* included in the file licenses/BSL.md
6+
*
7+
* As of the Change Date specified in that file, in accordance with
8+
* the Business Source License, use of this software will be governed
9+
* by the Apache License, Version 2.0
10+
*/
11+
12+
#include "crash_tracker/api.h"
13+
14+
#include "base/vassert.h"
15+
#include "config/node_config.h"
16+
#include "crash_tracker/logger.h"
17+
#include "crash_tracker/service.h"
18+
#include "hashing/xx.h"
19+
#include "serde/rw/envelope.h"
20+
#include "utils/file_io.h"
21+
22+
#include <seastar/core/memory.hh>
23+
#include <seastar/util/print_safe.hh>
24+
25+
#include <fmt/core.h>
26+
27+
#include <chrono>
28+
#include <iterator>
29+
#include <stdio.h>
30+
31+
using namespace std::chrono_literals;
32+
33+
namespace crash_tracker {
34+
35+
namespace {
36+
37+
service& instance() {
38+
static service inst;
39+
return inst;
40+
}
41+
42+
} // namespace
43+
44+
ss::future<> initialize() { return instance().start(); }
45+
46+
ss::future<> record_clean_shutdown() { return instance().stop(); }
47+
48+
namespace {
49+
std::string_view failure_msg_prefix(int signo) {
50+
if (signo == SIGSEGV) {
51+
return "Segmentation fault";
52+
} else if (signo == SIGILL) {
53+
return "Illegal instruction";
54+
} else {
55+
return "Aborting";
56+
}
57+
}
58+
} // namespace
59+
60+
void record_signal_crash(crash_description& cd, int signo) {
61+
auto& format_buf = cd._crash_reason;
62+
fmt::format_to_n(
63+
format_buf.begin(),
64+
format_buf.size(),
65+
"{} on shard {}. Backtrace: {}",
66+
failure_msg_prefix(signo),
67+
ss::this_shard_id(),
68+
ss::current_backtrace());
69+
70+
ss::backtrace([&cd](ss::frame f) { cd._stacktrace.push_back(f.addr); });
71+
}
72+
73+
void record_sigsegv_crash() {
74+
instance().record_crash(
75+
[](crash_description& cd) { record_signal_crash(cd, SIGSEGV); });
76+
}
77+
78+
void record_sigabrt_crash() {
79+
instance().record_crash(
80+
[](crash_description& cd) { record_signal_crash(cd, SIGABRT); });
81+
}
82+
83+
void record_sigill_crash() {
84+
instance().record_crash(
85+
[](crash_description& cd) { record_signal_crash(cd, SIGILL); });
86+
}
87+
88+
} // namespace crash_tracker

src/v/crash_tracker/api.h

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
/*
2+
* Copyright 2024 Redpanda Data, Inc.
3+
*
4+
* Use of this software is governed by the Business Source License
5+
* included in the file licenses/BSL.md
6+
*
7+
* As of the Change Date specified in that file, in accordance with
8+
* the Business Source License, use of this software will be governed
9+
* by the Apache License, Version 2.0
10+
*/
11+
12+
#pragma once
13+
14+
#include "base/seastarx.h"
15+
16+
#include <seastar/core/future.hh>
17+
18+
namespace crash_tracker {
19+
20+
/// Should be called on startup before any call to record_crash, otherwise
21+
/// record_crash is a noop.
22+
/// Here we check for too many consecutive unclean
23+
/// shutdowns/crashes and abort the startup sequence if the limit exceeds
24+
/// crash_loop_limit until the operator intervenes. Crash tracking
25+
/// is reset if the node configuration changes or its been 1h since
26+
/// the broker last failed to start. This metadata is tracked in the
27+
/// tracker file. This is to prevent on disk state from piling up in
28+
/// each unclean run and creating more state to recover for the next run.
29+
ss::future<> initialize();
30+
31+
/// On a clean shutdown,
32+
/// the tracker file should be deleted thus reseting the crash count on the
33+
/// next run. In case of an unclean shutdown, we already bumped
34+
/// the crash count and that should be taken into account in the
35+
/// next run.
36+
ss::future<> record_clean_shutdown();
37+
38+
/// Async-signal safe
39+
void record_sigsegv_crash();
40+
41+
/// Async-signal safe
42+
void record_sigabrt_crash();
43+
44+
/// Async-signal safe
45+
void record_sigill_crash();
46+
47+
} // namespace crash_tracker

src/v/crash_tracker/logger.cc

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
/*
2+
* Copyright 2024 Redpanda Data, Inc.
3+
*
4+
* Use of this software is governed by the Business Source License
5+
* included in the file licenses/BSL.md
6+
*
7+
* As of the Change Date specified in that file, in accordance with
8+
* the Business Source License, use of this software will be governed
9+
* by the Apache License, Version 2.0
10+
*/
11+
#include "logger.h"
12+
13+
namespace crash_tracker {
14+
15+
seastar::logger ctlog("crash_tracker");
16+
17+
} // namespace crash_tracker

src/v/crash_tracker/logger.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
/*
2+
* Copyright 2024 Redpanda Data, Inc.
3+
*
4+
* Use of this software is governed by the Business Source License
5+
* included in the file licenses/BSL.md
6+
*
7+
* As of the Change Date specified in that file, in accordance with
8+
* the Business Source License, use of this software will be governed
9+
* by the Apache License, Version 2.0
10+
*/
11+
12+
#pragma once
13+
14+
#include "base/seastarx.h"
15+
16+
#include <seastar/util/log.hh>
17+
18+
namespace crash_tracker {
19+
extern ss::logger ctlog;
20+
} // namespace crash_tracker

0 commit comments

Comments
 (0)