pgellert
diff --git a/‎src/v/CMakeLists.txt
Lines changed: 1 addition & 0 deletions b/‎src/v/CMakeLists.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/v/config/node_config.h
Lines changed: 4 additions & 0 deletions b/‎src/v/config/node_config.h
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/v/crash_tracker/BUILD
Lines changed: 41 additions & 0 deletions b/‎src/v/crash_tracker/BUILD
Lines changed: 41 additions & 0 deletions
diff --git a/‎src/v/crash_tracker/CMakeLists.txt
Lines changed: 16 additions & 0 deletions b/‎src/v/crash_tracker/CMakeLists.txt
Lines changed: 16 additions & 0 deletions
diff --git a/‎src/v/crash_tracker/limiter.cc
Lines changed: 140 additions & 0 deletions b/‎src/v/crash_tracker/limiter.cc
Lines changed: 140 additions & 0 deletions
diff --git a/‎src/v/crash_tracker/limiter.h
Lines changed: 35 additions & 0 deletions b/‎src/v/crash_tracker/limiter.h
Lines changed: 35 additions & 0 deletions
diff --git a/‎src/v/crash_tracker/logger.cc
Lines changed: 17 additions & 0 deletions b/‎src/v/crash_tracker/logger.cc
Lines changed: 17 additions & 0 deletions
diff --git a/‎src/v/crash_tracker/logger.h
Lines changed: 20 additions & 0 deletions b/‎src/v/crash_tracker/logger.h
Lines changed: 20 additions & 0 deletions
@@ -136,6 +136,7 @@ add_subdirectory(datalake)
 add_subdirectory(cloud_io)
 add_subdirectory(cloud_topics)
 add_subdirectory(schema)
+add_subdirectory(crash_tracker)
 
 option(ENABLE_GIT_VERSION "Build with Git metadata" OFF)
 
 
@@ -121,6 +121,10 @@ struct node_config final : public config_store {
         return data_directory().path / "startup_log";
     }
 
+    std::filesystem::path crash_report_dir_path() const {
+        return data_directory().path / "crash_reports";
+    }
+
     /**
      * Return the configured cache path if set, otherwise a default
      * path within the data directory.
 
@@ -0,0 +1,41 @@
+load("//bazel:build.bzl", "redpanda_cc_library")
+
+redpanda_cc_library(
+    name = "crash_tracker",
+    srcs = [
+        "limiter.cc",
+        "logger.cc",
+        "recorder.cc",
+        "service.cc",
+    ],
+    hdrs = [
+        "limiter.h",
+        "logger.h",
+        "recorder.h",
+        "service.h",
+        "types.h",
+    ],
+    implementation_deps = [
+        "//src/v/bytes",
+        "//src/v/bytes:iobuf",
+        "//src/v/config",
+        "//src/v/hashing:xx",
+        "//src/v/model",
+        "//src/v/random:generators",
+        "//src/v/serde",
+        "//src/v/serde:bytes",
+        "//src/v/serde:iobuf",
+        "//src/v/serde:sstring",
+        "//src/v/serde:vector",
+        "//src/v/utils:file_io",
+        "@fmt",
+    ],
+    include_prefix = "crash_tracker",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//src/v/base",
+        "//src/v/model",
+        "//src/v/serde",
+        "@seastar",
+    ],
+)
@@ -0,0 +1,16 @@
+v_cc_library(
+  NAME crash_tracker
+  SRCS
+    limiter.cc
+    logger.cc
+    recorder.cc
+    service.cc
+  DEPS
+    Seastar::seastar
+    v::base
+    v::bytes
+    v::model
+    v::serde
+    v::hashing
+    v::random
+)
@@ -0,0 +1,140 @@
+/*
+ * Copyright 2024 Redpanda Data, Inc.
+ *
+ * Use of this software is governed by the Business Source License
+ * included in the file licenses/BSL.md
+ *
+ * As of the Change Date specified in that file, in accordance with
+ * the Business Source License, use of this software will be governed
+ * by the Apache License, Version 2.0
+ */
+
+#include "crash_tracker/limiter.h"
+
+#include "base/vassert.h"
+#include "config/node_config.h"
+#include "crash_tracker/limiter.h"
+#include "crash_tracker/logger.h"
+#include "crash_tracker/recorder.h"
+#include "crash_tracker/service.h"
+#include "crash_tracker/types.h"
+#include "hashing/xx.h"
+#include "model/timestamp.h"
+#include "serde/rw/envelope.h"
+#include "utils/file_io.h"
+
+#include <seastar/core/seastar.hh>
+#include <seastar/util/print_safe.hh>
+
+#include <fmt/chrono.h>
+
+#include <chrono>
+#include <system_error>
+#include <unistd.h>
+
+using namespace std::chrono_literals;
+
+namespace crash_tracker {
+
+// Crash tracking resets every 1h.
+static constexpr model::timestamp_clock::duration crash_reset_duration{1h};
+
+ss::future<> limiter::check_for_crash_loop() const {
+    auto file_path = config::node().crash_loop_tracker_path();
+    std::optional<crash_tracker_metadata> maybe_crash_md;
+    if (
+      // Tracking is reset every time the broker boots in recovery mode.
+      !config::node().recovery_mode_enabled()
+      && co_await ss::file_exists(file_path.string())) {
+        // Ok to read the entire file, it contains a serialized uint32_t.
+        auto buf = co_await read_fully(file_path);
+        try {
+            maybe_crash_md = serde::from_iobuf<crash_tracker_metadata>(
+              std::move(buf));
+        } catch (const serde::serde_exception&) {
+            // A malformed log file, ignore and reset it later.
+            // We truncate it below.
+            vlog(ctlog.warn, "Ignorning malformed tracker file {}", file_path);
+        }
+    }
+
+    // Compute the checksum of the current node configuration.
+    auto current_config = co_await read_fully_to_string(
+      config::node().get_cfg_file_path());
+    auto checksum = xxhash_64(current_config.c_str(), current_config.length());
+
+    if (maybe_crash_md) {
+        auto& crash_md = maybe_crash_md.value();
+        auto& limit = config::node().crash_loop_limit.value();
+
+        // Check if it has been atleast 1h since last unsuccessful restart.
+        // Tracking resets every 1h.
+        auto time_since_last_start
+          = model::duration_since_epoch(model::timestamp::now())
+            - model::duration_since_epoch(crash_md._last_start_ts);
+
+        auto crash_limit_ok = !limit || crash_md._crash_count <= limit.value();
+        auto node_config_changed = crash_md._config_checksum != checksum;
+        auto tracking_reset = time_since_last_start > crash_reset_duration;
+
+        auto ok_to_proceed = crash_limit_ok || node_config_changed
+                             || tracking_reset;
+
+        if (!ok_to_proceed) {
+            auto crashes = co_await _recorder.get_recorded_crashes();
+            vlog(
+              ctlog.error,
+              "Crash loop detected. Too many consecutive crashes {}, exceeded "
+              "{} configured value {}. To recover Redpanda from this state, "
+              "manually remove file at path {}. Crash loop automatically "
+              "resets 1h after last crash or with node configuration changes. "
+              "{}",
+              crash_md._crash_count,
+              config::node().crash_loop_limit.name(),
+              limit.value(),
+              file_path,
+              crashes.describe());
+            throw std::runtime_error("Crash loop detected, aborting startup.");
+        }
+
+        vlog(
+          ctlog.debug,
+          "Consecutive crashes detected: {} node config changed: {} "
+          "time based tracking reset: {}",
+          crash_md._crash_count,
+          node_config_changed,
+          tracking_reset);
+
+        if (node_config_changed || tracking_reset) {
+            crash_md._crash_count = 0;
+        }
+    }
+
+    // Truncate and bump the crash count. We consider a run to be unclean by
+    // default unless the scheduled cleanup (that runs very late in shutdown)
+    // resets the file. See schedule_crash_tracker_file_cleanup().
+    auto new_crash_count = maybe_crash_md
+                             ? maybe_crash_md.value()._crash_count + 1
+                             : 1;
+    crash_tracker_metadata updated{
+      ._crash_count = new_crash_count,
+      ._config_checksum = checksum,
+      ._last_start_ts = model::timestamp::now()};
+    co_await write_fully(file_path, serde::to_iobuf(updated));
+    co_await ss::sync_directory(
+      config::node().data_directory.value().as_sstring());
+}
+
+ss::future<> limiter::record_clean_shutdown() const {
+    auto file = config::node().crash_loop_tracker_path().string();
+    if (co_await ss::file_exists(file)) {
+        co_await ss::remove_file(file);
+        co_await ss::sync_directory(
+          config::node().data_directory().as_sstring());
+        vlog(ctlog.debug, "Deleted crash loop tracker file: {}", file);
+    }
+
+    co_return;
+}
+
+} // namespace crash_tracker
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2024 Redpanda Data, Inc.
+ *
+ * Use of this software is governed by the Business Source License
+ * included in the file licenses/BSL.md
+ *
+ * As of the Change Date specified in that file, in accordance with
+ * the Business Source License, use of this software will be governed
+ * by the Apache License, Version 2.0
+ */
+
+#pragma once
+
+#include "base/seastarx.h"
+#include "bytes/iobuf.h"
+#include "crash_tracker/recorder.h"
+#include "crash_tracker/types.h"
+#include "model/timestamp.h"
+#include "serde/envelope.h"
+
+namespace crash_tracker {
+
+// Limits the number of restarts to a configured amount
+class limiter {
+public:
+    explicit limiter(const recorder& recorder)
+      : _recorder(recorder) {};
+    ss::future<> check_for_crash_loop() const;
+    ss::future<> record_clean_shutdown() const;
+
+private:
+    const recorder& _recorder;
+};
+
+} // namespace crash_tracker
@@ -0,0 +1,17 @@
+/*
+ * Copyright 2024 Redpanda Data, Inc.
+ *
+ * Use of this software is governed by the Business Source License
+ * included in the file licenses/BSL.md
+ *
+ * As of the Change Date specified in that file, in accordance with
+ * the Business Source License, use of this software will be governed
+ * by the Apache License, Version 2.0
+ */
+#include "logger.h"
+
+namespace crash_tracker {
+
+seastar::logger ctlog("crash_tracker");
+
+} // namespace crash_tracker
@@ -0,0 +1,20 @@
+/*
+ * Copyright 2024 Redpanda Data, Inc.
+ *
+ * Use of this software is governed by the Business Source License
+ * included in the file licenses/BSL.md
+ *
+ * As of the Change Date specified in that file, in accordance with
+ * the Business Source License, use of this software will be governed
+ * by the Apache License, Version 2.0
+ */
+
+#pragma once
+
+#include "base/seastarx.h"
+
+#include <seastar/util/log.hh>
+
+namespace crash_tracker {
+extern ss::logger ctlog;
+} // namespace crash_tracker