|
| 1 | +/* |
| 2 | + * Copyright 2024 Redpanda Data, Inc. |
| 3 | + * |
| 4 | + * Use of this software is governed by the Business Source License |
| 5 | + * included in the file licenses/BSL.md |
| 6 | + * |
| 7 | + * As of the Change Date specified in that file, in accordance with |
| 8 | + * the Business Source License, use of this software will be governed |
| 9 | + * by the Apache License, Version 2.0 |
| 10 | + */ |
| 11 | + |
| 12 | +#include "crash_tracker/limiter.h" |
| 13 | + |
| 14 | +#include "base/vassert.h" |
| 15 | +#include "config/node_config.h" |
| 16 | +#include "crash_tracker/limiter.h" |
| 17 | +#include "crash_tracker/logger.h" |
| 18 | +#include "crash_tracker/recorder.h" |
| 19 | +#include "crash_tracker/service.h" |
| 20 | +#include "crash_tracker/types.h" |
| 21 | +#include "hashing/xx.h" |
| 22 | +#include "model/timestamp.h" |
| 23 | +#include "serde/rw/envelope.h" |
| 24 | +#include "utils/file_io.h" |
| 25 | + |
| 26 | +#include <seastar/core/seastar.hh> |
| 27 | +#include <seastar/util/print_safe.hh> |
| 28 | + |
| 29 | +#include <fmt/chrono.h> |
| 30 | + |
| 31 | +#include <chrono> |
| 32 | +#include <system_error> |
| 33 | +#include <unistd.h> |
| 34 | + |
| 35 | +using namespace std::chrono_literals; |
| 36 | + |
| 37 | +namespace crash_tracker { |
| 38 | + |
| 39 | +// Crash tracking resets every 1h. |
| 40 | +static constexpr model::timestamp_clock::duration crash_reset_duration{1h}; |
| 41 | + |
| 42 | +ss::future<> limiter::check_for_crash_loop() const { |
| 43 | + auto file_path = config::node().crash_loop_tracker_path(); |
| 44 | + std::optional<crash_tracker_metadata> maybe_crash_md; |
| 45 | + if ( |
| 46 | + // Tracking is reset every time the broker boots in recovery mode. |
| 47 | + !config::node().recovery_mode_enabled() |
| 48 | + && co_await ss::file_exists(file_path.string())) { |
| 49 | + // Ok to read the entire file, it contains a serialized uint32_t. |
| 50 | + auto buf = co_await read_fully(file_path); |
| 51 | + try { |
| 52 | + maybe_crash_md = serde::from_iobuf<crash_tracker_metadata>( |
| 53 | + std::move(buf)); |
| 54 | + } catch (const serde::serde_exception&) { |
| 55 | + // A malformed log file, ignore and reset it later. |
| 56 | + // We truncate it below. |
| 57 | + vlog(ctlog.warn, "Ignorning malformed tracker file {}", file_path); |
| 58 | + } |
| 59 | + } |
| 60 | + |
| 61 | + // Compute the checksum of the current node configuration. |
| 62 | + auto current_config = co_await read_fully_to_string( |
| 63 | + config::node().get_cfg_file_path()); |
| 64 | + auto checksum = xxhash_64(current_config.c_str(), current_config.length()); |
| 65 | + |
| 66 | + if (maybe_crash_md) { |
| 67 | + auto& crash_md = maybe_crash_md.value(); |
| 68 | + auto& limit = config::node().crash_loop_limit.value(); |
| 69 | + |
| 70 | + // Check if it has been atleast 1h since last unsuccessful restart. |
| 71 | + // Tracking resets every 1h. |
| 72 | + auto time_since_last_start |
| 73 | + = model::duration_since_epoch(model::timestamp::now()) |
| 74 | + - model::duration_since_epoch(crash_md._last_start_ts); |
| 75 | + |
| 76 | + auto crash_limit_ok = !limit || crash_md._crash_count <= limit.value(); |
| 77 | + auto node_config_changed = crash_md._config_checksum != checksum; |
| 78 | + auto tracking_reset = time_since_last_start > crash_reset_duration; |
| 79 | + |
| 80 | + auto ok_to_proceed = crash_limit_ok || node_config_changed |
| 81 | + || tracking_reset; |
| 82 | + |
| 83 | + if (!ok_to_proceed) { |
| 84 | + auto crashes = co_await _recorder.get_recorded_crashes(); |
| 85 | + vlog( |
| 86 | + ctlog.error, |
| 87 | + "Crash loop detected. Too many consecutive crashes {}, exceeded " |
| 88 | + "{} configured value {}. To recover Redpanda from this state, " |
| 89 | + "manually remove file at path {}. Crash loop automatically " |
| 90 | + "resets 1h after last crash or with node configuration changes. " |
| 91 | + "{}", |
| 92 | + crash_md._crash_count, |
| 93 | + config::node().crash_loop_limit.name(), |
| 94 | + limit.value(), |
| 95 | + file_path, |
| 96 | + crashes.describe()); |
| 97 | + throw std::runtime_error("Crash loop detected, aborting startup."); |
| 98 | + } |
| 99 | + |
| 100 | + vlog( |
| 101 | + ctlog.debug, |
| 102 | + "Consecutive crashes detected: {} node config changed: {} " |
| 103 | + "time based tracking reset: {}", |
| 104 | + crash_md._crash_count, |
| 105 | + node_config_changed, |
| 106 | + tracking_reset); |
| 107 | + |
| 108 | + if (node_config_changed || tracking_reset) { |
| 109 | + crash_md._crash_count = 0; |
| 110 | + } |
| 111 | + } |
| 112 | + |
| 113 | + // Truncate and bump the crash count. We consider a run to be unclean by |
| 114 | + // default unless the scheduled cleanup (that runs very late in shutdown) |
| 115 | + // resets the file. See schedule_crash_tracker_file_cleanup(). |
| 116 | + auto new_crash_count = maybe_crash_md |
| 117 | + ? maybe_crash_md.value()._crash_count + 1 |
| 118 | + : 1; |
| 119 | + crash_tracker_metadata updated{ |
| 120 | + ._crash_count = new_crash_count, |
| 121 | + ._config_checksum = checksum, |
| 122 | + ._last_start_ts = model::timestamp::now()}; |
| 123 | + co_await write_fully(file_path, serde::to_iobuf(updated)); |
| 124 | + co_await ss::sync_directory( |
| 125 | + config::node().data_directory.value().as_sstring()); |
| 126 | +} |
| 127 | + |
| 128 | +ss::future<> limiter::record_clean_shutdown() const { |
| 129 | + auto file = config::node().crash_loop_tracker_path().string(); |
| 130 | + if (co_await ss::file_exists(file)) { |
| 131 | + co_await ss::remove_file(file); |
| 132 | + co_await ss::sync_directory( |
| 133 | + config::node().data_directory().as_sstring()); |
| 134 | + vlog(ctlog.debug, "Deleted crash loop tracker file: {}", file); |
| 135 | + } |
| 136 | + |
| 137 | + co_return; |
| 138 | +} |
| 139 | + |
| 140 | +} // namespace crash_tracker |
0 commit comments