Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 24 additions & 7 deletions src/v/cluster/health_monitor_backend.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1194,17 +1194,28 @@ health_monitor_backend::get_cluster_health_overview(
}
}
auto report_it = reports().find(id);
if (
report_it != reports().end()
&& report_it->second->local_state.recovery_mode_enabled) {
if (report_it == reports().end()) {
continue;
}
const auto& report = report_it->second;
if (report->local_state.recovery_mode_enabled) {
ret.nodes_in_recovery_mode.push_back(id);
}
auto disk_usage_alert = report->local_state.get_disk_alert();
switch (disk_usage_alert) {
case storage::disk_space_alert::ok:
break;
case storage::disk_space_alert::low_space:
case storage::disk_space_alert::degraded:
ret.high_disk_usage_nodes.push_back(id);
break;
}
}

std::sort(ret.all_nodes.begin(), ret.all_nodes.end());
std::sort(ret.nodes_down.begin(), ret.nodes_down.end());
std::sort(
ret.nodes_in_recovery_mode.begin(), ret.nodes_in_recovery_mode.end());
std::ranges::sort(ret.all_nodes);
std::ranges::sort(ret.nodes_down);
std::ranges::sort(ret.nodes_in_recovery_mode);
std::ranges::sort(ret.high_disk_usage_nodes);

auto aggr_report = aggregate_reports(reports());
co_await fill_aggregate_with_offline_partitions(
Expand All @@ -1228,6 +1239,12 @@ health_monitor_backend::get_cluster_health_overview(
ret.unhealthy_reasons.emplace_back("nodes_down");
}

// disk usage on a subset of nodes exceeds configured storage
// alert thresholds
if (!ret.high_disk_usage_nodes.empty()) {
ret.unhealthy_reasons.emplace_back("high_disk_usage_nodes");
}

// cluster is not healthy if some partitions do not have leaders
if (!ret.leaderless_partitions.empty()) {
ret.unhealthy_reasons.emplace_back("leaderless_partitions");
Expand Down
8 changes: 5 additions & 3 deletions src/v/cluster/health_monitor_types.cc
Original file line number Diff line number Diff line change
Expand Up @@ -366,13 +366,15 @@ std::ostream& operator<<(std::ostream& o, const cluster_health_overview& ho) {
fmt::print(
o,
"{{controller_id: {}, nodes: {}, unhealthy_reasons: {}, nodes_down: {}, "
"nodes_in_recovery_mode: {}, bytes_in_cloud_storage: {}, "
"leaderless_count: {}, under_replicated_count: {}, "
"leaderless_partitions: {}, under_replicated_partitions: {}}}",
"high_disk_usage_nodes: {}, nodes_in_recovery_mode: {}, "
"bytes_in_cloud_storage: {}, leaderless_count: {}, "
"under_replicated_count: {}, leaderless_partitions: {}, "
"under_replicated_partitions: {}}}",
ho.controller_id,
ho.all_nodes,
ho.unhealthy_reasons,
ho.nodes_down,
ho.high_disk_usage_nodes,
ho.nodes_in_recovery_mode,
ho.bytes_in_cloud_storage,
ho.leaderless_count,
Expand Down
4 changes: 4 additions & 0 deletions src/v/cluster/health_monitor_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -421,6 +421,10 @@ struct cluster_health_overview {
// A list of known nodes which are down from the point of view of the health
// subsystem.
std::vector<model::node_id> nodes_down;
// A list of nodes that exceed disk usage alerts defined by
// storage_space_alert_free_threshold_percent and
// storage_space_alert_free_threshold_bytes
std::vector<model::node_id> high_disk_usage_nodes;
// A list of nodes that have been booted up in recovery mode.
std::vector<model::node_id> nodes_in_recovery_mode;
std::vector<model::ntp> leaderless_partitions;
Expand Down
7 changes: 7 additions & 0 deletions src/v/redpanda/admin/api-doc/cluster.json
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,13 @@
},
"description": "ids of all nodes being recognized as down"
},
"high_disk_usage_nodes": {
"type": "array",
"items": {
"type": "int"
},
"description": "ids of all nodes with disk usage exceeding storage alert thresholds"
},
"nodes_in_recovery_mode": {
"type": "array",
"items": {
Expand Down
3 changes: 3 additions & 0 deletions src/v/redpanda/admin/server.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4060,12 +4060,15 @@ void admin_server::register_cluster_routes() {
ret.unhealthy_reasons._set = true;
ret.all_nodes._set = true;
ret.nodes_down._set = true;
ret.high_disk_usage_nodes._set = true;
ret.leaderless_partitions._set = true;
ret.under_replicated_partitions._set = true;

ret.unhealthy_reasons = health_overview.unhealthy_reasons;
ret.all_nodes = health_overview.all_nodes;
ret.nodes_down = health_overview.nodes_down;
ret.high_disk_usage_nodes
= health_overview.high_disk_usage_nodes;
ret.nodes_in_recovery_mode
= health_overview.nodes_in_recovery_mode;

Expand Down
42 changes: 40 additions & 2 deletions tests/rptest/tests/cluster_health_overview_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

from ducktape.utils.util import wait_until

from ducktape.cluster.cluster import ClusterNode
from rptest.clients.types import TopicSpec
from rptest.services.admin import Admin
from rptest.services.cluster import cluster
Expand All @@ -30,6 +31,7 @@ def __init__(self, test_context):
# https://github.com/redpanda-data/redpanda/issues/5253
"enable_leader_balancer": False,
},
environment={"__REDPANDA_TEST_DISABLE_BOUNDED_PROPERTY_CHECKS": "ON"},
)

self.admin = Admin(self.redpanda)
Expand All @@ -47,11 +49,11 @@ def create_topics(self):
self.client().create_topic(topics)
return topics

def get_health(self):
def get_health(self, node: ClusterNode | None = None):
"""Wrapper around admin.get_cluster_health_overview which validates some invariants
about each health report"""

hov = self.admin.get_cluster_health_overview()
hov = self.admin.get_cluster_health_overview(node=node)

# these invariants should always hold
if hov["is_healthy"]:
Expand All @@ -60,6 +62,7 @@ def get_health(self):
assert hov["leaderless_count"] == 0
assert len(hov["under_replicated_partitions"]) == 0
assert hov["under_replicated_count"] == 0
assert len(hov["high_disk_usage_nodes"]) == 0
assert len(hov["unhealthy_reasons"]) == 0
assert len(hov["all_nodes"]) > 0
else:
Expand Down Expand Up @@ -152,3 +155,38 @@ def two_nodes_down():
self.redpanda.start_node(second_down)

self.wait_until_healthy()

@cluster(
num_nodes=5, log_allow_list=[".*cluster - storage space alert: free space.*"]
)
def cluster_health_overview_disk_usage_alert_test(self):
# Test that high_disk_usage_nodes is reported correctly
self.create_topics()
self.wait_until_healthy()

# Fake alert
self.redpanda.set_cluster_config(
{"storage_space_alert_free_threshold_percent": 100}
)

def ensure_high_disk_usage_reported(node: ClusterNode):
hov = self.get_health(node=node)
return (
not hov["is_healthy"]
and "high_disk_usage_nodes" in hov["unhealthy_reasons"]
and len(hov["high_disk_usage_nodes"]) == 5
)

def ensure_unhealthy_report():
return all(
ensure_high_disk_usage_reported(node)
for node in self.redpanda.started_nodes()
)

wait_until(ensure_unhealthy_report, 30, 2)

# Disable disk usage alert
self.redpanda.set_cluster_config(
{"storage_space_alert_free_threshold_percent": 5}
)
self.wait_until_healthy()