From a6cf646e455c1252412d79f02f1eb5c5ba9d9079 Mon Sep 17 00:00:00 2001 From: Boris Ranto Date: Tue, 8 Jun 2021 09:43:23 +0200 Subject: [PATCH] dashboard: Add new prometheus alert It was requested for us to update our alerting definitions to include a slow OSD Ops health check. Closes: https://bugzilla.redhat.com/show_bug.cgi?id=1951664 Signed-off-by: Boris Ranto (cherry picked from commit 2491d4e004c7b162216bc17e2288f05d0b049a87) --- roles/ceph-prometheus/files/ceph_dashboard.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/roles/ceph-prometheus/files/ceph_dashboard.yml b/roles/ceph-prometheus/files/ceph_dashboard.yml index 7a1474416..0c95d4daf 100644 --- a/roles/ceph-prometheus/files/ceph_dashboard.yml +++ b/roles/ceph-prometheus/files/ceph_dashboard.yml @@ -105,3 +105,11 @@ groups: annotations: summary: "OSD(s) with High PG Count" description: "This indicates there are some OSDs with high PG count (275+)." + - alert: Slow OSD Ops + expr: ceph_healthcheck_slow_ops > 0 + for: 1m + labels: + severity: page + annotations: + summary: "Slow OSD Ops" + description: "OSD requests are taking too long to process (osd_op_complaint_time exceeded)"