ceph-ansible/roles/ceph-prometheus/files/ceph_dashboard.yml

groups:
- name: dashboard
  rules:
  - alert: Ceph Health Warning
    expr: ceph_health_status == 1
    for: 1m
    labels:
      severity: page
    annotations:
      summary: "Ceph Health Warning"
      description: "Overall Ceph Health"
  - alert: Ceph Health Error
    expr: ceph_health_status > 1
    for: 1m
    labels:
      severity: page
    annotations:
      summary: "Ceph Health Error"
      description: "The Ceph cluster health is in an error state"
  - alert: Disk(s) Near Full
    expr: (ceph_osd_stat_bytes_used / ceph_osd_stat_bytes) * 100 > 85
    for: 1m
    labels:
      severity: page
    annotations:
      summary: "Disk(s) Near Full"
      description: "This shows how many disks are at or above 85% full. Performance may degrade beyond this threshold on filestore (XFS) backed OSD's."
  - alert: OSD(s) Down
    expr: ceph_osd_up < 0.5
    for: 1m
    labels:
      severity: page
    annotations:
      summary: "OSD(s) Down"
      description: "This indicates that one or more OSDs is currently marked down in the cluster."
  - alert: OSD Host(s) Down
    expr: count by(instance) (ceph_disk_occupation * on(ceph_daemon) group_right(instance) ceph_osd_up == 0) - count by(instance) (ceph_disk_occupation) == 0
    for: 1m
    labels:
      severity: page
    annotations:
      summary: "OSD Host(s) Down"
      description: "This indicates that one or more OSD hosts is currently down in the cluster."
  - alert: PG(s) Stuck
    expr: max(ceph_osd_numpg) > scalar(ceph_pg_active)
    for: 1m
    labels:
      severity: page
    annotations:
      summary: "PG(s) Stuck"
      description: "This indicates there are pg's in a stuck state, manual intervention needed to resolve."
  - alert: OSD Host Loss Check
    expr: max(sum(ceph_osd_stat_bytes - ceph_osd_stat_bytes_used)) * 0.9 < scalar(max(sum by (instance) (ceph_osd_stat_bytes + on (ceph_daemon) group_left (instance) (ceph_disk_occupation*0))))
    for: 1m
    labels:
      severity: page
    annotations:
      summary: "OSD Host Loss Check"
      description: "This indicates that the cluster @ 90% full is not enough to support the loss of the largest OSD host."
  - alert: Slow OSD Responses
    expr: ((irate(node_disk_read_time_seconds_total[5m]) / clamp_min(irate(node_disk_reads_completed_total[5m]), 1) + irate(node_disk_write_time_seconds_total[5m]) / clamp_min(irate(node_disk_writes_completed_total[5m]), 1)) and on (instance, device) ceph_disk_occupation) > 1
    for: 1m
    labels:
      severity: page
    annotations:
      summary: "Slow OSD Responses"
      description: "This indicates that some OSD Latencies are above 1s."
  - alert: Network Errors
    expr: sum by (instance, device) (irate(node_network_receive_drop_total{device=~"(eth|en|bond|ib|mlx|p).*"}[5m]) + irate(node_network_receive_errs_total{device=~"(eth|en|bond|ib|mlx|p).*"}[5m]) + irate(node_network_transmit_drop_total{device=~"(eth|en|bond|ib|mlx|p).*"}[5m]) + irate(node_network_transmit_errs_total{device=~"(eth|en|bond|ib|mlx|p).*"}[5m])) > 10
    for: 1m
    labels:
      severity: page
    annotations:
      summary: "Network Errors"
      description: "This indicates that more than 10 dropped/error packets are seen in a 5m interval"
  - alert: Pool Capacity Low
    expr: (ceph_pool_stored / (ceph_pool_stored + ceph_pool_max_avail) * 100 + on (pool_id) group_left (name) (ceph_pool_metadata*0)) > 85
    for: 1m
    labels:
      severity: page
    annotations:
      summary: "Pool Capacity Low"
      description: "This indicates a low capacity in a pool."
  - alert: MON(s) Down
    expr: ceph_mon_quorum_status != 1
    for: 1m
    labels:
      severity: page
    annotations:
      summary: "MON(s) down"
      description: "This indicates that one or more MON(s) is down."
  - alert: Cluster Capacity Low
    expr: sum(ceph_osd_stat_bytes_used) / sum(ceph_osd_stat_bytes) > 0.85
    for: 1m
    labels:
      severity: page
    annotations:
      summary: "Cluster Capacity Low"
      description: "This indicates raw used space crosses the 85% capacity threshold of the ceph cluster."
  - alert: OSD(s) with High PG Count
    expr: ceph_osd_numpg > 275
    for: 1m
    labels:
      severity: page
    annotations:
      summary: "OSD(s) with High PG Count"
      description: "This indicates there are some OSDs with high PG count (275+)."
  - alert: Slow OSD Ops
    expr: ceph_healthcheck_slow_ops > 0
    for: 1m
    labels:
      severity: page
    annotations:
      summary: "Slow OSD Ops"
      description: "OSD requests are taking too long to process (osd_op_complaint_time exceeded)"
dashboard: Add and copy alerting rules This commit adds a list of alerting rules for ceph-dashboard from the old cephmetrics project. It also installs the configuration file so that the rules get recognized by the prometheus server. Signed-off-by: Boris Ranto <branto@redhat.com> 2019-02-16 03:27:15 +08:00			`groups:`
			`- name: dashboard`
			`rules:`
			`- alert: Ceph Health Warning`
			`expr: ceph_health_status == 1`
			`for: 1m`
			`labels:`
			`severity: page`
			`annotations:`
			`summary: "Ceph Health Warning"`
			`description: "Overall Ceph Health"`
			`- alert: Ceph Health Error`
			`expr: ceph_health_status > 1`
			`for: 1m`
			`labels:`
			`severity: page`
			`annotations:`
			`summary: "Ceph Health Error"`
			`description: "The Ceph cluster health is in an error state"`
			`- alert: Disk(s) Near Full`
			`expr: (ceph_osd_stat_bytes_used / ceph_osd_stat_bytes) * 100 > 85`
			`for: 1m`
			`labels:`
			`severity: page`
			`annotations:`
			`summary: "Disk(s) Near Full"`
			`description: "This shows how many disks are at or above 85% full. Performance may degrade beyond this threshold on filestore (XFS) backed OSD's."`
			`- alert: OSD(s) Down`
			`expr: ceph_osd_up < 0.5`
			`for: 1m`
			`labels:`
			`severity: page`
			`annotations:`
			`summary: "OSD(s) Down"`
			`description: "This indicates that one or more OSDs is currently marked down in the cluster."`
			`- alert: OSD Host(s) Down`
			`expr: count by(instance) (ceph_disk_occupation * on(ceph_daemon) group_right(instance) ceph_osd_up == 0) - count by(instance) (ceph_disk_occupation) == 0`
			`for: 1m`
			`labels:`
			`severity: page`
			`annotations:`
			`summary: "OSD Host(s) Down"`
			`description: "This indicates that one or more OSD hosts is currently down in the cluster."`
			`- alert: PG(s) Stuck`
			`expr: max(ceph_osd_numpg) > scalar(ceph_pg_active)`
			`for: 1m`
			`labels:`
			`severity: page`
			`annotations:`
			`summary: "PG(s) Stuck"`
			`description: "This indicates there are pg's in a stuck state, manual intervention needed to resolve."`
			`- alert: OSD Host Loss Check`
			`expr: max(sum(ceph_osd_stat_bytes - ceph_osd_stat_bytes_used)) * 0.9 < scalar(max(sum by (instance) (ceph_osd_stat_bytes + on (ceph_daemon) group_left (instance) (ceph_disk_occupation*0))))`
			`for: 1m`
			`labels:`
			`severity: page`
			`annotations:`
			`summary: "OSD Host Loss Check"`
			`description: "This indicates that the cluster @ 90% full is not enough to support the loss of the largest OSD host."`
			`- alert: Slow OSD Responses`
			`expr: ((irate(node_disk_read_time_seconds_total[5m]) / clamp_min(irate(node_disk_reads_completed_total[5m]), 1) + irate(node_disk_write_time_seconds_total[5m]) / clamp_min(irate(node_disk_writes_completed_total[5m]), 1)) and on (instance, device) ceph_disk_occupation) > 1`
			`for: 1m`
			`labels:`
			`severity: page`
			`annotations:`
			`summary: "Slow OSD Responses"`
			`description: "This indicates that some OSD Latencies are above 1s."`
			`- alert: Network Errors`
			`expr: sum by (instance, device) (irate(node_network_receive_drop_total{device=~"(eth\|en\|bond\|ib\|mlx\|p)."}[5m]) + irate(node_network_receive_errs_total{device=~"(eth\|en\|bond\|ib\|mlx\|p)."}[5m]) + irate(node_network_transmit_drop_total{device=~"(eth\|en\|bond\|ib\|mlx\|p)."}[5m]) + irate(node_network_transmit_errs_total{device=~"(eth\|en\|bond\|ib\|mlx\|p)."}[5m])) > 10`
			`for: 1m`
			`labels:`
			`severity: page`
			`annotations:`
			`summary: "Network Errors"`
			`description: "This indicates that more than 10 dropped/error packets are seen in a 5m interval"`
			`- alert: Pool Capacity Low`
ceph-prometheus: update pool stat counter Since [1] The bytes_used pool counter in prometheus has been renamed to stored. Closes: #5781 [1] https://github.com/ceph/ceph/commit/71fe9149 Signed-off-by: Dimitri Savineau <dsavinea@redhat.com> 2020-09-15 21:30:42 +08:00			`expr: (ceph_pool_stored / (ceph_pool_stored + ceph_pool_max_avail) * 100 + on (pool_id) group_left (name) (ceph_pool_metadata*0)) > 85`
dashboard: Add and copy alerting rules This commit adds a list of alerting rules for ceph-dashboard from the old cephmetrics project. It also installs the configuration file so that the rules get recognized by the prometheus server. Signed-off-by: Boris Ranto <branto@redhat.com> 2019-02-16 03:27:15 +08:00			`for: 1m`
			`labels:`
			`severity: page`
			`annotations:`
			`summary: "Pool Capacity Low"`
			`description: "This indicates a low capacity in a pool."`
			`- alert: MON(s) Down`
			`expr: ceph_mon_quorum_status != 1`
			`for: 1m`
			`labels:`
			`severity: page`
			`annotations:`
			`summary: "MON(s) down"`
			`description: "This indicates that one or more MON(s) is down."`
			`- alert: Cluster Capacity Low`
			`expr: sum(ceph_osd_stat_bytes_used) / sum(ceph_osd_stat_bytes) > 0.85`
			`for: 1m`
			`labels:`
			`severity: page`
			`annotations:`
			`summary: "Cluster Capacity Low"`
			`description: "This indicates raw used space crosses the 85% capacity threshold of the ceph cluster."`
			`- alert: OSD(s) with High PG Count`
			`expr: ceph_osd_numpg > 275`
			`for: 1m`
			`labels:`
			`severity: page`
			`annotations:`
			`summary: "OSD(s) with High PG Count"`
			`description: "This indicates there are some OSDs with high PG count (275+)."`
dashboard: Add new prometheus alert It was requested for us to update our alerting definitions to include a slow OSD Ops health check. Closes: https://bugzilla.redhat.com/show_bug.cgi?id=1951664 Signed-off-by: Boris Ranto <branto@redhat.com> 2021-06-08 15:43:23 +08:00			`- alert: Slow OSD Ops`
			`expr: ceph_healthcheck_slow_ops > 0`
			`for: 1m`
			`labels:`
			`severity: page`
			`annotations:`
			`summary: "Slow OSD Ops"`
			`description: "OSD requests are taking too long to process (osd_op_complaint_time exceeded)"`