From a9a533e398b724030e23c55f951cbead0268d134 Mon Sep 17 00:00:00 2001 From: Dimitri Savineau Date: Thu, 13 Feb 2020 15:56:23 -0500 Subject: [PATCH] ceph-prometheus: add alertmanager HA config When using multiple alertmanager nodes (via the grafana-server group) then we need to specify the other peers in the configuration. https://prometheus.io/docs/alerting/alertmanager/#high-availability https://github.com/prometheus/alertmanager#high-availability Closes: https://bugzilla.redhat.com/show_bug.cgi?id=1792225 Signed-off-by: Dimitri Savineau (cherry picked from commit b9d975385c2dceca3b06c18d4c37eadbe9f48c92) --- group_vars/all.yml.sample | 1 + group_vars/rhcs.yml.sample | 1 + roles/ceph-defaults/defaults/main.yml | 1 + roles/ceph-infra/tasks/dashboard_firewall.yml | 11 +++++++++++ .../ceph-prometheus/templates/alertmanager.service.j2 | 6 +++++- 5 files changed, 19 insertions(+), 1 deletion(-) diff --git a/group_vars/all.yml.sample b/group_vars/all.yml.sample index 010d4bfbf..34778b97f 100644 --- a/group_vars/all.yml.sample +++ b/group_vars/all.yml.sample @@ -773,6 +773,7 @@ dummy: #alertmanager_data_dir: /var/lib/alertmanager #alertmanager_conf_dir: /etc/alertmanager #alertmanager_port: 9093 +#alertmanager_cluster_port: 9094 ################################## diff --git a/group_vars/rhcs.yml.sample b/group_vars/rhcs.yml.sample index 94e013a43..e1a0a6f58 100644 --- a/group_vars/rhcs.yml.sample +++ b/group_vars/rhcs.yml.sample @@ -773,6 +773,7 @@ alertmanager_container_image: registry.redhat.io/openshift4/ose-prometheus-alert #alertmanager_data_dir: /var/lib/alertmanager #alertmanager_conf_dir: /etc/alertmanager #alertmanager_port: 9093 +#alertmanager_cluster_port: 9094 ################################## diff --git a/roles/ceph-defaults/defaults/main.yml b/roles/ceph-defaults/defaults/main.yml index 3b76963ec..9f74fc1ce 100644 --- a/roles/ceph-defaults/defaults/main.yml +++ b/roles/ceph-defaults/defaults/main.yml @@ -765,6 +765,7 @@ alertmanager_container_memory: 4 alertmanager_data_dir: /var/lib/alertmanager alertmanager_conf_dir: /etc/alertmanager alertmanager_port: 9093 +alertmanager_cluster_port: 9094 ################################## diff --git a/roles/ceph-infra/tasks/dashboard_firewall.yml b/roles/ceph-infra/tasks/dashboard_firewall.yml index f3166355d..d598b9331 100644 --- a/roles/ceph-infra/tasks/dashboard_firewall.yml +++ b/roles/ceph-infra/tasks/dashboard_firewall.yml @@ -52,6 +52,17 @@ permanent: true immediate: true state: enabled + + - name: open alertmanager cluster port + firewalld: + port: "{{ alertmanager_cluster_port }}/{{ item }}" + zone: "{{ ceph_dashboard_firewall_zone }}" + permanent: true + immediate: true + state: enabled + with_items: + - "tcp" + - "udp" when: - grafana_server_group_name is defined - grafana_server_group_name in group_names diff --git a/roles/ceph-prometheus/templates/alertmanager.service.j2 b/roles/ceph-prometheus/templates/alertmanager.service.j2 index 64b264e0b..173146ed4 100644 --- a/roles/ceph-prometheus/templates/alertmanager.service.j2 +++ b/roles/ceph-prometheus/templates/alertmanager.service.j2 @@ -22,9 +22,13 @@ ExecStart=/usr/bin/{{ container_binary }} run --rm --name=alertmanager \ --memory-swap={{ alertmanager_container_memory * 2 }}GB \ {{ alertmanager_container_image }} \ --config.file=/etc/alertmanager/alertmanager.yml \ + --cluster.listen-address={{ grafana_server_addr }}:{{ alertmanager_cluster_port }} \ +{% for peer in grafana_server_addrs|difference(grafana_server_addr) %} + --cluster.peer={{ peer }}:{{ alertmanager_cluster_port }} \ +{% endfor %} --storage.path=/alertmanager \ --web.external-url=http://{{ ansible_fqdn }}:{{ alertmanager_port }}/ \ - --web.listen-address=:{{ alertmanager_port }} + --web.listen-address={{ grafana_server_addr }}:{{ alertmanager_port }} ExecStop=/usr/bin/{{ container_binary }} stop alertmanager Restart=always RestartSec=10s