From 5ac7559736fed8755f390cb8f4bfb0293c3b06ed Mon Sep 17 00:00:00 2001 From: Boris Ranto Date: Wed, 5 Dec 2018 19:59:47 +0100 Subject: [PATCH] Merge cephmetrics/dashboard-ansible repo This commit will merge dashboard-ansible installation scripts with ceph-ansible. This includes several new roles to setup ceph-dashboard and the underlying technologies like prometheus and grafana server. Signed-off-by: Boris Ranto & Zack Cerza Co-authored-by: Zack Cerza Co-authored-by: Guillaume Abrioux (cherry picked from commit 2f141a6e808766bb6cd406ccc67ba0353b46e780) --- group_vars/all.yml.sample | 12 ++ group_vars/rhcs.yml.sample | 12 ++ infrastructure-playbooks/purge-cluster.yml | 120 +++++++++++++ .../purge-docker-cluster.yml | 122 +++++++++++++ roles/ceph-container-common/tasks/main.yml | 4 + roles/ceph-dashboard/defaults/main.yml | 12 ++ roles/ceph-dashboard/meta/main.yml | 14 ++ .../tasks/configure_dashboard.yml | 162 ++++++++++++++++++ roles/ceph-dashboard/tasks/main.yml | 7 + roles/ceph-defaults/defaults/main.yml | 12 ++ roles/ceph-grafana/defaults/main.yml | 17 ++ .../ceph-grafana/files/grafana-server.service | 17 ++ roles/ceph-grafana/files/grafana.list | 1 + roles/ceph-grafana/handlers/main.yml | 8 + roles/ceph-grafana/meta/main.yml | 14 ++ .../ceph-grafana/tasks/configure_grafana.yml | 83 +++++++++ roles/ceph-grafana/tasks/main.yml | 6 + roles/ceph-grafana/tasks/setup_container.yml | 64 +++++++ .../templates/dashboards-ceph-dashboard.yml | 12 ++ .../templates/datasources-ceph-dashboard.yml | 26 +++ roles/ceph-grafana/templates/grafana.ini | 26 +++ roles/ceph-grafana/templates/grafana.repo | 9 + roles/ceph-handler/handlers/main.yml | 10 ++ roles/ceph-infra/tasks/configure_firewall.yml | 34 ++++ roles/ceph-mgr/tasks/main.yml | 2 +- roles/ceph-mgr/tasks/pre_requisite.yml | 9 + roles/ceph-node-exporter/defaults/main.yml | 2 + .../files/node_exporter.service | 20 +++ roles/ceph-node-exporter/meta/main.yml | 14 ++ roles/ceph-node-exporter/tasks/main.yml | 3 + .../tasks/setup_container.yml | 42 +++++ roles/ceph-prometheus/defaults/main.yml | 17 ++ .../files/alertmanager.service | 17 ++ .../ceph-prometheus/files/prometheus.service | 17 ++ roles/ceph-prometheus/handlers/main.yml | 12 ++ roles/ceph-prometheus/meta/main.yml | 3 + roles/ceph-prometheus/tasks/main.yml | 35 ++++ .../ceph-prometheus/tasks/setup_container.yml | 93 ++++++++++ .../templates/alertmanager.yml | 15 ++ .../ceph-prometheus/templates/prometheus.yml | 47 +++++ site-container.yml.sample | 70 ++++++++ site.yml.sample | 73 ++++++++ 42 files changed, 1294 insertions(+), 1 deletion(-) create mode 100644 roles/ceph-dashboard/defaults/main.yml create mode 100644 roles/ceph-dashboard/meta/main.yml create mode 100644 roles/ceph-dashboard/tasks/configure_dashboard.yml create mode 100644 roles/ceph-dashboard/tasks/main.yml create mode 100644 roles/ceph-grafana/defaults/main.yml create mode 100644 roles/ceph-grafana/files/grafana-server.service create mode 100644 roles/ceph-grafana/files/grafana.list create mode 100644 roles/ceph-grafana/handlers/main.yml create mode 100644 roles/ceph-grafana/meta/main.yml create mode 100644 roles/ceph-grafana/tasks/configure_grafana.yml create mode 100644 roles/ceph-grafana/tasks/main.yml create mode 100644 roles/ceph-grafana/tasks/setup_container.yml create mode 100644 roles/ceph-grafana/templates/dashboards-ceph-dashboard.yml create mode 100644 roles/ceph-grafana/templates/datasources-ceph-dashboard.yml create mode 100644 roles/ceph-grafana/templates/grafana.ini create mode 100644 roles/ceph-grafana/templates/grafana.repo create mode 100644 roles/ceph-node-exporter/defaults/main.yml create mode 100644 roles/ceph-node-exporter/files/node_exporter.service create mode 100644 roles/ceph-node-exporter/meta/main.yml create mode 100644 roles/ceph-node-exporter/tasks/main.yml create mode 100644 roles/ceph-node-exporter/tasks/setup_container.yml create mode 100644 roles/ceph-prometheus/defaults/main.yml create mode 100644 roles/ceph-prometheus/files/alertmanager.service create mode 100644 roles/ceph-prometheus/files/prometheus.service create mode 100644 roles/ceph-prometheus/handlers/main.yml create mode 100644 roles/ceph-prometheus/meta/main.yml create mode 100644 roles/ceph-prometheus/tasks/main.yml create mode 100644 roles/ceph-prometheus/tasks/setup_container.yml create mode 100644 roles/ceph-prometheus/templates/alertmanager.yml create mode 100644 roles/ceph-prometheus/templates/prometheus.yml diff --git a/group_vars/all.yml.sample b/group_vars/all.yml.sample index c27941592..f20886e18 100644 --- a/group_vars/all.yml.sample +++ b/group_vars/all.yml.sample @@ -69,6 +69,7 @@ dummy: #ceph_nfs_firewall_zone: public #ceph_rbdmirror_firewall_zone: public #ceph_iscsi_firewall_zone: public +#ceph_dashboard_firewall_zone: public # Generate local ceph.conf in fetch directory #ceph_conf_local: false @@ -687,6 +688,17 @@ dummy: # - { name: client.openstack, caps: { mon: "profile rbd", osd: "profile rbd pool={{ openstack_glance_pool.name }}, profile rbd pool={{ openstack_nova_pool.name }}, profile rbd pool={{ openstack_cinder_pool.name }}, profile rbd pool={{ openstack_cinder_backup_pool.name }}"}, mode: "0600" } +############# +# DASHBOARD # +############# +#dashboard_enabled: False +#dashboard_network_name: ceph-dashboard +# Choose http or https +# For https, you should set dashboard.crt/key and grafana.crt/key +#dashboard_protocol: http +#dashboard_port: 8234 + + ############### # DEPRECATION # ############### diff --git a/group_vars/rhcs.yml.sample b/group_vars/rhcs.yml.sample index d431ca112..c4a3ce23c 100644 --- a/group_vars/rhcs.yml.sample +++ b/group_vars/rhcs.yml.sample @@ -69,6 +69,7 @@ fetch_directory: ~/ceph-ansible-keys #ceph_nfs_firewall_zone: public #ceph_rbdmirror_firewall_zone: public #ceph_iscsi_firewall_zone: public +#ceph_dashboard_firewall_zone: public # Generate local ceph.conf in fetch directory #ceph_conf_local: false @@ -687,6 +688,17 @@ ceph_docker_registry: "registry.access.redhat.com" # - { name: client.openstack, caps: { mon: "profile rbd", osd: "profile rbd pool={{ openstack_glance_pool.name }}, profile rbd pool={{ openstack_nova_pool.name }}, profile rbd pool={{ openstack_cinder_pool.name }}, profile rbd pool={{ openstack_cinder_backup_pool.name }}"}, mode: "0600" } +############# +# DASHBOARD # +############# +#dashboard_enabled: False +#dashboard_network_name: ceph-dashboard +# Choose http or https +# For https, you should set dashboard.crt/key and grafana.crt/key +#dashboard_protocol: http +#dashboard_port: 8234 + + ############### # DEPRECATION # ############### diff --git a/infrastructure-playbooks/purge-cluster.yml b/infrastructure-playbooks/purge-cluster.yml index 66700cf06..005486e51 100644 --- a/infrastructure-playbooks/purge-cluster.yml +++ b/infrastructure-playbooks/purge-cluster.yml @@ -42,12 +42,130 @@ - "{{ nfs_group_name|default('nfss') }}" - "{{ client_group_name|default('clients') }}" - "{{ mgr_group_name|default('mgrs') }}" + - grafana-server become: true tasks: - debug: msg="gather facts on all Ceph hosts for following reference" + +- name: purge node-exporter + hosts: + - "{{ mon_group_name|default('mons') }}" + - "{{ osd_group_name|default('osds') }}" + - "{{ mds_group_name|default('mdss') }}" + - "{{ rgw_group_name|default('rgws') }}" + - "{{ rbdmirror_group_name|default('rbdmirrors') }}" + - "{{ nfs_group_name|default('nfss') }}" + - "{{ client_group_name|default('clients') }}" + - "{{ mgr_group_name|default('mgrs') }}" + - agents + - grafana-server + - clients + - iscsigws + - iscsi-gws # for backward compatibility only! + + become: true + + tasks: + - name: set ceph_docker_registry value if not set + set_fact: + ceph_docker_registry: "docker.io" + when: ceph_docker_registry is not defined + + - name: disable node_exporter service + service: + name: node_exporter + state: stopped + enabled: no + failed_when: false + + - name: remove node-exporter container + docker_container: + name: node_exporter + state: absent + failed_when: false + + - name: remove node_exporter service file + file: + name: /etc/systemd/system/node_exporter.service + state: absent + + - name: remove node-exporter image + docker_image: + image: "{{ ceph_docker_registry }}/prom/node-exporter" + state: absent + force: yes + tags: + - remove_img + failed_when: false + + +- name: purge ceph grafana-server + hosts: grafana-server + become: true + vars: + grafana_services: + - grafana-server + - prometheus + - alertmanager + + tasks: + - name: set ceph_docker_registry value if not set + set_fact: + ceph_docker_registry: "docker.io" + when: ceph_docker_registry is not defined + + - name: stop services + service: + name: "{{ item }}" + state: stopped + enabled: no + with_items: "{{ grafana_services }}" + failed_when: false + + - name: remove containers + docker_container: + name: "{{ item }}" + state: absent + with_items: "{{ grafana_services }}" + failed_when: false + + - name: remove service files + file: + name: "/etc/systemd/system/{{ item }}.service" + state: absent + with_items: "{{ grafana_services }}" + failed_when: false + + - name: remove images + docker_image: + name: "{{ item }}" + state: absent + force: yes + with_items: + - "{{ ceph_docker_registry }}/prom/prometheus" + - "{{ ceph_docker_registry }}/grafana/grafana" + - "{{ ceph_docker_registry }}/prom/alertmanager" + failed_when: false + + - name: remove data + file: + name: "{{ item }}" + state: absent + with_items: + - /etc/grafana/dashboards + - /etc/grafana/grafana.ini + - /etc/grafana/provisioning + - /var/lib/grafana + - /etc/alertmanager + - /var/lib/alertmanager + - /var/lib/prometheus + - /etc/prometheus + failed_when: false + + - name: purge ceph mds cluster vars: @@ -445,6 +563,7 @@ - ceph-release - ceph-radosgw - calamari-server + - ceph-grafana-dashboards ceph_remaining_packages: - libcephfs1 @@ -466,6 +585,7 @@ - "{{ nfs_group_name|default('nfss') }}" - "{{ client_group_name|default('clients') }}" - "{{ mgr_group_name|default('mgrs') }}" + - grafana-server gather_facts: false # Already gathered previously diff --git a/infrastructure-playbooks/purge-docker-cluster.yml b/infrastructure-playbooks/purge-docker-cluster.yml index 702785245..c7b9d5ed5 100644 --- a/infrastructure-playbooks/purge-docker-cluster.yml +++ b/infrastructure-playbooks/purge-docker-cluster.yml @@ -439,6 +439,128 @@ tags: remove_img ignore_errors: true + +- name: purge node-exporter + + hosts: + - "{{ mon_group_name|default('mons') }}" + - "{{ osd_group_name|default('osds') }}" + - "{{ mds_group_name|default('mdss') }}" + - "{{ rgw_group_name|default('rgws') }}" + - "{{ rbdmirror_group_name|default('rbdmirrors') }}" + - "{{ nfs_group_name|default('nfss') }}" + - "{{ mgr_group_name|default('mgrs') }}" + - agents + - grafana-server + - iscsigws + - iscsi-gws # for backward compatibility only! + - clients + + gather_facts: false + + become: true + + tasks: + - name: set ceph_docker_registry value if not set + set_fact: + ceph_docker_registry: "docker.io" + when: ceph_docker_registry is not defined + + - name: disable node_exporter service + service: + name: node_exporter + state: stopped + enabled: no + failed_when: false + + - name: remove node-exporter container + docker_container: + name: node_exporter + state: absent + failed_when: false + + - name: remove node_exporter service file + file: + name: /etc/systemd/system/node_exporter.service + state: absent + + - name: remove node-exporter image + docker_image: + image: "{{ ceph_docker_registry }}/prom/node-exporter" + state: absent + force: yes + tags: + - remove_img + failed_when: false + + +- name: purge ceph-grafana + + hosts: grafana-server + + gather_facts: false + + become: true + + vars: + grafana_services: + - grafana-server + - prometheus + - alertmanager + + tasks: + - name: set ceph_docker_registry value if not set + set_fact: + ceph_docker_registry: "docker.io" + when: ceph_docker_registry is not defined + + - name: stop services + service: + name: "{{ item }}" + state: stopped + enabled: no + with_items: "{{ grafana_services }}" + failed_when: false + + - name: remove containers + docker_container: + name: "{{ item }}" + state: absent + with_items: "{{ grafana_services }}" + failed_when: false + + - name: remove service files + file: + name: "/etc/systemd/system/{{ item }}.service" + state: absent + with_items: "{{ grafana_services }}" + failed_when: false + + - name: remove images + docker_image: + name: "{{ item }}" + state: absent + force: yes + with_items: + - "{{ ceph_docker_registry }}/prom/prometheus" + - "{{ ceph_docker_registry }}/grafana/grafana" + - "{{ ceph_docker_registry }}/prom/alertmanager" + failed_when: false + + - name: remove data + file: + name: "{{ item }}" + state: absent + with_items: + - /etc/grafana/grafana.ini + - /etc/grafana/provisioning + - /var/lib/grafana + - /etc/alertmanager + - /var/lib/alertmanager + - /var/lib/prometheus + - /etc/prometheus + failed_when: false + - name: check container hosts hosts: diff --git a/roles/ceph-container-common/tasks/main.yml b/roles/ceph-container-common/tasks/main.yml index 9062c072b..f6d1334e8 100644 --- a/roles/ceph-container-common/tasks/main.yml +++ b/roles/ceph-container-common/tasks/main.yml @@ -18,6 +18,7 @@ - name: include fetch_image.yml include_tasks: fetch_image.yml tags: fetch_container_image + when: containerized_deployment - name: get ceph version command: > @@ -27,10 +28,13 @@ changed_when: false check_mode: no register: ceph_version + when: containerized_deployment - name: set_fact ceph_version ceph_version.stdout.split set_fact: ceph_version: "{{ ceph_version.stdout.split(' ')[2] }}" + when: containerized_deployment - name: include release.yml include_tasks: release.yml + when: containerized_deployment diff --git a/roles/ceph-dashboard/defaults/main.yml b/roles/ceph-dashboard/defaults/main.yml new file mode 100644 index 000000000..c3838e083 --- /dev/null +++ b/roles/ceph-dashboard/defaults/main.yml @@ -0,0 +1,12 @@ +--- +dashboard_admin_user: admin +dashboard_admin_password: admin +# We only need this for SSL (https) connections +dashboard_crt: '' +dashboard_key: '' +dashboard_rgw_api_user_id: ceph-dashboard +dashboard_rgw_api_host: '' +dashboard_rgw_api_port: '' +dashboard_rgw_api_scheme: '' +dashboard_rgw_api_admin_resource: '' +dashboard_rgw_api_no_ssl_verify: '' diff --git a/roles/ceph-dashboard/meta/main.yml b/roles/ceph-dashboard/meta/main.yml new file mode 100644 index 000000000..464f131bc --- /dev/null +++ b/roles/ceph-dashboard/meta/main.yml @@ -0,0 +1,14 @@ +--- +galaxy_info: + company: Red Hat + author: Boris Ranto + description: Configures Ceph Dashboard + license: Apache + min_ansible_version: 2.4 + platforms: + - name: EL + versions: + - 7 + galaxy_tags: + - system +dependencies: [] diff --git a/roles/ceph-dashboard/tasks/configure_dashboard.yml b/roles/ceph-dashboard/tasks/configure_dashboard.yml new file mode 100644 index 000000000..4780e03f0 --- /dev/null +++ b/roles/ceph-dashboard/tasks/configure_dashboard.yml @@ -0,0 +1,162 @@ +--- +- name: set mgr_prefix default + set_fact: + mgr_prefix: "" + +- block: + - name: check to see if the mgr is containerized + command: "{{ container_binary }} inspect ceph-mgr-{{ ansible_hostname }}" + register: mgr_container + failed_when: false + changed_when: false + + - name: choose the correct container name + set_fact: + container_name: "{% if mgr_container.rc == 0 %}ceph-mgr-{{ ansible_hostname }}{% endif %}" + + - name: prefix the mgr command with a {{ container_binary }} command + set_fact: + mgr_prefix: "{{ container_binary }} exec {{ container_name }}" + when: container_name != "" + when: container_binary != "" + +- name: disable SSL for dashboard + shell: | + {{ mgr_prefix }} ceph config set mgr mgr/dashboard/ssl false || \ + {{ mgr_prefix }} ceph config-key set mgr/dashboard/ssl false + when: dashboard_protocol != "https" + +- name: enable SSL for dashboard + shell: | + {{ mgr_prefix }} ceph config set mgr mgr/dashboard/ssl true || \ + {{ mgr_prefix }} ceph config-key set mgr/dashboard/ssl true + when: dashboard_protocol == "https" + +- name: copy dashboard SSL certificate file + copy: + src: "{{ dashboard_crt }}" + dest: "/etc/ceph/ceph-dashboard.crt" + owner: root + group: root + mode: 0644 + when: + - dashboard_crt + - dashboard_protocol == "https" + +- name: copy dashboard SSL certificate key + copy: + src: "{{ dashboard_key }}" + dest: "/etc/ceph/ceph-dashboard.key" + owner: root + group: root + mode: 0644 + when: + - dashboard_key + - dashboard_protocol == "https" + +- name: generate a Self Signed OpenSSL certificate for dashboard + shell: | + test -f /etc/ceph/ceph-dashboard.key -a -f /etc/ceph/ceph-dashboard.crt || \ + openssl req -new -nodes -x509 -subj '/O=IT/CN=ceph-dashboard' -days 3650 -keyout /etc/ceph/ceph-dashboard.key -out /etc/ceph/ceph-dashboard.crt -extensions v3_ca + when: + - dashboard_protocol == "https" + - not dashboard_key or not dashboard_crt + +- name: import dashboard certificate file + command: "{{ mgr_prefix }} ceph config-key set mgr/dashboard/crt -i /etc/ceph/ceph-dashboard.crt" + changed_when: false + when: dashboard_protocol == "https" + +- name: import dashboard certificate key + command: "{{ mgr_prefix }} ceph config-key set mgr/dashboard/key -i /etc/ceph/ceph-dashboard.key" + changed_when: false + when: dashboard_protocol == "https" + +- name: "set the dashboard port ({{ dashboard_port }})" + shell: | + {{ mgr_prefix }} ceph config set mgr mgr/dashboard/server_port {{ dashboard_port }} || \ + {{ mgr_prefix }} ceph config-key set mgr/dashboard/server_port {{ dashboard_port }} + +- name: disable mgr dashboard module (restart) + command: "{{ mgr_prefix }} ceph mgr module disable dashboard" + changed_when: false + +- name: enable mgr dashboard module (restart) + command: "{{ mgr_prefix }} ceph mgr module enable dashboard" + changed_when: false + +- name: set or update dashboard admin username and password + shell: | + if {{ mgr_prefix }} ceph dashboard ac-user-show {{ dashboard_admin_user }}; then + {{ mgr_prefix }} ceph dashboard ac-user-set-password {{ dashboard_admin_user }} {{ dashboard_admin_password }} + else + {{ mgr_prefix }} ceph dashboard ac-user-create {{ dashboard_admin_user }} {{ dashboard_admin_password }} administrator + fi + retries: 6 + delay: 5 + register: ac_result + until: ac_result.rc == 0 + +- name: set grafana url + command: "{{ mgr_prefix }} ceph dashboard set-grafana-api-url {{ dashboard_protocol }}://{{ groups['grafana-server'][0] }}:3000/" + changed_when: false + +- name: set alertmanager host + command: "{{ mgr_prefix }} ceph dashboard set-alertmanager-api-host {{ dashboard_protocol }}://{{ groups['grafana-server'][0] }}:9093/" + changed_when: false + +- name: create radosgw system user + shell: "timeout 20 {{ mgr_prefix }} radosgw-admin user create --uid={{ dashboard_rgw_api_user_id }} --display-name='Ceph dashboard' --system" + register: rgw_user_output + until: rgw_user_output.rc == 0 + retries: 3 + +- name: get the rgw access and secret keys + set_fact: + rgw_access_key: "{{ (rgw_user_output.stdout | from_json)['keys'][0]['access_key'] }}" + rgw_secret_key: "{{ (rgw_user_output.stdout | from_json)['keys'][0]['secret_key'] }}" + +- name: set the rgw user + command: "{{ mgr_prefix }} ceph dashboard set-rgw-api-user-id {{ dashboard_rgw_api_user_id }}" + changed_when: false + +- name: set the rgw access key + command: "{{ mgr_prefix }} ceph dashboard set-rgw-api-access-key {{ rgw_access_key }}" + changed_when: false + +- name: set the rgw secret key + command: "{{ mgr_prefix }} ceph dashboard set-rgw-api-secret-key {{ rgw_secret_key }}" + changed_when: false + +- name: set the rgw host + command: "{{ mgr_prefix }} ceph dashboard set-rgw-api-host {{ dashboard_rgw_api_host }}" + changed_when: false + when: dashboard_rgw_api_host + +- name: set the rgw port + command: "{{ mgr_prefix }} ceph dashboard set-rgw-api-port {{ dashboard_rgw_api_port }}" + changed_when: false + when: dashboard_rgw_api_port + +- name: set the rgw scheme + command: "{{ mgr_prefix }} ceph dashboard set-rgw-api-scheme {{ dashboard_rgw_api_scheme }}" + changed_when: false + when: dashboard_rgw_api_scheme + +- name: set the rgw admin resource + command: "{{ mgr_prefix }} ceph dashboard set-rgw-api-admin-resource {{ dashboard_rgw_api_admin_resource }}" + changed_when: false + when: dashboard_rgw_api_admin_resource + +- name: disable ssl verification for rgw + command: "{{ mgr_prefix }} ceph dashboard set-rgw-api-ssl-verify False" + changed_when: false + when: dashboard_rgw_api_no_ssl_verify + +- name: disable mgr dashboard module (restart) + command: "{{ mgr_prefix }} ceph mgr module disable dashboard" + changed_when: false + +- name: enable mgr dashboard module (restart) + command: "{{ mgr_prefix }} ceph mgr module enable dashboard" + changed_when: false diff --git a/roles/ceph-dashboard/tasks/main.yml b/roles/ceph-dashboard/tasks/main.yml new file mode 100644 index 000000000..e92cd7abf --- /dev/null +++ b/roles/ceph-dashboard/tasks/main.yml @@ -0,0 +1,7 @@ +--- +- name: include configure_dashboard.yml + include_tasks: configure_dashboard.yml + +- name: print dashboard URL + debug: + msg: "The dashboard has been deployed! You can access your dashboard web UI at {{ dashboard_protocol }}://{{ ansible_fqdn }}:{{ dashboard_port }}/ as an '{{ dashboard_admin_user }}' user with '{{ dashboard_admin_password }}' password." diff --git a/roles/ceph-defaults/defaults/main.yml b/roles/ceph-defaults/defaults/main.yml index adb468297..4236d01b2 100644 --- a/roles/ceph-defaults/defaults/main.yml +++ b/roles/ceph-defaults/defaults/main.yml @@ -61,6 +61,7 @@ ceph_mds_firewall_zone: public ceph_nfs_firewall_zone: public ceph_rbdmirror_firewall_zone: public ceph_iscsi_firewall_zone: public +ceph_dashboard_firewall_zone: public # Generate local ceph.conf in fetch directory ceph_conf_local: false @@ -679,6 +680,17 @@ openstack_keys: - { name: client.openstack, caps: { mon: "profile rbd", osd: "profile rbd pool={{ openstack_glance_pool.name }}, profile rbd pool={{ openstack_nova_pool.name }}, profile rbd pool={{ openstack_cinder_pool.name }}, profile rbd pool={{ openstack_cinder_backup_pool.name }}"}, mode: "0600" } +############# +# DASHBOARD # +############# +dashboard_enabled: False +dashboard_network_name: ceph-dashboard +# Choose http or https +# For https, you should set dashboard.crt/key and grafana.crt/key +dashboard_protocol: http +dashboard_port: 8234 + + ############### # DEPRECATION # ############### diff --git a/roles/ceph-grafana/defaults/main.yml b/roles/ceph-grafana/defaults/main.yml new file mode 100644 index 000000000..687c31ea5 --- /dev/null +++ b/roles/ceph-grafana/defaults/main.yml @@ -0,0 +1,17 @@ +--- +grafana_admin_user: admin +grafana_admin_password: admin +# We only need this for SSL (https) connections +grafana_crt: '' +grafana_key: '' +grafana_container_image: "grafana/grafana:latest" +grafana_container_cpu_period: 100000 +grafana_container_cpu_cores: 2 +# container_memory is in GB +grafana_container_memory: 4 +grafana_uid: 472 +grafana_datasource: Dashboard +grafana_dashboards_path: "/etc/grafana/dashboards/ceph-dashboard" +grafana_plugins: + - vonage-status-panel + - grafana-piechart-panel diff --git a/roles/ceph-grafana/files/grafana-server.service b/roles/ceph-grafana/files/grafana-server.service new file mode 100644 index 000000000..fab8f5191 --- /dev/null +++ b/roles/ceph-grafana/files/grafana-server.service @@ -0,0 +1,17 @@ +# This file is managed by ansible, don't make changes here - they will be +# overwritten. +[Unit] +Description=grafana-server +After=docker.service + +[Service] +EnvironmentFile=-/etc/environment +ExecStart=/usr/bin/docker start --attach grafana-server +ExecStop=-/usr/bin/docker stop grafana-server +Restart=always +RestartSec=10s +TimeoutStartSec=120 +TimeoutStopSec=15 + +[Install] +WantedBy=multi-user.target diff --git a/roles/ceph-grafana/files/grafana.list b/roles/ceph-grafana/files/grafana.list new file mode 100644 index 000000000..886da8d56 --- /dev/null +++ b/roles/ceph-grafana/files/grafana.list @@ -0,0 +1 @@ +deb https://packagecloud.io/grafana/stable/debian/ jessie main diff --git a/roles/ceph-grafana/handlers/main.yml b/roles/ceph-grafana/handlers/main.yml new file mode 100644 index 000000000..fd1bbd465 --- /dev/null +++ b/roles/ceph-grafana/handlers/main.yml @@ -0,0 +1,8 @@ +--- +- name: enable service + # We use the systemd module here so we can use the daemon_reload feature, + # since we're shipping the .service file ourselves + systemd: + name: grafana-server + daemon_reload: true + enabled: true diff --git a/roles/ceph-grafana/meta/main.yml b/roles/ceph-grafana/meta/main.yml new file mode 100644 index 000000000..76a6bd6fd --- /dev/null +++ b/roles/ceph-grafana/meta/main.yml @@ -0,0 +1,14 @@ +--- +galaxy_info: + company: Red Hat + author: Boris Ranto + description: Configures Grafana for Ceph Dashboard + license: Apache + min_ansible_version: 2.4 + platforms: + - name: EL + versions: + - 7 + galaxy_tags: + - system +dependencies: [] diff --git a/roles/ceph-grafana/tasks/configure_grafana.yml b/roles/ceph-grafana/tasks/configure_grafana.yml new file mode 100644 index 000000000..e031e171a --- /dev/null +++ b/roles/ceph-grafana/tasks/configure_grafana.yml @@ -0,0 +1,83 @@ +--- +- name: make sure grafana is down + service: + name: grafana-server + state: stopped + +- name: wait for grafana to be stopped + wait_for: + port: 3000 + state: stopped + +- name: make sure grafana configuration directories exist + file: + path: "{{ item }}" + state: directory + recurse: yes + with_items: + - "/etc/grafana/provisioning/datasources" + - "/etc/grafana/provisioning/dashboards" + +- name: write grafana.ini + template: + src: grafana.ini + dest: /etc/grafana/grafana.ini + mode: 0640 + +- name: write datasources provisioning config file + template: + src: datasources-ceph-dashboard.yml + dest: /etc/grafana/provisioning/datasources/ceph-dashboard.yml + mode: 0640 + +- name: Write dashboards provisioning config file + template: + src: dashboards-ceph-dashboard.yml + dest: /etc/grafana/provisioning/dashboards/ceph-dashboard.yml + mode: 0640 + +- name: copy grafana SSL certificate file + copy: + src: "{{ grafana_crt }}" + dest: "/etc/grafana/ceph-dashboard.crt" + mode: 0640 + when: + - grafana_crt + - dashboard_protocol == "https" + +- name: copy grafana SSL certificate key + copy: + src: "{{ grafana_key }}" + dest: "/etc/grafana/ceph-dashboard.key" + mode: 0640 + when: + - grafana_key + - dashboard_protocol == "https" + +- name: generate a Self Signed OpenSSL certificate for dashboard + shell: | + test -f /etc/grafana/ceph-dashboard.key -a -f /etc/grafana/ceph-dashboard.crt || \ + openssl req -new -nodes -x509 -subj '/O=IT/CN=ceph-grafana' -days 3650 -keyout /etc/grafana/ceph-dashboard.key -out /etc/grafana/ceph-dashboard.crt -extensions v3_ca + when: + - dashboard_protocol == "https" + - not grafana_key or not grafana_crt + +- name: set owner/group on /etc/grafana + file: + path: /etc/grafana + state: directory + # This is the UID used by the grafana container + owner: "{{ grafana_uid }}" + # This group is used by the grafana rpm + group: "grafana" + recurse: true + +- name: enable and start grafana + service: + name: grafana-server + state: restarted + enabled: true + +- name: wait for grafana to start + wait_for: + port: 3000 diff --git a/roles/ceph-grafana/tasks/main.yml b/roles/ceph-grafana/tasks/main.yml new file mode 100644 index 000000000..d125a5102 --- /dev/null +++ b/roles/ceph-grafana/tasks/main.yml @@ -0,0 +1,6 @@ +--- +- name: include setup_container.yml + include_tasks: setup_container.yml + +- name: include configure_grafana.yml + include_tasks: configure_grafana.yml diff --git a/roles/ceph-grafana/tasks/setup_container.yml b/roles/ceph-grafana/tasks/setup_container.yml new file mode 100644 index 000000000..f0c164028 --- /dev/null +++ b/roles/ceph-grafana/tasks/setup_container.yml @@ -0,0 +1,64 @@ +--- +- name: include ceph-container-common + include_role: + name: ceph-container-common + allow_duplicates: false + +- name: create grafana user + user: + name: grafana + shell: '/bin/false' + createhome: false + system: true + +- name: create /etc/grafana and /var/lib/grafana + file: + path: "{{ item }}" + state: directory + owner: "{{ grafana_uid }}" + recurse: true + with_items: + - /etc/grafana + - /var/lib/grafana + +- name: make sure the grafana-server service is down + service: + name: grafana-server + state: stopped + failed_when: false + +- name: create docker container + docker_container: + name: grafana-server + image: "{{ grafana_container_image }}" + state: present + # restart to allow updates + restart: true + restart_policy: no + force_kill: yes + published_ports: '3000:3000' + detach: true + volumes: + - "/etc/grafana:/etc/grafana:Z" + - "/var/lib/grafana:/var/lib/grafana:Z" + networks: + - name: "{{ dashboard_network_name }}" + keep_volumes: true + pull: true + cpu_period: "{{ grafana_container_cpu_period }}" + # As of ansible-2.5.2, this module doesn't support the equivalent of the + # --cpus flag, so we must use period/quota for now + cpu_quota: "{{ grafana_container_cpu_period * grafana_container_cpu_cores }}" + memory: "{{ grafana_container_memory }}GB" + memory_swap: "{{ grafana_container_memory * 2 }}GB" + env: + GF_INSTALL_PLUGINS: "{{ grafana_plugins|join(',') }}" + +- name: ship systemd service + copy: + src: grafana-server.service + dest: "/etc/systemd/system/" + owner: root + group: root + mode: 0644 + notify: enable service diff --git a/roles/ceph-grafana/templates/dashboards-ceph-dashboard.yml b/roles/ceph-grafana/templates/dashboards-ceph-dashboard.yml new file mode 100644 index 000000000..64dbf1d1a --- /dev/null +++ b/roles/ceph-grafana/templates/dashboards-ceph-dashboard.yml @@ -0,0 +1,12 @@ +apiVersion: 1 + +providers: +- name: 'Ceph Dashboard' + orgId: 1 + folder: 'ceph-dashboard' + type: file + disableDeletion: false + updateIntervalSeconds: 3 + editable: false + options: + path: '{{ grafana_dashboards_path }}' diff --git a/roles/ceph-grafana/templates/datasources-ceph-dashboard.yml b/roles/ceph-grafana/templates/datasources-ceph-dashboard.yml new file mode 100644 index 000000000..b2947b72a --- /dev/null +++ b/roles/ceph-grafana/templates/datasources-ceph-dashboard.yml @@ -0,0 +1,26 @@ +apiVersion: 1 + +# list of datasources that should be deleted from the database +deleteDatasources: + - name: '{{ grafana_datasource }}' + orgId: 1 + +# list of datasources to insert/update depending +# what's available in the database +datasources: + # name of the datasource. Required +- name: '{{ grafana_datasource }}' + # datasource type. Required + type: 'prometheus' + # access mode. proxy or direct (Server or Browser in the UI). Required + access: 'proxy' + # org id. will default to orgId 1 if not specified + orgId: 1 + # url + url: 'http://prometheus:9090' + # enable/disable basic auth + basicAuth: false + # mark as default datasource. Max one per org + isDefault: true + # allow users to edit datasources from the UI. + editable: false diff --git a/roles/ceph-grafana/templates/grafana.ini b/roles/ceph-grafana/templates/grafana.ini new file mode 100644 index 000000000..0ea67e9fa --- /dev/null +++ b/roles/ceph-grafana/templates/grafana.ini @@ -0,0 +1,26 @@ +# [server] +# root_url = %(protocol)s://%(domain)s:%(http_port)s/api/grafana/proxy + +[users] +default_theme = light + +#################################### Anonymous Auth ########################## +[auth.anonymous] +# enable anonymous access +enabled = true + +# specify organization name that should be used for unauthenticated users +org_name = Main Org. + +# specify role for unauthenticated users +org_role = Viewer + +[server] +cert_file = /etc/grafana/ceph-dashboard.crt +cert_key = /etc/grafana/ceph-dashboard.key +domain = {{ ansible_fqdn }} +protocol = {{ dashboard_protocol }} + +[security] +admin_user = {{ grafana_admin_user }} +admin_password = {{ grafana_admin_password }} diff --git a/roles/ceph-grafana/templates/grafana.repo b/roles/ceph-grafana/templates/grafana.repo new file mode 100644 index 000000000..1ba7fb6ff --- /dev/null +++ b/roles/ceph-grafana/templates/grafana.repo @@ -0,0 +1,9 @@ +[grafana] +name=grafana +baseurl=https://packagecloud.io/grafana/stable/el/{{ ansible_distribution_major_version }}/$basearch +repo_gpgcheck=1 +enabled=1 +gpgcheck=1 +gpgkey=https://packagecloud.io/gpg.key https://grafanarel.s3.amazonaws.com/RPM-GPG-KEY-grafana +sslverify=1 +sslcacert=/etc/pki/tls/certs/ca-bundle.crt diff --git a/roles/ceph-handler/handlers/main.yml b/roles/ceph-handler/handlers/main.yml index 9333b9cc2..2a9f16dc7 100644 --- a/roles/ceph-handler/handlers/main.yml +++ b/roles/ceph-handler/handlers/main.yml @@ -458,3 +458,13 @@ set_fact: _rbd_target_api_handler_called: False listen: "restart ceph rbd-target-api" + + - name: restart node-exporter service + listen: "restart node-exporter service" + # We use the systemd module here so we can use the daemon_reload feature, + # since we're shipping the .service file ourselves + systemd: + name: 'node_exporter' + daemon_reload: true + enabled: true + state: restarted diff --git a/roles/ceph-infra/tasks/configure_firewall.yml b/roles/ceph-infra/tasks/configure_firewall.yml index dc89664e4..3f1dbb6b4 100644 --- a/roles/ceph-infra/tasks/configure_firewall.yml +++ b/roles/ceph-infra/tasks/configure_firewall.yml @@ -155,4 +155,38 @@ - iscsi_gw_group_name in group_names tags: firewall + - block: + - name: open grafana port + firewalld: + port: "3000/tcp" + zone: "{{ ceph_dashboard_firewall_zone }}" + permanent: true + immediate: true + state: enabled + + - name: open node_exporter port + firewalld: + port: "9100/tcp" + zone: "{{ ceph_dashboard_firewall_zone }}" + permanent: true + immediate: true + state: enabled + + - name: open mgr/prometheus port + firewalld: + port: "9283/tcp" + zone: "{{ ceph_dashboard_firewall_zone }}" + permanent: true + immediate: true + state: enabled + + - name: open dashboard port + firewalld: + port: "{{ dashboard_port }}/tcp" + zone: "{{ ceph_dashboard_firewall_zone }}" + permanent: true + immediate: true + state: enabled + when: dashboard_enabled + - meta: flush_handlers diff --git a/roles/ceph-mgr/tasks/main.yml b/roles/ceph-mgr/tasks/main.yml index 1acda2f56..29444d0b8 100644 --- a/roles/ceph-mgr/tasks/main.yml +++ b/roles/ceph-mgr/tasks/main.yml @@ -17,6 +17,6 @@ - name: include mgr_modules.yml include_tasks: mgr_modules.yml when: - - ceph_mgr_modules | length > 0 + - ceph_mgr_modules | length > 0 or dashboard_enabled - ((groups[mgr_group_name] | default([]) | length == 0 and inventory_hostname == groups[mon_group_name] | last) or (groups[mgr_group_name] | default([]) | length > 0 and inventory_hostname == groups[mgr_group_name] | last)) diff --git a/roles/ceph-mgr/tasks/pre_requisite.yml b/roles/ceph-mgr/tasks/pre_requisite.yml index 81bc623fa..b3bc689bf 100644 --- a/roles/ceph-mgr/tasks/pre_requisite.yml +++ b/roles/ceph-mgr/tasks/pre_requisite.yml @@ -7,6 +7,15 @@ until: result is succeeded when: ansible_os_family in ['RedHat', 'Suse'] +- name: install ceph-grafana-dashboards package on RedHat or SUSE + package: + name: ceph-grafana-dashboards + state: "{{ (upgrade_ceph_packages|bool) | ternary('latest','present') }}" + register: result + until: result is succeeded + when: + - ansible_os_family in ['RedHat', 'Suse'] + - name: install ceph-mgr packages for debian apt: name: '{{ ceph_mgr_packages }}' diff --git a/roles/ceph-node-exporter/defaults/main.yml b/roles/ceph-node-exporter/defaults/main.yml new file mode 100644 index 000000000..1f150b93e --- /dev/null +++ b/roles/ceph-node-exporter/defaults/main.yml @@ -0,0 +1,2 @@ +--- +node_exporter_container_image: prom/node-exporter:latest diff --git a/roles/ceph-node-exporter/files/node_exporter.service b/roles/ceph-node-exporter/files/node_exporter.service new file mode 100644 index 000000000..ebf57b162 --- /dev/null +++ b/roles/ceph-node-exporter/files/node_exporter.service @@ -0,0 +1,20 @@ +# This file is managed by ansible, don't make changes here - they will be +# overwritten. +[Unit] +Description=Node Exporter +After=docker.service + +[Service] +EnvironmentFile=-/etc/environment +ExecStart=/usr/bin/docker start --attach node-exporter +# Make sure the cfg80211 is loaded before running the container, the node +# exporter needs this module loaded to test for presence of wi-fi devices +ExecStartPre=/usr/sbin/modprobe cfg80211 +ExecStop=-/usr/bin/docker stop node-exporter +Restart=always +RestartSec=10s +TimeoutStartSec=120 +TimeoutStopSec=15 + +[Install] +WantedBy=multi-user.target diff --git a/roles/ceph-node-exporter/meta/main.yml b/roles/ceph-node-exporter/meta/main.yml new file mode 100644 index 000000000..633df08e3 --- /dev/null +++ b/roles/ceph-node-exporter/meta/main.yml @@ -0,0 +1,14 @@ +--- +galaxy_info: + company: Red Hat + author: Boris Ranto + description: Configures Prometheus Node Exporter + license: Apache + min_ansible_version: 2.4 + platforms: + - name: EL + versions: + - 7 + galaxy_tags: + - system +dependencies: [] diff --git a/roles/ceph-node-exporter/tasks/main.yml b/roles/ceph-node-exporter/tasks/main.yml new file mode 100644 index 000000000..c18707688 --- /dev/null +++ b/roles/ceph-node-exporter/tasks/main.yml @@ -0,0 +1,3 @@ +--- +- name: include setup_container.yml + include_tasks: setup_container.yml diff --git a/roles/ceph-node-exporter/tasks/setup_container.yml b/roles/ceph-node-exporter/tasks/setup_container.yml new file mode 100644 index 000000000..09035e28f --- /dev/null +++ b/roles/ceph-node-exporter/tasks/setup_container.yml @@ -0,0 +1,42 @@ +--- +- name: include ceph-container-common + include_role: + name: ceph-container-common + allow_duplicates: false + +- name: make sure the node_exporter service is down + service: + name: node_exporter + state: stopped + failed_when: false + +- name: start docker container + docker_container: + name: node-exporter + image: "{{ node_exporter_container_image }}" + state: started + command: + - '--path.procfs=/host/proc' + - '--path.sysfs=/host/sys' + - '--no-collector.timex' + # restart to allow updates + restart: true + restart_policy: no + force_kill: yes + detach: true + volumes: + - '/proc:/host/proc:ro' + - '/sys:/host/sys:ro' + network_mode: host + keep_volumes: true + pull: true + notify: restart node-exporter service + +- name: ship systemd service + copy: + src: node_exporter.service + dest: "/etc/systemd/system/" + owner: root + group: root + mode: 0644 + notify: restart node-exporter service diff --git a/roles/ceph-prometheus/defaults/main.yml b/roles/ceph-prometheus/defaults/main.yml new file mode 100644 index 000000000..4e92142d0 --- /dev/null +++ b/roles/ceph-prometheus/defaults/main.yml @@ -0,0 +1,17 @@ +--- +prometheus_container_image: prom/prometheus:latest +prometheus_container_cpu_period: 100000 +prometheus_container_cpu_cores: 2 +# container_memory is in GB +prometheus_container_memory: 4 +prometheus_data_dir: /var/lib/prometheus +prometheus_conf_dir: /etc/prometheus +prometheus_user_id: '65534' # This is the UID used by the prom/prometheus docker image + +alertmanager_container_image: prom/alertmanager:latest +alertmanager_container_cpu_period: 100000 +alertmanager_container_cpu_cores: 2 +# container_memory is in GB +alertmanager_container_memory: 4 +alertmanager_data_dir: /var/lib/alertmanager +alertmanager_conf_dir: /etc/alertmanager diff --git a/roles/ceph-prometheus/files/alertmanager.service b/roles/ceph-prometheus/files/alertmanager.service new file mode 100644 index 000000000..2683c231f --- /dev/null +++ b/roles/ceph-prometheus/files/alertmanager.service @@ -0,0 +1,17 @@ +# This file is managed by ansible, don't make changes here - they will be +# overwritten. +[Unit] +Description=alertmanager +After=docker.service + +[Service] +EnvironmentFile=-/etc/environment +ExecStart=/usr/bin/docker start --attach alertmanager +ExecStop=/usr/bin/docker stop alertmanager +Restart=always +RestartSec=10s +TimeoutStartSec=120 +TimeoutStopSec=15 + +[Install] +WantedBy=multi-user.target diff --git a/roles/ceph-prometheus/files/prometheus.service b/roles/ceph-prometheus/files/prometheus.service new file mode 100644 index 000000000..147093542 --- /dev/null +++ b/roles/ceph-prometheus/files/prometheus.service @@ -0,0 +1,17 @@ +# This file is managed by ansible, don't make changes here - they will be +# overwritten. +[Unit] +Description=prometheus +After=docker.service + +[Service] +EnvironmentFile=-/etc/environment +ExecStart=/usr/bin/docker start --attach prometheus +ExecStop=/usr/bin/docker stop prometheus +Restart=always +RestartSec=10s +TimeoutStartSec=120 +TimeoutStopSec=15 + +[Install] +WantedBy=multi-user.target diff --git a/roles/ceph-prometheus/handlers/main.yml b/roles/ceph-prometheus/handlers/main.yml new file mode 100644 index 000000000..1e84e565a --- /dev/null +++ b/roles/ceph-prometheus/handlers/main.yml @@ -0,0 +1,12 @@ +--- +- name: service handler + # We use the systemd module here so we can use the daemon_reload feature, + # since we're shipping the .service file ourselves + systemd: + name: "{{ item }}" + daemon_reload: true + enabled: true + state: restarted + with_items: + - 'alertmanager' + - 'prometheus' diff --git a/roles/ceph-prometheus/meta/main.yml b/roles/ceph-prometheus/meta/main.yml new file mode 100644 index 000000000..e97ea337d --- /dev/null +++ b/roles/ceph-prometheus/meta/main.yml @@ -0,0 +1,3 @@ +--- +dependencies: + - role: ceph-defaults diff --git a/roles/ceph-prometheus/tasks/main.yml b/roles/ceph-prometheus/tasks/main.yml new file mode 100644 index 000000000..aaa03099c --- /dev/null +++ b/roles/ceph-prometheus/tasks/main.yml @@ -0,0 +1,35 @@ +--- +- name: create prometheus directories + file: + path: "{{ item }}" + state: directory + owner: "{{ prometheus_user_id }}" + with_items: + - "{{ prometheus_conf_dir }}" + - "{{ prometheus_data_dir }}" + +- name: write prometheus config file + template: + src: prometheus.yml + dest: "{{ prometheus_conf_dir }}/" + owner: "{{ prometheus_user_id }}" + notify: service handler + +- name: create alertmanager directories + file: + path: "{{ item }}" + state: directory + owner: "root" + with_items: + - "{{ alertmanager_conf_dir }}" + - "{{ alertmanager_data_dir }}" + +- name: write alertmanager config file + template: + src: alertmanager.yml + dest: "{{ alertmanager_conf_dir }}/" + owner: "root" + notify: service handler + +- name: include setup_container.yml + include_tasks: setup_container.yml diff --git a/roles/ceph-prometheus/tasks/setup_container.yml b/roles/ceph-prometheus/tasks/setup_container.yml new file mode 100644 index 000000000..2574ab4f8 --- /dev/null +++ b/roles/ceph-prometheus/tasks/setup_container.yml @@ -0,0 +1,93 @@ +--- +- name: include ceph-container-common + include_role: + name: ceph-container-common + allow_duplicates: false + +- name: make sure the alertmanager service is down + service: + name: alertmanager + state: stopped + failed_when: false + +- name: start alertmanager container + docker_container: + name: alertmanager + image: "{{ alertmanager_container_image }}" + state: started + command: + - '--config.file=/etc/alertmanager/alertmanager.yml' + - '--storage.path=/alertmanager' + # restart to allow updates + restart: true + restart_policy: no + force_kill: yes + published_ports: '9093:9093' + detach: true + volumes: + - "{{ alertmanager_conf_dir }}:/etc/alertmanager:Z" + - "{{ alertmanager_data_dir }}:/alertmanager:Z" + networks: + - name: "{{ dashboard_network_name }}" + keep_volumes: true + pull: true + cpu_period: "{{ alertmanager_container_cpu_period }}" + # As of ansible-2.5.2, this module doesn't support the equivalent of the + # --cpus flag, so we must use period/quota for now + cpu_quota: "{{ alertmanager_container_cpu_period * alertmanager_container_cpu_cores }}" + #memory: 0 + #memory_swap: 0 + memory: "{{ alertmanager_container_memory }}GB" + memory_swap: "{{ alertmanager_container_memory * 2 }}GB" + notify: service handler + +- name: make sure the prometheus service is down + service: + name: prometheus + state: stopped + failed_when: false + +- name: start prometheus docker container + docker_container: + name: prometheus + image: "{{ prometheus_container_image }}" + state: started + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--web.external-url=http://{{ inventory_hostname }}:9090/' + # restart to allow updates + restart: true + restart_policy: no + force_kill: yes + published_ports: '9090:9090' + detach: true + volumes: + - "{{ prometheus_conf_dir }}:/etc/prometheus:Z" + - "{{ prometheus_data_dir }}:/prometheus:Z" + networks: + - name: "{{ dashboard_network_name }}" + user: "{{ prometheus_user_id }}" + keep_volumes: true + pull: true + cpu_period: "{{ prometheus_container_cpu_period }}" + # As of ansible-2.5.2, this module doesn't support the equivalent of the + # --cpus flag, so we must use period/quota for now + cpu_quota: "{{ prometheus_container_cpu_period * prometheus_container_cpu_cores }}" + #memory: 0 + #memory_swap: 0 + memory: "{{ prometheus_container_memory }}GB" + memory_swap: "{{ prometheus_container_memory * 2 }}GB" + notify: service handler + +- name: ship systemd services + copy: + src: "{{ item }}" + dest: "/etc/systemd/system/" + owner: root + group: root + mode: 0644 + with_items: + - 'alertmanager.service' + - 'prometheus.service' + notify: service handler diff --git a/roles/ceph-prometheus/templates/alertmanager.yml b/roles/ceph-prometheus/templates/alertmanager.yml new file mode 100644 index 000000000..4408de0aa --- /dev/null +++ b/roles/ceph-prometheus/templates/alertmanager.yml @@ -0,0 +1,15 @@ +global: + resolve_timeout: 5m + +route: + group_by: ['alertname'] + group_wait: 10s + group_interval: 10s + repeat_interval: 1h + receiver: 'ceph-dashboard' +receivers: +- name: 'ceph-dashboard' + webhook_configs: +{% for host in groups['mgrs'] | default(groups['mons']) %} + - url: '{{ dashboard_protocol }}://{{ host }}:{{ dashboard_port }}/api/prometheus_receiver' +{% endfor %} diff --git a/roles/ceph-prometheus/templates/prometheus.yml b/roles/ceph-prometheus/templates/prometheus.yml new file mode 100644 index 000000000..860eb5e6c --- /dev/null +++ b/roles/ceph-prometheus/templates/prometheus.yml @@ -0,0 +1,47 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +rule_files: + - '/etc/prometheus/alerts/*' + +scrape_configs: + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + - job_name: 'ceph' + honor_labels: true + static_configs: +{% for host in groups['mgrs'] | default(groups['mons']) %} + - targets: ['{{ host }}:9283'] + labels: + instance: 'ceph_cluster' +{% endfor %} + - job_name: 'node' + static_configs: +{% for host in (groups['all'] | difference(groups['grafana-server'])) %} + - targets: ['{{ host }}:9100'] + labels: + instance: "{{ hostvars[host]['ansible_nodename'] }}" +{% endfor %} + - job_name: 'grafana' + static_configs: +{% for host in groups['grafana-server'] %} + - targets: ['{{ host }}:9100'] + labels: + instance: "{{ hostvars[host]['ansible_nodename'] }}" +{% endfor %} +{% if 'iscsigws' in groups %} + - job_name: 'iscsi-gws' + static_configs: +{% for host in groups['iscsigws'] %} + - targets: ['{{ host }}:9287'] + labels: + instance: "{{ hostvars[host]['ansible_nodename'] }}" +{% endfor %} +{% endif %} +alerting: + alertmanagers: + - scheme: http + static_configs: + - targets: ['alertmanager:9093'] diff --git a/site-container.yml.sample b/site-container.yml.sample index 516116951..a76394f5f 100644 --- a/site-container.yml.sample +++ b/site-container.yml.sample @@ -13,6 +13,7 @@ - iscsigws - iscsi-gws # for backward compatibility only! - mgrs + - grafana-server gather_facts: false become: True @@ -113,6 +114,9 @@ tags: ['ceph_update_config'] - import_role: name: ceph-handler + - import_role: + name: ceph-node-exporter + when: dashboard_enabled - import_role: name: ceph-container-common - import_role: @@ -157,6 +161,9 @@ tags: ['ceph_update_config'] - import_role: name: ceph-handler + - import_role: + name: ceph-node-exporter + when: dashboard_enabled - import_role: name: ceph-container-common - import_role: @@ -196,6 +203,9 @@ tags: ['ceph_update_config'] - import_role: name: ceph-handler + - import_role: + name: ceph-node-exporter + when: dashboard_enabled - import_role: name: ceph-container-common - import_role: @@ -235,6 +245,9 @@ tags: ['ceph_update_config'] - import_role: name: ceph-handler + - import_role: + name: ceph-node-exporter + when: dashboard_enabled - import_role: name: ceph-container-common - import_role: @@ -274,6 +287,9 @@ tags: ['ceph_update_config'] - import_role: name: ceph-handler + - import_role: + name: ceph-node-exporter + when: dashboard_enabled - import_role: name: ceph-container-common - import_role: @@ -313,6 +329,9 @@ tags: ['ceph_update_config'] - import_role: name: ceph-handler + - import_role: + name: ceph-node-exporter + when: dashboard_enabled - import_role: name: ceph-container-common - import_role: @@ -352,6 +371,9 @@ tags: ['ceph_update_config'] - import_role: name: ceph-handler + - import_role: + name: ceph-node-exporter + when: dashboard_enabled - import_role: name: ceph-container-common - import_role: @@ -391,6 +413,9 @@ tags: ['ceph_update_config'] - import_role: name: ceph-handler + - import_role: + name: ceph-node-exporter + when: dashboard_enabled - import_role: name: ceph-container-common when: inventory_hostname == groups.get('clients', ['']) | first @@ -433,6 +458,9 @@ tags: ['ceph_update_config'] - import_role: name: ceph-handler + - import_role: + name: ceph-node-exporter + when: dashboard_enabled - import_role: name: ceph-container-common - import_role: @@ -482,3 +510,45 @@ delegate_to: "{{ groups[mon_group_name][0] }}" run_once: true when: not ceph_status.failed + +- hosts: grafana-server + become: true + tasks: + - import_role: + name: ceph-defaults + tags: ['ceph_update_config'] + when: dashboard_enabled + - import_role: + name: ceph-facts + tags: ['ceph_update_config'] + when: dashboard_enabled + - import_role: + name: ceph-handler + when: dashboard_enabled + - import_role: + name: ceph-node-exporter + when: dashboard_enabled + - import_role: + name: ceph-common + when: dashboard_enabled + - import_role: + name: ceph-config + tags: ['ceph_update_config'] + when: dashboard_enabled + - import_role: + name: ceph-prometheus + when: dashboard_enabled + - import_role: + name: ceph-grafana + when: dashboard_enabled + +- hosts: '{{ (groups["mgrs"] | default(groups["mons"]))[0] }}' + become: true + tasks: + - import_role: + name: ceph-defaults + tags: ['ceph_update_config'] + when: dashboard_enabled + - import_role: + name: ceph-dashboard + when: dashboard_enabled diff --git a/site.yml.sample b/site.yml.sample index 9aa1923c8..23250a79a 100644 --- a/site.yml.sample +++ b/site.yml.sample @@ -13,6 +13,7 @@ - mgrs - iscsigws - iscsi-gws # for backward compatibility only! + - grafana-server gather_facts: false any_errors_fatal: true @@ -92,6 +93,9 @@ tags: ['ceph_update_config'] - import_role: name: ceph-handler + - import_role: + name: ceph-node-exporter + when: dashboard_enabled - import_role: name: ceph-common - import_role: @@ -134,6 +138,9 @@ tags: ['ceph_update_config'] - import_role: name: ceph-handler + - import_role: + name: ceph-node-exporter + when: dashboard_enabled - import_role: name: ceph-common - import_role: @@ -173,6 +180,9 @@ tags: ['ceph_update_config'] - import_role: name: ceph-handler + - import_role: + name: ceph-node-exporter + when: dashboard_enabled - import_role: name: ceph-common - import_role: @@ -212,6 +222,9 @@ tags: ['ceph_update_config'] - import_role: name: ceph-handler + - import_role: + name: ceph-node-exporter + when: dashboard_enabled - import_role: name: ceph-common - import_role: @@ -251,6 +264,9 @@ tags: ['ceph_update_config'] - import_role: name: ceph-handler + - import_role: + name: ceph-node-exporter + when: dashboard_enabled - import_role: name: ceph-common - import_role: @@ -290,6 +306,9 @@ tags: ['ceph_update_config'] - import_role: name: ceph-handler + - import_role: + name: ceph-node-exporter + when: dashboard_enabled - import_role: name: ceph-common - import_role: @@ -329,6 +348,9 @@ tags: ['ceph_update_config'] - import_role: name: ceph-handler + - import_role: + name: ceph-node-exporter + when: dashboard_enabled - import_role: name: ceph-common - import_role: @@ -368,6 +390,9 @@ tags: ['ceph_update_config'] - import_role: name: ceph-handler + - import_role: + name: ceph-node-exporter + when: dashboard_enabled - import_role: name: ceph-common - import_role: @@ -407,6 +432,9 @@ tags: ['ceph_update_config'] - import_role: name: ceph-handler + - import_role: + name: ceph-node-exporter + when: dashboard_enabled - import_role: name: ceph-common - import_role: @@ -448,6 +476,9 @@ tags: ['ceph_update_config'] - import_role: name: ceph-handler + - import_role: + name: ceph-node-exporter + when: dashboard_enabled - import_role: name: ceph-common - import_role: @@ -485,3 +516,45 @@ delegate_to: "{{ groups[mon_group_name][0] }}" run_once: true when: not ceph_status.failed + +- hosts: grafana-server + become: true + tasks: + - import_role: + name: ceph-defaults + tags: ['ceph_update_config'] + when: dashboard_enabled + - import_role: + name: ceph-facts + tags: ['ceph_update_config'] + when: dashboard_enabled + - import_role: + name: ceph-handler + when: dashboard_enabled + - import_role: + name: ceph-node-exporter + when: dashboard_enabled + - import_role: + name: ceph-common + when: dashboard_enabled + - import_role: + name: ceph-config + tags: ['ceph_update_config'] + when: dashboard_enabled + - import_role: + name: ceph-prometheus + when: dashboard_enabled + - import_role: + name: ceph-grafana + when: dashboard_enabled + +- hosts: '{{ (groups["mgrs"] | default(groups["mons"]))[0] }}' + become: true + tasks: + - import_role: + name: ceph-defaults + tags: ['ceph_update_config'] + when: dashboard_enabled + - import_role: + name: ceph-dashboard + when: dashboard_enabled