infra: add playbook to purge dashboard/monitoring

The dashboard/monitoring stack can be deployed via the dashboard_enabled
variable. But there's nothing similar if we can to remove that part only
and keep the ceph cluster up and running.
The current purge playbooks remove everything.

Closes: https://bugzilla.redhat.com/show_bug.cgi?id=1786691

Signed-off-by: Dimitri Savineau <dsavinea@redhat.com>
(cherry picked from commit 8e4ef7d6da)
pull/6749/head
Dimitri Savineau 2021-07-05 14:07:05 -04:00 committed by Guillaume Abrioux
parent 4695df6d2b
commit c8ca73f620
2 changed files with 227 additions and 1 deletions

View File

@ -0,0 +1,205 @@
---
# This playbook purges the Ceph MGR Dashboard and Monitoring
# (alertmanager/prometheus/grafana/node-exporter) stack.
# It removes: packages, configuration files and ALL THE DATA
#
# Use it like this:
# ansible-playbook purge-dashboard.yml
# Prompts for confirmation to purge, defaults to no and
# doesn't purge anything. yes purges the dashboard and
# monitoring stack.
#
# ansible-playbook -e ireallymeanit=yes|no purge-dashboard.yml
# Overrides the prompt using -e option. Can be used in
# automation scripts to avoid interactive prompt.
- name: confirm whether user really meant to purge the dashboard
hosts: localhost
gather_facts: false
vars_prompt:
- name: ireallymeanit
prompt: Are you sure you want to purge the dashboard?
default: 'no'
private: no
tasks:
- name: exit playbook, if user did not mean to purge dashboard
fail:
msg: >
"Exiting purge-dashboard playbook, dashboard was NOT purged.
To purge the dashboard, either say 'yes' on the prompt or
or use `-e ireallymeanit=yes` on the command line when
invoking the playbook"
when: ireallymeanit != 'yes'
- name: gather facts on all hosts
hosts:
- "{{ mon_group_name|default('mons') }}"
- "{{ osd_group_name|default('osds') }}"
- "{{ mds_group_name|default('mdss') }}"
- "{{ rgw_group_name|default('rgws') }}"
- "{{ rbdmirror_group_name|default('rbdmirrors') }}"
- "{{ nfs_group_name|default('nfss') }}"
- "{{ client_group_name|default('clients') }}"
- "{{ mgr_group_name|default('mgrs') }}"
- "{{ grafana_server_group_name | default('grafana-server') }}"
become: true
tasks:
- debug: msg="gather facts on all Ceph hosts for following reference"
- name: purge node exporter
hosts:
- "{{ mon_group_name|default('mons') }}"
- "{{ osd_group_name|default('osds') }}"
- "{{ mds_group_name|default('mdss') }}"
- "{{ rgw_group_name|default('rgws') }}"
- "{{ rbdmirror_group_name|default('rbdmirrors') }}"
- "{{ nfs_group_name|default('nfss') }}"
- "{{ client_group_name|default('clients') }}"
- "{{ mgr_group_name|default('mgrs') }}"
- "{{ grafana_server_group_name | default('grafana-server') }}"
gather_facts: false
become: true
tasks:
- import_role:
name: ceph-defaults
- import_role:
name: ceph-facts
tasks_from: container_binary
- name: disable node_exporter service
service:
name: node_exporter
state: stopped
enabled: no
failed_when: false
- name: remove node_exporter service file
file:
name: /etc/systemd/system/node_exporter.service
state: absent
- name: remove node-exporter image
command: "{{ container_binary }} rmi {{ node_exporter_container_image }}"
changed_when: false
failed_when: false
- name: purge ceph monitoring
hosts: "{{ grafana_server_group_name | default('grafana-server') }}"
gather_facts: false
become: true
tasks:
- import_role:
name: ceph-defaults
- import_role:
name: ceph-facts
tasks_from: container_binary
- name: stop services
service:
name: "{{ item }}"
state: stopped
enabled: no
failed_when: false
loop:
- alertmanager
- prometheus
- grafana-server
- name: remove systemd service files
file:
name: "/etc/systemd/system/{{ item }}.service"
state: absent
loop:
- alertmanager
- prometheus
- grafana-server
- name: remove ceph dashboard container images
command: "{{ container_binary }} rmi {{ item }}"
loop:
- "{{ alertmanager_container_image }}"
- "{{ prometheus_container_image }}"
- "{{ grafana_container_image }}"
changed_when: false
failed_when: false
- name: remove ceph-grafana-dashboards package on RedHat or SUSE
package:
name: ceph-grafana-dashboards
state: absent
when:
- not containerized_deployment | bool
- ansible_facts['os_family'] in ['RedHat', 'Suse']
- name: remove data
file:
name: "{{ item }}"
state: absent
loop:
- "{{ alertmanager_conf_dir }}"
- "{{ prometheus_conf_dir }}"
- /etc/grafana
- "{{ alertmanager_data_dir }}"
- "{{ prometheus_data_dir }}"
- /var/lib/grafana
- name: purge ceph dashboard
hosts: "{{ groups[mgr_group_name] | default(groups[mon_group_name]) | default(omit) }}"
gather_facts: false
become: true
environment:
CEPH_CONTAINER_IMAGE: "{{ ceph_docker_registry + '/' + ceph_docker_image + ':' + ceph_docker_image_tag if containerized_deployment | bool else None }}"
CEPH_CONTAINER_BINARY: "{{ container_binary }}"
tasks:
- import_role:
name: ceph-defaults
- import_role:
name: ceph-facts
tasks_from: container_binary
- name: set_fact ceph_cmd radosgw_cmd
set_fact:
ceph_cmd: "{{ hostvars[groups[mon_group_name][0]]['container_binary'] + ' run --net=host --rm -v /etc/ceph:/etc/ceph:z --entrypoint=ceph ' + ceph_docker_registry + '/' + ceph_docker_image + ':' + ceph_docker_image_tag if containerized_deployment | bool else 'ceph' }}"
radosgw_cmd: "{{ hostvars[groups[mon_group_name][0]]['container_binary'] + ' run --net=host --rm -v /etc/ceph:/etc/ceph:z --entrypoint=radosgw-admin ' + ceph_docker_registry + '/' + ceph_docker_image + ':' + ceph_docker_image_tag if containerized_deployment | bool else 'radosgw-admin' }}"
- name: remove the dashboard admin user
command: "{{ ceph_cmd }} --cluster {{ cluster }} dashboard ac-user-delete {{ dashboard_admin_user }}"
run_once: true
changed_when: false
failed_when: false
delegate_to: "{{ groups[mon_group_name][0] }}"
- name: remove radosgw system user
command: "{{ radosgw_cmd }} --cluster {{ cluster }} user rm --uid={{ dashboard_rgw_api_user_id }}"
run_once: true
changed_when: false
failed_when: false
delegate_to: "{{ groups[mon_group_name][0] }}"
when: groups.get(rgw_group_name, []) | length > 0
- name: disable mgr dashboard and prometheus modules
command: "{{ ceph_cmd }} --cluster {{ cluster }} mgr module disable {{ item }}"
delegate_to: "{{ groups[mon_group_name][0] }}"
run_once: true
changed_when: false
loop:
- dashboard
- prometheus
- name: remove TLS certificate and key files
file:
name: "/etc/ceph/ceph-dashboard.{{ item }}"
state: absent
loop:
- crt
- key
when: dashboard_protocol == "https"
- name: remove ceph-mgr-dashboard package
package:
name: ceph-mgr-dashboard
state: absent
when: not containerized_deployment | bool

23
tox.ini
View File

@ -1,5 +1,5 @@
[tox]
envlist = {centos,ubuntu}-{container,non_container}-{all_daemons,collocation,lvm_osds,shrink_mon,shrink_osd,shrink_mgr,shrink_mds,shrink_rbdmirror,shrink_rgw,lvm_batch,add_mons,add_osds,add_mgrs,add_mdss,add_rbdmirrors,add_rgws,rgw_multisite,purge,storage_inventory,lvm_auto_discovery,all_in_one}
envlist = {centos,ubuntu}-{container,non_container}-{all_daemons,collocation,lvm_osds,shrink_mon,shrink_osd,shrink_mgr,shrink_mds,shrink_rbdmirror,shrink_rgw,lvm_batch,add_mons,add_osds,add_mgrs,add_mdss,add_rbdmirrors,add_rgws,rgw_multisite,purge,storage_inventory,lvm_auto_discovery,all_in_one,purge_dashboard}
{centos,ubuntu}-container-{ooo_collocation}
{centos,ubuntu}-non_container-{switch_to_containers}
infra_lv_create
@ -132,6 +132,25 @@ commands=
# test that the cluster can be redeployed in a healthy state
py.test --reruns 5 --reruns-delay 1 -n 8 --durations=0 --sudo -v --connection=ansible --ansible-inventory={changedir}/{env:INVENTORY} --ssh-config={changedir}/vagrant_ssh_config {toxinidir}/tests/functional/tests
[purge-dashboard]
commands=
ansible-playbook -vv -i {changedir}/{env:INVENTORY} {toxinidir}/infrastructure-playbooks/purge-dashboard.yml --extra-vars "\
ireallymeanit=yes \
ceph_docker_registry={env:CEPH_DOCKER_REGISTRY:quay.ceph.io} \
ceph_docker_image={env:CEPH_DOCKER_IMAGE:ceph-ci/daemon} \
ceph_docker_image_tag={env:CEPH_DOCKER_IMAGE_TAG:latest-nautilus} \
"
# set up the cluster again
ansible-playbook -vv -i {changedir}/{env:INVENTORY} {toxinidir}/{env:PLAYBOOK:site.yml.sample} --extra-vars @ceph-override.json --extra-vars "\
ceph_stable_release={env:CEPH_STABLE_RELEASE:nautilus} \
ceph_docker_registry_auth=True \
ceph_docker_registry_username={env:DOCKER_HUB_USERNAME} \
ceph_docker_registry_password={env:DOCKER_HUB_PASSWORD} \
"
# test that the cluster can be redeployed in a healthy state
py.test --reruns 5 --reruns-delay 1 -n 8 --durations=0 --sudo -v --connection=ansible --ansible-inventory={changedir}/{env:INVENTORY} --ssh-config={changedir}/vagrant_ssh_config {toxinidir}/tests/functional/tests
[purge-lvm]
commands=
ansible-playbook -vv -i {changedir}/{env:INVENTORY} {toxinidir}/infrastructure-playbooks/{env:PURGE_PLAYBOOK:purge-cluster.yml} --extra-vars "\
@ -375,6 +394,7 @@ changedir=
# tests a 1 mon, 1 osd, 1 mds and 1 rgw centos7 cluster using docker
collocation: {toxinidir}/tests/functional/collocation{env:CONTAINER_DIR:}
purge: {toxinidir}/tests/functional/all_daemons{env:CONTAINER_DIR:}
purge_dashboard: {toxinidir}/tests/functional/all_daemons{env:CONTAINER_DIR:}
switch_to_containers: {toxinidir}/tests/functional/all_daemons
lvm_osds: {toxinidir}/tests/functional/lvm-osds{env:CONTAINER_DIR:}
lvm_batch: {toxinidir}/tests/functional/lvm-batch{env:CONTAINER_DIR:}
@ -425,6 +445,7 @@ commands=
all_daemons,all_in_one,collocation: ansible-playbook -vv -i {changedir}/{env:INVENTORY} {toxinidir}/{env:PLAYBOOK:site.yml.sample} --extra-vars "delegate_facts_host={env:DELEGATE_FACTS_HOST:True} ceph_stable_release={env:CEPH_STABLE_RELEASE:nautilus} ceph_docker_image_tag={env:CEPH_DOCKER_IMAGE_TAG_BIS:latest-bis-nautilus}" --extra-vars @ceph-override.json
purge: {[purge]commands}
purge_dashboard: {[purge-dashboard]commands}
switch_to_containers: {[switch-to-containers]commands}
shrink_mon: {[shrink-mon]commands}
shrink_osd: {[shrink-osd]commands}