infra: add playbook to purge dashboard/monitoring

The dashboard/monitoring stack can be deployed via the dashboard_enabled variable. But there's nothing similar if we can to remove that part only and keep the ceph cluster up and running. The current purge playbooks remove everything. Closes: https://bugzilla.redhat.com/show_bug.cgi?id=1786691 Signed-off-by: Dimitri Savineau <dsavinea@redhat.com> (cherry picked from commit 8e4ef7d6da)
2021-07-05 14:07:05 -04:00 · 2021-07-05 14:07:05 -04:00 · c8ca73f620
parent 4695df6d2b
commit c8ca73f620
2 changed files with 227 additions and 1 deletions
--- a/infrastructure-playbooks/purge-dashboard.yml
+++ b/infrastructure-playbooks/purge-dashboard.yml
@ -0,0 +1,205 @@
 ---
 # This playbook purges the Ceph MGR Dashboard and Monitoring
 # (alertmanager/prometheus/grafana/node-exporter) stack.
 # It removes: packages, configuration files and ALL THE DATA
 #
 # Use it like this:
 # ansible-playbook purge-dashboard.yml
 #     Prompts for confirmation to purge, defaults to no and
 #     doesn't purge anything. yes purges the dashboard and
 #     monitoring stack.
 #
 # ansible-playbook -e ireallymeanit=yes|no purge-dashboard.yml
 #     Overrides the prompt using -e option. Can be used in
 #     automation scripts to avoid interactive prompt.
 - name: confirm whether user really meant to purge the dashboard
  hosts: localhost
  gather_facts: false
  vars_prompt:
    - name: ireallymeanit
      prompt: Are you sure you want to purge the dashboard?
      default: 'no'
      private: no
  tasks:
    - name: exit playbook, if user did not mean to purge dashboard
      fail:
        msg: >
          "Exiting purge-dashboard playbook, dashboard was NOT purged.
           To purge the dashboard, either say 'yes' on the prompt or
           or use `-e ireallymeanit=yes` on the command line when
           invoking the playbook"
      when: ireallymeanit != 'yes'
 - name: gather facts on all hosts
  hosts:
    - "{{ mon_group_name|default('mons') }}"
    - "{{ osd_group_name|default('osds') }}"
    - "{{ mds_group_name|default('mdss') }}"
    - "{{ rgw_group_name|default('rgws') }}"
    - "{{ rbdmirror_group_name|default('rbdmirrors') }}"
    - "{{ nfs_group_name|default('nfss') }}"
    - "{{ client_group_name|default('clients') }}"
    - "{{ mgr_group_name|default('mgrs') }}"
    - "{{ grafana_server_group_name | default('grafana-server') }}"
  become: true
  tasks:
    - debug: msg="gather facts on all Ceph hosts for following reference"
 - name: purge node exporter
  hosts:
    - "{{ mon_group_name|default('mons') }}"
    - "{{ osd_group_name|default('osds') }}"
    - "{{ mds_group_name|default('mdss') }}"
    - "{{ rgw_group_name|default('rgws') }}"
    - "{{ rbdmirror_group_name|default('rbdmirrors') }}"
    - "{{ nfs_group_name|default('nfss') }}"
    - "{{ client_group_name|default('clients') }}"
    - "{{ mgr_group_name|default('mgrs') }}"
    - "{{ grafana_server_group_name | default('grafana-server') }}"
  gather_facts: false
  become: true
  tasks:
    - import_role:
        name: ceph-defaults
    - import_role:
        name: ceph-facts
        tasks_from: container_binary
    - name: disable node_exporter service
      service:
        name: node_exporter
        state: stopped
        enabled: no
      failed_when: false
    - name: remove node_exporter service file
      file:
        name: /etc/systemd/system/node_exporter.service
        state: absent
    - name: remove node-exporter image
      command: "{{ container_binary }} rmi {{ node_exporter_container_image }}"
      changed_when: false
      failed_when: false
 - name: purge ceph monitoring
  hosts: "{{ grafana_server_group_name | default('grafana-server') }}"
  gather_facts: false
  become: true
  tasks:
    - import_role:
        name: ceph-defaults
    - import_role:
        name: ceph-facts
        tasks_from: container_binary
    - name: stop services
      service:
        name: "{{ item }}"
        state: stopped
        enabled: no
      failed_when: false
      loop:
        - alertmanager
        - prometheus
        - grafana-server
    - name: remove systemd service files
      file:
        name: "/etc/systemd/system/{{ item }}.service"
        state: absent
      loop:
        - alertmanager
        - prometheus
        - grafana-server
    - name: remove ceph dashboard container images
      command: "{{ container_binary }} rmi {{ item }}"
      loop:
        - "{{ alertmanager_container_image }}"
        - "{{ prometheus_container_image }}"
        - "{{ grafana_container_image }}"
      changed_when: false
      failed_when: false
    - name: remove ceph-grafana-dashboards package on RedHat or SUSE
      package:
        name: ceph-grafana-dashboards
        state: absent
      when:
        - not containerized_deployment | bool
        - ansible_facts['os_family'] in ['RedHat', 'Suse']
    - name: remove data
      file:
        name: "{{ item }}"
        state: absent
      loop:
        - "{{ alertmanager_conf_dir }}"
        - "{{ prometheus_conf_dir }}"
        - /etc/grafana
        - "{{ alertmanager_data_dir }}"
        - "{{ prometheus_data_dir }}"
        - /var/lib/grafana
 - name: purge ceph dashboard
  hosts: "{{ groups[mgr_group_name] | default(groups[mon_group_name]) | default(omit) }}"
  gather_facts: false
  become: true
  environment:
    CEPH_CONTAINER_IMAGE: "{{ ceph_docker_registry + '/' + ceph_docker_image + ':' + ceph_docker_image_tag if containerized_deployment | bool else None }}"
    CEPH_CONTAINER_BINARY: "{{ container_binary }}"
  tasks:
    - import_role:
        name: ceph-defaults
    - import_role:
        name: ceph-facts
        tasks_from: container_binary
    - name: set_fact ceph_cmd radosgw_cmd
      set_fact:
        ceph_cmd: "{{ hostvars[groups[mon_group_name][0]]['container_binary'] + ' run --net=host --rm -v /etc/ceph:/etc/ceph:z --entrypoint=ceph ' + ceph_docker_registry + '/' + ceph_docker_image + ':' + ceph_docker_image_tag if containerized_deployment | bool else 'ceph' }}"
        radosgw_cmd: "{{ hostvars[groups[mon_group_name][0]]['container_binary'] + ' run --net=host --rm -v /etc/ceph:/etc/ceph:z --entrypoint=radosgw-admin ' + ceph_docker_registry + '/' + ceph_docker_image + ':' + ceph_docker_image_tag if containerized_deployment | bool else 'radosgw-admin' }}"
    - name: remove the dashboard admin user
      command: "{{ ceph_cmd }} --cluster {{ cluster }} dashboard ac-user-delete {{ dashboard_admin_user }}"
      run_once: true
      changed_when: false
      failed_when: false
      delegate_to: "{{ groups[mon_group_name][0] }}"
    - name: remove radosgw system user
      command: "{{ radosgw_cmd }} --cluster {{ cluster }} user rm --uid={{ dashboard_rgw_api_user_id }}"
      run_once: true
      changed_when: false
      failed_when: false
      delegate_to: "{{ groups[mon_group_name][0] }}"
      when: groups.get(rgw_group_name, []) | length > 0
    - name: disable mgr dashboard and prometheus modules
      command: "{{ ceph_cmd }} --cluster {{ cluster }} mgr module disable {{ item }}"
      delegate_to: "{{ groups[mon_group_name][0] }}"
      run_once: true
      changed_when: false
      loop:
        - dashboard
        - prometheus
    - name: remove TLS certificate and key files
      file:
        name: "/etc/ceph/ceph-dashboard.{{ item }}"
        state: absent
      loop:
        - crt
        - key
      when: dashboard_protocol == "https"
    - name: remove ceph-mgr-dashboard package
      package:
        name: ceph-mgr-dashboard
        state: absent
      when: not containerized_deployment | bool
--- a/tox.ini
+++ b/tox.ini
@ -1,5 +1,5 @@
 [tox]
-envlist = {centos,ubuntu}-{container,non_container}-{all_daemons,collocation,lvm_osds,shrink_mon,shrink_osd,shrink_mgr,shrink_mds,shrink_rbdmirror,shrink_rgw,lvm_batch,add_mons,add_osds,add_mgrs,add_mdss,add_rbdmirrors,add_rgws,rgw_multisite,purge,storage_inventory,lvm_auto_discovery,all_in_one}
+envlist = {centos,ubuntu}-{container,non_container}-{all_daemons,collocation,lvm_osds,shrink_mon,shrink_osd,shrink_mgr,shrink_mds,shrink_rbdmirror,shrink_rgw,lvm_batch,add_mons,add_osds,add_mgrs,add_mdss,add_rbdmirrors,add_rgws,rgw_multisite,purge,storage_inventory,lvm_auto_discovery,all_in_one,purge_dashboard}
  {centos,ubuntu}-container-{ooo_collocation}
  {centos,ubuntu}-non_container-{switch_to_containers}
  infra_lv_create
@ -132,6 +132,25 @@ commands=
  # test that the cluster can be redeployed in a healthy state
  py.test --reruns 5 --reruns-delay 1 -n 8 --durations=0 --sudo -v --connection=ansible --ansible-inventory={changedir}/{env:INVENTORY} --ssh-config={changedir}/vagrant_ssh_config {toxinidir}/tests/functional/tests
 [purge-dashboard]
 commands=
  ansible-playbook -vv -i {changedir}/{env:INVENTORY} {toxinidir}/infrastructure-playbooks/purge-dashboard.yml --extra-vars "\
      ireallymeanit=yes \
      ceph_docker_registry={env:CEPH_DOCKER_REGISTRY:quay.ceph.io} \
      ceph_docker_image={env:CEPH_DOCKER_IMAGE:ceph-ci/daemon} \
      ceph_docker_image_tag={env:CEPH_DOCKER_IMAGE_TAG:latest-nautilus} \
  "
  # set up the cluster again
  ansible-playbook -vv -i {changedir}/{env:INVENTORY} {toxinidir}/{env:PLAYBOOK:site.yml.sample} --extra-vars @ceph-override.json --extra-vars "\
      ceph_stable_release={env:CEPH_STABLE_RELEASE:nautilus} \
      ceph_docker_registry_auth=True \
      ceph_docker_registry_username={env:DOCKER_HUB_USERNAME} \
      ceph_docker_registry_password={env:DOCKER_HUB_PASSWORD} \
  "
  # test that the cluster can be redeployed in a healthy state
  py.test --reruns 5 --reruns-delay 1 -n 8 --durations=0 --sudo -v --connection=ansible --ansible-inventory={changedir}/{env:INVENTORY} --ssh-config={changedir}/vagrant_ssh_config {toxinidir}/tests/functional/tests
 [purge-lvm]
 commands=
  ansible-playbook -vv -i {changedir}/{env:INVENTORY} {toxinidir}/infrastructure-playbooks/{env:PURGE_PLAYBOOK:purge-cluster.yml} --extra-vars "\
@ -375,6 +394,7 @@ changedir=
  # tests a 1 mon, 1 osd, 1 mds and 1 rgw centos7 cluster using docker
  collocation: {toxinidir}/tests/functional/collocation{env:CONTAINER_DIR:}
  purge: {toxinidir}/tests/functional/all_daemons{env:CONTAINER_DIR:}
  purge_dashboard: {toxinidir}/tests/functional/all_daemons{env:CONTAINER_DIR:}
  switch_to_containers: {toxinidir}/tests/functional/all_daemons
  lvm_osds: {toxinidir}/tests/functional/lvm-osds{env:CONTAINER_DIR:}
  lvm_batch: {toxinidir}/tests/functional/lvm-batch{env:CONTAINER_DIR:}
@ -425,6 +445,7 @@ commands=
  all_daemons,all_in_one,collocation: ansible-playbook -vv -i {changedir}/{env:INVENTORY} {toxinidir}/{env:PLAYBOOK:site.yml.sample} --extra-vars "delegate_facts_host={env:DELEGATE_FACTS_HOST:True} ceph_stable_release={env:CEPH_STABLE_RELEASE:nautilus} ceph_docker_image_tag={env:CEPH_DOCKER_IMAGE_TAG_BIS:latest-bis-nautilus}" --extra-vars @ceph-override.json
  purge: {[purge]commands}
  purge_dashboard: {[purge-dashboard]commands}
  switch_to_containers: {[switch-to-containers]commands}
  shrink_mon: {[shrink-mon]commands}
  shrink_osd: {[shrink-osd]commands}