ceph-ansible/infrastructure-playbooks/purge-cluster.yml

---
# This playbook purges Ceph
# It removes: packages, configuration files and ALL THE DATA
#
# Use it like this:
# ansible-playbook purge-cluster.yml
#     Prompts for confirmation to purge, defaults to no and
#     doesn't purge the cluster. yes purges the cluster.
#
# ansible-playbook -e ireallymeanit=yes|no purge-cluster.yml
#     Overrides the prompt using -e option. Can be used in
#     automation scripts to avoid interactive prompt.

- name: confirm whether user really meant to purge the cluster
  hosts: localhost
  gather_facts: false

  vars_prompt:
    - name: ireallymeanit
      prompt: Are you sure you want to purge the cluster?
      default: 'no'
      private: no

  tasks:
  - name: exit playbook, if user did not mean to purge cluster
    fail:
      msg: >
        "Exiting purge-cluster playbook, cluster was NOT purged.
         To purge the cluster, either say 'yes' on the prompt or
         or use `-e ireallymeanit=yes` on the command line when
         invoking the playbook"
    when: ireallymeanit != 'yes'

- name: gather facts on all hosts

  hosts:
    - "{{ mon_group_name|default('mons') }}"
    - "{{ osd_group_name|default('osds') }}"
    - "{{ mds_group_name|default('mdss') }}"
    - "{{ rgw_group_name|default('rgws') }}"
    - "{{ rbdmirror_group_name|default('rbdmirrors') }}"
    - "{{ nfs_group_name|default('nfss') }}"
    - "{{ client_group_name|default('clients') }}"
    - "{{ mgr_group_name|default('mgrs') }}"
    - "{{ monitoring_group_name | default('monitoring') }}"

  become: true

  tasks:
    - debug: msg="gather facts on all Ceph hosts for following reference"

- name: check there's no ceph kernel threads present
  hosts: "{{ client_group_name|default('clients') }}"
  become: true
  any_errors_fatal: true

  tasks:
    - import_role:
        name: ceph-defaults

    - block:
        - name: get nfs nodes ansible facts
          setup:
            gather_subset:
              - 'all'
              - '!facter'
              - '!ohai'
          delegate_to: "{{ item }}"
          delegate_facts: True
          with_items: "{{ groups[nfs_group_name] }}"
          run_once: true

        - name: get all nfs-ganesha mount points
          command: grep "{{ hostvars[item]['ansible_facts']['all_ipv4_addresses'] | ips_in_ranges(public_network.split(',')) | first }}" /proc/mounts
          register: nfs_ganesha_mount_points
          failed_when: false
          with_items: "{{ groups[nfs_group_name] }}"

        - name: ensure nfs-ganesha mountpoint(s) are unmounted
          mount:
            path: "{{ item.split(' ')[1] }}"
            state: unmounted
          with_items:
            - "{{ nfs_ganesha_mount_points.results | map(attribute='stdout_lines') | list }}"
          when: item | length > 0
      when: groups[nfs_group_name] | default([]) | length > 0

    - name: ensure cephfs mountpoint(s) are unmounted
      command: umount -a -t ceph
      changed_when: false

    - name: find mapped rbd ids
      find:
        paths: /sys/bus/rbd/devices
        file_type: any
      register: rbd_mapped_ids

    - name: use sysfs to unmap rbd devices
      shell: "echo {{ item.path | basename }} > /sys/bus/rbd/remove_single_major"
      changed_when: false
      with_items: "{{ rbd_mapped_ids.files }}"

    - name: unload ceph kernel modules
      modprobe:
        name: "{{ item }}"
        state: absent
      with_items:
        - rbd
        - ceph
        - libceph

- name: purge ceph nfs cluster

  vars:
    nfs_group_name: nfss

  hosts: "{{ nfs_group_name|default('nfss') }}"

  gather_facts: false # Already gathered previously

  become: true

  tasks:

  - name: stop ceph nfss with systemd
    service:
      name: nfs-ganesha
      state: stopped
    failed_when: false
    when: ansible_facts['service_mgr'] == 'systemd'

- name: purge node-exporter
  hosts:
    - "{{ mon_group_name|default('mons') }}"
    - "{{ osd_group_name|default('osds') }}"
    - "{{ mds_group_name|default('mdss') }}"
    - "{{ rgw_group_name|default('rgws') }}"
    - "{{ rbdmirror_group_name|default('rbdmirrors') }}"
    - "{{ nfs_group_name|default('nfss') }}"
    - "{{ client_group_name|default('clients') }}"
    - "{{ mgr_group_name|default('mgrs') }}"
    - "{{ monitoring_group_name|default('monitoring') }}"
    - clients
    - iscsigws

  become: true

  tasks:
    - import_role:
        name: ceph-defaults

    - block:
        - import_role:
            name: ceph-facts
            tasks_from: container_binary

        - name: disable node_exporter service
          service:
            name: node_exporter
            state: stopped
            enabled: no
          failed_when: false

        - name: remove node_exporter service file
          file:
            name: /etc/systemd/system/node_exporter.service
            state: absent

        - name: remove node-exporter image
          command: "{{ container_binary }} rmi {{ node_exporter_container_image }}"
          failed_when: false
          tags:
            - remove_img
      when: dashboard_enabled | bool


- name: purge ceph monitoring
  hosts: monitoring
  become: true
  vars:
    grafana_services:
      - grafana-server
      - prometheus
      - alertmanager

  tasks:
    - import_role:
        name: ceph-defaults

    - block:
        - import_role:
            name: ceph-facts
            tasks_from: container_binary

        - name: stop services
          service:
            name: "{{ item }}"
            state: stopped
            enabled: no
          with_items: "{{ grafana_services }}"
          failed_when: false

        - name: remove service files
          file:
            name: "/etc/systemd/system/{{ item }}.service"
            state: absent
          with_items: "{{ grafana_services }}"
          failed_when: false

        - name: remove ceph dashboard container images
          command: "{{ container_binary }} rmi {{ item }}"
          with_items:
            - "{{ prometheus_container_image }}"
            - "{{ grafana_container_image }}"
            - "{{ alertmanager_container_image }}"
          failed_when: false
          tags:
            - remove_img

        - name: remove data
          file:
            name: "{{ item }}"
            state: absent
          with_items:
            - /etc/grafana/dashboards
            - /etc/grafana/grafana.ini
            - /etc/grafana/provisioning
            - /var/lib/grafana
            - /etc/alertmanager
            - /var/lib/alertmanager
            - /var/lib/prometheus
            - /etc/prometheus
          failed_when: false
      when: dashboard_enabled | bool


- name: purge ceph mds cluster

  vars:
    mds_group_name: mdss

  hosts: "{{ mds_group_name|default('mdss') }}"

  gather_facts: false # Already gathered previously

  become: true

  tasks:

  - name: stop ceph mdss with systemd
    service:
      name: ceph-mds@{{ ansible_facts['hostname'] }}
      state: stopped
      enabled: no
    failed_when: false


- name: purge ceph mgr cluster

  vars:
    mgr_group_name: mgrs

  hosts: "{{ mgr_group_name|default('mgrs') }}"

  gather_facts: false # Already gathered previously

  become: true

  tasks:

  - name: stop ceph mgrs with systemd
    service:
      name: ceph-mgr@{{ ansible_facts['hostname'] }}
      state: stopped
      enabled: no
    failed_when: false
    when: ansible_facts['service_mgr'] == 'systemd'

- name: purge rgwloadbalancer cluster

  vars:
    rgwloadbalancer_group_name: rgwloadbalancers

  hosts:
    - "{{ rgwloadbalancer_group_name|default('rgwloadbalancers') }}"

  gather_facts: false # Already gathered previously

  become: true

  tasks:

  - name: stop rgwloadbalancer services
    service:
      name: ['keepalived', 'haproxy']
      state: stopped
      enabled: no
    failed_when: false

- name: purge ceph rgw cluster

  vars:
    rgw_group_name: rgws

  hosts: "{{ rgw_group_name|default('rgws') }}"

  gather_facts: false # Already gathered previously

  become: true

  tasks:
    - import_role:
        name: ceph-defaults

    - import_role:
        name: ceph-facts
        tasks_from: set_radosgw_address

    - name: stop ceph rgws with systemd
      service:
        name: "ceph-radosgw@rgw.{{ ansible_facts['hostname'] }}.{{ item.instance_name }}"
        state: stopped
        enabled: no
      failed_when: false
      with_items: "{{ rgw_instances }}"


- name: purge ceph rbd-mirror cluster

  vars:
    rbdmirror_group_name: rbdmirrors

  hosts: "{{ rbdmirror_group_name|default('rbdmirrors') }}"

  gather_facts: false # Already gathered previously

  become: true

  tasks:

  - name: stop ceph rbd mirror with systemd
    service:
      name: "ceph-rbd-mirror@rbd-mirror.{{ ansible_facts['hostname'] }}"
      state: stopped
    failed_when: false


- name: purge ceph osd cluster

  vars:
    osd_group_name: osds
    reboot_osd_node: False

  hosts: "{{ osd_group_name|default('osds') }}"

  gather_facts: false # Already gathered previously

  become: true

  handlers:
  - name: restart machine
    shell: sleep 2 && shutdown -r now "Ansible updates triggered"
    async: 1
    poll: 0
    ignore_errors: true

  - name: wait for server to boot
    become: false
    wait_for:
      port: 22
      host: "{{ hostvars[inventory_hostname]['ansible_facts']['default_ipv4']['address'] }}"
      state: started
      delay: 10
      timeout: 500
      delegate_to: localhost

  - name: remove data
    shell: rm -rf /var/lib/ceph/*  # noqa 302

  tasks:

  - import_role:
      name: ceph-defaults

  - name: default lvm_volumes if not defined
    set_fact:
      lvm_volumes: []
    when: lvm_volumes is not defined

  - name: get osd numbers
    shell: if [ -d /var/lib/ceph/osd ] ; then ls /var/lib/ceph/osd | sed 's/.*-//' ; fi  # noqa 306
    register: osd_ids
    changed_when: false

  - name: stop ceph-osd with systemd
    service:
      name: ceph-osd@{{ item }}
      state: stopped
      enabled: no
    with_items: "{{ osd_ids.stdout_lines }}"
    when: ansible_facts['service_mgr'] == 'systemd'

  - name: remove ceph udev rules
    file:
      path: "{{ item }}"
      state: absent
    with_items:
      - /usr/lib/udev/rules.d/95-ceph-osd.rules
      - /usr/lib/udev/rules.d/60-ceph-by-parttypeuuid.rules

  # NOTE(leseb): hope someone will find a more elegant way one day...
  - name: see if encrypted partitions are present
    shell: blkid -t TYPE=crypto_LUKS -s PARTLABEL -s PARTUUID | grep "ceph.*." | grep -o PARTUUID.* | cut -d '"' -f 2  # noqa 306
    register: encrypted_ceph_partuuid
    changed_when: false

  - name: get osd data and lockbox mount points
    shell: (grep /var/lib/ceph/osd /proc/mounts || echo -n) | awk '{ print $2 }'  # noqa 306
    register: mounted_osd
    changed_when: false

  - name: drop all cache
    shell: "sync && sleep 1 && echo 3 > /proc/sys/vm/drop_caches"
    changed_when: false

  - name: umount osd data partition
    mount:
      path: "{{ item }}"
      state: unmounted
    with_items: "{{ mounted_osd.stdout_lines }}"

  - name: remove osd mountpoint tree
    file:
      path: /var/lib/ceph/osd/
      state: absent
    register: remove_osd_mountpoints
    ignore_errors: true

  - name: is reboot needed
    command: echo requesting reboot
    delegate_to: localhost
    become: false
    notify:
      - restart machine
      - wait for server to boot
      - remove data
    when:
      - reboot_osd_node | bool
      - remove_osd_mountpoints.failed is defined

  - name: wipe table on dm-crypt devices
    command: dmsetup wipe_table --force "{{ item }}"
    with_items: "{{ encrypted_ceph_partuuid.stdout_lines }}"
    when: encrypted_ceph_partuuid.stdout_lines | length > 0

  - name: delete dm-crypt devices if any
    command: dmsetup remove --retry --force {{ item }}
    with_items: "{{ encrypted_ceph_partuuid.stdout_lines }}"
    when: encrypted_ceph_partuuid.stdout_lines | length > 0

  - name: get payload_offset
    shell: cryptsetup luksDump /dev/disk/by-partuuid/{{ item }} | awk '/Payload offset:/ { print $3 }'  # noqa 306
    register: payload_offset
    with_items: "{{ encrypted_ceph_partuuid.stdout_lines }}"
    when: encrypted_ceph_partuuid.stdout_lines | length > 0

  - name: get physical sector size
    command: blockdev --getpbsz /dev/disk/by-partuuid/{{ item }}
    changed_when: false
    with_items: "{{ encrypted_ceph_partuuid.stdout_lines }}"
    when: encrypted_ceph_partuuid.stdout_lines | length > 0
    register: phys_sector_size

  - name: wipe dmcrypt device
    command: dd if=/dev/zero of=/dev/disk/by-partuuid/{{ item.0 }} bs={{ item.1.stdout }} count={{ item.2.stdout }} oflag=direct
    changed_when: false
    with_together:
      - "{{ encrypted_ceph_partuuid.stdout_lines }}"
      - "{{ payload_offset.results }}"
      - "{{ phys_sector_size.results }}"

  - name: get ceph data partitions
    shell: |
      blkid -o device -t PARTLABEL="ceph data"
    changed_when: false
    failed_when: false
    register: ceph_data_partition_to_erase_path

  - name: get ceph lockbox partitions
    shell: |
      blkid -o device -t PARTLABEL="ceph lockbox"
    changed_when: false
    failed_when: false
    register: ceph_lockbox_partition_to_erase_path

  - name: see if ceph-volume is installed # noqa : 305
    shell: command -v ceph-volume
    changed_when: false
    failed_when: false
    register: ceph_volume_present

  - name: zap and destroy osds created by ceph-volume with lvm_volumes
    ceph_volume:
      data: "{{ item.data }}"
      data_vg: "{{ item.data_vg|default(omit) }}"
      journal: "{{ item.journal|default(omit) }}"
      journal_vg: "{{ item.journal_vg|default(omit) }}"
      db: "{{ item.db|default(omit) }}"
      db_vg: "{{ item.db_vg|default(omit) }}"
      wal: "{{ item.wal|default(omit) }}"
      wal_vg: "{{ item.wal_vg|default(omit) }}"
      action: "zap"
    environment:
      CEPH_VOLUME_DEBUG: "{{ ceph_volume_debug }}"
    with_items: "{{ lvm_volumes | default([]) }}"
    when: ceph_volume_present.rc == 0

  - name: zap and destroy osds created by ceph-volume with devices
    ceph_volume:
      data: "{{ item }}"
      action: "zap"
    environment:
      CEPH_VOLUME_DEBUG: "{{ ceph_volume_debug }}"
    with_items:
      - "{{ devices | default([]) }}"
      - "{{ dedicated_devices | default([]) }}"
      - "{{ bluestore_wal_devices | default([]) }}"
    when: ceph_volume_present.rc == 0

  - name: get ceph block partitions
    shell: |
      blkid -o device -t PARTLABEL="ceph block"
    changed_when: false
    failed_when: false
    register: ceph_block_partition_to_erase_path

  - name: get ceph journal partitions
    shell: |
      blkid -o device -t PARTLABEL="ceph journal"
    changed_when: false
    failed_when: false
    register: ceph_journal_partition_to_erase_path

  - name: get ceph db partitions
    shell: |
      blkid -o device -t PARTLABEL="ceph block.db"
    changed_when: false
    failed_when: false
    register: ceph_db_partition_to_erase_path

  - name: get ceph wal partitions
    shell: |
      blkid -o device -t PARTLABEL="ceph block.wal"
    changed_when: false
    failed_when: false
    register: ceph_wal_partition_to_erase_path

  - name: set_fact combined_devices_list
    set_fact:
      combined_devices_list: "{{ ceph_data_partition_to_erase_path.stdout_lines +
                                 ceph_lockbox_partition_to_erase_path.stdout_lines +
                                 ceph_block_partition_to_erase_path.stdout_lines +
                                 ceph_journal_partition_to_erase_path.stdout_lines +
                                 ceph_db_partition_to_erase_path.stdout_lines +
                                 ceph_wal_partition_to_erase_path.stdout_lines }}"

  - name: resolve parent device
    command: lsblk --nodeps -no pkname "{{ item }}"
    register: tmp_resolved_parent_device
    changed_when: false
    with_items: "{{ combined_devices_list }}"

  - name: set_fact resolved_parent_device
    set_fact:
      resolved_parent_device: "{{ tmp_resolved_parent_device.results | map(attribute='stdout') | list | unique }}"

  - name: wipe partitions
    shell: |
      wipefs --all "{{ item }}"
      dd if=/dev/zero of="{{ item }}" bs=1 count=4096
    changed_when: false
    with_items: "{{ combined_devices_list }}"

  - name: check parent device partition
    parted:
      device: "/dev/{{ item }}"
    loop: "{{ resolved_parent_device }}"
    register: parted_info

  - name: fail if there is a boot partition on the device
    fail:
      msg: "{{ item.item }} has a boot partition"
    loop: "{{ parted_info.results }}"
    when: "'boot' in (item.partitions | map(attribute='flags') | list | flatten)"

  - name: zap ceph journal/block db/block wal partitions  # noqa 306
    shell: |
      sgdisk -Z --clear --mbrtogpt -g -- /dev/"{{ item }}"
      dd if=/dev/zero of=/dev/"{{ item }}" bs=1M count=200
      parted -s /dev/"{{ item }}" mklabel gpt
      partprobe /dev/"{{ item }}"
      udevadm settle --timeout=600
    with_items: "{{ resolved_parent_device }}"
    changed_when: false

- name: purge ceph mon cluster

  vars:
    mon_group_name:       mons

  hosts: "{{ mon_group_name|default('mons') }}"

  gather_facts: false # already gathered previously

  become: true

  tasks:

  - name: stop ceph mons with systemd
    service:
      name: "ceph-{{ item }}@{{ ansible_facts['hostname'] }}"
      state: stopped
      enabled: no
    failed_when: false
    with_items:
      - mon
      - mgr


  - name: remove monitor store and bootstrap keys
    file:
      path: "{{ item }}"
      state: absent
    with_items:
      - /var/lib/ceph/mon
      - /var/lib/ceph/bootstrap-mds
      - /var/lib/ceph/bootstrap-osd
      - /var/lib/ceph/bootstrap-rgw
      - /var/lib/ceph/bootstrap-rbd
      - /var/lib/ceph/bootstrap-mgr
      - /var/lib/ceph/tmp

- name: purge ceph-crash daemons
  hosts:
    - "{{ mon_group_name | default('mons') }}"
    - "{{ osd_group_name | default('osds') }}"
    - "{{ mds_group_name | default('mdss') }}"
    - "{{ rgw_group_name | default('rgws') }}"
    - "{{ rbdmirror_group_name | default('rbdmirrors') }}"
    - "{{ mgr_group_name | default('mgrs') }}"
  gather_facts: false
  become: true
  tasks:
    - name: stop ceph-crash service
      service:
        name: ceph-crash.service
        state: stopped
        enabled: no
      failed_when: false

    - name: remove /var/lib/ceph/crash
      file:
        path: /var/lib/ceph/crash
        state: absent


- name: final cleanup - check any running ceph, purge ceph packages, purge config and remove data

  vars:
    # When set to true both groups of packages are purged.
    # This can cause problem with qemu-kvm
    purge_all_packages: true

    ceph_packages:
      - ceph
      - ceph-base
      - ceph-common
      - ceph-fuse
      - ceph-mds
      - ceph-mgr
      - ceph-mon
      - ceph-osd
      - ceph-release
      - ceph-radosgw
      - ceph-grafana-dashboards
      - rbd-mirror

    ceph_remaining_packages:
      - libcephfs2
      - librados2
      - libradosstriper1
      - librbd1
      - librgw2
      - python3-ceph-argparse
      - python3-cephfs
      - python3-rados
      - python3-rbd
      - python3-rgw

    extra_packages:
      - keepalived
      - haproxy

  hosts:
    - "{{ mon_group_name|default('mons') }}"
    - "{{ osd_group_name|default('osds') }}"
    - "{{ mds_group_name|default('mdss') }}"
    - "{{ rgw_group_name|default('rgws') }}"
    - "{{ rbdmirror_group_name|default('rbdmirrors') }}"
    - "{{ nfs_group_name|default('nfss') }}"
    - "{{ client_group_name|default('clients') }}"
    - "{{ mgr_group_name|default('mgrs') }}"
    - "{{ monitoring_group_name|default('monitoring') }}"

  gather_facts: false # Already gathered previously

  become: true

  handlers:
  - name: get osd data and lockbox mount points
    shell: "(grep /var/lib/ceph/osd /proc/mounts || echo -n) | awk '{ print $2 }'"
    register: mounted_osd
    changed_when: false
    listen: "remove data"

  - name: umount osd data partition
    mount:
      path: "{{ item }}"
      state: unmounted
    with_items: "{{ mounted_osd.stdout_lines }}"
    listen: "remove data"

  - name: remove data
    shell: rm -rf /var/lib/ceph/*  # noqa 302
    listen: "remove data"

  tasks:

  - name: purge ceph packages with yum
    yum:
      name: "{{ ceph_packages }}"
      state: absent
    when: ansible_facts['pkg_mgr'] == 'yum'

  - name: purge ceph packages with dnf
    dnf:
      name: "{{ ceph_packages }}"
      state: absent
    when: ansible_facts['pkg_mgr'] == 'dnf'

  - name: purge ceph packages with apt
    apt:
      name: "{{ ceph_packages }}"
      state: absent
      purge: true
    when: ansible_facts['pkg_mgr'] == 'apt'

  - name: purge remaining ceph packages with yum
    yum:
      name: "{{ ceph_remaining_packages }}"
      state: absent
    when:
      - ansible_facts['pkg_mgr'] == 'yum'
      - purge_all_packages | bool

  - name: purge remaining ceph packages with dnf
    dnf:
      name: "{{ ceph_remaining_packages }}"
      state: absent
    when:
      - ansible_facts['pkg_mgr'] == 'dnf'
      - purge_all_packages | bool

  - name: purge remaining ceph packages with apt
    apt:
      name: "{{ ceph_remaining_packages }}"
      state: absent
    when:
      - ansible_facts['pkg_mgr'] == 'apt'
      - purge_all_packages | bool

  - name: purge extra packages with yum
    yum:
      name: "{{ extra_packages }}"
      state: absent
    when:
      - ansible_facts['pkg_mgr'] == 'yum'
      - purge_all_packages | bool

  - name: purge extra packages with dnf
    dnf:
      name: "{{ extra_packages }}"
      state: absent
    when:
      - ansible_facts['pkg_mgr'] == 'dnf'
      - purge_all_packages | bool

  - name: purge extra packages with apt
    apt:
      name: "{{ extra_packages }}"
      state: absent
    when:
      - ansible_facts['pkg_mgr'] == 'apt'
      - purge_all_packages | bool

  - name: remove config and any ceph socket left
    file:
      path: "{{ item }}"
      state: absent
    with_items:
      - /etc/ceph
      - /etc/keepalived
      - /etc/haproxy
      - /run/ceph

  - name: remove logs
    file:
     path: /var/log/ceph
     state: absent

  - name: request data removal
    command: echo requesting data removal  # noqa 301
    become: false
    delegate_to: localhost
    notify: remove data

  - name: purge dnf cache
    command: dnf clean all
    when: ansible_facts['pkg_mgr'] == 'dnf'

  - name: clean apt
    command: apt-get clean  # noqa 303
    when: ansible_facts['pkg_mgr'] == 'apt'

  - name: purge ceph repo file in /etc/yum.repos.d
    file:
      path: '/etc/yum.repos.d/{{ item }}.repo'
      state: absent
    with_items:
      - ceph-dev
      - ceph_stable
    when: ansible_facts['os_family'] == 'RedHat'

  - name: check for anything running ceph
    command: "ps -u ceph -U ceph"
    register: check_for_running_ceph
    changed_when: false
    failed_when: check_for_running_ceph.rc == 0

  - name: find ceph systemd unit files to remove
    find:
      paths: "/etc/systemd/system"
      pattern: "ceph*"
      recurse: true
      file_type: any
    register: systemd_files

  - name: remove ceph systemd unit files
    file:
      path: "{{ item.path }}"
      state: absent
    with_items: "{{ systemd_files.files }}"
    when: ansible_facts['service_mgr'] == 'systemd'


- name: purge fetch directory

  hosts: localhost

  gather_facts: false

  tasks:

  - name: set fetch_directory value if not set
    set_fact:
      fetch_directory: "fetch/"
    when: fetch_directory is not defined

  - name: purge fetch directory for localhost
    file:
      path: "{{ fetch_directory | default('fetch/') }}"
      state: absent