ceph-ansible/infrastructure-playbooks/shrink-mon.yml

---
# This playbook shrinks the Ceph monitors from your cluster
# It can remove a Ceph of monitor from the cluster and ALL ITS DATA
#
# Use it like this:
# ansible-playbook shrink-mon.yml -e mon_to_kill=ceph-mon01
#     Prompts for confirmation to shrink, defaults to no and
#     doesn't shrink the cluster. yes shrinks the cluster.
#
# ansible-playbook -e ireallymeanit=yes|no shrink-mon.yml
#     Overrides the prompt using -e option. Can be used in
#     automation scripts to avoid interactive prompt.


- name: gather facts and check the init system

  hosts: "{{ mon_group_name|default('mons') }}"

  become: true

  tasks:
    - debug: msg="gather facts on all Ceph hosts for following reference"

- name: confirm whether user really meant to remove monitor from the ceph cluster
  hosts: localhost
  become: true
  vars_prompt:
    - name: ireallymeanit
      prompt: Are you sure you want to shrink the cluster?
      default: 'no'
      private: no
  vars:
    mon_group_name: mons

  tasks:
    # pre task for following import
    - name: exit playbook, if only one monitor is present in cluster
      fail:
        msg: "You are about to shrink the only monitor present in the cluster.
              If you really want to do that, please use the purge-cluster playbook."
      when: groups[mon_group_name] | length | int == 1

    - name: exit playbook, if no monitor was given
      fail:
        msg: "mon_to_kill must be declared
          Exiting shrink-cluster playbook, no monitor was removed.
           On the command line when invoking the playbook, you can use
           -e mon_to_kill=ceph-mon01 argument. You can only remove a single monitor each time the playbook runs."
      when: mon_to_kill is not defined

    - name: exit playbook, if the monitor is not part of the inventory
      fail:
        msg: "It seems that the host given is not part of your inventory, please make sure it is."
      when: mon_to_kill not in groups[mon_group_name]

    - name: exit playbook, if user did not mean to shrink cluster
      fail:
        msg: "Exiting shrink-mon playbook, no monitor was removed.
           To shrink the cluster, either say 'yes' on the prompt or
           or use `-e ireallymeanit=yes` on the command line when
           invoking the playbook"
      when: ireallymeanit != 'yes'

    - import_role:
        name: ceph-defaults

    - import_role:
        name: ceph-facts

    # post_tasks for preceding import
    - name: pick a monitor different than the one we want to remove
      set_fact:
        mon_host: "{{ item }}"
      with_items: "{{ groups[mon_group_name] }}"
      when: item != mon_to_kill

    - name: "set_fact container_exec_cmd build {{ container_binary }} exec command (containerized)"
      set_fact:
        container_exec_cmd: "{{ container_binary }} exec ceph-mon-{{ hostvars[mon_host]['ansible_hostname'] }}"
      when: containerized_deployment | bool

    - name: exit playbook, if can not connect to the cluster
      command: "{{ container_exec_cmd }} timeout 5 ceph --cluster {{ cluster }} health"
      register: ceph_health
      until: ceph_health.stdout.find("HEALTH") > -1
      delegate_to: "{{ mon_host }}"
      retries: 5
      delay: 2

    - name: set_fact mon_to_kill_hostname
      set_fact:
        mon_to_kill_hostname: "{{ hostvars[mon_to_kill]['ansible_hostname'] }}"

    - name: stop monitor service(s)
      service:
        name: ceph-mon@{{ mon_to_kill_hostname }}
        state: stopped
        enabled: no
      delegate_to: "{{ mon_to_kill }}"
      failed_when: false

    - name: purge monitor store
      file:
        path: /var/lib/ceph/mon/{{ cluster }}-{{ mon_to_kill_hostname }}
        state: absent
      delegate_to: "{{ mon_to_kill }}"

    - name: remove monitor from the quorum
      command: "{{ container_exec_cmd }} ceph --cluster {{ cluster }} mon remove {{ mon_to_kill_hostname }}"
      failed_when: false
      delegate_to: "{{ mon_host }}"

    # NOTE (leseb): sorry for the 'sleep' command
    # but it will take a couple of seconds for other monitors
    # to notice that one member has left.
    # 'sleep 5' is not that bad and should be sufficient
    - name: verify the monitor is out of the cluster
      shell: |
        {{ container_exec_cmd }} ceph --cluster {{ cluster }} -s -f json | python -c 'import sys, json; print(json.load(sys.stdin)["quorum_names"])'
      delegate_to: "{{ mon_host }}"
      failed_when: false
      register: result
      until: mon_to_kill_hostname not in result.stdout
      retries: 2
      delay: 10

    - name: please remove the monitor from your ceph configuration file
      debug:
          msg: "The monitor has been successfully removed from the cluster.
          Please remove the monitor entry from the rest of your ceph configuration files, cluster wide."
      run_once: true
      when: mon_to_kill_hostname not in result.stdout

    - name: fail if monitor is still part of the cluster
      fail:
          msg: "Monitor appears to still be part of the cluster, please check what happened."
      run_once: true
      when: mon_to_kill_hostname in result.stdout

    - name: show ceph health
      command: "{{ container_exec_cmd }} ceph --cluster {{ cluster }} -s"
      delegate_to: "{{ mon_host }}"

    - name: show ceph mon status
      command: "{{ container_exec_cmd }} ceph --cluster {{ cluster }} mon stat"
      delegate_to: "{{ mon_host }}"