From a76bc46d2af0ffd57d47c52d3d489f64a020e0a9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Thu, 11 Aug 2016 17:20:07 +0200
Subject: [PATCH] add shrink playbooks: mons and osds
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We now have the ability to shrink a ceph cluster with the help of 2 new
playbooks. Even if a lot portions of those are identical I thought I
would make more sense to separate both for several reasons:

* it is rare to remove mon(s) and osd(s)
* this remains a tricky process so to avoid any overlap we keep things
* separated

For monitors, just select the list of the monitor hostnames you want to
delete from the cluster and execute the playbook like this. The hostname
must be resolvable. Then run the playbook like this:

ansible-playbook shrink-cluster.yml -e mon_host=ceph-mon-01,ceph-mon-02
Are you sure you want to shrink the cluster? [no]: yes

For OSDs, just select the list of the OSD id you want to delete from the
cluster and execute the playbook like this:

ansible-playbook shrink-cluster.yml -e osd_ids=0,2,4
Are you sure you want to shrink the cluster? [no]: yes

If you know what you're doing you can run it like this:

ansible-playbook shrink-cluster.yml -e ireallymeanit=yes -e
osd_ids=0,2,4

Thanks a lot to @SamYaple for his help on the complex
variables/fact/filters

Signed-off-by: Sébastien Han <seb@redhat.com>
---
 shrink-mon.yml | 142 +++++++++++++++++++++++++++++++++++++++++++++++++
 shrink-osd.yml | 131 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 273 insertions(+)
 create mode 100644 shrink-mon.yml
 create mode 100644 shrink-osd.yml

diff --git a/shrink-mon.yml b/shrink-mon.yml
new file mode 100644
index 000000000..93f74c449
--- /dev/null
+++ b/shrink-mon.yml
@@ -0,0 +1,142 @@
+---
+# This playbook shrinks the Ceph monitors from your cluster
+# It can remove any number of monitor(s) from the cluster and ALL THEIR DATA
+#
+# Use it like this:
+# ansible-playbook shrink-mon.yml -e mon_host=ceph-mon01,ceph-mon02
+#     Prompts for confirmation to shrink, defaults to no and
+#     doesn't shrink the cluster. yes shrinks the cluster.
+#
+# ansible-playbook -e ireallymeanit=yes|no shrink-cluster.yml
+#     Overrides the prompt using -e option. Can be used in
+#     automation scripts to avoid interactive prompt.
+
+
+- name: confirm whether user really meant to remove monitor(s) from the ceph cluster
+
+  hosts:
+    - localhost
+
+  gather_facts: false
+  become: true
+
+  vars_prompt:
+    - name: ireallymeanit
+      prompt: Are you sure you want to shrink the cluster?
+      default: 'no'
+      private: no
+
+  tasks:
+  - include_vars: roles/ceph-common/defaults/main.yml
+  - include_vars: group_vars/all
+
+  - name: exit playbook, if user did not mean to shrink cluster
+    fail:
+      msg: "Exiting shrink-mon playbook, no monitor(s) was/were removed.
+         To shrink the cluster, either say 'yes' on the prompt or
+         or use `-e ireallymeanit=yes` on the command line when
+         invoking the playbook"
+    when: ireallymeanit != 'yes'
+
+  - name: exit playbook, if no monitor(s) was/were given
+    fail:
+      msg: "mon_host must be declared
+        Exiting shrink-cluster playbook, no monitor(s) was/were removed.
+         On the command line when invoking the playbook, you can use
+         -e mon_host=ceph-mon01,ceph-mon02 argument."
+    when: mon_host is not defined
+
+  - name: test if ceph command exist
+    command: command -v ceph
+    changed_when: false
+    failed_when: false
+    register: ceph_command
+
+  - name: exit playbook, if ceph command does not exist
+    debug:
+      msg: "The ceph command is not available, please install it :("
+    run_once: true
+    when:
+      - ceph_command.rc != 0
+
+  - name: exit playbook, if cluster files do not exist
+    stat:
+      path: "{{ item }}"
+    register: ceph_conf_key
+    with_items:
+      - /etc/ceph/{{ cluster }}.conf
+      - /etc/ceph/{{ cluster }}.client.admin.keyring
+    failed_when: false
+
+  - fail:
+      msg: "Ceph's configuration file is not present in /etc/ceph"
+    with_items: "{{ceph_conf_key.results}}"
+    when:
+      -  item.stat.exists == false
+
+  - name: exit playbook, if can not connect to the cluster
+    command: timeout 5 ceph --cluster {{ cluster }} health
+    register: ceph_health
+    until: ceph_health.stdout.find("HEALTH") > -1
+    retries: 5
+    delay: 2
+
+  - name: verify given monitors are reachable
+    command: ping -c 1 {{ item }}
+    with_items: "{{mon_host.split(',')}}"
+    register: mon_reachable
+    failed_when: false
+
+  - fail:
+      msg: "One or more monitors are not reachable, please check your /etc/hosts or your DNS"
+    with_items: "{{mon_reachable.results}}"
+    when:
+      -  item.rc != 0
+
+  - name: stop monitor service (systemd)
+    service:
+      name: ceph-mon@{{ item }}
+      state: stopped
+      enabled: no
+    with_items: "{{mon_host.split(',')}}"
+    delegate_to: "{{item}}"
+    failed_when: false
+
+  - name: purge monitor store
+    file:
+      path: /var/lib/ceph/mon/{{ cluster }}-{{ item }}
+      state: absent
+    with_items: "{{mon_host.split(',')}}"
+    delegate_to: "{{item}}"
+
+  - name: remove monitor from the quorum
+    command: ceph --cluster {{ cluster }} mon remove {{ item }}
+    failed_when: false
+    with_items: "{{mon_host.split(',')}}"
+
+  # NOTE (leseb): sorry for the 'sleep' command
+  # but it will take a couple of seconds for other monitors
+  # to notice that one member has left.
+  # 'sleep 5' is not that bad and should be sufficient
+  - name: verify the monitor is out of the cluster
+    shell: "sleep 5 && ceph --cluster {{ cluster }} -s | grep monmap | sed 's/.*quorum//' | egrep -sq {{ item }}"
+    with_items: "{{mon_host.split(',')}}"
+    failed_when: false
+    register: ceph_health_mon
+
+  - name: please remove the monitor from your ceph configuration file
+    debug:
+        msg: "The monitor(s) has/have been successfully removed from the cluster.
+        Please remove the monitor(s) entry(ies) from the rest of your ceph configuration files, cluster wide."
+    run_once: true
+    with_items: "{{ceph_health_mon.results}}"
+    when:
+      - item.rc != 0
+
+  - name: please remove the monitor from your ceph configuration file
+    fail:
+        msg: "Monitor(s) appear(s) to still be part of the cluster, please check what happened."
+    run_once: true
+    with_items: "{{ceph_health_mon.results}}"
+    when:
+      - item.rc == 0
diff --git a/shrink-osd.yml b/shrink-osd.yml
new file mode 100644
index 000000000..5fb1bd60f
--- /dev/null
+++ b/shrink-osd.yml
@@ -0,0 +1,131 @@
+---
+# This playbook shrinks Ceph OSDs.
+# It can remove any number of OSD(s) from the cluster and ALL THEIR DATA
+#
+# Use it like this:
+# ansible-playbook shrink-osd.yml -e osd_id=0,2,6
+#     Prompts for confirmation to shrink, defaults to no and
+#     doesn't shrink the cluster. yes shrinks the cluster.
+#
+# ansible-playbook -e ireallymeanit=yes|no shrink-osd.yml
+#     Overrides the prompt using -e option. Can be used in
+#     automation scripts to avoid interactive prompt.
+
+
+- name: confirm whether user really meant to remove osd(s) from the cluster
+
+  hosts:
+    - localhost
+
+  gather_facts: false
+  become: true
+
+  vars_prompt:
+    - name: ireallymeanit
+      prompt: Are you sure you want to shrink the cluster?
+      default: 'no'
+      private: no
+
+  tasks:
+  - include_vars: roles/ceph-common/defaults/main.yml
+  - include_vars: group_vars/all
+
+  - name: exit playbook, if user did not mean to shrink cluster
+    fail:
+      msg: "Exiting shrink-osd playbook, no osd(s) was/were removed..
+         To shrink the cluster, either say 'yes' on the prompt or
+         or use `-e ireallymeanit=yes` on the command line when
+         invoking the playbook"
+    when: ireallymeanit != 'yes'
+
+  - name: exit playbook, if no osd(s) was/were given
+    fail:
+      msg: "osd_ids must be declared
+        Exiting shrink-osd playbook, no OSD()s was/were removed.
+         On the command line when invoking the playbook, you can use
+         -e osd_ids=0,1,2,3 argument."
+    when: osd_ids is not defined
+
+  - name: test if ceph command exist
+    command: command -v ceph
+    changed_when: false
+    failed_when: false
+    register: ceph_command
+
+  - name: exit playbook, if ceph command does not exist
+    debug:
+      msg: "The ceph command is not available, please install it :("
+    run_once: true
+    when:
+      - ceph_command.rc != 0
+
+  - name: exit playbook, if cluster files do not exist
+    stat:
+      path: "{{ item }}"
+    register: ceph_conf_key
+    with_items:
+      - /etc/ceph/{{ cluster }}.conf
+      - /etc/ceph/{{ cluster }}.client.admin.keyring
+    failed_when: false
+
+  - fail:
+      msg: "Ceph's configuration file is not present in /etc/ceph"
+    with_items: "{{ceph_conf_key.results}}"
+    when:
+      -  item.stat.exists == false
+
+  - name: exit playbook, if can not connect to the cluster
+    command: timeout 5 ceph --cluster {{ cluster }} health
+    register: ceph_health
+    until: ceph_health.stdout.find("HEALTH") > -1
+    retries: 5
+    delay: 2
+
+# NOTE (leseb): just in case, the complex filters mechanism below does not work anymore.
+# This will be a quick and easy fix but will require using the shell module.
+#  - name: find the host where the osd(s) is/are running on
+#    shell: |
+#      ceph --cluster {{ cluster }} osd find {{ item }} | grep -Po '(?<="ip": ")[^:]*'
+#    with_items: "{{osd_ids.split(',')}}"
+#    register: osd_hosts
+#
+  - name: find the host where the osd(s) is/are running on
+    command: ceph --cluster {{ cluster }} osd find {{ item }}
+    with_items: "{{osd_ids.split(',')}}"
+    register: osd_hosts
+
+  - set_fact: ip_item="{{(item.stdout | from_json).ip}}"
+    with_items: "{{osd_hosts.results}}"
+    register: ip_result
+
+  - set_fact: ips="{{ ip_result.results | map(attribute='ansible_facts.ip_item') | list }}"
+
+  - set_fact: real_ips="{{ ips | regex_replace(':[0-9][0-9][0-9][0-9]\/[0-9][0-9][0-9][0-9]', '') }}"
+
+  - name: check if ceph admin key exists on the osd nodes
+    stat:
+      path: "/etc/ceph/{{ cluster }}.client.admin.keyring"
+    register: ceph_admin_key
+    with_items: "{{real_ips}}"
+    delegate_to: "{{item}}"
+    failed_when: false
+
+  - fail:
+      msg: "The Ceph admin key is not present on the OSD node, please add it and remove it after the playbook is done."
+    with_items: "{{ceph_admin_key.results}}"
+    when:
+      -  item.stat.exists == false
+
+  - name: deactivating osd(s)
+    command: ceph-disk deactivate --cluster {{ cluster }} --deactivate-by-id {{ item.0 }} --mark-out
+    with_together:
+      - "{{osd_ids.split(',')}}"
+      - "{{real_ips}}"
+    delegate_to: "{{item.1}}"
+
+  - name: destroying osd(s)
+    command: ceph-disk destroy --cluster {{ cluster }} --destroy-by-id {{ item.0 }} --zap
+    with_together:
+      - "{{osd_ids.split(',')}}"
+      - "{{real_ips}}"
+    delegate_to: "{{item.1}}"