ceph-ansible/infrastructure-playbooks/shrink-osd.yml

---
# This playbook shrinks Ceph OSDs.
# It can remove any number of OSD(s) from the cluster and ALL THEIR DATA
#
# Use it like this:
# ansible-playbook shrink-osd.yml -e osd_to_kill=0,2,6
#     Prompts for confirmation to shrink, defaults to no and
#     doesn't shrink the cluster. yes shrinks the cluster.
#
# ansible-playbook -e ireallymeanit=yes|no shrink-osd.yml
#     Overrides the prompt using -e option. Can be used in
#     automation scripts to avoid interactive prompt.

- name: gather facts and check the init system

  hosts:
    - "{{ mon_group_name|default('mons') }}"
    - "{{ osd_group_name|default('osds') }}"

  become: True
  tasks:
    - debug: msg="gather facts on all Ceph hosts for following reference"

- name: confirm whether user really meant to remove osd(s) from the cluster

  hosts:
    - localhost

  become: true

  vars_prompt:
    - name: ireallymeanit
      prompt: Are you sure you want to shrink the cluster?
      default: 'no'
      private: no

  vars:
    mon_group_name: mons
    osd_group_name: osds

  pre_tasks:
    - name: exit playbook, if user did not mean to shrink cluster
      fail:
        msg: "Exiting shrink-osd playbook, no osd(s) was/were removed..
           To shrink the cluster, either say 'yes' on the prompt or
           or use `-e ireallymeanit=yes` on the command line when
           invoking the playbook"
      when: ireallymeanit != 'yes'

    - name: exit playbook, if no osd(s) was/were given
      fail:
        msg: "osd_to_kill must be declared
          Exiting shrink-osd playbook, no OSD(s) was/were removed.
           On the command line when invoking the playbook, you can use
           -e osd_to_kill=0,1,2,3 argument."
      when: osd_to_kill is not defined

  roles:
    - ceph-defaults

  post_tasks:

    - name: exit playbook, if can not connect to the cluster
      command: timeout 5 ceph --cluster {{ cluster }} health
      register: ceph_health
      until: ceph_health.stdout.find("HEALTH") > -1
      delegate_to: "{{ groups[mon_group_name][0] }}"
      retries: 5
      delay: 2

    - name: find the host(s) where the osd(s) is/are running on
      command: ceph --cluster {{ cluster }} osd find {{ item }}
      with_items: "{{ osd_to_kill.split(',') }}"
      delegate_to: "{{ groups[mon_group_name][0] }}"
      register: find_osd_hosts

    - set_fact:
        osd_hosts: "{{ osd_hosts | default([]) + [ (item.stdout | from_json).crush_location.host ] }}"
      with_items: "{{ find_osd_hosts.results }}"

    - name: check if ceph admin key exists on the osd nodes
      stat:
        path: "/etc/ceph/{{ cluster }}.client.admin.keyring"
      register: ceph_admin_key
      with_items: "{{ osd_hosts }}"
      delegate_to: "{{ item }}"
      failed_when: false

    - fail:
        msg: "The Ceph admin key is not present on the OSD node, please add it and remove it after the playbook is done."
      with_items: "{{ ceph_admin_key.results }}"
      when:
        -  item.stat.exists == false

    - name: deactivating osd(s)
      command: ceph-disk deactivate --cluster {{ cluster }} --deactivate-by-id {{ item.0 }} --mark-out
      register: deactivate
      ignore_errors: yes
      run_once: true
      with_together:
        - "{{ osd_to_kill.split(',') }}"
        - "{{ osd_hosts }}"
      delegate_to: "{{ item.1 }}"

    - name: set osd(s) out when ceph-disk deactivating fail
      command: ceph --cluster {{ cluster }} osd out osd.{{ item.0 }}
      delegate_to: "{{ groups[mon_group_name][0] }}"
      with_together:
        - "{{ osd_to_kill.split(',') }}"
        - "{{ deactivate.results }}"
      when:
        - item.1.stderr|length > 0

    - name: destroying osd(s)
      command: ceph-disk destroy --cluster {{ cluster }} --destroy-by-id {{ item.0 }} --zap
      register: destroy
      ignore_errors: yes
      run_once: true
      with_together:
        - "{{ osd_to_kill.split(',') }}"
        - "{{ osd_hosts }}"
      delegate_to: "{{ item.1 }}"

    - name: remove osd(s) from crush_map when ceph-disk destroy fail
      command: ceph --cluster {{ cluster }} osd crush remove osd.{{ item.0 }}
      run_once: true
      delegate_to: "{{ groups[mon_group_name][0] }}"
      with_together:
        - "{{ osd_to_kill.split(',') }}"
        - "{{ destroy.results }}"
      when:
        - item.1.stderr|length > 0

    - name: delete osd(s) auth key when ceph-disk destroy fail
      command: ceph --cluster {{ cluster }} auth del osd.{{ item.0 }}
      delegate_to: "{{ groups[mon_group_name][0] }}"
      with_together:
        - "{{ osd_to_kill.split(',') }}"
        - "{{ destroy.results }}"
      when:
        - item.1.stderr|length > 0

    - name: deallocate osd(s) id when ceph-disk destroy fail
      command: ceph --cluster {{ cluster }} osd rm {{ item.0 }}
      delegate_to: "{{ groups[mon_group_name][0] }}"
      with_together:
        - "{{ osd_to_kill.split(',') }}"
        - "{{ destroy.results }}"
      when:
        - item.1.stderr|length > 0

    - name: show ceph health
      command: ceph --cluster {{ cluster }} -s
      delegate_to: "{{ groups[mon_group_name][0] }}"

    - name: show ceph osd tree
      command: ceph --cluster {{ cluster }} osd tree
      delegate_to: "{{ groups[mon_group_name][0] }}"
add shrink playbooks: mons and osds We now have the ability to shrink a ceph cluster with the help of 2 new playbooks. Even if a lot portions of those are identical I thought I would make more sense to separate both for several reasons: * it is rare to remove mon(s) and osd(s) * this remains a tricky process so to avoid any overlap we keep things * separated For monitors, just select the list of the monitor hostnames you want to delete from the cluster and execute the playbook like this. The hostname must be resolvable. Then run the playbook like this: ansible-playbook shrink-cluster.yml -e mon_host=ceph-mon-01,ceph-mon-02 Are you sure you want to shrink the cluster? [no]: yes For OSDs, just select the list of the OSD id you want to delete from the cluster and execute the playbook like this: ansible-playbook shrink-cluster.yml -e osd_ids=0,2,4 Are you sure you want to shrink the cluster? [no]: yes If you know what you're doing you can run it like this: ansible-playbook shrink-cluster.yml -e ireallymeanit=yes -e osd_ids=0,2,4 Thanks a lot to @SamYaple for his help on the complex variables/fact/filters Signed-off-by: Sébastien Han <seb@redhat.com> 2016-08-11 23:20:07 +08:00			`---`
			`# This playbook shrinks Ceph OSDs.`
			`# It can remove any number of OSD(s) from the cluster and ALL THEIR DATA`
			`#`
			`# Use it like this:`
shrink mon and osd Rework shrinking a monitor and an OSD playbook. Also adding test scenario. Closes: https://bugzilla.redhat.com/show_bug.cgi?id=1366807 Signed-off-by: Sébastien Han <seb@redhat.com> 2017-08-31 06:07:28 +08:00			`# ansible-playbook shrink-osd.yml -e osd_to_kill=0,2,6`
add shrink playbooks: mons and osds We now have the ability to shrink a ceph cluster with the help of 2 new playbooks. Even if a lot portions of those are identical I thought I would make more sense to separate both for several reasons: * it is rare to remove mon(s) and osd(s) * this remains a tricky process so to avoid any overlap we keep things * separated For monitors, just select the list of the monitor hostnames you want to delete from the cluster and execute the playbook like this. The hostname must be resolvable. Then run the playbook like this: ansible-playbook shrink-cluster.yml -e mon_host=ceph-mon-01,ceph-mon-02 Are you sure you want to shrink the cluster? [no]: yes For OSDs, just select the list of the OSD id you want to delete from the cluster and execute the playbook like this: ansible-playbook shrink-cluster.yml -e osd_ids=0,2,4 Are you sure you want to shrink the cluster? [no]: yes If you know what you're doing you can run it like this: ansible-playbook shrink-cluster.yml -e ireallymeanit=yes -e osd_ids=0,2,4 Thanks a lot to @SamYaple for his help on the complex variables/fact/filters Signed-off-by: Sébastien Han <seb@redhat.com> 2016-08-11 23:20:07 +08:00			`# Prompts for confirmation to shrink, defaults to no and`
			`# doesn't shrink the cluster. yes shrinks the cluster.`
			`#`
			`# ansible-playbook -e ireallymeanit=yes\|no shrink-osd.yml`
			`# Overrides the prompt using -e option. Can be used in`
			`# automation scripts to avoid interactive prompt.`

shrink mon and osd Rework shrinking a monitor and an OSD playbook. Also adding test scenario. Closes: https://bugzilla.redhat.com/show_bug.cgi?id=1366807 Signed-off-by: Sébastien Han <seb@redhat.com> 2017-08-31 06:07:28 +08:00			`- name: gather facts and check the init system`

			`hosts:`
			`- "{{ mon_group_name\|default('mons') }}"`
			`- "{{ osd_group_name\|default('osds') }}"`

			`become: True`
			`tasks:`
			`- debug: msg="gather facts on all Ceph hosts for following reference"`
add shrink playbooks: mons and osds We now have the ability to shrink a ceph cluster with the help of 2 new playbooks. Even if a lot portions of those are identical I thought I would make more sense to separate both for several reasons: * it is rare to remove mon(s) and osd(s) * this remains a tricky process so to avoid any overlap we keep things * separated For monitors, just select the list of the monitor hostnames you want to delete from the cluster and execute the playbook like this. The hostname must be resolvable. Then run the playbook like this: ansible-playbook shrink-cluster.yml -e mon_host=ceph-mon-01,ceph-mon-02 Are you sure you want to shrink the cluster? [no]: yes For OSDs, just select the list of the OSD id you want to delete from the cluster and execute the playbook like this: ansible-playbook shrink-cluster.yml -e osd_ids=0,2,4 Are you sure you want to shrink the cluster? [no]: yes If you know what you're doing you can run it like this: ansible-playbook shrink-cluster.yml -e ireallymeanit=yes -e osd_ids=0,2,4 Thanks a lot to @SamYaple for his help on the complex variables/fact/filters Signed-off-by: Sébastien Han <seb@redhat.com> 2016-08-11 23:20:07 +08:00
			`- name: confirm whether user really meant to remove osd(s) from the cluster`

			`hosts:`
			`- localhost`

			`become: true`

			`vars_prompt:`
			`- name: ireallymeanit`
			`prompt: Are you sure you want to shrink the cluster?`
			`default: 'no'`
			`private: no`

shrink mon and osd Rework shrinking a monitor and an OSD playbook. Also adding test scenario. Closes: https://bugzilla.redhat.com/show_bug.cgi?id=1366807 Signed-off-by: Sébastien Han <seb@redhat.com> 2017-08-31 06:07:28 +08:00			`vars:`
			`mon_group_name: mons`
			`osd_group_name: osds`

			`pre_tasks:`
			`- name: exit playbook, if user did not mean to shrink cluster`
			`fail:`
			`msg: "Exiting shrink-osd playbook, no osd(s) was/were removed..`
			`To shrink the cluster, either say 'yes' on the prompt or`
			or use `-e ireallymeanit=yes` on the command line when
			`invoking the playbook"`
			`when: ireallymeanit != 'yes'`

			`- name: exit playbook, if no osd(s) was/were given`
			`fail:`
			`msg: "osd_to_kill must be declared`
shrink-osd: fix when multiple osds The loop was being built properly so we were always getting the last item as osd host. Closes: https://bugzilla.redhat.com/show_bug.cgi?id=1490355 Signed-off-by: Sébastien Han <seb@redhat.com> 2017-09-13 07:05:34 +08:00			`Exiting shrink-osd playbook, no OSD(s) was/were removed.`
shrink mon and osd Rework shrinking a monitor and an OSD playbook. Also adding test scenario. Closes: https://bugzilla.redhat.com/show_bug.cgi?id=1366807 Signed-off-by: Sébastien Han <seb@redhat.com> 2017-08-31 06:07:28 +08:00			`On the command line when invoking the playbook, you can use`
			`-e osd_to_kill=0,1,2,3 argument."`
			`when: osd_to_kill is not defined`

			`roles:`
			`- ceph-defaults`

			`post_tasks:`

			`- name: exit playbook, if can not connect to the cluster`
			`command: timeout 5 ceph --cluster {{ cluster }} health`
			`register: ceph_health`
			`until: ceph_health.stdout.find("HEALTH") > -1`
			`delegate_to: "{{ groups[mon_group_name][0] }}"`
			`retries: 5`
			`delay: 2`

			`- name: find the host(s) where the osd(s) is/are running on`
			`command: ceph --cluster {{ cluster }} osd find {{ item }}`
			`with_items: "{{ osd_to_kill.split(',') }}"`
			`delegate_to: "{{ groups[mon_group_name][0] }}"`
			`register: find_osd_hosts`

			`- set_fact:`
shrink-osd: fix when multiple osds The loop was being built properly so we were always getting the last item as osd host. Closes: https://bugzilla.redhat.com/show_bug.cgi?id=1490355 Signed-off-by: Sébastien Han <seb@redhat.com> 2017-09-13 07:05:34 +08:00			`osd_hosts: "{{ osd_hosts \| default([]) + [ (item.stdout \| from_json).crush_location.host ] }}"`
shrink mon and osd Rework shrinking a monitor and an OSD playbook. Also adding test scenario. Closes: https://bugzilla.redhat.com/show_bug.cgi?id=1366807 Signed-off-by: Sébastien Han <seb@redhat.com> 2017-08-31 06:07:28 +08:00			`with_items: "{{ find_osd_hosts.results }}"`

			`- name: check if ceph admin key exists on the osd nodes`
			`stat:`
			`path: "/etc/ceph/{{ cluster }}.client.admin.keyring"`
			`register: ceph_admin_key`
			`with_items: "{{ osd_hosts }}"`
			`delegate_to: "{{ item }}"`
			`failed_when: false`

			`- fail:`
			`msg: "The Ceph admin key is not present on the OSD node, please add it and remove it after the playbook is done."`
			`with_items: "{{ ceph_admin_key.results }}"`
			`when:`
			`- item.stat.exists == false`

			`- name: deactivating osd(s)`
			`command: ceph-disk deactivate --cluster {{ cluster }} --deactivate-by-id {{ item.0 }} --mark-out`
			`register: deactivate`
			`ignore_errors: yes`
shrink-osd: fix when multiple osds The loop was being built properly so we were always getting the last item as osd host. Closes: https://bugzilla.redhat.com/show_bug.cgi?id=1490355 Signed-off-by: Sébastien Han <seb@redhat.com> 2017-09-13 07:05:34 +08:00			`run_once: true`
shrink mon and osd Rework shrinking a monitor and an OSD playbook. Also adding test scenario. Closes: https://bugzilla.redhat.com/show_bug.cgi?id=1366807 Signed-off-by: Sébastien Han <seb@redhat.com> 2017-08-31 06:07:28 +08:00			`with_together:`
			`- "{{ osd_to_kill.split(',') }}"`
			`- "{{ osd_hosts }}"`
			`delegate_to: "{{ item.1 }}"`

			`- name: set osd(s) out when ceph-disk deactivating fail`
			`command: ceph --cluster {{ cluster }} osd out osd.{{ item.0 }}`
			`delegate_to: "{{ groups[mon_group_name][0] }}"`
			`with_together:`
			`- "{{ osd_to_kill.split(',') }}"`
			`- "{{ deactivate.results }}"`
			`when:`
			`- item.1.stderr\|length > 0`

			`- name: destroying osd(s)`
			`command: ceph-disk destroy --cluster {{ cluster }} --destroy-by-id {{ item.0 }} --zap`
			`register: destroy`
			`ignore_errors: yes`
shrink-osd: fix when multiple osds The loop was being built properly so we were always getting the last item as osd host. Closes: https://bugzilla.redhat.com/show_bug.cgi?id=1490355 Signed-off-by: Sébastien Han <seb@redhat.com> 2017-09-13 07:05:34 +08:00			`run_once: true`
shrink mon and osd Rework shrinking a monitor and an OSD playbook. Also adding test scenario. Closes: https://bugzilla.redhat.com/show_bug.cgi?id=1366807 Signed-off-by: Sébastien Han <seb@redhat.com> 2017-08-31 06:07:28 +08:00			`with_together:`
			`- "{{ osd_to_kill.split(',') }}"`
			`- "{{ osd_hosts }}"`
			`delegate_to: "{{ item.1 }}"`

			`- name: remove osd(s) from crush_map when ceph-disk destroy fail`
			`command: ceph --cluster {{ cluster }} osd crush remove osd.{{ item.0 }}`
shrink-osd: fix when multiple osds The loop was being built properly so we were always getting the last item as osd host. Closes: https://bugzilla.redhat.com/show_bug.cgi?id=1490355 Signed-off-by: Sébastien Han <seb@redhat.com> 2017-09-13 07:05:34 +08:00			`run_once: true`
shrink mon and osd Rework shrinking a monitor and an OSD playbook. Also adding test scenario. Closes: https://bugzilla.redhat.com/show_bug.cgi?id=1366807 Signed-off-by: Sébastien Han <seb@redhat.com> 2017-08-31 06:07:28 +08:00			`delegate_to: "{{ groups[mon_group_name][0] }}"`
			`with_together:`
			`- "{{ osd_to_kill.split(',') }}"`
			`- "{{ destroy.results }}"`
			`when:`
			`- item.1.stderr\|length > 0`

			`- name: delete osd(s) auth key when ceph-disk destroy fail`
			`command: ceph --cluster {{ cluster }} auth del osd.{{ item.0 }}`
			`delegate_to: "{{ groups[mon_group_name][0] }}"`
			`with_together:`
			`- "{{ osd_to_kill.split(',') }}"`
			`- "{{ destroy.results }}"`
			`when:`
			`- item.1.stderr\|length > 0`

			`- name: deallocate osd(s) id when ceph-disk destroy fail`
			`command: ceph --cluster {{ cluster }} osd rm {{ item.0 }}`
			`delegate_to: "{{ groups[mon_group_name][0] }}"`
			`with_together:`
			`- "{{ osd_to_kill.split(',') }}"`
			`- "{{ destroy.results }}"`
			`when:`
			`- item.1.stderr\|length > 0`

			`- name: show ceph health`
			`command: ceph --cluster {{ cluster }} -s`
			`delegate_to: "{{ groups[mon_group_name][0] }}"`

			`- name: show ceph osd tree`
			`command: ceph --cluster {{ cluster }} osd tree`
			`delegate_to: "{{ groups[mon_group_name][0] }}"`