From 85a448429dceba161c1d8867f2813657b7868045 Mon Sep 17 00:00:00 2001 From: Guillaume Abrioux Date: Wed, 3 Jul 2019 10:45:46 +0200 Subject: [PATCH] shrink-mds: refact post tasks This commit refacts the way we check the "mds_to_kill" node is well stopped. Signed-off-by: Guillaume Abrioux Co-authored-by: Rishabh Dave (cherry picked from commit 7df62fde3476aa0a79821f26775660de1ec0c68e) --- infrastructure-playbooks/shrink-mds.yml | 83 ++++++++++++++++++------- 1 file changed, 59 insertions(+), 24 deletions(-) diff --git a/infrastructure-playbooks/shrink-mds.yml b/infrastructure-playbooks/shrink-mds.yml index 947f88cbf..470fcc824 100644 --- a/infrastructure-playbooks/shrink-mds.yml +++ b/infrastructure-playbooks/shrink-mds.yml @@ -59,15 +59,11 @@ - name: set_fact container_exec_cmd for mon0 set_fact: - container_exec_cmd: > - {{ container_binary }} exec ceph-mon-{{ hostvars[groups - [mon_group_name][0]]['ansible_hostname'] }} + container_exec_cmd: "{{ container_binary }} exec ceph-mon-{{ hostvars[groups[mon_group_name][0]]['ansible_hostname'] }}" when: containerized_deployment | bool - name: exit playbook, if can not connect to the cluster - command: > - {{ container_exec_cmd | default('') }} timeout 5 ceph --cluster - {{ cluster }} health + command: "{{ container_exec_cmd | default('') }} timeout 5 ceph --cluster {{ cluster }} health" register: ceph_health until: ceph_health is succeeded delegate_to: "{{ groups[mon_group_name][0] }}" @@ -79,13 +75,62 @@ mds_to_kill_hostname: "{{ hostvars[mds_to_kill]['ansible_hostname'] }}" tasks: - - name: stop mds service(s) - service: - name: ceph-mds@{{ mds_to_kill_hostname }} - state: stopped - enabled: no - delegate_to: "{{ mds_to_kill }}" - failed_when: false + # get rid of this as soon as "systemctl stop ceph-msd@$HOSTNAME" also + # removes the MDS from the FS map. + - name: exit mds if it the deployment is containerized + when: containerized_deployment | bool + command: "{{ container_exec_cmd | default('') }} ceph tell mds.{{ mds_to_kill }} exit" + delegate_to: "{{ groups[mon_group_name][0] }}" + + - name: stop mds service and verify it + block: + - name: stop mds service + service: + name: ceph-mds@{{ mds_to_kill_hostname }} + state: stopped + enabled: no + delegate_to: "{{ mds_to_kill }}" + failed_when: false + + - name: ensure that the mds is stopped + command: "systemctl is-active ceph_mds@{{ mds_to_kill_hostname }}" + register: mds_to_kill_status + failed_when: mds_to_kill_status.rc == 0 + delegate_to: "{{ mds_to_kill }}" + retries: 5 + delay: 2 + + - name: fail if the mds is reported as active or standby + block: + - name: get ceph status + command: "{{ container_exec_cmd | default('') }} ceph --cluster {{ cluster }} -s -f json" + register: ceph_status + delegate_to: "{{ groups[mon_group_name][0] }}" + + - name: get active mds nodes list + set_fact: + active_mdss: "{{ active_mdss | default([]) + [item.name] }}" + with_items: "{{ (ceph_status.stdout | from_json)['fsmap']['by_rank'] }}" + + - name: get ceph fs dump status + command: "{{ container_exec_cmd | default('') }} ceph --cluster {{ cluster }} fs dump -f json" + register: ceph_fs_status + delegate_to: "{{ groups[mon_group_name][0] }}" + + - name: create a list of standby mdss + set_fact: + standby_mdss: (ceph_fs_status.stdout | from_json)['standbys'] | map(attribute='name') | list + + - name: fail if mds just killed is being reported as active or standby + fail: + msg: "mds node {{ mds_to_kill }} still up and running." + when: + - (mds_to_kill in active_mdss | default([])) or + (mds_to_kill in standby_mdss | default([])) + + - name: delete the filesystem too if deleted the last mds too + command: "{{ container_exec_cmd | default('') }} ceph --cluster {{ cluster }} fs rm --yes-i-really-mean-it {{ cephfs }}" + delegate_to: "{{ groups[mon_group_name][0] }}" - name: purge mds store file: @@ -94,16 +139,6 @@ delegate_to: "{{ mds_to_kill }}" post_tasks: - - name: verify that the mds has stopped - shell: > - {{ container_exec_cmd | default('') }} ceph --cluster ceph --conf - /etc/ceph/ceph.conf fs dump | grep mds0 - register: result - failed_when: result.rc == 0 - delegate_to: "{{ mds_to_kill }}" - - name: show ceph health - command: > - {{ container_exec_cmd | default('') }} ceph --cluster - {{ cluster }} -s + command: "{{ container_exec_cmd | default('') }} ceph --cluster {{ cluster }} -s" delegate_to: "{{ groups[mon_group_name][0] }}"