shrink-mds: refact post tasks

This commit refacts the way we check the "mds_to_kill" node is well stopped. Signed-off-by: Guillaume Abrioux <gabrioux@redhat.com> Co-authored-by: Rishabh Dave <ridave@redhat.com> (cherry picked from commit 7df62fde34)
2019-07-03 10:45:46 +02:00 · 2019-07-03 10:45:46 +02:00 · 85a448429d
parent e213163b63
commit 85a448429d
1 changed files with 59 additions and 24 deletions
--- a/infrastructure-playbooks/shrink-mds.yml
+++ b/infrastructure-playbooks/shrink-mds.yml
@ -59,15 +59,11 @@
    - name: set_fact container_exec_cmd for mon0
      set_fact:
-        container_exec_cmd: >
+        container_exec_cmd: "{{ container_binary }} exec ceph-mon-{{ hostvars[groups[mon_group_name][0]]['ansible_hostname'] }}"
          {{ container_binary }} exec ceph-mon-{{ hostvars[groups
          [mon_group_name][0]]['ansible_hostname'] }}
      when: containerized_deployment | bool
    - name: exit playbook, if can not connect to the cluster
-      command: >
+      command: "{{ container_exec_cmd | default('') }} timeout 5 ceph --cluster {{ cluster }} health"
        {{ container_exec_cmd | default('') }} timeout 5 ceph --cluster
        {{ cluster }} health
      register: ceph_health
      until: ceph_health is succeeded
      delegate_to: "{{ groups[mon_group_name][0] }}"
@ -79,13 +75,62 @@
        mds_to_kill_hostname: "{{ hostvars[mds_to_kill]['ansible_hostname'] }}"
  tasks:
-    - name: stop mds service(s)
+    # get rid of this as soon as "systemctl stop ceph-msd@$HOSTNAME" also
-      service:
+    # removes the MDS from the FS map.
-        name: ceph-mds@{{ mds_to_kill_hostname }}
+    - name: exit mds if it the deployment is containerized
-        state: stopped
+      when: containerized_deployment | bool
-        enabled: no
+      command: "{{ container_exec_cmd | default('') }} ceph tell mds.{{ mds_to_kill }} exit"
-      delegate_to: "{{ mds_to_kill }}"
+      delegate_to: "{{ groups[mon_group_name][0] }}"
-      failed_when: false
+
    - name: stop mds service and verify it
      block:
        - name: stop mds service
          service:
            name: ceph-mds@{{ mds_to_kill_hostname }}
            state: stopped
            enabled: no
          delegate_to: "{{ mds_to_kill }}"
          failed_when: false
        - name: ensure that the mds is stopped
          command: "systemctl is-active ceph_mds@{{ mds_to_kill_hostname }}"
          register: mds_to_kill_status
          failed_when: mds_to_kill_status.rc == 0
          delegate_to: "{{ mds_to_kill }}"
          retries: 5
          delay: 2
    - name: fail if the mds is reported as active or standby
      block:
        - name: get ceph status
          command: "{{ container_exec_cmd | default('') }} ceph --cluster {{ cluster }} -s -f json"
          register: ceph_status
          delegate_to: "{{ groups[mon_group_name][0] }}"
        - name: get active mds nodes list
          set_fact:
            active_mdss: "{{ active_mdss | default([]) + [item.name] }}"
          with_items: "{{ (ceph_status.stdout | from_json)['fsmap']['by_rank'] }}"
        - name: get ceph fs dump status
          command: "{{ container_exec_cmd | default('') }} ceph --cluster {{ cluster }} fs dump -f json"
          register: ceph_fs_status
          delegate_to: "{{ groups[mon_group_name][0] }}"
        - name: create a list of standby mdss
          set_fact:
            standby_mdss: (ceph_fs_status.stdout | from_json)['standbys'] | map(attribute='name') | list
        - name: fail if mds just killed is being reported as active or standby
          fail:
            msg: "mds node {{ mds_to_kill }} still up and running."
          when:
            - (mds_to_kill in active_mdss | default([])) or
              (mds_to_kill in standby_mdss | default([]))
    - name: delete the filesystem too if deleted the last mds too
      command: "{{ container_exec_cmd | default('') }} ceph --cluster {{ cluster }} fs rm --yes-i-really-mean-it {{ cephfs }}"
      delegate_to: "{{ groups[mon_group_name][0] }}"
    - name: purge mds store
      file:
@ -94,16 +139,6 @@
      delegate_to: "{{ mds_to_kill }}"
  post_tasks:
    - name: verify that the mds has stopped
      shell: >
        {{ container_exec_cmd | default('') }} ceph --cluster ceph --conf
        /etc/ceph/ceph.conf fs dump | grep mds0
      register: result
      failed_when: result.rc == 0
      delegate_to: "{{ mds_to_kill }}"
    - name: show ceph health
-      command: >
+      command: "{{ container_exec_cmd | default('') }} ceph --cluster {{ cluster }} -s"
        {{ container_exec_cmd | default('') }} ceph --cluster
        {{ cluster }} -s
      delegate_to: "{{ groups[mon_group_name][0] }}"