From 747555dfa601b4925204fd878735c296ef728e5d Mon Sep 17 00:00:00 2001
From: Dimitri Savineau <dsavinea@redhat.com>
Date: Thu, 9 Jan 2020 11:48:13 -0500
Subject: [PATCH] shrink-rgw: refact global workflow

Instead of running the ceph roles against localhost we should do it
on the first mon.
The ansible and inventory hostname of the rgw nodes could be different.
Ensure that the rgw instance to remove is present in the cluster.
Fix rgw service and directory path.

Closes: https://bugzilla.redhat.com/show_bug.cgi?id=1677431

Signed-off-by: Dimitri Savineau <dsavinea@redhat.com>
---
 infrastructure-playbooks/shrink-rgw.yml | 126 ++++++++++++------------
 1 file changed, 62 insertions(+), 64 deletions(-)

diff --git a/infrastructure-playbooks/shrink-rgw.yml b/infrastructure-playbooks/shrink-rgw.yml
index d54e5fca0..441a82c15 100644
--- a/infrastructure-playbooks/shrink-rgw.yml
+++ b/infrastructure-playbooks/shrink-rgw.yml
@@ -11,60 +11,26 @@
 #     automation scripts to avoid interactive prompt.
 
 
-- name: gather facts and check the init system
-  hosts:
-    - "{{ mon_group_name | default('mons') }}"
-    - "{{ rgw_group_name | default('rgws') }}"
-  become: true
-  tasks:
-    - debug:
-        msg: gather facts on MONS and RGWs
-
 - name: confirm whether user really meant to remove rgw from the ceph cluster
   hosts: localhost
-  become: true
+  become: false
+  gather_facts: false
   vars_prompt:
     - name: ireallymeanit
       prompt: Are you sure you want to shrink the cluster?
       default: 'no'
       private: no
-  pre_tasks:
-    - import_role:
-        name: ceph-defaults
-
-    - import_role:
-        name: ceph-facts
-
+  tasks:
     - name: exit playbook, if no rgw was given
-      when: rgw_to_kill is not defined
+      when: rgw_to_kill is not defined or rgw_to_kill | length == 0
       fail:
         msg: >
           rgw_to_kill must be declared.
           Exiting shrink-cluster playbook, no RGW was removed. On the command
           line when invoking the playbook, you can use
-          "-e rgw_to_kill=ceph-rgw0 argument".  You can only remove a single
+          "-e rgw_to_kill=ceph.rgw0 argument".  You can only remove a single
           RGW each time the playbook runs.
 
-    - name: get rgw hostname
-      set_fact:
-        rgw_to_kill_hostname: "{{ rgw_to_kill.split('.')[0] }}"
-
-    - name: get rgw instance
-      set_fact:
-        rgw_to_kill_instance: "{{ rgw_to_kill.split('.')[1] }}"
-
-    - name: exit if supplied hostname didnt match actual hostname
-      fail:
-        msg: supplied hostname didn't match with actual hostname
-      when: hostvars[rgw_to_kill_hostname]['ansible_hostname'] != rgw_to_kill_hostname
-
-    - name: exit playbook, if the rgw is not part of the inventory
-      when: rgw_to_kill_hostname not in groups[rgw_group_name]
-      fail:
-        msg: >
-          It seems that the host given is not part of your inventory, please
-          make sure it is.
-
     - name: exit playbook, if user did not mean to shrink cluster
       when: ireallymeanit != 'yes'
       fail:
@@ -73,42 +39,76 @@
           cluster, either say 'yes' on the prompt or use
           '-e ireallymeanit=yes' on the command line when  invoking the playbook
 
+- name: gather facts and mons and rgws
+  hosts:
+    - "{{ mon_group_name | default('mons') }}[0]"
+    - "{{ rgw_group_name | default('rgws') }}"
+  become: true
+  gather_facts: false
+  tasks:
+    - setup:
+
+- hosts: mons[0]
+  become: true
+  gather_facts: false
+  pre_tasks:
+    - import_role:
+        name: ceph-defaults
+
+    - import_role:
+        name: ceph-facts
+        tasks_from: container_binary
+
     - name: set_fact container_exec_cmd for mon0
-      when: containerized_deployment | bool
       set_fact:
-        container_exec_cmd: "{{ container_binary }} exec ceph-mon-{{hostvars[groups[mon_group_name][0]]['ansible_hostname'] }}"
+        container_exec_cmd: "{{ container_binary }} exec ceph-mon-{{ ansible_hostname }}"
+      when: containerized_deployment | bool
 
     - name: exit playbook, if can not connect to the cluster
       command: "{{ container_exec_cmd | default('') }} timeout 5 ceph --cluster {{ cluster }} health"
       register: ceph_health
       until: ceph_health is succeeded
-      delegate_to: "{{ groups[mon_group_name][0] }}"
       retries: 5
       delay: 2
 
-  tasks:
-    - name: stop rgw service and verify it
-      block:
-        - name: stop rgw service
-          service:
-            name: ceph-radosgw@rgw.{{ rgw_to_kill }}
-            state: stopped
-            enabled: no
-          delegate_to: "{{ rgw_to_kill_hostname }}"
-          failed_when: false
+    - name: get rgw instances
+      command: "{{ container_exec_cmd | default('') }} ceph --cluster {{ cluster }} -s -f json"
+      register: rgw_instances
 
-        - name: ensure that the rgw is stopped
-          command: "systemctl is-active ceph_rgw@rgw.{{ rgw_to_kill }}"
-          register: rgw_to_kill_status
-          failed_when: rgw_to_kill_status.rc == 0
-          delegate_to: "{{ rgw_to_kill_hostname }}"
-          retries: 5
-          delay: 2
+
+    - name: exit playbook, if the rgw_to_kill doesn't exist
+      when: rgw_to_kill not in (rgw_instances.stdout | from_json).servicemap.services.rgw.daemons.keys() | list
+      fail:
+        msg: >
+          It seems that the rgw instance given is not part of the ceph cluster. Please
+          make sure it is.
+          The rgw instance format is $(hostname}.rgw$(instance number).
+  tasks:
+    - name: get rgw host running the rgw instance to kill
+      set_fact:
+        rgw_host: '{{ item }}'
+      with_items: '{{ groups[rgw_group_name] }}'
+      when: hostvars[item]['ansible_hostname'] == rgw_to_kill.split('.')[0]
+
+    - name: stop rgw service
+      service:
+        name: ceph-radosgw@rgw.{{ rgw_to_kill }}
+        state: stopped
+        enabled: no
+      delegate_to: "{{ rgw_host }}"
+      failed_when: false
+
+    - name: ensure that the rgw is stopped
+      command: "systemctl is-active ceph-radosgw@rgw.{{ rgw_to_kill }}"
+      register: rgw_to_kill_status
+      failed_when: rgw_to_kill_status.rc == 0
+      delegate_to: "{{ rgw_host }}"
+      retries: 5
+      delay: 2
 
     - name: exit if rgw_to_kill is reported in ceph status
       command: "{{ container_exec_cmd | default('') }} ceph --cluster {{ cluster }} -s -f json"
       register: ceph_status
-      delegate_to: "{{ groups[mon_group_name][0] }}"
       failed_when:
         - (ceph_status.stdout | from_json).servicemap.services.rgw is defined
         - rgw_to_kill in (ceph_status.stdout | from_json).servicemap.services.rgw.daemons.keys() | list
@@ -120,11 +120,9 @@
 
     - name: purge directories related to rgw
       file:
-        path: /var/lib/ceph/radosgw/{{ cluster }}-{{ rgw_to_kill_hostname }}
+        path: /var/lib/ceph/radosgw/{{ cluster }}-rgw.{{ rgw_to_kill }}
         state: absent
-      delegate_to: "{{ rgw_to_kill_hostname }}"
-
+      delegate_to: "{{ rgw_host }}"
   post_tasks:
     - name: show ceph health
       command: "{{ container_exec_cmd | default('')}} ceph --cluster {{ cluster }} -s"
-      delegate_to: "{{ groups[mon_group_name][0] }}"