common: serialise host restart

This commits allows us to restart Ceph daemon machine by machine instead of restarting all the daemons in a single shot. Rework the structure of the handler for clarity as well. Signed-off-by: Sébastien Han <seb@redhat.com>
2016-09-22 17:03:14 +02:00 · 2016-09-22 17:03:14 +02:00 · 40a2df5bbf
parent efc49e2347
commit 40a2df5bbf
10 changed files with 146 additions and 27 deletions
--- a/group_vars/all.yml.sample
+++ b/group_vars/all.yml.sample
@ -339,6 +339,20 @@ dummy:
 # if you don't want it keep the option commented
 #common_single_host_mode: true
 ## Handlers - restarting daemons after a config change
 # if for whatever reasons the content of your ceph configuration changes
 # ceph daemons will be restarted as well. At the moment, we can not detect
 # which config option changed so all the daemons will be restarted. Although
 # this restart will be serialized for each node, in between a health check
 # will be performed so we make sure we don't move to the next node until
 # ceph is not healthy
 # Obviously between the checks (for monitors to be in quorum and for osd's pgs
 # to be clean) we have to wait. These retries and delays can be configurable
 # for both monitors and osds.
 #handler_health_mon_check_retries: 5
 #handler_health_mon_check_delay: 10
 #handler_health_osd_check_retries: 40
 #handler_health_osd_check_delay: 30
 ###################
 # CONFIG OVERRIDE #
--- a/roles/ceph-common/defaults/main.yml
+++ b/roles/ceph-common/defaults/main.yml
@ -331,6 +331,20 @@ restapi_port: 5000
 # if you don't want it keep the option commented
 #common_single_host_mode: true
 ## Handlers - restarting daemons after a config change
 # if for whatever reasons the content of your ceph configuration changes
 # ceph daemons will be restarted as well. At the moment, we can not detect
 # which config option changed so all the daemons will be restarted. Although
 # this restart will be serialized for each node, in between a health check
 # will be performed so we make sure we don't move to the next node until
 # ceph is not healthy
 # Obviously between the checks (for monitors to be in quorum and for osd's pgs
 # to be clean) we have to wait. These retries and delays can be configurable
 # for both monitors and osds.
 handler_health_mon_check_retries: 5
 handler_health_mon_check_delay: 10
 handler_health_osd_check_retries: 40
 handler_health_osd_check_delay: 30
 ###################
 # CONFIG OVERRIDE #
--- a/roles/ceph-common/handlers/main.yml
+++ b/roles/ceph-common/handlers/main.yml
@ -2,41 +2,19 @@
 - name: update apt cache
  apt:
    update-cache: yes
  when: ansible_os_family == 'Debian'
 - name: restart ceph mons
-  service:
+  include: "{{ playbook_dir }}/roles/ceph-common/handlers/restart-mon.yml"
      name: ceph-mon@{{ monitor_name }}
      state: restarted
  when:
    - socket.rc == 0
    - mon_group_name in group_names
 # This does not just restart OSDs but everything else too. Unfortunately
 # at this time the ansible role does not have an OSD id list to use
 # for restarting them specifically.
 - name: restart ceph osds
-  service:
+  include: "{{ playbook_dir }}/roles/ceph-common/handlers/restart-osd.yml"
    name: ceph.target
    state: restarted
  when:
    - socket.rc == 0
    - osd_group_name in group_names
 - name: restart ceph mdss
-  service:
+  include: "{{ playbook_dir }}/roles/ceph-common/handlers/restart-mds.yml"
      name: ceph-mds@{{ mds_name }}
      state: restarted
  when:
    - socket.rc == 0
    - mds_group_name in group_names
 - name: restart ceph rgws
-  service:
+  include: "{{ playbook_dir }}/roles/ceph-common/handlers/restart-rgw.yml"
    name: ceph-rgw@{{ ansible_hostname }}
    state: restarted
  when:
    - socketrgw.rc == 0
    - rgw_group_name in group_names
 - name: restart ceph nfss
  service:
--- a/roles/ceph-common/handlers/restart-mds.yml
+++ b/roles/ceph-common/handlers/restart-mds.yml
@ -0,0 +1,13 @@
 ---
 - name: restart ceph mdss
  service:
    name: ceph-mds@{{ mds_name }}
    state: restarted
  # serial: 1 would be the proper solution here, but that can only be set on play level
  # upstream issue: https://github.com/ansible/ansible/issues/12170
  run_once: true
  with_items: "{{ groups[mds_group_name] }}"
  delegate_to: "{{ item }}"
  when:
    - socket.rc == 0
    - mds_group_name in group_names
--- a/roles/ceph-common/handlers/restart-mon.yml
+++ b/roles/ceph-common/handlers/restart-mon.yml
@ -0,0 +1,17 @@
 ---
 - name: restart ceph mons
  service:
    name: ceph-mon@{{ monitor_name }}
    state: restarted
  # serial: 1 would be the proper solution here, but that can only be set on play level
  # upstream issue: https://github.com/ansible/ansible/issues/12170
  run_once: true
  with_items: "{{ groups[mon_group_name] }}"
  delegate_to: "{{ item }}"
  when:
    - socket.rc == 0
    - mon_group_name in group_names
 - name: validate monitors
  include: validate-mon.yml
  when: mon_group_name in group_names
--- a/roles/ceph-common/handlers/restart-osd.yml
+++ b/roles/ceph-common/handlers/restart-osd.yml
@ -0,0 +1,22 @@
 ---
 # This does not just restart OSDs but everything else too. Unfortunately
 # at this time the ansible role does not have an OSD id list to use
 # for restarting them specifically.
 - name: restart ceph osds
  shell: |
    for id in $(ls /var/lib/ceph/osd/ |grep -oh '[0-9]*'); do
      systemctl restart ceph-osd@$id
      sleep 5
    done
  # serial: 1 would be the proper solution here, but that can only be set on play level
  # upstream issue: https://github.com/ansible/ansible/issues/12170
  run_once: true
  with_items: "{{ groups[osd_group_name] }}"
  delegate_to: "{{ item }}"
  when:
    - socket.rc == 0
    - osd_group_name in group_names
 - name: validate osds
  include: validate-osd.yml
  when: osd_group_name in group_names
--- a/roles/ceph-common/handlers/restart-rgw.yml
+++ b/roles/ceph-common/handlers/restart-rgw.yml
@ -0,0 +1,13 @@
 ---
 - name: restart ceph rgws
  service:
    name: ceph-rgw@{{ ansible_hostname }}
    state: restarted
  # serial: 1 would be the proper solution here, but that can only be set on play level
  # upstream issue: https://github.com/ansible/ansible/issues/12170
  run_once: true
  with_items: "{{ groups[rgw_group_name] }}"
  delegate_to: "{{ item }}"
  when:
    - socketrgw.rc == 0
    - rgw_group_name in group_names
--- a/roles/ceph-common/handlers/validate-mon.yml
+++ b/roles/ceph-common/handlers/validate-mon.yml
@ -0,0 +1,28 @@
 ---
 - name: wait for ceph monitor socket
  wait_for:
    path: "/var/run/ceph/{{ cluster }}-mon.{{ monitor_name }}.asok"
 - name: set mon_host_count
  set_fact: mon_host_count={{ groups[mon_group_name] | length }}
 - name: select a running monitor
  set_fact: mon_host={{ item }}
  with_items: "{{ groups[mon_group_name] }}"
  when:
    - item != inventory_hostname
    - mon_host_count | int > 1
 - name: select first monitor if only one monitor
  set_fact: mon_host={{ item }}
  with_items: "{{ groups[mon_group_name][0] }}"
  when: mon_host_count | int == 1
 - name: waiting for the monitor to join the quorum...
  shell: |
    ceph -s  --cluster {{ cluster }} | grep monmap | sed 's/.*quorum//' | egrep -sq {{ ansible_hostname }}
  register: result
  until: result.rc == 0
  retries: "{{ handler_health_mon_check_retries }}"
  delay: "{{ handler_health_mon_check_delay }}"
  delegate_to: "{{ mon_host }}"
--- a/roles/ceph-common/handlers/validate-osd.yml
+++ b/roles/ceph-common/handlers/validate-osd.yml
@ -0,0 +1,19 @@
 ---
 - name: collect osds
  shell: |
    ls /var/lib/ceph/osd/ |grep -oh '[0-9]*'
  register: osd_ids
 - name: wait for ceph osd socket(s)
  wait_for:
    path: "/var/run/ceph/{{ cluster }}-osd.{{ item }}.asok"
  with_items: "{{ osd_ids.stdout_lines }}"
 - name: waiting for clean pgs...
  shell: |
    test "$(ceph --cluster {{ cluster }} pg stat | sed 's/^.*pgs://;s/active+clean.*//;s/ //')" -eq "$(ceph --cluster {{ cluster }} pg stat | sed 's/pgs.*//;s/^.*://;s/ //')" && ceph --cluster {{ cluster }} health | egrep -sq "HEALTH_OK|HEALTH_WARN"
  register: result
  until: result.rc == 0
  retries: "{{ handler_health_osd_check_retries }}"
  delay: "{{ handler_health_osd_check_delay }}"
  delegate_to: "{{ groups[mon_group_name][0] }}"
--- a/roles/ceph-common/tasks/generate_ceph_conf.yml
+++ b/roles/ceph-common/tasks/generate_ceph_conf.yml
@ -22,3 +22,4 @@
    - restart ceph osds
    - restart ceph mdss
    - restart ceph rgws
    - restart ceph nfss