common: serialise host restart

This commits allows us to restart Ceph daemon machine by machine instead
of restarting all the daemons in a single shot.

Rework the structure of the handler for clarity as well.

Signed-off-by: Sébastien Han <seb@redhat.com>
pull/995/head
Sébastien Han 2016-09-22 17:03:14 +02:00
parent efc49e2347
commit 40a2df5bbf
10 changed files with 146 additions and 27 deletions

View File

@ -339,6 +339,20 @@ dummy:
# if you don't want it keep the option commented
#common_single_host_mode: true
## Handlers - restarting daemons after a config change
# if for whatever reasons the content of your ceph configuration changes
# ceph daemons will be restarted as well. At the moment, we can not detect
# which config option changed so all the daemons will be restarted. Although
# this restart will be serialized for each node, in between a health check
# will be performed so we make sure we don't move to the next node until
# ceph is not healthy
# Obviously between the checks (for monitors to be in quorum and for osd's pgs
# to be clean) we have to wait. These retries and delays can be configurable
# for both monitors and osds.
#handler_health_mon_check_retries: 5
#handler_health_mon_check_delay: 10
#handler_health_osd_check_retries: 40
#handler_health_osd_check_delay: 30
###################
# CONFIG OVERRIDE #

View File

@ -331,6 +331,20 @@ restapi_port: 5000
# if you don't want it keep the option commented
#common_single_host_mode: true
## Handlers - restarting daemons after a config change
# if for whatever reasons the content of your ceph configuration changes
# ceph daemons will be restarted as well. At the moment, we can not detect
# which config option changed so all the daemons will be restarted. Although
# this restart will be serialized for each node, in between a health check
# will be performed so we make sure we don't move to the next node until
# ceph is not healthy
# Obviously between the checks (for monitors to be in quorum and for osd's pgs
# to be clean) we have to wait. These retries and delays can be configurable
# for both monitors and osds.
handler_health_mon_check_retries: 5
handler_health_mon_check_delay: 10
handler_health_osd_check_retries: 40
handler_health_osd_check_delay: 30
###################
# CONFIG OVERRIDE #

View File

@ -2,41 +2,19 @@
- name: update apt cache
apt:
update-cache: yes
when: ansible_os_family == 'Debian'
- name: restart ceph mons
service:
name: ceph-mon@{{ monitor_name }}
state: restarted
when:
- socket.rc == 0
- mon_group_name in group_names
include: "{{ playbook_dir }}/roles/ceph-common/handlers/restart-mon.yml"
# This does not just restart OSDs but everything else too. Unfortunately
# at this time the ansible role does not have an OSD id list to use
# for restarting them specifically.
- name: restart ceph osds
service:
name: ceph.target
state: restarted
when:
- socket.rc == 0
- osd_group_name in group_names
include: "{{ playbook_dir }}/roles/ceph-common/handlers/restart-osd.yml"
- name: restart ceph mdss
service:
name: ceph-mds@{{ mds_name }}
state: restarted
when:
- socket.rc == 0
- mds_group_name in group_names
include: "{{ playbook_dir }}/roles/ceph-common/handlers/restart-mds.yml"
- name: restart ceph rgws
service:
name: ceph-rgw@{{ ansible_hostname }}
state: restarted
when:
- socketrgw.rc == 0
- rgw_group_name in group_names
include: "{{ playbook_dir }}/roles/ceph-common/handlers/restart-rgw.yml"
- name: restart ceph nfss
service:

View File

@ -0,0 +1,13 @@
---
- name: restart ceph mdss
service:
name: ceph-mds@{{ mds_name }}
state: restarted
# serial: 1 would be the proper solution here, but that can only be set on play level
# upstream issue: https://github.com/ansible/ansible/issues/12170
run_once: true
with_items: "{{ groups[mds_group_name] }}"
delegate_to: "{{ item }}"
when:
- socket.rc == 0
- mds_group_name in group_names

View File

@ -0,0 +1,17 @@
---
- name: restart ceph mons
service:
name: ceph-mon@{{ monitor_name }}
state: restarted
# serial: 1 would be the proper solution here, but that can only be set on play level
# upstream issue: https://github.com/ansible/ansible/issues/12170
run_once: true
with_items: "{{ groups[mon_group_name] }}"
delegate_to: "{{ item }}"
when:
- socket.rc == 0
- mon_group_name in group_names
- name: validate monitors
include: validate-mon.yml
when: mon_group_name in group_names

View File

@ -0,0 +1,22 @@
---
# This does not just restart OSDs but everything else too. Unfortunately
# at this time the ansible role does not have an OSD id list to use
# for restarting them specifically.
- name: restart ceph osds
shell: |
for id in $(ls /var/lib/ceph/osd/ |grep -oh '[0-9]*'); do
systemctl restart ceph-osd@$id
sleep 5
done
# serial: 1 would be the proper solution here, but that can only be set on play level
# upstream issue: https://github.com/ansible/ansible/issues/12170
run_once: true
with_items: "{{ groups[osd_group_name] }}"
delegate_to: "{{ item }}"
when:
- socket.rc == 0
- osd_group_name in group_names
- name: validate osds
include: validate-osd.yml
when: osd_group_name in group_names

View File

@ -0,0 +1,13 @@
---
- name: restart ceph rgws
service:
name: ceph-rgw@{{ ansible_hostname }}
state: restarted
# serial: 1 would be the proper solution here, but that can only be set on play level
# upstream issue: https://github.com/ansible/ansible/issues/12170
run_once: true
with_items: "{{ groups[rgw_group_name] }}"
delegate_to: "{{ item }}"
when:
- socketrgw.rc == 0
- rgw_group_name in group_names

View File

@ -0,0 +1,28 @@
---
- name: wait for ceph monitor socket
wait_for:
path: "/var/run/ceph/{{ cluster }}-mon.{{ monitor_name }}.asok"
- name: set mon_host_count
set_fact: mon_host_count={{ groups[mon_group_name] | length }}
- name: select a running monitor
set_fact: mon_host={{ item }}
with_items: "{{ groups[mon_group_name] }}"
when:
- item != inventory_hostname
- mon_host_count | int > 1
- name: select first monitor if only one monitor
set_fact: mon_host={{ item }}
with_items: "{{ groups[mon_group_name][0] }}"
when: mon_host_count | int == 1
- name: waiting for the monitor to join the quorum...
shell: |
ceph -s --cluster {{ cluster }} | grep monmap | sed 's/.*quorum//' | egrep -sq {{ ansible_hostname }}
register: result
until: result.rc == 0
retries: "{{ handler_health_mon_check_retries }}"
delay: "{{ handler_health_mon_check_delay }}"
delegate_to: "{{ mon_host }}"

View File

@ -0,0 +1,19 @@
---
- name: collect osds
shell: |
ls /var/lib/ceph/osd/ |grep -oh '[0-9]*'
register: osd_ids
- name: wait for ceph osd socket(s)
wait_for:
path: "/var/run/ceph/{{ cluster }}-osd.{{ item }}.asok"
with_items: "{{ osd_ids.stdout_lines }}"
- name: waiting for clean pgs...
shell: |
test "$(ceph --cluster {{ cluster }} pg stat | sed 's/^.*pgs://;s/active+clean.*//;s/ //')" -eq "$(ceph --cluster {{ cluster }} pg stat | sed 's/pgs.*//;s/^.*://;s/ //')" && ceph --cluster {{ cluster }} health | egrep -sq "HEALTH_OK|HEALTH_WARN"
register: result
until: result.rc == 0
retries: "{{ handler_health_osd_check_retries }}"
delay: "{{ handler_health_osd_check_delay }}"
delegate_to: "{{ groups[mon_group_name][0] }}"

View File

@ -22,3 +22,4 @@
- restart ceph osds
- restart ceph mdss
- restart ceph rgws
- restart ceph nfss