mirror of https://github.com/ceph/ceph-ansible.git
common: serialise host restart
This commits allows us to restart Ceph daemon machine by machine instead
of restarting all the daemons in a single shot.
Rework the structure of the handler for clarity as well.
Signed-off-by: Sébastien Han <seb@redhat.com>
(cherry picked from commit 40a2df5bbf
)
Conflicts:
roles/ceph-common/handlers/main.yml
roles/ceph-common/tasks/generate_ceph_conf.yml
pull/1260/head
parent
7cfa152f10
commit
ce7ad225d8
|
@ -316,6 +316,20 @@ dummy:
|
|||
# if you don't want it keep the option commented
|
||||
#common_single_host_mode: true
|
||||
|
||||
## Handlers - restarting daemons after a config change
|
||||
# if for whatever reasons the content of your ceph configuration changes
|
||||
# ceph daemons will be restarted as well. At the moment, we can not detect
|
||||
# which config option changed so all the daemons will be restarted. Although
|
||||
# this restart will be serialized for each node, in between a health check
|
||||
# will be performed so we make sure we don't move to the next node until
|
||||
# ceph is not healthy
|
||||
# Obviously between the checks (for monitors to be in quorum and for osd's pgs
|
||||
# to be clean) we have to wait. These retries and delays can be configurable
|
||||
# for both monitors and osds.
|
||||
#handler_health_mon_check_retries: 5
|
||||
#handler_health_mon_check_delay: 10
|
||||
#handler_health_osd_check_retries: 40
|
||||
#handler_health_osd_check_delay: 30
|
||||
|
||||
###################
|
||||
# CONFIG OVERRIDE #
|
||||
|
|
|
@ -308,6 +308,20 @@ restapi_port: 5000
|
|||
# if you don't want it keep the option commented
|
||||
#common_single_host_mode: true
|
||||
|
||||
## Handlers - restarting daemons after a config change
|
||||
# if for whatever reasons the content of your ceph configuration changes
|
||||
# ceph daemons will be restarted as well. At the moment, we can not detect
|
||||
# which config option changed so all the daemons will be restarted. Although
|
||||
# this restart will be serialized for each node, in between a health check
|
||||
# will be performed so we make sure we don't move to the next node until
|
||||
# ceph is not healthy
|
||||
# Obviously between the checks (for monitors to be in quorum and for osd's pgs
|
||||
# to be clean) we have to wait. These retries and delays can be configurable
|
||||
# for both monitors and osds.
|
||||
handler_health_mon_check_retries: 5
|
||||
handler_health_mon_check_delay: 10
|
||||
handler_health_osd_check_retries: 40
|
||||
handler_health_osd_check_delay: 30
|
||||
|
||||
###################
|
||||
# CONFIG OVERRIDE #
|
||||
|
|
|
@ -2,125 +2,19 @@
|
|||
- name: update apt cache
|
||||
apt:
|
||||
update-cache: yes
|
||||
when: ansible_os_family == 'Debian'
|
||||
|
||||
- name: restart ceph mons
|
||||
command: service ceph restart mon
|
||||
when:
|
||||
- socket.rc == 0
|
||||
- ansible_distribution != 'Ubuntu'
|
||||
- mon_group_name in group_names
|
||||
- ceph_release_num.{{ ceph_release }} < ceph_release_num.infernalis
|
||||
|
||||
- name: restart ceph mons with systemd
|
||||
service:
|
||||
name: ceph-mon@{{ monitor_name }}
|
||||
state: restarted
|
||||
when:
|
||||
- socket.rc == 0
|
||||
- use_systemd
|
||||
- mon_group_name in group_names
|
||||
- ceph_release_num.{{ ceph_release }} > ceph_release_num.hammer
|
||||
|
||||
- name: restart ceph mons on ubuntu
|
||||
command: initctl restart ceph-mon cluster={{ cluster }} id={{ monitor_name }}
|
||||
when:
|
||||
- socket.rc == 0
|
||||
- ansible_distribution == 'Ubuntu'
|
||||
- not use_systemd
|
||||
- mon_group_name in group_names
|
||||
include: "{{ playbook_dir }}/roles/ceph-common/handlers/restart-mon.yml"
|
||||
|
||||
- name: restart ceph osds
|
||||
command: service ceph restart osd
|
||||
when:
|
||||
- socket.rc == 0
|
||||
- ansible_distribution != 'Ubuntu'
|
||||
- osd_group_name in group_names
|
||||
- ceph_release_num.{{ ceph_release }} < ceph_release_num.infernalis
|
||||
|
||||
# This does not just restart OSDs but everything else too. Unfortunately
|
||||
# at this time the ansible role does not have an OSD id list to use
|
||||
# for restarting them specifically.
|
||||
- name: restart ceph osds with systemd
|
||||
service:
|
||||
name: ceph.target
|
||||
state: restarted
|
||||
when:
|
||||
- socket.rc == 0
|
||||
- use_systemd
|
||||
- osd_group_name in group_names
|
||||
- ceph_release_num.{{ ceph_release }} > ceph_release_num.hammer
|
||||
|
||||
- name: restart ceph osds on ubuntu
|
||||
shell: |
|
||||
for id in $(ls /var/lib/ceph/osd/ |grep -oh '[0-9]*'); do
|
||||
initctl restart ceph-osd cluster={{ cluster }} id=$id
|
||||
done
|
||||
when:
|
||||
- socket.rc == 0
|
||||
- ansible_distribution == 'Ubuntu'
|
||||
- not use_systemd
|
||||
- osd_group_name in group_names
|
||||
|
||||
- name: restart ceph mdss on ubuntu
|
||||
command: initctl restart ceph-mds cluster={{ cluster }} id={{ ansible_hostname }}
|
||||
when:
|
||||
- socket.rc == 0
|
||||
- ansible_distribution == 'Ubuntu'
|
||||
- not use_systemd
|
||||
- mds_group_name in group_names
|
||||
include: "{{ playbook_dir }}/roles/ceph-common/handlers/restart-osd.yml"
|
||||
|
||||
- name: restart ceph mdss
|
||||
command: service ceph restart mds
|
||||
when:
|
||||
- socket.rc == 0
|
||||
- ansible_distribution != 'Ubuntu'
|
||||
- use_systemd
|
||||
- mds_group_name in group_names
|
||||
- ceph_release_num.{{ ceph_release }} < ceph_release_num.infernalis
|
||||
|
||||
- name: restart ceph mdss with systemd
|
||||
service:
|
||||
name: ceph-mds@{{ mds_name }}
|
||||
state: restarted
|
||||
when:
|
||||
- socket.rc == 0
|
||||
- use_systemd
|
||||
- mds_group_name in group_names
|
||||
- ceph_release_num.{{ ceph_release }} > ceph_release_num.hammer
|
||||
|
||||
- name: restart ceph rgws on ubuntu
|
||||
command: initctl restart radosgw cluster={{ cluster }} id=rgw.{{ ansible_hostname }}
|
||||
when:
|
||||
- socketrgw.rc == 0
|
||||
- ansible_distribution == 'Ubuntu'
|
||||
- not use_systemd
|
||||
- rgw_group_name in group_names
|
||||
include: "{{ playbook_dir }}/roles/ceph-common/handlers/restart-mds.yml"
|
||||
|
||||
- name: restart ceph rgws
|
||||
command: /etc/init.d/radosgw restart
|
||||
when:
|
||||
- socketrgw.rc == 0
|
||||
- ansible_distribution != 'Ubuntu'
|
||||
- rgw_group_name in group_names
|
||||
- ceph_release_num.{{ ceph_release }} < ceph_release_num.infernalis
|
||||
|
||||
- name: restart ceph rgws on red hat
|
||||
command: /etc/init.d/ceph-radosgw restart
|
||||
when:
|
||||
- socketrgw.rc == 0
|
||||
- ansible_os_family == 'RedHat'
|
||||
- rgw_group_name in group_names
|
||||
- ceph_release_num.{{ ceph_release }} < ceph_release_num.infernalis
|
||||
|
||||
- name: restart ceph rgws with systemd
|
||||
service:
|
||||
name: ceph-rgw@{{ ansible_hostname }}
|
||||
state: restarted
|
||||
when:
|
||||
- socketrgw.rc == 0
|
||||
- use_systemd
|
||||
- rgw_group_name in group_names
|
||||
- ceph_release_num.{{ ceph_release }} > ceph_release_num.hammer
|
||||
include: "{{ playbook_dir }}/roles/ceph-common/handlers/restart-rgw.yml"
|
||||
|
||||
- name: restart ceph nfss
|
||||
service:
|
||||
|
|
|
@ -0,0 +1,13 @@
|
|||
---
|
||||
- name: restart ceph mdss
|
||||
service:
|
||||
name: ceph-mds@{{ mds_name }}
|
||||
state: restarted
|
||||
# serial: 1 would be the proper solution here, but that can only be set on play level
|
||||
# upstream issue: https://github.com/ansible/ansible/issues/12170
|
||||
run_once: true
|
||||
with_items: "{{ groups[mds_group_name] }}"
|
||||
delegate_to: "{{ item }}"
|
||||
when:
|
||||
- socket.rc == 0
|
||||
- mds_group_name in group_names
|
|
@ -0,0 +1,17 @@
|
|||
---
|
||||
- name: restart ceph mons
|
||||
service:
|
||||
name: ceph-mon@{{ monitor_name }}
|
||||
state: restarted
|
||||
# serial: 1 would be the proper solution here, but that can only be set on play level
|
||||
# upstream issue: https://github.com/ansible/ansible/issues/12170
|
||||
run_once: true
|
||||
with_items: "{{ groups[mon_group_name] }}"
|
||||
delegate_to: "{{ item }}"
|
||||
when:
|
||||
- socket.rc == 0
|
||||
- mon_group_name in group_names
|
||||
|
||||
- name: validate monitors
|
||||
include: validate-mon.yml
|
||||
when: mon_group_name in group_names
|
|
@ -0,0 +1,22 @@
|
|||
---
|
||||
# This does not just restart OSDs but everything else too. Unfortunately
|
||||
# at this time the ansible role does not have an OSD id list to use
|
||||
# for restarting them specifically.
|
||||
- name: restart ceph osds
|
||||
shell: |
|
||||
for id in $(ls /var/lib/ceph/osd/ |grep -oh '[0-9]*'); do
|
||||
systemctl restart ceph-osd@$id
|
||||
sleep 5
|
||||
done
|
||||
# serial: 1 would be the proper solution here, but that can only be set on play level
|
||||
# upstream issue: https://github.com/ansible/ansible/issues/12170
|
||||
run_once: true
|
||||
with_items: "{{ groups[osd_group_name] }}"
|
||||
delegate_to: "{{ item }}"
|
||||
when:
|
||||
- socket.rc == 0
|
||||
- osd_group_name in group_names
|
||||
|
||||
- name: validate osds
|
||||
include: validate-osd.yml
|
||||
when: osd_group_name in group_names
|
|
@ -0,0 +1,13 @@
|
|||
---
|
||||
- name: restart ceph rgws
|
||||
service:
|
||||
name: ceph-rgw@{{ ansible_hostname }}
|
||||
state: restarted
|
||||
# serial: 1 would be the proper solution here, but that can only be set on play level
|
||||
# upstream issue: https://github.com/ansible/ansible/issues/12170
|
||||
run_once: true
|
||||
with_items: "{{ groups[rgw_group_name] }}"
|
||||
delegate_to: "{{ item }}"
|
||||
when:
|
||||
- socketrgw.rc == 0
|
||||
- rgw_group_name in group_names
|
|
@ -0,0 +1,28 @@
|
|||
---
|
||||
- name: wait for ceph monitor socket
|
||||
wait_for:
|
||||
path: "/var/run/ceph/{{ cluster }}-mon.{{ monitor_name }}.asok"
|
||||
|
||||
- name: set mon_host_count
|
||||
set_fact: mon_host_count={{ groups[mon_group_name] | length }}
|
||||
|
||||
- name: select a running monitor
|
||||
set_fact: mon_host={{ item }}
|
||||
with_items: "{{ groups[mon_group_name] }}"
|
||||
when:
|
||||
- item != inventory_hostname
|
||||
- mon_host_count | int > 1
|
||||
|
||||
- name: select first monitor if only one monitor
|
||||
set_fact: mon_host={{ item }}
|
||||
with_items: "{{ groups[mon_group_name][0] }}"
|
||||
when: mon_host_count | int == 1
|
||||
|
||||
- name: waiting for the monitor to join the quorum...
|
||||
shell: |
|
||||
ceph -s --cluster {{ cluster }} | grep monmap | sed 's/.*quorum//' | egrep -sq {{ ansible_hostname }}
|
||||
register: result
|
||||
until: result.rc == 0
|
||||
retries: "{{ handler_health_mon_check_retries }}"
|
||||
delay: "{{ handler_health_mon_check_delay }}"
|
||||
delegate_to: "{{ mon_host }}"
|
|
@ -0,0 +1,19 @@
|
|||
---
|
||||
- name: collect osds
|
||||
shell: |
|
||||
ls /var/lib/ceph/osd/ |grep -oh '[0-9]*'
|
||||
register: osd_ids
|
||||
|
||||
- name: wait for ceph osd socket(s)
|
||||
wait_for:
|
||||
path: "/var/run/ceph/{{ cluster }}-osd.{{ item }}.asok"
|
||||
with_items: "{{ osd_ids.stdout_lines }}"
|
||||
|
||||
- name: waiting for clean pgs...
|
||||
shell: |
|
||||
test "$(ceph --cluster {{ cluster }} pg stat | sed 's/^.*pgs://;s/active+clean.*//;s/ //')" -eq "$(ceph --cluster {{ cluster }} pg stat | sed 's/pgs.*//;s/^.*://;s/ //')" && ceph --cluster {{ cluster }} health | egrep -sq "HEALTH_OK|HEALTH_WARN"
|
||||
register: result
|
||||
until: result.rc == 0
|
||||
retries: "{{ handler_health_osd_check_retries }}"
|
||||
delay: "{{ handler_health_osd_check_delay }}"
|
||||
delegate_to: "{{ groups[mon_group_name][0] }}"
|
|
@ -19,16 +19,7 @@
|
|||
config_type: ini
|
||||
notify:
|
||||
- restart ceph mons
|
||||
- restart ceph mons on ubuntu
|
||||
- restart ceph mons with systemd
|
||||
- restart ceph osds
|
||||
- restart ceph osds on ubuntu
|
||||
- restart ceph osds with systemd
|
||||
- restart ceph mdss
|
||||
- restart ceph mdss on ubuntu
|
||||
- restart ceph mdss with systemd
|
||||
- restart ceph rgws
|
||||
- restart ceph rgws on ubuntu
|
||||
- restart ceph rgws on red hat
|
||||
- restart ceph rgws with systemd
|
||||
- restart ceph nfss
|
||||
|
|
Loading…
Reference in New Issue