diff --git a/group_vars/all.yml.sample b/group_vars/all.yml.sample index 82eeeb655..9eb23f1cf 100644 --- a/group_vars/all.yml.sample +++ b/group_vars/all.yml.sample @@ -316,6 +316,20 @@ dummy: # if you don't want it keep the option commented #common_single_host_mode: true +## Handlers - restarting daemons after a config change +# if for whatever reasons the content of your ceph configuration changes +# ceph daemons will be restarted as well. At the moment, we can not detect +# which config option changed so all the daemons will be restarted. Although +# this restart will be serialized for each node, in between a health check +# will be performed so we make sure we don't move to the next node until +# ceph is not healthy +# Obviously between the checks (for monitors to be in quorum and for osd's pgs +# to be clean) we have to wait. These retries and delays can be configurable +# for both monitors and osds. +#handler_health_mon_check_retries: 5 +#handler_health_mon_check_delay: 10 +#handler_health_osd_check_retries: 40 +#handler_health_osd_check_delay: 30 ################### # CONFIG OVERRIDE # diff --git a/roles/ceph-common/defaults/main.yml b/roles/ceph-common/defaults/main.yml index 3000dfdcd..50bd4b3dd 100644 --- a/roles/ceph-common/defaults/main.yml +++ b/roles/ceph-common/defaults/main.yml @@ -308,6 +308,20 @@ restapi_port: 5000 # if you don't want it keep the option commented #common_single_host_mode: true +## Handlers - restarting daemons after a config change +# if for whatever reasons the content of your ceph configuration changes +# ceph daemons will be restarted as well. At the moment, we can not detect +# which config option changed so all the daemons will be restarted. Although +# this restart will be serialized for each node, in between a health check +# will be performed so we make sure we don't move to the next node until +# ceph is not healthy +# Obviously between the checks (for monitors to be in quorum and for osd's pgs +# to be clean) we have to wait. These retries and delays can be configurable +# for both monitors and osds. +handler_health_mon_check_retries: 5 +handler_health_mon_check_delay: 10 +handler_health_osd_check_retries: 40 +handler_health_osd_check_delay: 30 ################### # CONFIG OVERRIDE # diff --git a/roles/ceph-common/handlers/main.yml b/roles/ceph-common/handlers/main.yml index c7dfb7647..dd2a0d726 100644 --- a/roles/ceph-common/handlers/main.yml +++ b/roles/ceph-common/handlers/main.yml @@ -2,125 +2,19 @@ - name: update apt cache apt: update-cache: yes + when: ansible_os_family == 'Debian' - name: restart ceph mons - command: service ceph restart mon - when: - - socket.rc == 0 - - ansible_distribution != 'Ubuntu' - - mon_group_name in group_names - - ceph_release_num.{{ ceph_release }} < ceph_release_num.infernalis - -- name: restart ceph mons with systemd - service: - name: ceph-mon@{{ monitor_name }} - state: restarted - when: - - socket.rc == 0 - - use_systemd - - mon_group_name in group_names - - ceph_release_num.{{ ceph_release }} > ceph_release_num.hammer - -- name: restart ceph mons on ubuntu - command: initctl restart ceph-mon cluster={{ cluster }} id={{ monitor_name }} - when: - - socket.rc == 0 - - ansible_distribution == 'Ubuntu' - - not use_systemd - - mon_group_name in group_names + include: "{{ playbook_dir }}/roles/ceph-common/handlers/restart-mon.yml" - name: restart ceph osds - command: service ceph restart osd - when: - - socket.rc == 0 - - ansible_distribution != 'Ubuntu' - - osd_group_name in group_names - - ceph_release_num.{{ ceph_release }} < ceph_release_num.infernalis - -# This does not just restart OSDs but everything else too. Unfortunately -# at this time the ansible role does not have an OSD id list to use -# for restarting them specifically. -- name: restart ceph osds with systemd - service: - name: ceph.target - state: restarted - when: - - socket.rc == 0 - - use_systemd - - osd_group_name in group_names - - ceph_release_num.{{ ceph_release }} > ceph_release_num.hammer - -- name: restart ceph osds on ubuntu - shell: | - for id in $(ls /var/lib/ceph/osd/ |grep -oh '[0-9]*'); do - initctl restart ceph-osd cluster={{ cluster }} id=$id - done - when: - - socket.rc == 0 - - ansible_distribution == 'Ubuntu' - - not use_systemd - - osd_group_name in group_names - -- name: restart ceph mdss on ubuntu - command: initctl restart ceph-mds cluster={{ cluster }} id={{ ansible_hostname }} - when: - - socket.rc == 0 - - ansible_distribution == 'Ubuntu' - - not use_systemd - - mds_group_name in group_names + include: "{{ playbook_dir }}/roles/ceph-common/handlers/restart-osd.yml" - name: restart ceph mdss - command: service ceph restart mds - when: - - socket.rc == 0 - - ansible_distribution != 'Ubuntu' - - use_systemd - - mds_group_name in group_names - - ceph_release_num.{{ ceph_release }} < ceph_release_num.infernalis - -- name: restart ceph mdss with systemd - service: - name: ceph-mds@{{ mds_name }} - state: restarted - when: - - socket.rc == 0 - - use_systemd - - mds_group_name in group_names - - ceph_release_num.{{ ceph_release }} > ceph_release_num.hammer - -- name: restart ceph rgws on ubuntu - command: initctl restart radosgw cluster={{ cluster }} id=rgw.{{ ansible_hostname }} - when: - - socketrgw.rc == 0 - - ansible_distribution == 'Ubuntu' - - not use_systemd - - rgw_group_name in group_names + include: "{{ playbook_dir }}/roles/ceph-common/handlers/restart-mds.yml" - name: restart ceph rgws - command: /etc/init.d/radosgw restart - when: - - socketrgw.rc == 0 - - ansible_distribution != 'Ubuntu' - - rgw_group_name in group_names - - ceph_release_num.{{ ceph_release }} < ceph_release_num.infernalis - -- name: restart ceph rgws on red hat - command: /etc/init.d/ceph-radosgw restart - when: - - socketrgw.rc == 0 - - ansible_os_family == 'RedHat' - - rgw_group_name in group_names - - ceph_release_num.{{ ceph_release }} < ceph_release_num.infernalis - -- name: restart ceph rgws with systemd - service: - name: ceph-rgw@{{ ansible_hostname }} - state: restarted - when: - - socketrgw.rc == 0 - - use_systemd - - rgw_group_name in group_names - - ceph_release_num.{{ ceph_release }} > ceph_release_num.hammer + include: "{{ playbook_dir }}/roles/ceph-common/handlers/restart-rgw.yml" - name: restart ceph nfss service: diff --git a/roles/ceph-common/handlers/restart-mds.yml b/roles/ceph-common/handlers/restart-mds.yml new file mode 100644 index 000000000..e6ff5ef4e --- /dev/null +++ b/roles/ceph-common/handlers/restart-mds.yml @@ -0,0 +1,13 @@ +--- +- name: restart ceph mdss + service: + name: ceph-mds@{{ mds_name }} + state: restarted + # serial: 1 would be the proper solution here, but that can only be set on play level + # upstream issue: https://github.com/ansible/ansible/issues/12170 + run_once: true + with_items: "{{ groups[mds_group_name] }}" + delegate_to: "{{ item }}" + when: + - socket.rc == 0 + - mds_group_name in group_names diff --git a/roles/ceph-common/handlers/restart-mon.yml b/roles/ceph-common/handlers/restart-mon.yml new file mode 100644 index 000000000..440b7f219 --- /dev/null +++ b/roles/ceph-common/handlers/restart-mon.yml @@ -0,0 +1,17 @@ +--- +- name: restart ceph mons + service: + name: ceph-mon@{{ monitor_name }} + state: restarted + # serial: 1 would be the proper solution here, but that can only be set on play level + # upstream issue: https://github.com/ansible/ansible/issues/12170 + run_once: true + with_items: "{{ groups[mon_group_name] }}" + delegate_to: "{{ item }}" + when: + - socket.rc == 0 + - mon_group_name in group_names + +- name: validate monitors + include: validate-mon.yml + when: mon_group_name in group_names diff --git a/roles/ceph-common/handlers/restart-osd.yml b/roles/ceph-common/handlers/restart-osd.yml new file mode 100644 index 000000000..dc6fbeebb --- /dev/null +++ b/roles/ceph-common/handlers/restart-osd.yml @@ -0,0 +1,22 @@ +--- +# This does not just restart OSDs but everything else too. Unfortunately +# at this time the ansible role does not have an OSD id list to use +# for restarting them specifically. +- name: restart ceph osds + shell: | + for id in $(ls /var/lib/ceph/osd/ |grep -oh '[0-9]*'); do + systemctl restart ceph-osd@$id + sleep 5 + done + # serial: 1 would be the proper solution here, but that can only be set on play level + # upstream issue: https://github.com/ansible/ansible/issues/12170 + run_once: true + with_items: "{{ groups[osd_group_name] }}" + delegate_to: "{{ item }}" + when: + - socket.rc == 0 + - osd_group_name in group_names + +- name: validate osds + include: validate-osd.yml + when: osd_group_name in group_names diff --git a/roles/ceph-common/handlers/restart-rgw.yml b/roles/ceph-common/handlers/restart-rgw.yml new file mode 100644 index 000000000..5e52e9cc0 --- /dev/null +++ b/roles/ceph-common/handlers/restart-rgw.yml @@ -0,0 +1,13 @@ +--- +- name: restart ceph rgws + service: + name: ceph-rgw@{{ ansible_hostname }} + state: restarted + # serial: 1 would be the proper solution here, but that can only be set on play level + # upstream issue: https://github.com/ansible/ansible/issues/12170 + run_once: true + with_items: "{{ groups[rgw_group_name] }}" + delegate_to: "{{ item }}" + when: + - socketrgw.rc == 0 + - rgw_group_name in group_names diff --git a/roles/ceph-common/handlers/validate-mon.yml b/roles/ceph-common/handlers/validate-mon.yml new file mode 100644 index 000000000..4c5e15acb --- /dev/null +++ b/roles/ceph-common/handlers/validate-mon.yml @@ -0,0 +1,28 @@ +--- +- name: wait for ceph monitor socket + wait_for: + path: "/var/run/ceph/{{ cluster }}-mon.{{ monitor_name }}.asok" + +- name: set mon_host_count + set_fact: mon_host_count={{ groups[mon_group_name] | length }} + +- name: select a running monitor + set_fact: mon_host={{ item }} + with_items: "{{ groups[mon_group_name] }}" + when: + - item != inventory_hostname + - mon_host_count | int > 1 + +- name: select first monitor if only one monitor + set_fact: mon_host={{ item }} + with_items: "{{ groups[mon_group_name][0] }}" + when: mon_host_count | int == 1 + +- name: waiting for the monitor to join the quorum... + shell: | + ceph -s --cluster {{ cluster }} | grep monmap | sed 's/.*quorum//' | egrep -sq {{ ansible_hostname }} + register: result + until: result.rc == 0 + retries: "{{ handler_health_mon_check_retries }}" + delay: "{{ handler_health_mon_check_delay }}" + delegate_to: "{{ mon_host }}" diff --git a/roles/ceph-common/handlers/validate-osd.yml b/roles/ceph-common/handlers/validate-osd.yml new file mode 100644 index 000000000..b83d0952d --- /dev/null +++ b/roles/ceph-common/handlers/validate-osd.yml @@ -0,0 +1,19 @@ +--- +- name: collect osds + shell: | + ls /var/lib/ceph/osd/ |grep -oh '[0-9]*' + register: osd_ids + +- name: wait for ceph osd socket(s) + wait_for: + path: "/var/run/ceph/{{ cluster }}-osd.{{ item }}.asok" + with_items: "{{ osd_ids.stdout_lines }}" + +- name: waiting for clean pgs... + shell: | + test "$(ceph --cluster {{ cluster }} pg stat | sed 's/^.*pgs://;s/active+clean.*//;s/ //')" -eq "$(ceph --cluster {{ cluster }} pg stat | sed 's/pgs.*//;s/^.*://;s/ //')" && ceph --cluster {{ cluster }} health | egrep -sq "HEALTH_OK|HEALTH_WARN" + register: result + until: result.rc == 0 + retries: "{{ handler_health_osd_check_retries }}" + delay: "{{ handler_health_osd_check_delay }}" + delegate_to: "{{ groups[mon_group_name][0] }}" diff --git a/roles/ceph-common/tasks/generate_ceph_conf.yml b/roles/ceph-common/tasks/generate_ceph_conf.yml index 59fa03c17..dc929c736 100644 --- a/roles/ceph-common/tasks/generate_ceph_conf.yml +++ b/roles/ceph-common/tasks/generate_ceph_conf.yml @@ -19,16 +19,7 @@ config_type: ini notify: - restart ceph mons - - restart ceph mons on ubuntu - - restart ceph mons with systemd - restart ceph osds - - restart ceph osds on ubuntu - - restart ceph osds with systemd - restart ceph mdss - - restart ceph mdss on ubuntu - - restart ceph mdss with systemd - restart ceph rgws - - restart ceph rgws on ubuntu - - restart ceph rgws on red hat - - restart ceph rgws with systemd - restart ceph nfss