From 5bec62ba7c8ecbc91ede05297dadb639c530f597 Mon Sep 17 00:00:00 2001 From: Guillaume Abrioux Date: Mon, 3 Apr 2017 19:55:11 +0200 Subject: [PATCH] Common: Fix handlers that are not properly triggered. Until now, only the first task were executed. The idea here is to use `listen` statement to be able to notify multiple handler and regroup all of them in `./handlers/main.yml` as notifying an included handler task is not possible. Signed-off-by: Guillaume Abrioux --- roles/ceph-common/handlers/main.yml | 62 +++++++++++++++++-- roles/ceph-common/handlers/restart-mds.yml | 13 ---- roles/ceph-common/handlers/restart-mon.yml | 17 ----- roles/ceph-common/handlers/restart-osd.yml | 22 ------- roles/ceph-common/handlers/restart-rgw.yml | 13 ---- roles/ceph-common/handlers/validate-mon.yml | 28 --------- roles/ceph-common/handlers/validate-osd.yml | 20 ------ .../ceph-common/tasks/checks/check_socket.yml | 14 ----- roles/ceph-common/tasks/main.yml | 1 - .../templates/restart_mon_daemon.sh.j2 | 33 ++++++++++ .../templates/restart_osd_daemon.sh.j2 | 36 +++++++++++ 11 files changed, 125 insertions(+), 134 deletions(-) delete mode 100644 roles/ceph-common/handlers/restart-mds.yml delete mode 100644 roles/ceph-common/handlers/restart-mon.yml delete mode 100644 roles/ceph-common/handlers/restart-osd.yml delete mode 100644 roles/ceph-common/handlers/restart-rgw.yml delete mode 100644 roles/ceph-common/handlers/validate-mon.yml delete mode 100644 roles/ceph-common/handlers/validate-osd.yml delete mode 100644 roles/ceph-common/tasks/checks/check_socket.yml create mode 100644 roles/ceph-common/templates/restart_mon_daemon.sh.j2 create mode 100644 roles/ceph-common/templates/restart_osd_daemon.sh.j2 diff --git a/roles/ceph-common/handlers/main.yml b/roles/ceph-common/handlers/main.yml index 8cb6bd301..9602a0ce5 100644 --- a/roles/ceph-common/handlers/main.yml +++ b/roles/ceph-common/handlers/main.yml @@ -4,17 +4,67 @@ update-cache: yes when: ansible_os_family == 'Debian' -- name: restart ceph mons - include: "./restart-mon.yml" +- block: + - name: copy mon restart script + template: + src: restart_mon_daemon.sh.j2 + dest: /tmp/restart_mon_daemon.sh + owner: root + group: root + mode: 0750 + listen: "restart ceph mons" -- name: restart ceph osds - include: "./restart-osd.yml" + - name: restart ceph mon daemon(s) + command: /tmp/restart_mon_daemon.sh + listen: "restart ceph mons" + + when: + - mon_group_name in group_names + +# This does not just restart OSDs but everything else too. Unfortunately +# at this time the ansible role does not have an OSD id list to use +# for restarting them specifically. +- block: + - name: copy osd restart script + template: + src: restart_osd_daemon.sh.j2 + dest: /tmp/restart_osd_daemon.sh + owner: root + group: root + mode: 0750 + listen: "restart ceph osds" + + - name: restart ceph osds daemon(s) + command: /tmp/restart_osd_daemon.sh + listen: "restart ceph osds" + when: + - handler_health_osd_check + when: + - osd_group_name in group_names - name: restart ceph mdss - include: "./restart-mds.yml" + service: + name: ceph-mds@{{ mds_name }} + state: restarted + # serial: 1 would be the proper solution here, but that can only be set on play level + # upstream issue: https://github.com/ansible/ansible/issues/12170 + run_once: true + with_items: "{{ groups.get(mds_group_name, []) }}" + delegate_to: "{{ item }}" + when: + - mds_group_name in group_names - name: restart ceph rgws - include: "./restart-rgw.yml" + service: + name: ceph-radosgw@rgw.{{ ansible_hostname }} + state: restarted + # serial: 1 would be the proper solution here, but that can only be set on play level + # upstream issue: https://github.com/ansible/ansible/issues/12170 + run_once: true + with_items: "{{ groups.get(rgw_group_name, []) }}" + delegate_to: "{{ item }}" + when: + - rgw_group_name in group_names - name: restart ceph nfss service: diff --git a/roles/ceph-common/handlers/restart-mds.yml b/roles/ceph-common/handlers/restart-mds.yml deleted file mode 100644 index 142043f27..000000000 --- a/roles/ceph-common/handlers/restart-mds.yml +++ /dev/null @@ -1,13 +0,0 @@ ---- -- name: restart ceph mdss - service: - name: ceph-mds@{{ mds_name }} - state: restarted - # serial: 1 would be the proper solution here, but that can only be set on play level - # upstream issue: https://github.com/ansible/ansible/issues/12170 - run_once: true - with_items: "{{ groups.get(mds_group_name, []) }}" - delegate_to: "{{ item }}" - when: - - socket.rc == 0 - - mds_group_name in group_names diff --git a/roles/ceph-common/handlers/restart-mon.yml b/roles/ceph-common/handlers/restart-mon.yml deleted file mode 100644 index 6776bd47f..000000000 --- a/roles/ceph-common/handlers/restart-mon.yml +++ /dev/null @@ -1,17 +0,0 @@ ---- -- name: restart ceph mons - service: - name: ceph-mon@{{ monitor_name }} - state: restarted - # serial: 1 would be the proper solution here, but that can only be set on play level - # upstream issue: https://github.com/ansible/ansible/issues/12170 - run_once: true - with_items: "{{ groups.get(mon_group_name, []) }}" - delegate_to: "{{ item }}" - when: - - socket.rc == 0 - - mon_group_name in group_names - -- name: validate monitors - include: validate-mon.yml - when: mon_group_name in group_names diff --git a/roles/ceph-common/handlers/restart-osd.yml b/roles/ceph-common/handlers/restart-osd.yml deleted file mode 100644 index 93641f926..000000000 --- a/roles/ceph-common/handlers/restart-osd.yml +++ /dev/null @@ -1,22 +0,0 @@ ---- -# This does not just restart OSDs but everything else too. Unfortunately -# at this time the ansible role does not have an OSD id list to use -# for restarting them specifically. -- name: restart ceph osds - shell: | - for id in $(ls /var/lib/ceph/osd/ | sed 's/.*-//'); do - systemctl restart ceph-osd@$id - sleep 5 - done - # serial: 1 would be the proper solution here, but that can only be set on play level - # upstream issue: https://github.com/ansible/ansible/issues/12170 - run_once: true - with_items: "{{ groups.get(osd_group_name, []) }}" - delegate_to: "{{ item }}" - when: - - socket.rc == 0 - - osd_group_name in group_names - -- name: validate osds - include: validate-osd.yml - when: osd_group_name in group_names diff --git a/roles/ceph-common/handlers/restart-rgw.yml b/roles/ceph-common/handlers/restart-rgw.yml deleted file mode 100644 index 479ac31ad..000000000 --- a/roles/ceph-common/handlers/restart-rgw.yml +++ /dev/null @@ -1,13 +0,0 @@ ---- -- name: restart ceph rgws - service: - name: ceph-rgw@{{ ansible_hostname }} - state: restarted - # serial: 1 would be the proper solution here, but that can only be set on play level - # upstream issue: https://github.com/ansible/ansible/issues/12170 - run_once: true - with_items: "{{ groups.get(rgw_group_name, []) }}" - delegate_to: "{{ item }}" - when: - - socketrgw.rc == 0 - - rgw_group_name in group_names diff --git a/roles/ceph-common/handlers/validate-mon.yml b/roles/ceph-common/handlers/validate-mon.yml deleted file mode 100644 index 4c5e15acb..000000000 --- a/roles/ceph-common/handlers/validate-mon.yml +++ /dev/null @@ -1,28 +0,0 @@ ---- -- name: wait for ceph monitor socket - wait_for: - path: "/var/run/ceph/{{ cluster }}-mon.{{ monitor_name }}.asok" - -- name: set mon_host_count - set_fact: mon_host_count={{ groups[mon_group_name] | length }} - -- name: select a running monitor - set_fact: mon_host={{ item }} - with_items: "{{ groups[mon_group_name] }}" - when: - - item != inventory_hostname - - mon_host_count | int > 1 - -- name: select first monitor if only one monitor - set_fact: mon_host={{ item }} - with_items: "{{ groups[mon_group_name][0] }}" - when: mon_host_count | int == 1 - -- name: waiting for the monitor to join the quorum... - shell: | - ceph -s --cluster {{ cluster }} | grep monmap | sed 's/.*quorum//' | egrep -sq {{ ansible_hostname }} - register: result - until: result.rc == 0 - retries: "{{ handler_health_mon_check_retries }}" - delay: "{{ handler_health_mon_check_delay }}" - delegate_to: "{{ mon_host }}" diff --git a/roles/ceph-common/handlers/validate-osd.yml b/roles/ceph-common/handlers/validate-osd.yml deleted file mode 100644 index aefe1b9be..000000000 --- a/roles/ceph-common/handlers/validate-osd.yml +++ /dev/null @@ -1,20 +0,0 @@ ---- -- name: collect osds - shell: | - ls /var/lib/ceph/osd/ | sed 's/.*-//' - register: osd_ids - -- name: wait for ceph osd socket(s) - wait_for: - path: "/var/run/ceph/{{ cluster }}-osd.{{ item }}.asok" - with_items: "{{ osd_ids.stdout_lines }}" - -- name: waiting for clean pgs... - shell: | - test "$(ceph --cluster {{ cluster }} pg stat | sed 's/^.*pgs://;s/active+clean.*//;s/ //')" -eq "$(ceph --cluster {{ cluster }} pg stat | sed 's/pgs.*//;s/^.*://;s/ //')" && ceph --cluster {{ cluster }} health | egrep -sq "HEALTH_OK|HEALTH_WARN" - register: result - until: result.rc == 0 - retries: "{{ handler_health_osd_check_retries }}" - delay: "{{ handler_health_osd_check_delay }}" - delegate_to: "{{ groups[mon_group_name][0] }}" - when: handler_health_osd_check diff --git a/roles/ceph-common/tasks/checks/check_socket.yml b/roles/ceph-common/tasks/checks/check_socket.yml deleted file mode 100644 index 674d34c8d..000000000 --- a/roles/ceph-common/tasks/checks/check_socket.yml +++ /dev/null @@ -1,14 +0,0 @@ ---- -- name: check for a ceph socket - shell: "stat /var/run/ceph/*.asok > /dev/null 2>&1" - changed_when: false - failed_when: false - always_run: true - register: socket - -- name: check for a rados gateway socket - shell: "stat {{ rbd_client_admin_socket_path }}*.asok > /dev/null 2>&1" - changed_when: false - failed_when: false - always_run: true - register: socketrgw diff --git a/roles/ceph-common/tasks/main.yml b/roles/ceph-common/tasks/main.yml index 217d48757..360c21232 100644 --- a/roles/ceph-common/tasks/main.yml +++ b/roles/ceph-common/tasks/main.yml @@ -87,7 +87,6 @@ static: False - include: facts.yml -- include: ./checks/check_socket.yml - include: create_ceph_initial_dirs.yml - include: generate_cluster_fsid.yml - include: generate_ceph_conf.yml diff --git a/roles/ceph-common/templates/restart_mon_daemon.sh.j2 b/roles/ceph-common/templates/restart_mon_daemon.sh.j2 new file mode 100644 index 000000000..d918b0198 --- /dev/null +++ b/roles/ceph-common/templates/restart_mon_daemon.sh.j2 @@ -0,0 +1,33 @@ +#!/bin/bash + +RETRIES="{{ handler_health_mon_check_retries }}" +DELAY="{{ handler_health_mon_check_delay }}" +MONITOR_NAME="{{ monitor_name }}" +CLUSTER="{{ cluster }}" +SOCKET=/var/run/ceph/${CLUSTER}-mon.${MONITOR_NAME}.asok + + +check_quorum() { +while [ $RETRIES -ne 0 ]; do + MEMBERS=$(ceph --cluster ${CLUSTER} -s --format json | sed -r 's/.*"quorum_names":(\[[^]]+\]).*/\1/') + test "${MEMBERS/$MONITOR_NAME}" != "$MEMBERS" && exit 0 + sleep $DELAY + let RETRIES=RETRIES-1 +done +# If we reach this point, it means there is a problem with the quorum +exit 1 +} + +# First, restart the daemon +systemctl restart ceph-mon@${MONITOR_NAME} + +COUNT=10 +# Wait and ensure the socket exists after restarting the daemon +while [ $COUNT -ne 0 ]; do + test -S $SOCKET && check_quorum + sleep 1 + let COUNT=COUNT-1 +done +# If we reach this point, it means the socket is not present. +echo "Error while restarting mon daemon" +exit 1 diff --git a/roles/ceph-common/templates/restart_osd_daemon.sh.j2 b/roles/ceph-common/templates/restart_osd_daemon.sh.j2 new file mode 100644 index 000000000..8b0b7d1de --- /dev/null +++ b/roles/ceph-common/templates/restart_osd_daemon.sh.j2 @@ -0,0 +1,36 @@ +#!/bin/bash + +RETRIES="{{ handler_health_osd_check_retries }}" +DELAY="{{ handler_health_osd_check_delay }}" +CEPH_CLI="--name client.bootstrap-osd --keyring /var/lib/ceph/bootstrap-osd/{{ cluster }}.keyring --cluster {{ cluster }}" + +check_pgs() { + while [ $RETRIES -ne 0 ]; do + ceph $CEPH_CLI -s | grep -sq 'active+clean' + RET=$? + test $RET -eq 0 && exit 0 + sleep $DELAY + let RETRIES=RETRIES-1 + done + # PGs not clean, exiting with return code 1 + echo "Error with PGs, check config" + exit 1 +} + + +for id in $(ls /var/lib/ceph/osd/ | sed 's/.*-//'); do + # First, restart daemon(s) + systemctl restart ceph-osd@${id} + # We need to wait because it may take some time for the socket to actually exists + COUNT=10 + # Wait and ensure the socket exists after restarting the daemon + SOCKET=/var/run/ceph/{{ cluster }}-osd.${id}.asok + while [ $COUNT -ne 0 ]; do + test -S $SOCKET && check_pgs + sleep 1 + let COUNT=COUNT-1 + done + # If we reach this point, it means the socket is not present. + echo "Error while restarting mon daemon" + exit 1 +done