#!/bin/bash RETRIES="{{ handler_health_mon_check_retries }}" DELAY="{{ handler_health_mon_check_delay }}" MONITOR_NAME="{{ monitor_name }}" {% if containerized_deployment | bool %} DOCKER_EXEC="{{ container_binary }} exec ceph-mon-{{ ansible_hostname }}" {% endif %} # if daemon is uninstalled, no restarting is needed; so exit with success systemctl status ceph-mon@{{ ansible_hostname }} > /dev/null if [[ $? -ne 0 ]]; then exit 0 fi # Backward compatibility $DOCKER_EXEC test -S /var/run/ceph/{{ cluster }}-mon.{{ ansible_fqdn }}.asok && SOCKET=/var/run/ceph/{{ cluster }}-mon.{{ ansible_fqdn }}.asok $DOCKER_EXEC test -S /var/run/ceph/{{ cluster }}-mon.{{ ansible_hostname }}.asok && SOCKET=/var/run/ceph/{{ cluster }}-mon.{{ ansible_hostname }}.asok check_quorum() { while [ $RETRIES -ne 0 ]; do $DOCKER_EXEC ceph --cluster {{ cluster }} quorum_status --format json | "{{ discovered_interpreter_python }}" -c 'import sys, json; exit(0) if "{{ monitor_name }}" in json.load(sys.stdin)["quorum_names"] else exit(1)' && exit 0 sleep $DELAY let RETRIES=RETRIES-1 done # If we reach this point, it means there is a problem with the quorum echo "Error with quorum." echo "cluster status:" $DOCKER_EXEC ceph --cluster {{ cluster }} -s echo "quorum status:" $DOCKER_EXEC ceph --cluster {{ cluster }} daemon mon.${MONITOR_NAME} mon_status $DOCKER_EXEC ceph --cluster {{ cluster }} daemon mon.${MONITOR_NAME} quorum_status exit 1 } # First, restart the daemon systemctl restart ceph-mon@{{ ansible_hostname }} COUNT=10 # Wait and ensure the socket exists after restarting the daemon while [ $COUNT -ne 0 ]; do $DOCKER_EXEC test -S $SOCKET && check_quorum sleep $DELAY let COUNT=COUNT-1 done # If we reach this point, it means the socket is not present. echo "Socket file ${SOCKET} could not be found, which means the monitor is not running. Showing ceph-mon unit logs now:" journalctl -u ceph-mon@{{ ansible_hostname }} exit 1