ceph-ansible/roles/ceph-handler/templates/restart_mon_daemon.sh.j2

50 lines
1.9 KiB
Django/Jinja

#!/bin/bash
RETRIES="{{ handler_health_mon_check_retries }}"
DELAY="{{ handler_health_mon_check_delay }}"
MONITOR_NAME="{{ monitor_name }}"
{% if containerized_deployment | bool %}
DOCKER_EXEC="{{ container_binary }} exec ceph-mon-{{ ansible_hostname }}"
{% endif %}
# if daemon is uninstalled, no restarting is needed; so exit with success
systemctl status ceph-mon@{{ ansible_hostname }} > /dev/null
if [[ $? -ne 0 ]]; then
exit 0
fi
# Backward compatibility
$DOCKER_EXEC test -S /var/run/ceph/{{ cluster }}-mon.{{ ansible_fqdn }}.asok && SOCKET=/var/run/ceph/{{ cluster }}-mon.{{ ansible_fqdn }}.asok
$DOCKER_EXEC test -S /var/run/ceph/{{ cluster }}-mon.{{ ansible_hostname }}.asok && SOCKET=/var/run/ceph/{{ cluster }}-mon.{{ ansible_hostname }}.asok
check_quorum() {
while [ $RETRIES -ne 0 ]; do
$DOCKER_EXEC ceph --cluster {{ cluster }} -s --format json | "{{ discovered_interpreter_python }}" -c 'import sys, json; exit(0) if "{{ monitor_name }}" in json.load(sys.stdin)["quorum_names"] else exit(1)' && exit 0
sleep $DELAY
let RETRIES=RETRIES-1
done
# If we reach this point, it means there is a problem with the quorum
echo "Error with quorum."
echo "cluster status:"
$DOCKER_EXEC ceph --cluster {{ cluster }} -s
echo "quorum status:"
$DOCKER_EXEC ceph --cluster {{ cluster }} daemon mon.${MONITOR_NAME} mon_status
$DOCKER_EXEC ceph --cluster {{ cluster }} daemon mon.${MONITOR_NAME} quorum_status
exit 1
}
# First, restart the daemon
systemctl restart ceph-mon@{{ ansible_hostname }}
COUNT=10
# Wait and ensure the socket exists after restarting the daemon
while [ $COUNT -ne 0 ]; do
$DOCKER_EXEC test -S $SOCKET && check_quorum
sleep $DELAY
let COUNT=COUNT-1
done
# If we reach this point, it means the socket is not present.
echo "Socket file ${SOCKET} could not be found, which means the monitor is not running. Showing ceph-mon unit logs now:"
journalctl -u ceph-mon@{{ ansible_hostname }}
exit 1