2017-04-04 01:55:11 +08:00
|
|
|
#!/bin/bash
|
|
|
|
|
|
|
|
RETRIES="{{ handler_health_mon_check_retries }}"
|
|
|
|
DELAY="{{ handler_health_mon_check_delay }}"
|
|
|
|
MONITOR_NAME="{{ monitor_name }}"
|
2020-09-26 00:15:02 +08:00
|
|
|
{% if containerized_deployment | bool %}
|
2018-11-08 17:02:37 +08:00
|
|
|
DOCKER_EXEC="{{ container_binary }} exec ceph-mon-{{ ansible_hostname }}"
|
2017-09-28 00:22:15 +08:00
|
|
|
{% endif %}
|
2017-04-04 01:55:11 +08:00
|
|
|
|
2019-04-06 14:15:31 +08:00
|
|
|
# if daemon is uninstalled, no restarting is needed; so exit with success
|
|
|
|
systemctl status ceph-mon@{{ ansible_hostname }} > /dev/null
|
|
|
|
if [[ $? -ne 0 ]]; then
|
|
|
|
exit 0
|
|
|
|
fi
|
|
|
|
|
2018-07-31 21:18:28 +08:00
|
|
|
# Backward compatibility
|
|
|
|
$DOCKER_EXEC test -S /var/run/ceph/{{ cluster }}-mon.{{ ansible_fqdn }}.asok && SOCKET=/var/run/ceph/{{ cluster }}-mon.{{ ansible_fqdn }}.asok
|
|
|
|
$DOCKER_EXEC test -S /var/run/ceph/{{ cluster }}-mon.{{ ansible_hostname }}.asok && SOCKET=/var/run/ceph/{{ cluster }}-mon.{{ ansible_hostname }}.asok
|
2017-04-04 01:55:11 +08:00
|
|
|
|
|
|
|
check_quorum() {
|
|
|
|
while [ $RETRIES -ne 0 ]; do
|
2019-08-14 15:56:41 +08:00
|
|
|
$DOCKER_EXEC ceph --cluster {{ cluster }} -s --format json | "{{ discovered_interpreter_python }}" -c 'import sys, json; exit(0) if "{{ monitor_name }}" in json.load(sys.stdin)["quorum_names"] else exit(1)' && exit 0
|
2017-04-04 01:55:11 +08:00
|
|
|
sleep $DELAY
|
|
|
|
let RETRIES=RETRIES-1
|
|
|
|
done
|
|
|
|
# If we reach this point, it means there is a problem with the quorum
|
2017-04-18 18:40:43 +08:00
|
|
|
echo "Error with quorum."
|
|
|
|
echo "cluster status:"
|
2017-09-28 00:22:15 +08:00
|
|
|
$DOCKER_EXEC ceph --cluster {{ cluster }} -s
|
2018-03-07 18:56:30 +08:00
|
|
|
echo "quorum status:"
|
|
|
|
$DOCKER_EXEC ceph --cluster {{ cluster }} daemon mon.${MONITOR_NAME} mon_status
|
|
|
|
$DOCKER_EXEC ceph --cluster {{ cluster }} daemon mon.${MONITOR_NAME} quorum_status
|
2017-04-04 01:55:11 +08:00
|
|
|
exit 1
|
|
|
|
}
|
|
|
|
|
|
|
|
# First, restart the daemon
|
2018-07-31 21:18:28 +08:00
|
|
|
systemctl restart ceph-mon@{{ ansible_hostname }}
|
2017-04-04 01:55:11 +08:00
|
|
|
|
|
|
|
COUNT=10
|
|
|
|
# Wait and ensure the socket exists after restarting the daemon
|
|
|
|
while [ $COUNT -ne 0 ]; do
|
2017-09-28 00:22:15 +08:00
|
|
|
$DOCKER_EXEC test -S $SOCKET && check_quorum
|
2017-09-15 03:38:11 +08:00
|
|
|
sleep $DELAY
|
2017-04-04 01:55:11 +08:00
|
|
|
let COUNT=COUNT-1
|
|
|
|
done
|
|
|
|
# If we reach this point, it means the socket is not present.
|
2018-11-27 17:45:05 +08:00
|
|
|
echo "Socket file ${SOCKET} could not be found, which means the monitor is not running. Showing ceph-mon unit logs now:"
|
|
|
|
journalctl -u ceph-mon@{{ ansible_hostname }}
|
2017-04-04 01:55:11 +08:00
|
|
|
exit 1
|