2017-07-29 07:00:06 +08:00
|
|
|
#!/bin/bash
|
|
|
|
|
|
|
|
DELAY="{{ handler_health_osd_check_delay }}"
|
|
|
|
CEPH_CLI="--name client.bootstrap-osd --keyring /var/lib/ceph/bootstrap-osd/{{ cluster }}.keyring --cluster {{ cluster }}"
|
|
|
|
|
|
|
|
check_pgs() {
|
2018-11-08 17:02:37 +08:00
|
|
|
num_pgs=$($container_exec ceph $CEPH_CLI -s -f json|python -c 'import sys, json; print(json.load(sys.stdin)["pgmap"]["num_pgs"])')
|
2017-10-06 20:59:23 +08:00
|
|
|
if [[ "$num_pgs" == "0" ]]; then
|
|
|
|
return 0
|
|
|
|
fi
|
2017-07-29 07:00:06 +08:00
|
|
|
while [ $RETRIES -ne 0 ]; do
|
2019-02-01 21:32:14 +08:00
|
|
|
test "$($container_exec ceph $CEPH_CLI -s -f json | python -c 'import sys, json; print(json.load(sys.stdin)["pgmap"]["num_pgs"])')" -eq "$($container_exec ceph $CEPH_CLI -s -f json | python -c 'import sys, json; print(sum ( [ i["count"] for i in json.load(sys.stdin)["pgmap"]["pgs_by_state"] if "active+clean" in i["state_name"]]))')"
|
2017-07-29 07:00:06 +08:00
|
|
|
RET=$?
|
|
|
|
test $RET -eq 0 && return 0
|
|
|
|
sleep $DELAY
|
|
|
|
let RETRIES=RETRIES-1
|
|
|
|
done
|
|
|
|
# PGs not clean, exiting with return code 1
|
|
|
|
echo "Error while running 'ceph $CEPH_CLI -s', PGs were not reported as active+clean"
|
|
|
|
echo "It is possible that the cluster has less OSDs than the replica configuration"
|
|
|
|
echo "Will refuse to continue"
|
2018-11-08 17:02:37 +08:00
|
|
|
$container_exec ceph $CEPH_CLI -s
|
|
|
|
$container_exec ceph $CEPH_CLI osd dump
|
|
|
|
$container_exec ceph $CEPH_CLI osd tree
|
|
|
|
$container_exec ceph $CEPH_CLI osd crush rule dump
|
2017-07-29 07:00:06 +08:00
|
|
|
exit 1
|
|
|
|
}
|
|
|
|
|
2018-11-08 17:02:37 +08:00
|
|
|
wait_for_socket_in_container() {
|
|
|
|
osd_mount_point=$({{ container_binary }} exec "$1" df --output=target | grep '/var/lib/ceph/osd/')
|
|
|
|
whoami=$({{ container_binary }} exec "$1" cat $osd_mount_point/whoami)
|
|
|
|
if ! {{ container_binary }} exec "$1" timeout 10 bash -c "while [ ! -e /var/run/ceph/*.asok ]; do sleep 1 ; done"; then
|
2017-08-22 22:43:01 +08:00
|
|
|
echo "Timed out while trying to look for a Ceph OSD socket."
|
|
|
|
echo "Abort mission!"
|
2017-07-29 07:00:06 +08:00
|
|
|
exit 1
|
|
|
|
fi
|
|
|
|
}
|
|
|
|
|
|
|
|
get_dev_name() {
|
|
|
|
echo $1 | sed -r 's/ceph-osd@([a-z]{1,4})\.service/\1/'
|
|
|
|
}
|
|
|
|
|
2018-11-08 17:02:37 +08:00
|
|
|
get_container_id_from_dev_name() {
|
2017-07-29 07:00:06 +08:00
|
|
|
local id
|
|
|
|
local count
|
|
|
|
count=10
|
|
|
|
while [ $count -ne 0 ]; do
|
2018-11-08 17:02:37 +08:00
|
|
|
id=$({{ container_binary }} ps -q -f "name=$1")
|
2017-07-29 07:00:06 +08:00
|
|
|
test "$id" != "" && break
|
2017-09-15 03:38:11 +08:00
|
|
|
sleep $DELAY
|
2017-07-29 07:00:06 +08:00
|
|
|
let count=count-1
|
|
|
|
done
|
|
|
|
echo "$id"
|
|
|
|
}
|
|
|
|
|
2018-11-08 17:02:37 +08:00
|
|
|
get_container_osd_id() {
|
|
|
|
wait_for_socket_in_container $1
|
|
|
|
{{ container_binary }} exec "$1" ls /var/run/ceph | cut -d'.' -f2
|
2017-07-29 07:00:06 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
# For containerized deployments, the unit file looks like: ceph-osd@sda.service
|
2018-08-02 19:28:44 +08:00
|
|
|
# For non-containerized deployments, the unit file looks like: ceph-osd@NNN.service where NNN is OSD ID
|
2018-09-19 21:25:15 +08:00
|
|
|
for unit in $(systemctl list-units | grep -E "loaded * active" | grep -oE "ceph-osd@([0-9]+|[a-z]+).service"); do
|
2017-07-29 07:00:06 +08:00
|
|
|
# First, restart daemon(s)
|
|
|
|
systemctl restart "${unit}"
|
|
|
|
# We need to wait because it may take some time for the socket to actually exists
|
|
|
|
COUNT=10
|
|
|
|
# Wait and ensure the socket exists after restarting the daemon
|
2018-10-03 00:10:19 +08:00
|
|
|
{% if containerized_deployment and osd_scenario != 'lvm' -%}
|
2017-07-29 07:00:06 +08:00
|
|
|
id=$(get_dev_name "$unit")
|
2018-11-08 17:02:37 +08:00
|
|
|
container_id=$(get_container_id_from_dev_name "$id")
|
|
|
|
wait_for_socket_in_container "$container_id"
|
2018-04-12 21:52:30 +08:00
|
|
|
osd_id=$whoami
|
2018-11-08 17:02:37 +08:00
|
|
|
container_exec="{{ container_binary }} exec $container_id"
|
2018-10-03 00:10:19 +08:00
|
|
|
{% elif containerized_deployment and osd_scenario == 'lvm' %}
|
|
|
|
osd_id=$(echo ${unit#ceph-osd@} | grep -oE '[0-9]+')
|
2018-11-08 17:02:37 +08:00
|
|
|
container_id=$(get_container_id_from_dev_name "ceph-osd-${osd_id}")
|
|
|
|
container_exec="{{ container_binary }} exec $container_id"
|
2017-07-29 07:00:06 +08:00
|
|
|
{% else %}
|
2018-09-19 21:25:15 +08:00
|
|
|
osd_id=$(echo ${unit#ceph-osd@} | grep -oE '[0-9]+')
|
2017-07-29 07:00:06 +08:00
|
|
|
{% endif %}
|
2017-08-22 22:43:01 +08:00
|
|
|
SOCKET=/var/run/ceph/{{ cluster }}-osd.${osd_id}.asok
|
2017-07-29 07:00:06 +08:00
|
|
|
while [ $COUNT -ne 0 ]; do
|
2018-09-22 00:55:01 +08:00
|
|
|
RETRIES="{{ handler_health_osd_check_retries }}"
|
2018-11-08 17:02:37 +08:00
|
|
|
$container_exec test -S "$SOCKET" && check_pgs && continue 2
|
2017-09-15 03:38:11 +08:00
|
|
|
sleep $DELAY
|
2017-07-29 07:00:06 +08:00
|
|
|
let COUNT=COUNT-1
|
|
|
|
done
|
|
|
|
# If we reach this point, it means the socket is not present.
|
2018-11-27 17:45:05 +08:00
|
|
|
echo "Socket file ${SOCKET} could not be found, which means the osd daemon is not running. Showing ceph-osd unit logs now:"
|
|
|
|
journalctl -u "${unit}"
|
2017-07-29 07:00:06 +08:00
|
|
|
exit 1
|
|
|
|
done
|