ceph-ansible/roles/ceph-osd/templates/systemd-device-to-id.sh.j2

82 lines
3.1 KiB
Plaintext
Raw Normal View History

#!/bin/bash
DELAY="{{ handler_health_osd_check_delay }}"
CEPH_CLI="--name client.bootstrap-osd --keyring /var/lib/ceph/bootstrap-osd/{{ cluster }}.keyring --cluster {{ cluster }}"
check_pgs() {
num_pgs=$($docker_exec ceph $CEPH_CLI -s -f json|python -c 'import sys, json; print(json.load(sys.stdin)["pgmap"]["num_pgs"])')
if [[ "$num_pgs" == "0" ]]; then
return 0
fi
while [ $RETRIES -ne 0 ]; do
test "$($docker_exec ceph $CEPH_CLI -s -f json | python -c 'import sys, json; print(json.load(sys.stdin)["pgmap"]["num_pgs"])')" -eq "$($docker_exec ceph $CEPH_CLI -s -f json | python -c 'import sys, json; print sum ( [ i["count"] for i in json.load(sys.stdin)["pgmap"]["pgs_by_state"] if "active+clean" in i["state_name"]])')"
RET=$?
test $RET -eq 0 && return 0
sleep $DELAY
let RETRIES=RETRIES-1
done
# PGs not clean, exiting with return code 1
echo "Error while running 'ceph $CEPH_CLI -s', PGs were not reported as active+clean"
echo "It is possible that the cluster has less OSDs than the replica configuration"
echo "Will refuse to continue"
$docker_exec ceph $CEPH_CLI -s
$docker_exec ceph $CEPH_CLI osd dump
$docker_exec ceph $CEPH_CLI osd tree
$docker_exec ceph $CEPH_CLI osd crush rule dump
exit 1
}
wait_for_socket_in_docker() {
osd_mount_point=$(docker exec "$1" df --output=target | grep '/var/lib/ceph/osd/')
whoami=$(docker exec "$1" cat $osd_mount_point/whoami)
if ! docker exec "$1" timeout 10 bash -c "while [ ! -e /var/run/ceph/{{ cluster }}-osd.${whoami}.asok ]; do sleep 1 ; done"; then
echo "Timed out while trying to look for a Ceph OSD socket."
echo "Abort mission!"
exit 1
fi
}
get_dev_name() {
echo $1 | sed -r 's/ceph-osd@([a-z]{1,4}|nvme[0-9]+n[0-9]+)\.service/\1/'
}
get_docker_id_from_dev_name() {
local id
local count
count=10
while [ $count -ne 0 ]; do
id=$(docker ps -q -f "name=${1}$")
test "$id" != "" && break
sleep $DELAY
let count=count-1
done
echo "$id"
}
for unit in $(systemctl list-units | grep -E "loaded * active" | grep -oE "ceph-osd@([a-z]+|nvme[0-9]+n[0-9]+).service"); do
dev_name=$(get_dev_name "$unit")
container_id=$(get_docker_id_from_dev_name "$dev_name")
wait_for_socket_in_docker "$container_id"
osd_id=$whoami
# Stop and Disable the unit based on device name
systemctl stop ceph-osd@${dev_name}
systemctl disable ceph-osd@${dev_name}
# Enable and Start the unit based on OSD id
systemctl enable ceph-osd@${osd_id}
systemctl start ceph-osd@${osd_id}
container_id=$(get_docker_id_from_dev_name "ceph-osd-${osd_id}")
docker_exec="docker exec $container_id"
SOCKET=/var/run/ceph/{{ cluster }}-osd.${osd_id}.asok
COUNT=10
while [ $COUNT -ne 0 ]; do
RETRIES="{{ handler_health_osd_check_retries }}"
$docker_exec test -S "$SOCKET" && check_pgs && continue 2
sleep $DELAY
let COUNT=COUNT-1
done
# If we reach this point, it means the socket is not present.
echo "Socket file ${SOCKET} could not be found, which means the osd daemon is not running. Showing ceph-osd unit logs now:"
journalctl -u "ceph-osd@${osd_id}.service"
exit 1
done