#!/bin/bash RETRIES="{{ handler_health_osd_check_retries }}" DELAY="{{ handler_health_osd_check_delay }}" CEPH_CLI="--name client.bootstrap-osd --keyring /var/lib/ceph/bootstrap-osd/{{ cluster }}.keyring --cluster {{ cluster }}" check_pgs() { num_pgs=$($docker_exec ceph $CEPH_CLI -s -f json|python -c 'import sys, json; print(json.load(sys.stdin)["pgmap"]["num_pgs"])') if [[ "$num_pgs" == "0" ]]; then return 0 fi while [ $RETRIES -ne 0 ]; do test "[""$($docker_exec ceph $CEPH_CLI -s -f json | python -c 'import sys, json; print(json.load(sys.stdin)["pgmap"]["num_pgs"])')""]" = "$($docker_exec ceph $CEPH_CLI -s -f json | python -c 'import sys, json; print [ i["count"] for i in json.load(sys.stdin)["pgmap"]["pgs_by_state"] if i["state_name"] == "active+clean"]')" RET=$? test $RET -eq 0 && return 0 sleep $DELAY let RETRIES=RETRIES-1 done # PGs not clean, exiting with return code 1 echo "Error while running 'ceph $CEPH_CLI -s', PGs were not reported as active+clean" echo "It is possible that the cluster has less OSDs than the replica configuration" echo "Will refuse to continue" $docker_exec ceph $CEPH_CLI -s exit 1 } wait_for_socket_in_docker() { if ! docker exec "$1" timeout 10 bash -c "while [ ! -e /var/run/ceph/*.asok ]; do sleep 1 ; done"; then echo "Timed out while trying to look for a Ceph OSD socket." echo "Abort mission!" exit 1 fi } get_dev_name() { echo $1 | sed -r 's/ceph-osd@([a-z]{1,4})\.service/\1/' } get_docker_id_from_dev_name() { local id local count count=10 while [ $count -ne 0 ]; do id=$(docker ps -q -f "name=$1") test "$id" != "" && break sleep $DELAY let count=count-1 done echo "$id" } get_docker_osd_id() { wait_for_socket_in_docker $1 docker exec "$1" ls /var/run/ceph | cut -d'.' -f2 } # For containerized deployments, the unit file looks like: ceph-osd@sda.service # For non-containerized deployments, the unit file looks like: ceph-osd@0.service for unit in $(systemctl list-units | grep "loaded active" | grep -oE "ceph-osd@([0-9]{1,2}|[a-z]+).service"); do # First, restart daemon(s) systemctl restart "${unit}" # We need to wait because it may take some time for the socket to actually exists COUNT=10 # Wait and ensure the socket exists after restarting the daemon {% if containerized_deployment -%} id=$(get_dev_name "$unit") container_id=$(get_docker_id_from_dev_name "$id") osd_id=$(get_docker_osd_id "$container_id") docker_exec="docker exec $container_id" {% else %} osd_id=$(echo ${unit#ceph-osd@} | grep -oE '[0-9]{1,2}') {% endif %} SOCKET=/var/run/ceph/{{ cluster }}-osd.${osd_id}.asok while [ $COUNT -ne 0 ]; do $docker_exec test -S "$SOCKET" && check_pgs && continue 2 sleep $DELAY let COUNT=COUNT-1 done # If we reach this point, it means the socket is not present. echo "Socket file ${SOCKET} could not be found, which means the osd daemon is not running." exit 1 done