#!/bin/bash RETRIES="{{ handler_health_osd_check_retries }}" DELAY="{{ handler_health_osd_check_delay }}" CEPH_CLI="--name client.bootstrap-osd --keyring /var/lib/ceph/bootstrap-osd/{{ cluster }}.keyring --cluster {{ cluster }}" check_pgs() { while [ $RETRIES -ne 0 ]; do test "[""$(ceph $CEPH_CLI -s -f json | python -c 'import sys, json; print(json.load(sys.stdin)["pgmap"]["num_pgs"])')""]" = "$(ceph $CEPH_CLI -s -f json | python -c 'import sys, json; print [ i["count"] for i in json.load(sys.stdin)["pgmap"]["pgs_by_state"] if i["state_name"] == "active+clean"]')" RET=$? test $RET -eq 0 && return 0 sleep $DELAY let RETRIES=RETRIES-1 done # PGs not clean, exiting with return code 1 echo "Error while running 'ceph $CEPH_CLI -s', PGs were not reported as active+clean" echo "It is possible that the cluster has less OSDs than the replica configuration" echo "Will refuse to continue" ceph $CEPH_CLI -s exit 1 } for id in $(ls /var/lib/ceph/osd/ | sed 's/.*-//'); do # First, restart daemon(s) systemctl restart ceph-osd@${id} # We need to wait because it may take some time for the socket to actually exists COUNT=10 # Wait and ensure the socket exists after restarting the daemon SOCKET=/var/run/ceph/{{ cluster }}-osd.${id}.asok while [ $COUNT -ne 0 ]; do test -S $SOCKET && check_pgs && continue 2 sleep 1 let COUNT=COUNT-1 done # If we reach this point, it means the socket is not present. echo "Socket file ${SOCKET} could not be found, which means the osd daemon is not running." exit 1 done