ceph-ansible/roles/ceph-common/templates/restart_osd_daemon.sh.j2

#!/bin/bash

RETRIES="{{ handler_health_osd_check_retries }}"
DELAY="{{ handler_health_osd_check_delay }}"
CEPH_CLI="--name client.bootstrap-osd --keyring /var/lib/ceph/bootstrap-osd/{{ cluster }}.keyring --cluster {{ cluster }}"

check_pgs() {
  while [ $RETRIES -ne 0 ]; do
    ceph $CEPH_CLI -s | grep -sq 'active+clean'
    RET=$?
    test $RET -eq 0 && exit 0
    sleep $DELAY
    let RETRIES=RETRIES-1
  done
  # PGs not clean, exiting with return code 1
  echo "Error while running 'ceph $CEPH_CLI -s', PGs were not reported as active+clean"
  echo "It is possible that the cluster has less OSDs than the replica configuration"
  echo "Will refuse to continue"
  ceph $CEPH_CLI -s
  exit 1
}

for id in $(ls /var/lib/ceph/osd/ | sed 's/.*-//'); do
  # First, restart daemon(s)
  systemctl restart ceph-osd@${id}
  # We need to wait because it may take some time for the socket to actually exists
  COUNT=10
  # Wait and ensure the socket exists after restarting the daemon
  SOCKET=/var/run/ceph/{{ cluster }}-osd.${id}.asok
  while [ $COUNT -ne 0 ]; do
    test -S $SOCKET && check_pgs
    sleep 1
    let COUNT=COUNT-1
  done
  # If we reach this point, it means the socket is not present.
  echo "Socket file ${SOCKET} could not be found, which means the osd daemon is not running."
  exit 1
done