ceph-ansible/roles/ceph-handler/templates/restart_osd_daemon.sh.j2

#!/bin/bash

DELAY="{{ handler_health_osd_check_delay }}"
CEPH_CLI="--name client.bootstrap-osd --keyring /var/lib/ceph/bootstrap-osd/{{ cluster }}.keyring --cluster {{ cluster }}"

check_pgs() {
  num_pgs=$($container_exec ceph $CEPH_CLI -s -f json | "{{ discovered_interpreter_python }}" -c 'import sys, json; print(json.load(sys.stdin)["pgmap"]["num_pgs"])')
  if [[ "$num_pgs" == "0" ]]; then
    return 0
  fi
  while [ $RETRIES -ne 0 ]; do
    test "$($container_exec ceph $CEPH_CLI -s -f json | "{{ discovered_interpreter_python }}" -c 'import sys, json; print(json.load(sys.stdin)["pgmap"]["num_pgs"])')" -eq "$($container_exec ceph $CEPH_CLI -s -f json | "{{ discovered_interpreter_python }}" -c 'import sys, json; print(sum ( [ i["count"] for i in json.load(sys.stdin)["pgmap"]["pgs_by_state"] if "active+clean" in i["state_name"]]))')"
    RET=$?
    test $RET -eq 0 && return 0
    sleep $DELAY
    let RETRIES=RETRIES-1
  done
  # PGs not clean, exiting with return code 1
  echo "Error while running 'ceph $CEPH_CLI -s', PGs were not reported as active+clean"
  echo "It is possible that the cluster has less OSDs than the replica configuration"
  echo "Will refuse to continue"
  $container_exec ceph $CEPH_CLI -s
  $container_exec ceph $CEPH_CLI osd dump
  $container_exec ceph $CEPH_CLI osd tree
  $container_exec ceph $CEPH_CLI osd crush rule dump
  exit 1
}

wait_for_socket_in_container() {
  osd_mount_point=$({{ container_binary }} exec "$1" df --output=target | grep '/var/lib/ceph/osd/')
  whoami=$({{ container_binary }} exec "$1" cat $osd_mount_point/whoami)
  if ! {{ container_binary }} exec "$1" timeout 10 bash -c "while [ ! -e /var/run/ceph/{{ cluster }}-osd.${whoami}.asok ]; do sleep 1 ; done"; then
    echo "Timed out while trying to look for a Ceph OSD socket."
    echo "Abort mission!"
    exit 1
  fi
}

get_dev_name() {
  echo $1 | sed -r 's/ceph-osd@([a-z]{1,4})\.service/\1/'
}

get_container_id_from_dev_name() {
  local id
  local count
  count=10
  while [ $count -ne 0 ]; do
    id=$({{ container_binary }} ps | grep -E "${1}$" | cut -d' ' -f1)
    test "$id" != "" && break
    sleep $DELAY
    let count=count-1
  done
  echo "$id"
}

# For containerized deployments, the unit file looks like: ceph-osd@sda.service
# For non-containerized deployments, the unit file looks like: ceph-osd@NNN.service where NNN is OSD ID
for unit in $(systemctl list-units | grep -E "loaded * active" | grep -oE "ceph-osd@([0-9]+).service"); do
  # First, restart daemon(s)
  systemctl restart "${unit}"
  # We need to wait because it may take some time for the socket to actually exists
  COUNT=10
  # Wait and ensure the socket exists after restarting the daemon
  {% if containerized_deployment | bool %}
  osd_id=$(echo ${unit#ceph-osd@} | grep -oE '[0-9]+')
  container_id=$(get_container_id_from_dev_name "ceph-osd-${osd_id}")
  container_exec="{{ container_binary }} exec $container_id"
  {% else %}
  osd_id=$(echo ${unit#ceph-osd@} | grep -oE '[0-9]+')
  {% endif %}
  SOCKET=/var/run/ceph/{{ cluster }}-osd.${osd_id}.asok
  while [ $COUNT -ne 0 ]; do
    RETRIES="{{ handler_health_osd_check_retries }}"
    $container_exec test -S "$SOCKET" && check_pgs && continue 2
    sleep $DELAY
    let COUNT=COUNT-1
  done
  # If we reach this point, it means the socket is not present.
  echo "Socket file ${SOCKET} could not be found, which means the osd daemon is not running. Showing ceph-osd unit logs now:"
  journalctl -u "${unit}"
  exit 1
done
Add handlers for containerized deployment Until now, there is no handlers for containerized deployments. Signed-off-by: Guillaume Abrioux <gabrioux@redhat.com> 2017-07-29 07:00:06 +08:00			`#!/bin/bash`

			`DELAY="{{ handler_health_osd_check_delay }}"`
			`CEPH_CLI="--name client.bootstrap-osd --keyring /var/lib/ceph/bootstrap-osd/{{ cluster }}.keyring --cluster {{ cluster }}"`

			`check_pgs() {`
common: use discovered_interpreter_python fact in order to use the right binary name when using python cli in command or shell module. Signed-off-by: Guillaume Abrioux <gabrioux@redhat.com> 2019-08-14 15:56:41 +08:00			`num_pgs=$($container_exec ceph $CEPH_CLI -s -f json \| "{{ discovered_interpreter_python }}" -c 'import sys, json; print(json.load(sys.stdin)["pgmap"]["num_pgs"])')`
handler: do not test if pgs_num = 0 We don't need to wait if they are no PGS. Signed-off-by: Sébastien Han <seb@redhat.com> 2017-10-06 20:59:23 +08:00			`if [[ "$num_pgs" == "0" ]]; then`
			`return 0`
			`fi`
Add handlers for containerized deployment Until now, there is no handlers for containerized deployments. Signed-off-by: Guillaume Abrioux <gabrioux@redhat.com> 2017-07-29 07:00:06 +08:00			`while [ $RETRIES -ne 0 ]; do`
common: use discovered_interpreter_python fact in order to use the right binary name when using python cli in command or shell module. Signed-off-by: Guillaume Abrioux <gabrioux@redhat.com> 2019-08-14 15:56:41 +08:00			`test "$($container_exec ceph $CEPH_CLI -s -f json \| "{{ discovered_interpreter_python }}" -c 'import sys, json; print(json.load(sys.stdin)["pgmap"]["num_pgs"])')" -eq "$($container_exec ceph $CEPH_CLI -s -f json \| "{{ discovered_interpreter_python }}" -c 'import sys, json; print(sum ( [ i["count"] for i in json.load(sys.stdin)["pgmap"]["pgs_by_state"] if "active+clean" in i["state_name"]]))')"`
Add handlers for containerized deployment Until now, there is no handlers for containerized deployments. Signed-off-by: Guillaume Abrioux <gabrioux@redhat.com> 2017-07-29 07:00:06 +08:00			`RET=$?`
			`test $RET -eq 0 && return 0`
			`sleep $DELAY`
			`let RETRIES=RETRIES-1`
			`done`
			`# PGs not clean, exiting with return code 1`
			`echo "Error while running 'ceph $CEPH_CLI -s', PGs were not reported as active+clean"`
			`echo "It is possible that the cluster has less OSDs than the replica configuration"`
			`echo "Will refuse to continue"`
Add new container scenario Test with podman instead of docker and also support for python 3 only. Signed-off-by: Sébastien Han <seb@redhat.com> 2018-11-08 17:02:37 +08:00			`$container_exec ceph $CEPH_CLI -s`
			`$container_exec ceph $CEPH_CLI osd dump`
			`$container_exec ceph $CEPH_CLI osd tree`
			`$container_exec ceph $CEPH_CLI osd crush rule dump`
Add handlers for containerized deployment Until now, there is no handlers for containerized deployments. Signed-off-by: Guillaume Abrioux <gabrioux@redhat.com> 2017-07-29 07:00:06 +08:00			`exit 1`
			`}`

Add new container scenario Test with podman instead of docker and also support for python 3 only. Signed-off-by: Sébastien Han <seb@redhat.com> 2018-11-08 17:02:37 +08:00			`wait_for_socket_in_container() {`
			`osd_mount_point=$({{ container_binary }} exec "$1" df --output=target \| grep '/var/lib/ceph/osd/')`
			`whoami=$({{ container_binary }} exec "$1" cat $osd_mount_point/whoami)`
ceph-handler: fix cluster name in socket path c90f605b5 introduces the default ceph cluster name value in the rgw socket path for the rgw restart script. But this should use the `cluster` variable instead. This commit also fixes this in the osd restart script. Signed-off-by: Dimitri Savineau <dsavinea@redhat.com> 2019-07-03 23:26:42 +08:00			`if ! {{ container_binary }} exec "$1" timeout 10 bash -c "while [ ! -e /var/run/ceph/{{ cluster }}-osd.${whoami}.asok ]; do sleep 1 ; done"; then`
switch-from-non-containerized-to-containerized: simplify This commit eases the use of the infrastructure-playbooks/switch-from-non-containerized-to-containerized-ceph-daemons.yml playbook. We basically run it with a couple of pre-tasks and then we let the playbook run the docker roles. It obviously expect to have proper variables configured in order to work. Signed-off-by: Sébastien Han <seb@redhat.com> 2017-08-22 22:43:01 +08:00			`echo "Timed out while trying to look for a Ceph OSD socket."`
			`echo "Abort mission!"`
Add handlers for containerized deployment Until now, there is no handlers for containerized deployments. Signed-off-by: Guillaume Abrioux <gabrioux@redhat.com> 2017-07-29 07:00:06 +08:00			`exit 1`
			`fi`
			`}`

			`get_dev_name() {`
			`echo $1 \| sed -r 's/ceph-osd@([a-z]{1,4})\.service/\1/'`
			`}`

Add new container scenario Test with podman instead of docker and also support for python 3 only. Signed-off-by: Sébastien Han <seb@redhat.com> 2018-11-08 17:02:37 +08:00			`get_container_id_from_dev_name() {`
Add handlers for containerized deployment Until now, there is no handlers for containerized deployments. Signed-off-by: Guillaume Abrioux <gabrioux@redhat.com> 2017-07-29 07:00:06 +08:00			`local id`
			`local count`
			`count=10`
			`while [ $count -ne 0 ]; do`
handler: fix bug in osd handlers fbf4ed42aee8fa5fd18c4c289cbb80ffeda8f72e introduced a bug when container binary is podman. podman doesn't support ps -f using regular expression, the container id is never set in the restart script causing the handler to fail. Closes: https://bugzilla.redhat.com/show_bug.cgi?id=1721536 Signed-off-by: Guillaume Abrioux <gabrioux@redhat.com> 2019-07-18 20:06:23 +08:00			`id=$({{ container_binary }} ps \| grep -E "${1}$" \| cut -d' ' -f1)`
Add handlers for containerized deployment Until now, there is no handlers for containerized deployments. Signed-off-by: Guillaume Abrioux <gabrioux@redhat.com> 2017-07-29 07:00:06 +08:00			`test "$id" != "" && break`
defaults: restart docker daemon higher delay Use default delay since the mon (in particular) can take more time to restart. Solves error with: STDERR: Error response from daemon: No such container: ceph-mon-mon0 Signed-off-by: Sébastien Han <seb@redhat.com> 2017-09-15 03:38:11 +08:00			`sleep $DELAY`
Add handlers for containerized deployment Until now, there is no handlers for containerized deployments. Signed-off-by: Guillaume Abrioux <gabrioux@redhat.com> 2017-07-29 07:00:06 +08:00			`let count=count-1`
			`done`
			`echo "$id"`
			`}`

			`# For containerized deployments, the unit file looks like: ceph-osd@sda.service`
Fix in regular expression matching OSD ID on non-contenerized deployment. restart_osd_daemon.sh is used to discover and restart all OSDs on a host. To do it the scripts loops the list of ceph-osd@ services in the system. This commit fixes bug in the regular expression responsile for extraction of OSDs - prior version uses `[0-9]{1,2}` expression which is ignoring all OSDS which numbers are greater than 99 (thus longer than 2 digits). Fix removed upper limit of digits in the number. This problem existed in two places in the script. Closes: #2964 Signed-off-by: Artur Fijalkowski <artur.fijalkowski@ing.com> 2018-08-02 19:28:44 +08:00			`# For non-containerized deployments, the unit file looks like: ceph-osd@NNN.service where NNN is OSD ID`
common: support OSDs with more than 2 digits When running environment with OSDs having ID with more than 2 digits, some tasks don't match the system units and therefore, playbook can fail. Closes: https://bugzilla.redhat.com/show_bug.cgi?id=1805643 Signed-off-by: Guillaume Abrioux <gabrioux@redhat.com> 2020-02-21 17:22:32 +08:00			`for unit in $(systemctl list-units \| grep -E "loaded * active" \| grep -oE "ceph-osd@([0-9]+).service"); do`
Add handlers for containerized deployment Until now, there is no handlers for containerized deployments. Signed-off-by: Guillaume Abrioux <gabrioux@redhat.com> 2017-07-29 07:00:06 +08:00			`# First, restart daemon(s)`
			`systemctl restart "${unit}"`
			`# We need to wait because it may take some time for the socket to actually exists`
			`COUNT=10`
			`# Wait and ensure the socket exists after restarting the daemon`
add missing boolean filter Otherwise this will generate an ansible warning about the missing filter. [DEPRECATION WARNING]: evaluating xxx as a bare variable, this behaviour will go away and you might need to add \|bool to the expression in the future. Also see CONDITIONAL_BARE_VARS configuration toggle.. This feature will be removed in version 2.12. Signed-off-by: Dimitri Savineau <dsavinea@redhat.com> 2020-09-26 00:15:02 +08:00			`{% if containerized_deployment \| bool %}`
handler: add support for ceph-volume containerized restart The restart script wasn't working with the current new addition of ceph-volume in container where now OSDs have the OSD id name in the container name. Signed-off-by: Sébastien Han <seb@redhat.com> 2018-10-03 00:10:19 +08:00			`osd_id=$(echo ${unit#ceph-osd@} \| grep -oE '[0-9]+')`
Add new container scenario Test with podman instead of docker and also support for python 3 only. Signed-off-by: Sébastien Han <seb@redhat.com> 2018-11-08 17:02:37 +08:00			`container_id=$(get_container_id_from_dev_name "ceph-osd-${osd_id}")`
			`container_exec="{{ container_binary }} exec $container_id"`
Add handlers for containerized deployment Until now, there is no handlers for containerized deployments. Signed-off-by: Guillaume Abrioux <gabrioux@redhat.com> 2017-07-29 07:00:06 +08:00			`{% else %}`
restart_osd_daemon.sh.j2 - use `+` rather than `{1,}` in regex `+` is more idiomatic for "one or more" in a regex than `{1,}`; the latter was introduced in a previous fix for an incorrect `{1,2}` restriction. Signed-off-by: Matthew Vernon <mv3@sanger.ac.uk> 2018-09-19 21:25:15 +08:00			`osd_id=$(echo ${unit#ceph-osd@} \| grep -oE '[0-9]+')`
Add handlers for containerized deployment Until now, there is no handlers for containerized deployments. Signed-off-by: Guillaume Abrioux <gabrioux@redhat.com> 2017-07-29 07:00:06 +08:00			`{% endif %}`
switch-from-non-containerized-to-containerized: simplify This commit eases the use of the infrastructure-playbooks/switch-from-non-containerized-to-containerized-ceph-daemons.yml playbook. We basically run it with a couple of pre-tasks and then we let the playbook run the docker roles. It obviously expect to have proper variables configured in order to work. Signed-off-by: Sébastien Han <seb@redhat.com> 2017-08-22 22:43:01 +08:00			`SOCKET=/var/run/ceph/{{ cluster }}-osd.${osd_id}.asok`
Add handlers for containerized deployment Until now, there is no handlers for containerized deployments. Signed-off-by: Guillaume Abrioux <gabrioux@redhat.com> 2017-07-29 07:00:06 +08:00			`while [ $COUNT -ne 0 ]; do`
restart_osd_daemon.sh.j2 - Reset RETRIES between calls of check_pgs Previously RETRIES was set (by default to 40) once at the start of the script; this meant that it would only ever wait for up to 40 lots of 30s across all the OSDs on a host before bombing out. In fact, we want to be prepared to wait for the same amount of time after each OSD restart for the clusters' pgs to be happy again before continuing. Closes: #3154 Signed-off-by: Matthew Vernon <mv3@sanger.ac.uk> 2018-09-22 00:55:01 +08:00			`RETRIES="{{ handler_health_osd_check_retries }}"`
Add new container scenario Test with podman instead of docker and also support for python 3 only. Signed-off-by: Sébastien Han <seb@redhat.com> 2018-11-08 17:02:37 +08:00			`$container_exec test -S "$SOCKET" && check_pgs && continue 2`
defaults: restart docker daemon higher delay Use default delay since the mon (in particular) can take more time to restart. Solves error with: STDERR: Error response from daemon: No such container: ceph-mon-mon0 Signed-off-by: Sébastien Han <seb@redhat.com> 2017-09-15 03:38:11 +08:00			`sleep $DELAY`
Add handlers for containerized deployment Until now, there is no handlers for containerized deployments. Signed-off-by: Guillaume Abrioux <gabrioux@redhat.com> 2017-07-29 07:00:06 +08:00			`let COUNT=COUNT-1`
			`done`
			`# If we reach this point, it means the socket is not present.`
handler: show unit logs on error This will tremendously help debugging daemons that fail on restart by showing the systemd unit logs. Signed-off-by: Sébastien Han <seb@redhat.com> 2018-11-27 17:45:05 +08:00			`echo "Socket file ${SOCKET} could not be found, which means the osd daemon is not running. Showing ceph-osd unit logs now:"`
			`journalctl -u "${unit}"`
Add handlers for containerized deployment Until now, there is no handlers for containerized deployments. Signed-off-by: Guillaume Abrioux <gabrioux@redhat.com> 2017-07-29 07:00:06 +08:00			`exit 1`
			`done`