From d4a3e26534334cdbb9111fca3c430b3b2913885a Mon Sep 17 00:00:00 2001 From: Dimitri Savineau Date: Tue, 7 May 2019 16:33:21 -0400 Subject: [PATCH] ceph-handler: Fix rgw socket in restart script Since Mimic the radosgw socket has two extra fields in the socket name (before the .asok suffix): . Before: /var/run/ceph/ceph-client.rgw.cephaio-1.asok After: /var/run/ceph/ceph-client.rgw.cephaio-1.16913.23928832.asok The radosgw restart script doesn't handle this and could fail during an upgrade. If the SOCKETS variable isn't defined in the script then the test command won't fail because the return code is 0 $ test -S $ echo $? 0 There multiple issues in that script: - The default SOCKETS value isn't defined due to a typo SOCKET vs SOCKETS. - Because the socket name uses the pid then we need to check the socket name after the service restart. - After restarting the radosgw service we need to wait few seconds otherwise the socket won't be created. - Update the wget parameters because the command is doing a loop. We now use the same option than curl. - The check_rest function doesn't test the radosgw at all due to a wrong test command (test against a string) and always returns 0. This needs to use the DOCKER_EXECS variable in order to execute the command. $ test 'wget http://192.168.100.11:8080' $ echo $? 0 Also remove the test based on the ansible_fqdn because we only use the ansible_hostname + rgw instance name. Finally group all for loop into a single one. Resolves: #3926 Signed-off-by: Dimitri Savineau (cherry picked from commit c90f605b5148d179790cec545d02db1086579994) --- .../templates/restart_rgw_daemon.sh.j2 | 29 +++++++------------ 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/roles/ceph-handler/templates/restart_rgw_daemon.sh.j2 b/roles/ceph-handler/templates/restart_rgw_daemon.sh.j2 index e78a54102..4f27de050 100644 --- a/roles/ceph-handler/templates/restart_rgw_daemon.sh.j2 +++ b/roles/ceph-handler/templates/restart_rgw_daemon.sh.j2 @@ -14,19 +14,14 @@ fi declare -a DOCKER_EXECS for ((i=0; i<${RGW_NUMS}; i++)); do DOCKER_EXECS[i]="" -{% if containerized_deployment %} +{% if containerized_deployment | bool %} CONTAINER_NAME="ceph-rgw-${HOST_NAME}-rgw${i}" DOCKER_EXECS[i]="{{ container_binary }} exec ${CONTAINER_NAME}" {% endif %} done declare -a SOCKETS -# Backward compatibility -for ((i=0; i<${RGW_NUMS}; i++)); do - SOCKET[i]="EMPTY_SOCKET" - ${DOCKER_EXECS[i]} test -S /var/run/ceph/{{ cluster }}-client.rgw.{{ ansible_fqdn }}.asok && SOCKETS[i]=/var/run/ceph/{{ cluster }}-client.rgw.{{ ansible_fqdn }}.asok - ${DOCKER_EXECS[i]} test -S /var/run/ceph/{{ cluster }}-client.rgw.${HOST_NAME}.rgw${i}.asok && SOCKETS[i]=/var/run/ceph/{{ cluster }}-client.rgw.${HOST_NAME}.rgw${i}.asok -done RGW_IP={{ hostvars[inventory_hostname]['_radosgw_address'] }} +SOCKET_PREFIX="/var/run/ceph/ceph-client.rgw.${HOST_NAME}.rgw" check_socket() { local i=$1 @@ -34,6 +29,7 @@ check_socket() { local count=10 # Wait and ensure the socket exists after restarting the daemon while [ $count -ne 0 ]; do + ${DOCKER_EXECS[i]} test -S ${SOCKET_PREFIX}${i}.*.*.asok && SOCKETS[i]=$(stat --printf=%n ${SOCKET_PREFIX}${i}.*.*.asok) ${DOCKER_EXECS[i]} test -S ${SOCKETS[i]} && succ=$((succ+1)) && break sleep $DELAY let count=count-1 @@ -48,7 +44,7 @@ check_socket() { check_for_curl_or_wget() { local i=$1 if ${DOCKER_EXECS[i]} command -v wget &>/dev/null; then - rgw_test_command="wget --quiet" + rgw_test_command="wget --tries 1 --quiet -O /dev/null" elif ${DOCKER_EXECS[i]} command -v curl &>/dev/null; then rgw_test_command="curl --fail --silent --output /dev/null" else @@ -63,28 +59,23 @@ check_rest() { check_for_curl_or_wget ${i} local succ=0 while [ $RETRIES -ne 0 ]; do - test "$rgw_test_command $RGW_PROTOCOL://$RGW_IP:$((RGW_BASE_PORT+i))" && succ=$((succ+1)) && break + ${DOCKER_EXECS[i]} $rgw_test_command $RGW_PROTOCOL://$RGW_IP:$((RGW_BASE_PORT+i)) && succ=$((succ+1)) && break sleep $DELAY let RETRIES=RETRIES-1 done if [ $succ -ne 1 ]; then # If we reach this point, it means there is a problem with the connection to rgw - echo "Error connecting locally to Rados Gateway service: $RGW_PROTOCOL://$rgw_listen" + echo "Error connecting locally to Rados Gateway service: $RGW_PROTOCOL://$RGW_IP:$((RGW_BASE_PORT+i))" exit 1 fi } -# First, restart the daemon for ((i=0; i<${RGW_NUMS}; i++)); do + SOCKETS[i]="EMPTY_SOCKET" + # First, restart the daemon systemctl restart ceph-radosgw@rgw.${HOST_NAME}.rgw${i} -done - -# Check socket files -for ((i=0; i<${RGW_NUMS}; i++)); do + # Check socket files check_socket ${i} -done - -# Check rest -for ((i=0; i<${RGW_NUMS}; i++)); do + # Check rest check_rest ${i} done