From 9737947dde0a8b3b0d878dd9e5e14d5e047e6aee Mon Sep 17 00:00:00 2001
From: Seena Fallah <seenafallah@gmail.com>
Date: Mon, 10 Jun 2024 12:11:55 +0200
Subject: [PATCH] ceph-handler: use haproxy maintenance for rgw restarts

RGW currently restarts without waiting for existing connections to
close. By adjusting the HAProxy weight before the restart, we can
ensure that no active connections are disrupted during the restart
process.

Signed-off-by: Seena Fallah <seenafallah@gmail.com>
(cherry picked from commit 4fa9057a3ebb4c8929df23c1a2b12554eb3957e4)
---
 group_vars/all.yml.sample                     |  1 +
 roles/ceph-defaults/defaults/main.yml         |  1 +
 .../templates/restart_rgw_daemon.sh.j2        | 30 +++++++++++++++----
 3 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/group_vars/all.yml.sample b/group_vars/all.yml.sample
index aa293481f..6a66b871d 100644
--- a/group_vars/all.yml.sample
+++ b/group_vars/all.yml.sample
@@ -420,6 +420,7 @@ dummy:
 # RGW handler checks
 #handler_health_rgw_check_retries: 5
 #handler_health_rgw_check_delay: 10
+#handler_rgw_use_haproxy_maintenance: false
 
 # NFS handler checks
 #handler_health_nfs_check_retries: 5
diff --git a/roles/ceph-defaults/defaults/main.yml b/roles/ceph-defaults/defaults/main.yml
index bf1a13141..eb34908c8 100644
--- a/roles/ceph-defaults/defaults/main.yml
+++ b/roles/ceph-defaults/defaults/main.yml
@@ -412,6 +412,7 @@ handler_health_mds_check_delay: 10
 # RGW handler checks
 handler_health_rgw_check_retries: 5
 handler_health_rgw_check_delay: 10
+handler_rgw_use_haproxy_maintenance: false
 
 # NFS handler checks
 handler_health_nfs_check_retries: 5
diff --git a/roles/ceph-handler/templates/restart_rgw_daemon.sh.j2 b/roles/ceph-handler/templates/restart_rgw_daemon.sh.j2
index 5ea0f3c7d..d7eb36a72 100644
--- a/roles/ceph-handler/templates/restart_rgw_daemon.sh.j2
+++ b/roles/ceph-handler/templates/restart_rgw_daemon.sh.j2
@@ -11,6 +11,7 @@ else
     RGW_PROTOCOL=http
 fi
 INSTANCES_NAME=({% for i in rgw_instances %}{{ i.instance_name }} {% endfor %})
+HAPROXY_BACKEND=({% for i in rgw_instances %}{{ i.haproxy_backend | default('rgw-backend') }} {% endfor %})
 RGW_IPS=({% for i in rgw_instances %}{{ i.radosgw_address }} {% endfor %})
 RGW_PORTS=({% for i in rgw_instances %}{{ i.radosgw_frontend_port }} {% endfor %})
 RGW_ZONE="{{ rgw_zone }}"
@@ -78,19 +79,38 @@ check_rest() {
 }
 
 for ((i=0; i<${RGW_NUMS}; i++)); do
-  # First, restart the daemon
-
   # Check if systemd unit exists
   # This is needed for new instances as the restart might trigger before the deployment
-  if systemctl list-units --full --all | grep -q "ceph-radosgw@rgw.${RGW_ZONE}.${HOST_NAME}.${INSTANCES_NAME[i]}"; then
-    systemctl restart ceph-radosgw@rgw.${RGW_ZONE}.${HOST_NAME}.${INSTANCES_NAME[i]}
-  else
+  if ! systemctl list-units --full --all | grep -q "ceph-radosgw@rgw.${RGW_ZONE}.${HOST_NAME}.${INSTANCES_NAME[i]}"; then
     echo "Systemd unit ceph-radosgw@rgw.${RGW_ZONE}.${HOST_NAME}.${INSTANCES_NAME[i]} does not exist."
     continue
   fi
 
+{% if handler_rgw_use_haproxy_maintenance %}
+  # set server weight to 0 on haproxy
+  echo "set weight ${HAPROXY_BACKEND[i]}/${INSTANCES_NAME[i]} 0" | socat stdio {{ haproxy_socket_path }}
+
+  # wait for the connections to drop
+  retries={{ handler_rgw_haproxy_maintenance_retries | default(60) }}
+  while [ $retries -gt 0 ]; do
+    if [ "$(echo "show servers conn ${HAPROXY_BACKEND[i]}" | socat stdio {{ haproxy_socket_path }} | grep "${HAPROXY_BACKEND[i]}/${INSTANCES_NAME[i]} " | awk '{ print $7 }')" -eq 0 ]; then
+      break
+    fi
+    sleep 1
+    let retries=retries-1
+  done
+{% endif %}
+
+  # Restart the daemon
+  systemctl restart ceph-radosgw@rgw.${RGW_ZONE}.${HOST_NAME}.${INSTANCES_NAME[i]}
+
   # Check socket files
   check_socket ${i}
   # Check rest
   check_rest ${i}
+
+{% if handler_rgw_use_haproxy_maintenance %}
+  # set server weight to 100 on haproxy
+  echo "set weight ${HAPROXY_BACKEND[i]}/${INSTANCES_NAME[i]} 100" | socat stdio {{ haproxy_socket_path }}
+{% endif %}
 done