ceph-defaults: fix handlers for mds and rgw

The way we handle the restart for both mds and rgw is not ideal, it will
try to restart the daemon on the host that don't run the daemon,
resulting in a service file being created (see bug description).

Now we restart each daemon precisely and in a serialized fashion.

Note: the current implementation does NOT support multiple mds or rgw on
the same node.

Closes: https://bugzilla.redhat.com/show_bug.cgi?id=1469781
Signed-off-by: Sébastien Han <seb@redhat.com>
pull/1838/head
Sébastien Han 2017-08-31 11:22:33 +02:00
parent 7ccd10a15e
commit 3dd47a45cb
7 changed files with 161 additions and 27 deletions

View File

@ -327,7 +327,6 @@ dummy:
#radosgw_dns_name: your.subdomain.tld # subdomains used by radosgw. See http://ceph.com/docs/master/radosgw/config/#enabling-subdomain-s3-calls #radosgw_dns_name: your.subdomain.tld # subdomains used by radosgw. See http://ceph.com/docs/master/radosgw/config/#enabling-subdomain-s3-calls
#radosgw_resolve_cname: false # enable for radosgw to resolve DNS CNAME based bucket names #radosgw_resolve_cname: false # enable for radosgw to resolve DNS CNAME based bucket names
#radosgw_civetweb_port: 8080 #radosgw_civetweb_port: 8080
#radosgw_civetweb_bind_ip: "{{ ansible_default_ipv4.address }}" # when using ipv6 enclose with brackets: "[{{ ansible_default_ipv6.address }}]"
#radosgw_civetweb_num_threads: 100 #radosgw_civetweb_num_threads: 100
# For additional civetweb configuration options available such as SSL, logging, # For additional civetweb configuration options available such as SSL, logging,
# keepalive, and timeout settings, please see the civetweb docs at # keepalive, and timeout settings, please see the civetweb docs at
@ -366,11 +365,23 @@ dummy:
# Obviously between the checks (for monitors to be in quorum and for osd's pgs # Obviously between the checks (for monitors to be in quorum and for osd's pgs
# to be clean) we have to wait. These retries and delays can be configurable # to be clean) we have to wait. These retries and delays can be configurable
# for both monitors and osds. # for both monitors and osds.
#
# Monitor handler checks
#handler_health_mon_check_retries: 5 #handler_health_mon_check_retries: 5
#handler_health_mon_check_delay: 10 #handler_health_mon_check_delay: 10
#
# OSD handler checks
#handler_health_osd_check_retries: 40 #handler_health_osd_check_retries: 40
#handler_health_osd_check_delay: 30 #handler_health_osd_check_delay: 30
#handler_health_osd_check: true #handler_health_osd_check: true
#
# MDS handler checks
#handler_health_mds_check_retries: 5
#handler_health_mds_check_delay: 10
#
# RGW handler checks
#handler_health_rgw_check_retries: 5
#handler_health_rgw_check_delay: 10
# Confiure the type of NFS gatway access. At least one must be enabled for an # Confiure the type of NFS gatway access. At least one must be enabled for an
# NFS role to be useful # NFS role to be useful

View File

@ -327,7 +327,6 @@ ceph_repository: rhcs
#radosgw_dns_name: your.subdomain.tld # subdomains used by radosgw. See http://ceph.com/docs/master/radosgw/config/#enabling-subdomain-s3-calls #radosgw_dns_name: your.subdomain.tld # subdomains used by radosgw. See http://ceph.com/docs/master/radosgw/config/#enabling-subdomain-s3-calls
#radosgw_resolve_cname: false # enable for radosgw to resolve DNS CNAME based bucket names #radosgw_resolve_cname: false # enable for radosgw to resolve DNS CNAME based bucket names
#radosgw_civetweb_port: 8080 #radosgw_civetweb_port: 8080
#radosgw_civetweb_bind_ip: "{{ ansible_default_ipv4.address }}" # when using ipv6 enclose with brackets: "[{{ ansible_default_ipv6.address }}]"
#radosgw_civetweb_num_threads: 100 #radosgw_civetweb_num_threads: 100
# For additional civetweb configuration options available such as SSL, logging, # For additional civetweb configuration options available such as SSL, logging,
# keepalive, and timeout settings, please see the civetweb docs at # keepalive, and timeout settings, please see the civetweb docs at
@ -366,11 +365,23 @@ ceph_repository: rhcs
# Obviously between the checks (for monitors to be in quorum and for osd's pgs # Obviously between the checks (for monitors to be in quorum and for osd's pgs
# to be clean) we have to wait. These retries and delays can be configurable # to be clean) we have to wait. These retries and delays can be configurable
# for both monitors and osds. # for both monitors and osds.
#
# Monitor handler checks
#handler_health_mon_check_retries: 5 #handler_health_mon_check_retries: 5
#handler_health_mon_check_delay: 10 #handler_health_mon_check_delay: 10
#
# OSD handler checks
#handler_health_osd_check_retries: 40 #handler_health_osd_check_retries: 40
#handler_health_osd_check_delay: 30 #handler_health_osd_check_delay: 30
#handler_health_osd_check: true #handler_health_osd_check: true
#
# MDS handler checks
#handler_health_mds_check_retries: 5
#handler_health_mds_check_delay: 10
#
# RGW handler checks
#handler_health_rgw_check_retries: 5
#handler_health_rgw_check_delay: 10
# Confiure the type of NFS gatway access. At least one must be enabled for an # Confiure the type of NFS gatway access. At least one must be enabled for an
# NFS role to be useful # NFS role to be useful

View File

@ -319,7 +319,6 @@ mds_max_mds: 3
#radosgw_dns_name: your.subdomain.tld # subdomains used by radosgw. See http://ceph.com/docs/master/radosgw/config/#enabling-subdomain-s3-calls #radosgw_dns_name: your.subdomain.tld # subdomains used by radosgw. See http://ceph.com/docs/master/radosgw/config/#enabling-subdomain-s3-calls
radosgw_resolve_cname: false # enable for radosgw to resolve DNS CNAME based bucket names radosgw_resolve_cname: false # enable for radosgw to resolve DNS CNAME based bucket names
radosgw_civetweb_port: 8080 radosgw_civetweb_port: 8080
radosgw_civetweb_bind_ip: "{{ ansible_default_ipv4.address }}" # when using ipv6 enclose with brackets: "[{{ ansible_default_ipv6.address }}]"
radosgw_civetweb_num_threads: 100 radosgw_civetweb_num_threads: 100
# For additional civetweb configuration options available such as SSL, logging, # For additional civetweb configuration options available such as SSL, logging,
# keepalive, and timeout settings, please see the civetweb docs at # keepalive, and timeout settings, please see the civetweb docs at
@ -358,11 +357,23 @@ restapi_port: 5000
# Obviously between the checks (for monitors to be in quorum and for osd's pgs # Obviously between the checks (for monitors to be in quorum and for osd's pgs
# to be clean) we have to wait. These retries and delays can be configurable # to be clean) we have to wait. These retries and delays can be configurable
# for both monitors and osds. # for both monitors and osds.
#
# Monitor handler checks
handler_health_mon_check_retries: 5 handler_health_mon_check_retries: 5
handler_health_mon_check_delay: 10 handler_health_mon_check_delay: 10
#
# OSD handler checks
handler_health_osd_check_retries: 40 handler_health_osd_check_retries: 40
handler_health_osd_check_delay: 30 handler_health_osd_check_delay: 30
handler_health_osd_check: true handler_health_osd_check: true
#
# MDS handler checks
handler_health_mds_check_retries: 5
handler_health_mds_check_delay: 10
#
# RGW handler checks
handler_health_rgw_check_retries: 5
handler_health_rgw_check_delay: 10
# Confiure the type of NFS gatway access. At least one must be enabled for an # Confiure the type of NFS gatway access. At least one must be enabled for an
# NFS role to be useful # NFS role to be useful

View File

@ -64,28 +64,44 @@
- inventory_hostname in play_hosts - inventory_hostname in play_hosts
- osd_group_name in group_names - osd_group_name in group_names
- name: restart ceph mdss - name: copy mds restart script
service: template:
name: ceph-mds@{{ mds_name }} src: restart_mds_daemon.sh.j2
state: restarted dest: /tmp/restart_mds_daemon.sh
# serial: 1 would be the proper solution here, but that can only be set on play level owner: root
# upstream issue: https://github.com/ansible/ansible/issues/12170 group: root
run_once: true mode: 0750
with_items: "{{ groups.get(mds_group_name, []) }}" listen: "restart ceph mdss"
delegate_to: "{{ item }}"
when: when:
- inventory_hostname in play_hosts
- mds_group_name in group_names - mds_group_name in group_names
- name: restart ceph rgws - name: restart ceph mds daemon(s)
service: command: /tmp/restart_mds_daemon.sh
name: ceph-radosgw@rgw.{{ ansible_hostname }} listen: "restart ceph mdss"
state: restarted
# serial: 1 would be the proper solution here, but that can only be set on play level
# upstream issue: https://github.com/ansible/ansible/issues/12170
run_once: true
with_items: "{{ groups.get(rgw_group_name, []) }}"
delegate_to: "{{ item }}"
when: when:
# We do not want to run these checks on initial deployment (`socket.rc == 0`)
- socket.rc == 0
- mds_group_name in group_names
- name: copy rgw restart script
template:
src: restart_rgw_daemon.sh.j2
dest: /tmp/restart_rgw_daemon.sh
owner: root
group: root
mode: 0750
listen: "restart ceph rgws"
when:
- inventory_hostname in play_hosts
- rgw_group_name in group_names
- name: restart ceph rgw daemon(s)
command: /tmp/restart_rgw_daemon.sh
listen: "restart ceph rgws"
when:
# We do not want to run these checks on initial deployment (`socket.rc == 0`)
- socket.rc == 0
- rgw_group_name in group_names - rgw_group_name in group_names
- name: restart ceph nfss - name: restart ceph nfss

View File

@ -0,0 +1,20 @@
#!/bin/bash
RETRIES="{{ handler_health_mds_check_retries }}"
DELAY="{{ handler_health_mds_check_delay }}"
MDS_NAME="{{ ansible_hostname }}"
SOCKET=/var/run/ceph/{{ cluster }}-mds.${MDS_NAME}.asok
# First, restart the daemon
systemctl restart ceph-mds@${MDS_NAME}
COUNT=10
# Wait and ensure the socket exists after restarting the daemds
while [ $RETRIES -ne 0 ]; do
{{ docker_exec_cmd }} test -S $SOCKET && exit 0
sleep $DELAY
let RETRIES=RETRIES-1
done
# If we reach this point, it means the socket is not present.
echo "Socket file ${SOCKET} could not be found, which means the Metadata Server is not running."
exit 1

View File

@ -0,0 +1,65 @@
#!/bin/bash
RETRIES="{{ handler_health_rgw_check_retries }}"
DELAY="{{ handler_health_rgw_check_delay }}"
RGW_NAME="{{ ansible_hostname }}"
RGW_PORT="{{ radosgw_civetweb_port }}"
SOCKET=/var/run/ceph/{{ cluster }}-client.rgw.${RGW_NAME}.asok
{% if radosgw_address_block | length > 0 %}
{% if ip_version == 'ipv4' -%}
RGW_IP={{ hostvars[inventory_hostname]['ansible_all_' + ip_version + '_addresses'] | ipaddr(radosgw_address_block) | first }}
{%- elif ip_version == 'ipv6' -%}
RGW_IP=[{{ hostvars[inventory_hostname]['ansible_all_' + ip_version + '_addresses'] | ipaddr(radosgw_address_block) | first }}]
{%- endif %}
{% elif hostvars[inventory_hostname]['radosgw_address'] is defined and hostvars[inventory_hostname]['radosgw_address'] != '0.0.0.0' -%}
{% if ip_version == 'ipv4' -%}
RGW_IP={{ hostvars[inventory_hostname]['radosgw_address'] }}
{%- elif ip_version == 'ipv6' -%}
RGW_IP=[{{ hostvars[inventory_hostname]['radosgw_address'] }}]
{% endif %}
{%- else -%}
{% set interface = ["ansible_",radosgw_interface]|join %}
{% if ip_version == 'ipv6' -%}
RGW_IP=[{{ hostvars[inventory_hostname][interface][ip_version][0]['address'] }}]
{%- elif ip_version == 'ipv4' -%}
RGW_IP={{ hostvars[inventory_hostname][interface][ip_version]['address'] }}
{% endif %}
{%- endif %}
check_for_curl_or_wget() {
if {{ docker_exec_cmd }} command -v wget &>/dev/null; then
rgw_test_command="wget --quiet"
elif {{ docker_exec_cmd }} command -v curl &>/dev/null; then
rgw_test_command="curl --fail --silent --output /dev/null"
else
echo "It seems that neither curl or wget are available on your system."
echo "Cannot test rgw connection."
exit 0
fi
}
check_rest() {
check_for_curl_or_wget
while [ $RETRIES -ne 0 ]; do
test "$rgw_test_command http://$RGW_IP:$RGW_PORT" && exit 0
sleep $DELAY
let RETRIES=RETRIES-1
done
# If we reach this point, it means there is a problem with the connection to rgw
echo "Error connecting locally to Rados Gateway service: http://$rgw_listen"
exit 1
}
# First, restart the daemon
systemctl restart ceph-radosgw@rgw.${RGW_NAME}
COUNT=10
# Wait and ensure the socket exists after restarting the daemon
while [ $COUNT -ne 0 ]; do
{{ docker_exec_cmd }} test -S $SOCKET && check_rest
sleep 1
let COUNT=COUNT-1
done
echo "Socket file ${SOCKET} could not be found, which means Rados Gateway is not running."
exit 1