ceph-defaults: fix handlers for mds and rgw

The way we handle the restart for both mds and rgw is not ideal, it will
try to restart the daemon on the host that don't run the daemon,
resulting in a service file being created (see bug description).

Now we restart each daemon precisely and in a serialized fashion.

Note: the current implementation does NOT support multiple mds or rgw on
the same node.

Closes: https://bugzilla.redhat.com/show_bug.cgi?id=1469781
Signed-off-by: Sébastien Han <seb@redhat.com>
pull/1838/head
Sébastien Han 2017-08-31 11:22:33 +02:00
parent 7ccd10a15e
commit 3dd47a45cb
7 changed files with 161 additions and 27 deletions

View File

@ -327,7 +327,6 @@ dummy:
#radosgw_dns_name: your.subdomain.tld # subdomains used by radosgw. See http://ceph.com/docs/master/radosgw/config/#enabling-subdomain-s3-calls
#radosgw_resolve_cname: false # enable for radosgw to resolve DNS CNAME based bucket names
#radosgw_civetweb_port: 8080
#radosgw_civetweb_bind_ip: "{{ ansible_default_ipv4.address }}" # when using ipv6 enclose with brackets: "[{{ ansible_default_ipv6.address }}]"
#radosgw_civetweb_num_threads: 100
# For additional civetweb configuration options available such as SSL, logging,
# keepalive, and timeout settings, please see the civetweb docs at
@ -366,11 +365,23 @@ dummy:
# Obviously between the checks (for monitors to be in quorum and for osd's pgs
# to be clean) we have to wait. These retries and delays can be configurable
# for both monitors and osds.
#
# Monitor handler checks
#handler_health_mon_check_retries: 5
#handler_health_mon_check_delay: 10
#
# OSD handler checks
#handler_health_osd_check_retries: 40
#handler_health_osd_check_delay: 30
#handler_health_osd_check: true
#
# MDS handler checks
#handler_health_mds_check_retries: 5
#handler_health_mds_check_delay: 10
#
# RGW handler checks
#handler_health_rgw_check_retries: 5
#handler_health_rgw_check_delay: 10
# Confiure the type of NFS gatway access. At least one must be enabled for an
# NFS role to be useful

View File

@ -327,7 +327,6 @@ ceph_repository: rhcs
#radosgw_dns_name: your.subdomain.tld # subdomains used by radosgw. See http://ceph.com/docs/master/radosgw/config/#enabling-subdomain-s3-calls
#radosgw_resolve_cname: false # enable for radosgw to resolve DNS CNAME based bucket names
#radosgw_civetweb_port: 8080
#radosgw_civetweb_bind_ip: "{{ ansible_default_ipv4.address }}" # when using ipv6 enclose with brackets: "[{{ ansible_default_ipv6.address }}]"
#radosgw_civetweb_num_threads: 100
# For additional civetweb configuration options available such as SSL, logging,
# keepalive, and timeout settings, please see the civetweb docs at
@ -366,11 +365,23 @@ ceph_repository: rhcs
# Obviously between the checks (for monitors to be in quorum and for osd's pgs
# to be clean) we have to wait. These retries and delays can be configurable
# for both monitors and osds.
#
# Monitor handler checks
#handler_health_mon_check_retries: 5
#handler_health_mon_check_delay: 10
#
# OSD handler checks
#handler_health_osd_check_retries: 40
#handler_health_osd_check_delay: 30
#handler_health_osd_check: true
#
# MDS handler checks
#handler_health_mds_check_retries: 5
#handler_health_mds_check_delay: 10
#
# RGW handler checks
#handler_health_rgw_check_retries: 5
#handler_health_rgw_check_delay: 10
# Confiure the type of NFS gatway access. At least one must be enabled for an
# NFS role to be useful

View File

@ -319,7 +319,6 @@ mds_max_mds: 3
#radosgw_dns_name: your.subdomain.tld # subdomains used by radosgw. See http://ceph.com/docs/master/radosgw/config/#enabling-subdomain-s3-calls
radosgw_resolve_cname: false # enable for radosgw to resolve DNS CNAME based bucket names
radosgw_civetweb_port: 8080
radosgw_civetweb_bind_ip: "{{ ansible_default_ipv4.address }}" # when using ipv6 enclose with brackets: "[{{ ansible_default_ipv6.address }}]"
radosgw_civetweb_num_threads: 100
# For additional civetweb configuration options available such as SSL, logging,
# keepalive, and timeout settings, please see the civetweb docs at
@ -358,11 +357,23 @@ restapi_port: 5000
# Obviously between the checks (for monitors to be in quorum and for osd's pgs
# to be clean) we have to wait. These retries and delays can be configurable
# for both monitors and osds.
#
# Monitor handler checks
handler_health_mon_check_retries: 5
handler_health_mon_check_delay: 10
#
# OSD handler checks
handler_health_osd_check_retries: 40
handler_health_osd_check_delay: 30
handler_health_osd_check: true
#
# MDS handler checks
handler_health_mds_check_retries: 5
handler_health_mds_check_delay: 10
#
# RGW handler checks
handler_health_rgw_check_retries: 5
handler_health_rgw_check_delay: 10
# Confiure the type of NFS gatway access. At least one must be enabled for an
# NFS role to be useful

View File

@ -18,7 +18,7 @@
command: /tmp/restart_mon_daemon.sh
listen: "restart ceph mons"
when:
# We do not want to run these checks on initial deployment (`socket.rc == 0`)
# We do not want to run these checks on initial deployment (`socket.rc == 0`)
- socket.rc == 0
- mon_group_name in group_names
@ -42,8 +42,8 @@
listen: "restart ceph osds"
with_items: "{{ socket_osd_container.results | default([]) }}"
when:
# We do not want to run these checks on initial deployment (`socket_osd_container.results[n].rc == 0`)
# except when a crush location is specified. ceph-disk will start the osds before the osd crush location is specified
# We do not want to run these checks on initial deployment (`socket_osd_container.results[n].rc == 0`)
# except when a crush location is specified. ceph-disk will start the osds before the osd crush location is specified
- containerized_deployment
- ((crush_location is defined and crush_location) or item.get('rc') == 0)
- handler_health_osd_check
@ -55,8 +55,8 @@
command: /tmp/restart_osd_daemon.sh
listen: "restart ceph osds"
when:
# We do not want to run these checks on initial deployment (`socket_osd_container.results[n].rc == 0`)
# except when a crush location is specified. ceph-disk will start the osds before the osd crush location is specified
# We do not want to run these checks on initial deployment (`socket_osd_container.results[n].rc == 0`)
# except when a crush location is specified. ceph-disk will start the osds before the osd crush location is specified
- ((crush_location is defined and crush_location) or socket.rc == 0)
- ceph_current_fsid.rc == 0
- handler_health_osd_check
@ -64,28 +64,44 @@
- inventory_hostname in play_hosts
- osd_group_name in group_names
- name: restart ceph mdss
service:
name: ceph-mds@{{ mds_name }}
state: restarted
# serial: 1 would be the proper solution here, but that can only be set on play level
# upstream issue: https://github.com/ansible/ansible/issues/12170
run_once: true
with_items: "{{ groups.get(mds_group_name, []) }}"
delegate_to: "{{ item }}"
- name: copy mds restart script
template:
src: restart_mds_daemon.sh.j2
dest: /tmp/restart_mds_daemon.sh
owner: root
group: root
mode: 0750
listen: "restart ceph mdss"
when:
- inventory_hostname in play_hosts
- mds_group_name in group_names
- name: restart ceph rgws
service:
name: ceph-radosgw@rgw.{{ ansible_hostname }}
state: restarted
# serial: 1 would be the proper solution here, but that can only be set on play level
# upstream issue: https://github.com/ansible/ansible/issues/12170
run_once: true
with_items: "{{ groups.get(rgw_group_name, []) }}"
delegate_to: "{{ item }}"
- name: restart ceph mds daemon(s)
command: /tmp/restart_mds_daemon.sh
listen: "restart ceph mdss"
when:
# We do not want to run these checks on initial deployment (`socket.rc == 0`)
- socket.rc == 0
- mds_group_name in group_names
- name: copy rgw restart script
template:
src: restart_rgw_daemon.sh.j2
dest: /tmp/restart_rgw_daemon.sh
owner: root
group: root
mode: 0750
listen: "restart ceph rgws"
when:
- inventory_hostname in play_hosts
- rgw_group_name in group_names
- name: restart ceph rgw daemon(s)
command: /tmp/restart_rgw_daemon.sh
listen: "restart ceph rgws"
when:
# We do not want to run these checks on initial deployment (`socket.rc == 0`)
- socket.rc == 0
- rgw_group_name in group_names
- name: restart ceph nfss

View File

@ -0,0 +1,20 @@
#!/bin/bash
RETRIES="{{ handler_health_mds_check_retries }}"
DELAY="{{ handler_health_mds_check_delay }}"
MDS_NAME="{{ ansible_hostname }}"
SOCKET=/var/run/ceph/{{ cluster }}-mds.${MDS_NAME}.asok
# First, restart the daemon
systemctl restart ceph-mds@${MDS_NAME}
COUNT=10
# Wait and ensure the socket exists after restarting the daemds
while [ $RETRIES -ne 0 ]; do
{{ docker_exec_cmd }} test -S $SOCKET && exit 0
sleep $DELAY
let RETRIES=RETRIES-1
done
# If we reach this point, it means the socket is not present.
echo "Socket file ${SOCKET} could not be found, which means the Metadata Server is not running."
exit 1

View File

@ -8,7 +8,7 @@ SOCKET=/var/run/ceph/{{ cluster }}-mon.${MONITOR_NAME}.asok
check_quorum() {
while [ $RETRIES -ne 0 ]; do
MEMBERS=$({{ docker_exec_cmd }} ceph --cluster {{ cluster }} -s --format json | sed -r 's/.*"quorum_names":(\[[^]]+\]).*/\1/')
MEMBERS=$({{ docker_exec_cmd }} ceph --cluster {{ cluster }} -s --format json | sed -r 's/.*"quorum_names":(\[[^]]+\]).*/\1/')
test "${MEMBERS/$MONITOR_NAME}" != "$MEMBERS" && exit 0
sleep $DELAY
let RETRIES=RETRIES-1

View File

@ -0,0 +1,65 @@
#!/bin/bash
RETRIES="{{ handler_health_rgw_check_retries }}"
DELAY="{{ handler_health_rgw_check_delay }}"
RGW_NAME="{{ ansible_hostname }}"
RGW_PORT="{{ radosgw_civetweb_port }}"
SOCKET=/var/run/ceph/{{ cluster }}-client.rgw.${RGW_NAME}.asok
{% if radosgw_address_block | length > 0 %}
{% if ip_version == 'ipv4' -%}
RGW_IP={{ hostvars[inventory_hostname]['ansible_all_' + ip_version + '_addresses'] | ipaddr(radosgw_address_block) | first }}
{%- elif ip_version == 'ipv6' -%}
RGW_IP=[{{ hostvars[inventory_hostname]['ansible_all_' + ip_version + '_addresses'] | ipaddr(radosgw_address_block) | first }}]
{%- endif %}
{% elif hostvars[inventory_hostname]['radosgw_address'] is defined and hostvars[inventory_hostname]['radosgw_address'] != '0.0.0.0' -%}
{% if ip_version == 'ipv4' -%}
RGW_IP={{ hostvars[inventory_hostname]['radosgw_address'] }}
{%- elif ip_version == 'ipv6' -%}
RGW_IP=[{{ hostvars[inventory_hostname]['radosgw_address'] }}]
{% endif %}
{%- else -%}
{% set interface = ["ansible_",radosgw_interface]|join %}
{% if ip_version == 'ipv6' -%}
RGW_IP=[{{ hostvars[inventory_hostname][interface][ip_version][0]['address'] }}]
{%- elif ip_version == 'ipv4' -%}
RGW_IP={{ hostvars[inventory_hostname][interface][ip_version]['address'] }}
{% endif %}
{%- endif %}
check_for_curl_or_wget() {
if {{ docker_exec_cmd }} command -v wget &>/dev/null; then
rgw_test_command="wget --quiet"
elif {{ docker_exec_cmd }} command -v curl &>/dev/null; then
rgw_test_command="curl --fail --silent --output /dev/null"
else
echo "It seems that neither curl or wget are available on your system."
echo "Cannot test rgw connection."
exit 0
fi
}
check_rest() {
check_for_curl_or_wget
while [ $RETRIES -ne 0 ]; do
test "$rgw_test_command http://$RGW_IP:$RGW_PORT" && exit 0
sleep $DELAY
let RETRIES=RETRIES-1
done
# If we reach this point, it means there is a problem with the connection to rgw
echo "Error connecting locally to Rados Gateway service: http://$rgw_listen"
exit 1
}
# First, restart the daemon
systemctl restart ceph-radosgw@rgw.${RGW_NAME}
COUNT=10
# Wait and ensure the socket exists after restarting the daemon
while [ $COUNT -ne 0 ]; do
{{ docker_exec_cmd }} test -S $SOCKET && check_rest
sleep 1
let COUNT=COUNT-1
done
echo "Socket file ${SOCKET} could not be found, which means Rados Gateway is not running."
exit 1