diff --git a/group_vars/all.yml.sample b/group_vars/all.yml.sample index b8a538dca..17bcda687 100644 --- a/group_vars/all.yml.sample +++ b/group_vars/all.yml.sample @@ -327,7 +327,6 @@ dummy: #radosgw_dns_name: your.subdomain.tld # subdomains used by radosgw. See http://ceph.com/docs/master/radosgw/config/#enabling-subdomain-s3-calls #radosgw_resolve_cname: false # enable for radosgw to resolve DNS CNAME based bucket names #radosgw_civetweb_port: 8080 -#radosgw_civetweb_bind_ip: "{{ ansible_default_ipv4.address }}" # when using ipv6 enclose with brackets: "[{{ ansible_default_ipv6.address }}]" #radosgw_civetweb_num_threads: 100 # For additional civetweb configuration options available such as SSL, logging, # keepalive, and timeout settings, please see the civetweb docs at @@ -366,11 +365,23 @@ dummy: # Obviously between the checks (for monitors to be in quorum and for osd's pgs # to be clean) we have to wait. These retries and delays can be configurable # for both monitors and osds. +# +# Monitor handler checks #handler_health_mon_check_retries: 5 #handler_health_mon_check_delay: 10 +# +# OSD handler checks #handler_health_osd_check_retries: 40 #handler_health_osd_check_delay: 30 #handler_health_osd_check: true +# +# MDS handler checks +#handler_health_mds_check_retries: 5 +#handler_health_mds_check_delay: 10 +# +# RGW handler checks +#handler_health_rgw_check_retries: 5 +#handler_health_rgw_check_delay: 10 # Confiure the type of NFS gatway access. At least one must be enabled for an # NFS role to be useful diff --git a/group_vars/rhcs.yml.sample b/group_vars/rhcs.yml.sample index 9b8aa0a1c..24093b9c0 100644 --- a/group_vars/rhcs.yml.sample +++ b/group_vars/rhcs.yml.sample @@ -327,7 +327,6 @@ ceph_repository: rhcs #radosgw_dns_name: your.subdomain.tld # subdomains used by radosgw. See http://ceph.com/docs/master/radosgw/config/#enabling-subdomain-s3-calls #radosgw_resolve_cname: false # enable for radosgw to resolve DNS CNAME based bucket names #radosgw_civetweb_port: 8080 -#radosgw_civetweb_bind_ip: "{{ ansible_default_ipv4.address }}" # when using ipv6 enclose with brackets: "[{{ ansible_default_ipv6.address }}]" #radosgw_civetweb_num_threads: 100 # For additional civetweb configuration options available such as SSL, logging, # keepalive, and timeout settings, please see the civetweb docs at @@ -366,11 +365,23 @@ ceph_repository: rhcs # Obviously between the checks (for monitors to be in quorum and for osd's pgs # to be clean) we have to wait. These retries and delays can be configurable # for both monitors and osds. +# +# Monitor handler checks #handler_health_mon_check_retries: 5 #handler_health_mon_check_delay: 10 +# +# OSD handler checks #handler_health_osd_check_retries: 40 #handler_health_osd_check_delay: 30 #handler_health_osd_check: true +# +# MDS handler checks +#handler_health_mds_check_retries: 5 +#handler_health_mds_check_delay: 10 +# +# RGW handler checks +#handler_health_rgw_check_retries: 5 +#handler_health_rgw_check_delay: 10 # Confiure the type of NFS gatway access. At least one must be enabled for an # NFS role to be useful diff --git a/roles/ceph-defaults/defaults/main.yml b/roles/ceph-defaults/defaults/main.yml index 473d9cdfb..e34263380 100644 --- a/roles/ceph-defaults/defaults/main.yml +++ b/roles/ceph-defaults/defaults/main.yml @@ -319,7 +319,6 @@ mds_max_mds: 3 #radosgw_dns_name: your.subdomain.tld # subdomains used by radosgw. See http://ceph.com/docs/master/radosgw/config/#enabling-subdomain-s3-calls radosgw_resolve_cname: false # enable for radosgw to resolve DNS CNAME based bucket names radosgw_civetweb_port: 8080 -radosgw_civetweb_bind_ip: "{{ ansible_default_ipv4.address }}" # when using ipv6 enclose with brackets: "[{{ ansible_default_ipv6.address }}]" radosgw_civetweb_num_threads: 100 # For additional civetweb configuration options available such as SSL, logging, # keepalive, and timeout settings, please see the civetweb docs at @@ -358,11 +357,23 @@ restapi_port: 5000 # Obviously between the checks (for monitors to be in quorum and for osd's pgs # to be clean) we have to wait. These retries and delays can be configurable # for both monitors and osds. +# +# Monitor handler checks handler_health_mon_check_retries: 5 handler_health_mon_check_delay: 10 +# +# OSD handler checks handler_health_osd_check_retries: 40 handler_health_osd_check_delay: 30 handler_health_osd_check: true +# +# MDS handler checks +handler_health_mds_check_retries: 5 +handler_health_mds_check_delay: 10 +# +# RGW handler checks +handler_health_rgw_check_retries: 5 +handler_health_rgw_check_delay: 10 # Confiure the type of NFS gatway access. At least one must be enabled for an # NFS role to be useful diff --git a/roles/ceph-defaults/handlers/main.yml b/roles/ceph-defaults/handlers/main.yml index 3a053d435..8f21db762 100644 --- a/roles/ceph-defaults/handlers/main.yml +++ b/roles/ceph-defaults/handlers/main.yml @@ -18,7 +18,7 @@ command: /tmp/restart_mon_daemon.sh listen: "restart ceph mons" when: -# We do not want to run these checks on initial deployment (`socket.rc == 0`) + # We do not want to run these checks on initial deployment (`socket.rc == 0`) - socket.rc == 0 - mon_group_name in group_names @@ -42,8 +42,8 @@ listen: "restart ceph osds" with_items: "{{ socket_osd_container.results | default([]) }}" when: - # We do not want to run these checks on initial deployment (`socket_osd_container.results[n].rc == 0`) - # except when a crush location is specified. ceph-disk will start the osds before the osd crush location is specified + # We do not want to run these checks on initial deployment (`socket_osd_container.results[n].rc == 0`) + # except when a crush location is specified. ceph-disk will start the osds before the osd crush location is specified - containerized_deployment - ((crush_location is defined and crush_location) or item.get('rc') == 0) - handler_health_osd_check @@ -55,8 +55,8 @@ command: /tmp/restart_osd_daemon.sh listen: "restart ceph osds" when: - # We do not want to run these checks on initial deployment (`socket_osd_container.results[n].rc == 0`) - # except when a crush location is specified. ceph-disk will start the osds before the osd crush location is specified + # We do not want to run these checks on initial deployment (`socket_osd_container.results[n].rc == 0`) + # except when a crush location is specified. ceph-disk will start the osds before the osd crush location is specified - ((crush_location is defined and crush_location) or socket.rc == 0) - ceph_current_fsid.rc == 0 - handler_health_osd_check @@ -64,28 +64,44 @@ - inventory_hostname in play_hosts - osd_group_name in group_names -- name: restart ceph mdss - service: - name: ceph-mds@{{ mds_name }} - state: restarted - # serial: 1 would be the proper solution here, but that can only be set on play level - # upstream issue: https://github.com/ansible/ansible/issues/12170 - run_once: true - with_items: "{{ groups.get(mds_group_name, []) }}" - delegate_to: "{{ item }}" +- name: copy mds restart script + template: + src: restart_mds_daemon.sh.j2 + dest: /tmp/restart_mds_daemon.sh + owner: root + group: root + mode: 0750 + listen: "restart ceph mdss" when: + - inventory_hostname in play_hosts - mds_group_name in group_names -- name: restart ceph rgws - service: - name: ceph-radosgw@rgw.{{ ansible_hostname }} - state: restarted - # serial: 1 would be the proper solution here, but that can only be set on play level - # upstream issue: https://github.com/ansible/ansible/issues/12170 - run_once: true - with_items: "{{ groups.get(rgw_group_name, []) }}" - delegate_to: "{{ item }}" +- name: restart ceph mds daemon(s) + command: /tmp/restart_mds_daemon.sh + listen: "restart ceph mdss" when: + # We do not want to run these checks on initial deployment (`socket.rc == 0`) + - socket.rc == 0 + - mds_group_name in group_names + +- name: copy rgw restart script + template: + src: restart_rgw_daemon.sh.j2 + dest: /tmp/restart_rgw_daemon.sh + owner: root + group: root + mode: 0750 + listen: "restart ceph rgws" + when: + - inventory_hostname in play_hosts + - rgw_group_name in group_names + +- name: restart ceph rgw daemon(s) + command: /tmp/restart_rgw_daemon.sh + listen: "restart ceph rgws" + when: + # We do not want to run these checks on initial deployment (`socket.rc == 0`) + - socket.rc == 0 - rgw_group_name in group_names - name: restart ceph nfss diff --git a/roles/ceph-defaults/templates/restart_mds_daemon.sh.j2 b/roles/ceph-defaults/templates/restart_mds_daemon.sh.j2 new file mode 100644 index 000000000..557ac7f56 --- /dev/null +++ b/roles/ceph-defaults/templates/restart_mds_daemon.sh.j2 @@ -0,0 +1,20 @@ +#!/bin/bash + +RETRIES="{{ handler_health_mds_check_retries }}" +DELAY="{{ handler_health_mds_check_delay }}" +MDS_NAME="{{ ansible_hostname }}" +SOCKET=/var/run/ceph/{{ cluster }}-mds.${MDS_NAME}.asok + +# First, restart the daemon +systemctl restart ceph-mds@${MDS_NAME} + +COUNT=10 +# Wait and ensure the socket exists after restarting the daemds +while [ $RETRIES -ne 0 ]; do + {{ docker_exec_cmd }} test -S $SOCKET && exit 0 + sleep $DELAY + let RETRIES=RETRIES-1 +done +# If we reach this point, it means the socket is not present. +echo "Socket file ${SOCKET} could not be found, which means the Metadata Server is not running." +exit 1 diff --git a/roles/ceph-defaults/templates/restart_mon_daemon.sh.j2 b/roles/ceph-defaults/templates/restart_mon_daemon.sh.j2 index 745f6915f..9c86ffccb 100644 --- a/roles/ceph-defaults/templates/restart_mon_daemon.sh.j2 +++ b/roles/ceph-defaults/templates/restart_mon_daemon.sh.j2 @@ -8,7 +8,7 @@ SOCKET=/var/run/ceph/{{ cluster }}-mon.${MONITOR_NAME}.asok check_quorum() { while [ $RETRIES -ne 0 ]; do - MEMBERS=$({{ docker_exec_cmd }} ceph --cluster {{ cluster }} -s --format json | sed -r 's/.*"quorum_names":(\[[^]]+\]).*/\1/') + MEMBERS=$({{ docker_exec_cmd }} ceph --cluster {{ cluster }} -s --format json | sed -r 's/.*"quorum_names":(\[[^]]+\]).*/\1/') test "${MEMBERS/$MONITOR_NAME}" != "$MEMBERS" && exit 0 sleep $DELAY let RETRIES=RETRIES-1 diff --git a/roles/ceph-defaults/templates/restart_rgw_daemon.sh.j2 b/roles/ceph-defaults/templates/restart_rgw_daemon.sh.j2 new file mode 100644 index 000000000..612559f7a --- /dev/null +++ b/roles/ceph-defaults/templates/restart_rgw_daemon.sh.j2 @@ -0,0 +1,65 @@ +#!/bin/bash + +RETRIES="{{ handler_health_rgw_check_retries }}" +DELAY="{{ handler_health_rgw_check_delay }}" +RGW_NAME="{{ ansible_hostname }}" +RGW_PORT="{{ radosgw_civetweb_port }}" +SOCKET=/var/run/ceph/{{ cluster }}-client.rgw.${RGW_NAME}.asok + +{% if radosgw_address_block | length > 0 %} + {% if ip_version == 'ipv4' -%} +RGW_IP={{ hostvars[inventory_hostname]['ansible_all_' + ip_version + '_addresses'] | ipaddr(radosgw_address_block) | first }} + {%- elif ip_version == 'ipv6' -%} +RGW_IP=[{{ hostvars[inventory_hostname]['ansible_all_' + ip_version + '_addresses'] | ipaddr(radosgw_address_block) | first }}] + {%- endif %} +{% elif hostvars[inventory_hostname]['radosgw_address'] is defined and hostvars[inventory_hostname]['radosgw_address'] != '0.0.0.0' -%} + {% if ip_version == 'ipv4' -%} +RGW_IP={{ hostvars[inventory_hostname]['radosgw_address'] }} + {%- elif ip_version == 'ipv6' -%} +RGW_IP=[{{ hostvars[inventory_hostname]['radosgw_address'] }}] + {% endif %} +{%- else -%} + {% set interface = ["ansible_",radosgw_interface]|join %} + {% if ip_version == 'ipv6' -%} +RGW_IP=[{{ hostvars[inventory_hostname][interface][ip_version][0]['address'] }}] + {%- elif ip_version == 'ipv4' -%} +RGW_IP={{ hostvars[inventory_hostname][interface][ip_version]['address'] }} + {% endif %} +{%- endif %} + +check_for_curl_or_wget() { + if {{ docker_exec_cmd }} command -v wget &>/dev/null; then + rgw_test_command="wget --quiet" + elif {{ docker_exec_cmd }} command -v curl &>/dev/null; then + rgw_test_command="curl --fail --silent --output /dev/null" + else + echo "It seems that neither curl or wget are available on your system." + echo "Cannot test rgw connection." + exit 0 + fi +} + +check_rest() { + check_for_curl_or_wget + while [ $RETRIES -ne 0 ]; do + test "$rgw_test_command http://$RGW_IP:$RGW_PORT" && exit 0 + sleep $DELAY + let RETRIES=RETRIES-1 + done + # If we reach this point, it means there is a problem with the connection to rgw + echo "Error connecting locally to Rados Gateway service: http://$rgw_listen" + exit 1 +} + +# First, restart the daemon +systemctl restart ceph-radosgw@rgw.${RGW_NAME} + +COUNT=10 +# Wait and ensure the socket exists after restarting the daemon +while [ $COUNT -ne 0 ]; do + {{ docker_exec_cmd }} test -S $SOCKET && check_rest + sleep 1 + let COUNT=COUNT-1 +done +echo "Socket file ${SOCKET} could not be found, which means Rados Gateway is not running." +exit 1 diff --git a/roles/ceph-rgw/tasks/start_radosgw.yml b/roles/ceph-rgw/tasks/start_radosgw.yml index 96e806fe9..cdb40666e 100644 --- a/roles/ceph-rgw/tasks/start_radosgw.yml +++ b/roles/ceph-rgw/tasks/start_radosgw.yml @@ -1,18 +1,10 @@ --- -- name: check if rgw is started - command: /etc/init.d/radosgw status - register: rgwstatus - changed_when: false - failed_when: false - always_run: true - - name: ensure systemd service override directory exists file: state: directory path: "/etc/systemd/system/ceph-rgw@.service.d/" when: - ceph_rgw_systemd_overrides is defined - - ansible_service_mgr == 'systemd' - name: add ceph-rgw systemd service overrides config_template: @@ -22,7 +14,6 @@ config_type: "ini" when: - ceph_rgw_systemd_overrides is defined - - ansible_service_mgr == 'systemd' - name: start rgw service: