From e113d94c9a72fac486ae3be88a8644390e58c319 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Han?= Date: Thu, 21 Sep 2017 23:47:36 +0200 Subject: [PATCH 1/9] tests: implement reboot after a deployment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We don't test server reboot, a lot of things can happen after that. So now, we deploy, reboot then we run testinfra. Signed-off-by: Sébastien Han --- tox.ini | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tox.ini b/tox.ini index 2143b2a8c..8f8416776 100644 --- a/tox.ini +++ b/tox.ini @@ -117,6 +117,7 @@ whitelist_externals = bash pip cp + sleep passenv=* sitepackages=True setenv= @@ -214,6 +215,17 @@ commands= ansible-playbook -vv -i {changedir}/hosts {toxinidir}/tests/functional/setup.yml + # wait 5 minutes for services to be ready + sleep 300 + # test cluster state using ceph-ansible tests + testinfra -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {toxinidir}/tests/functional/tests + + # reboot all vms + vagrant reload --no-provision + + # wait 5 minutes for services to be ready + sleep 300 + # retest to ensure cluster came back up correctly after rebooting testinfra -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {toxinidir}/tests/functional/tests purge_cluster: {[purge]commands} From fc29ccd0adb8384217fd42d18f54b2b3838ebe66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Han?= Date: Tue, 26 Sep 2017 14:12:11 +0200 Subject: [PATCH 2/9] rbd-mirror: force sercice enable ceph-rbd-mirror.target MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a bug in the rbd mirror unit file, the upstream fix is here: https://github.com/ceph/ceph/pull/17969. This should be reverted once the patch is merged and backport is done. Signed-off-by: Sébastien Han --- roles/ceph-rbd-mirror/tasks/start_rbd_mirror.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/roles/ceph-rbd-mirror/tasks/start_rbd_mirror.yml b/roles/ceph-rbd-mirror/tasks/start_rbd_mirror.yml index 62a804372..3b5da3e34 100644 --- a/roles/ceph-rbd-mirror/tasks/start_rbd_mirror.yml +++ b/roles/ceph-rbd-mirror/tasks/start_rbd_mirror.yml @@ -35,6 +35,17 @@ when: - ceph_release_num.{{ ceph_release }} >= ceph_release_num.luminous +# This task is a workaround for rbd-mirror not starting after reboot +# The upstream fix is: https://github.com/ceph/ceph/pull/17969 +# It's affecting, ceph version 12.2.0 (32ce2a3ae5239ee33d6150705cdb24d43bab910c) luminous (rc) and before +- name: enable ceph-rbd-mirror.target + service: + name: "ceph-rbd-mirror.target" + enabled: yes + changed_when: false + when: + - ceph_release_num.{{ ceph_release }} >= ceph_release_num.luminous + - name: start and add the rbd-mirror service instance service: name: "ceph-rbd-mirror@rbd-mirror.{{ ansible_hostname }}" From b4bec524429cd0cf1d0b95e8255557aa41cbd28b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Han?= Date: Tue, 26 Sep 2017 14:21:37 +0200 Subject: [PATCH 3/9] tests: add tests for rgw-nfs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit rgw-nfs is part of servicemap so we should use it to make sure the process is up and running. Signed-off-by: Sébastien Han --- .../functional/tests/nfs/test_nfs_ganesha.py | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/tests/functional/tests/nfs/test_nfs_ganesha.py b/tests/functional/tests/nfs/test_nfs_ganesha.py index e0ad01080..8136ff459 100644 --- a/tests/functional/tests/nfs/test_nfs_ganesha.py +++ b/tests/functional/tests/nfs/test_nfs_ganesha.py @@ -1,3 +1,4 @@ +import json import pytest class TestNFSs(object): @@ -22,6 +23,30 @@ class TestNFSs(object): def test_nfs_config_override(self, node, host): assert host.file("/etc/ganesha/ganesha.conf").contains("Entries_HWMark") + @pytest.mark.no_docker + def test_nfs_is_up(self, node, host): + hostname = node["vars"]["inventory_hostname"] + cluster = node['cluster_name'] + cmd = "sudo ceph --name client.rgw.{hostname} --keyring /var/lib/ceph/radosgw/{cluster}-rgw.{hostname}/keyring --cluster={cluster} --connect-timeout 5 -f json -s".format( + hostname=hostname, + cluster=cluster + ) + output = host.check_output(cmd) + daemons = [i for i in json.loads(output)["servicemap"]["services"]["rgw-nfs"]["daemons"]] + assert hostname in daemons + + @pytest.mark.docker + def test_docker_nfs_is_up(self, node, host): + hostname = node["vars"]["inventory_hostname"] + cluster = node['cluster_name'] + cmd = "sudo docker exec ceph-nfs-{hostname} ceph --name client.rgw.{hostname} --keyring /var/lib/ceph/radosgw/{cluster}-rgw.{hostname}/keyring --cluster={cluster} --connect-timeout 5 -f json -s".format( + hostname=hostname, + cluster=cluster + ) + output = host.check_output(cmd) + daemons = [i for i in json.loads(output)["servicemap"]["services"]["rgw-nfs"]["daemons"]] + assert hostname in daemons + #NOTE (guits): This check must be fixed. (Permission denied error) # @pytest.mark.no_docker # def test_nfs_rgw_fsal_export(self, node, host): From 341c9e077b98b23a38db69d2704ddffb9c752ee6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Han?= Date: Tue, 26 Sep 2017 23:16:43 +0200 Subject: [PATCH 4/9] nfs: fix container setup and re-arrange files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Sébastien Han --- group_vars/all.yml.sample | 4 +++ group_vars/rhcs.yml.sample | 4 +++ roles/ceph-defaults/defaults/main.yml | 4 +++ .../templates/restart_nfs_daemon.sh.j2 | 23 ++++++++++++++ roles/ceph-nfs/tasks/create_rgw_nfs_user.yml | 1 + roles/ceph-nfs/tasks/docker/main.yml | 3 -- .../tasks/docker/start_docker_nfs.yml | 18 ----------- roles/ceph-nfs/tasks/ganesha_selinux_fix.yml | 28 +++++++++++++++++ roles/ceph-nfs/tasks/main.yml | 22 +++++++++----- ...onfigs.yml => pre_requisite_container.yml} | 0 ...te.yml => pre_requisite_non_container.yml} | 14 +++++---- roles/ceph-nfs/tasks/start_nfs.yml | 30 +++++++++++++++++++ roles/ceph-nfs/templates/ceph-nfs.service.j2 | 30 ++++++++++--------- 13 files changed, 132 insertions(+), 49 deletions(-) create mode 100644 roles/ceph-defaults/templates/restart_nfs_daemon.sh.j2 delete mode 100644 roles/ceph-nfs/tasks/docker/main.yml delete mode 100644 roles/ceph-nfs/tasks/docker/start_docker_nfs.yml create mode 100644 roles/ceph-nfs/tasks/ganesha_selinux_fix.yml rename roles/ceph-nfs/tasks/{docker/copy_configs.yml => pre_requisite_container.yml} (100%) rename roles/ceph-nfs/tasks/{pre_requisite.yml => pre_requisite_non_container.yml} (86%) diff --git a/group_vars/all.yml.sample b/group_vars/all.yml.sample index 9c10d5adf..e2d54f78b 100644 --- a/group_vars/all.yml.sample +++ b/group_vars/all.yml.sample @@ -394,6 +394,10 @@ dummy: #handler_health_rgw_check_retries: 5 #handler_health_rgw_check_delay: 10 +# NFS handler checks +#handler_health_nfs_check_retries: 5 +#handler_health_nfs_check_delay: 10 + ############### # NFS-GANESHA # ############### diff --git a/group_vars/rhcs.yml.sample b/group_vars/rhcs.yml.sample index 5e9f926ea..112efa41e 100644 --- a/group_vars/rhcs.yml.sample +++ b/group_vars/rhcs.yml.sample @@ -394,6 +394,10 @@ ceph_repository: rhcs #handler_health_rgw_check_retries: 5 #handler_health_rgw_check_delay: 10 +# NFS handler checks +#handler_health_nfs_check_retries: 5 +#handler_health_nfs_check_delay: 10 + ############### # NFS-GANESHA # ############### diff --git a/roles/ceph-defaults/defaults/main.yml b/roles/ceph-defaults/defaults/main.yml index f8786be1c..cfdbbbdca 100644 --- a/roles/ceph-defaults/defaults/main.yml +++ b/roles/ceph-defaults/defaults/main.yml @@ -386,6 +386,10 @@ handler_health_mds_check_delay: 10 handler_health_rgw_check_retries: 5 handler_health_rgw_check_delay: 10 +# NFS handler checks +handler_health_nfs_check_retries: 5 +handler_health_nfs_check_delay: 10 + ############### # NFS-GANESHA # ############### diff --git a/roles/ceph-defaults/templates/restart_nfs_daemon.sh.j2 b/roles/ceph-defaults/templates/restart_nfs_daemon.sh.j2 new file mode 100644 index 000000000..cbc78e989 --- /dev/null +++ b/roles/ceph-defaults/templates/restart_nfs_daemon.sh.j2 @@ -0,0 +1,23 @@ +#!/bin/bash + +RETRIES="{{ handler_health_nfs_check_retries }}" +DELAY="{{ handler_health_nfs_check_delay }}" +NFS_NAME="{{ ansible_hostname }}" +PID=/var/run/ganesha.pid + +# First, restart the daemon +{% if containerized_deployment -%} +systemctl restart ceph-nfs@${NFS_NAME} +COUNT=10 +# Wait and ensure the pid exists after restarting the daemon +while [ $RETRIES -ne 0 ]; do + {{ docker_exec_cmd }} test -f $PID && exit 0 + sleep $DELAY + let RETRIES=RETRIES-1 +done +# If we reach this point, it means the pid is not present. +echo "PID file ${PID} could not be found, which means Ganesha is not running." +exit 1 +{% else %} +systemctl restart nfs-ganesha +{% endif %} diff --git a/roles/ceph-nfs/tasks/create_rgw_nfs_user.yml b/roles/ceph-nfs/tasks/create_rgw_nfs_user.yml index 9eb82832e..1f51fbf1a 100644 --- a/roles/ceph-nfs/tasks/create_rgw_nfs_user.yml +++ b/roles/ceph-nfs/tasks/create_rgw_nfs_user.yml @@ -8,6 +8,7 @@ - name: create rgw nfs user command: "{{ docker_exec_cmd_nfs | default('') }} radosgw-admin --cluster {{ cluster }} user create --uid={{ ceph_nfs_rgw_user }} --display-name='RGW NFS User'" register: rgwuser + changed_when: false delegate_to: "{{ groups[mon_group_name][0] }}" when: - nfs_obj_gw diff --git a/roles/ceph-nfs/tasks/docker/main.yml b/roles/ceph-nfs/tasks/docker/main.yml deleted file mode 100644 index f05ce4ae9..000000000 --- a/roles/ceph-nfs/tasks/docker/main.yml +++ /dev/null @@ -1,3 +0,0 @@ ---- -- name: include start_docker_nfs.yml - include: start_docker_nfs.yml diff --git a/roles/ceph-nfs/tasks/docker/start_docker_nfs.yml b/roles/ceph-nfs/tasks/docker/start_docker_nfs.yml deleted file mode 100644 index 45bc18eb2..000000000 --- a/roles/ceph-nfs/tasks/docker/start_docker_nfs.yml +++ /dev/null @@ -1,18 +0,0 @@ ---- -- name: generate systemd unit file - become: true - template: - src: "{{ role_path }}/templates/ceph-nfs.service.j2" - dest: /etc/systemd/system/ceph-nfs@.service - owner: "root" - group: "root" - mode: "0644" - -- name: systemd start nfs container - systemd: - name: "ceph-nfs@{{ ansible_hostname }}.service" - state: started - enabled: yes - daemon_reload: yes - when: - - ceph_nfs_enable_service diff --git a/roles/ceph-nfs/tasks/ganesha_selinux_fix.yml b/roles/ceph-nfs/tasks/ganesha_selinux_fix.yml new file mode 100644 index 000000000..0aa3c66ed --- /dev/null +++ b/roles/ceph-nfs/tasks/ganesha_selinux_fix.yml @@ -0,0 +1,28 @@ +--- +- name: check if selinux is enabled + command: getenforce + register: selinuxstatus + changed_when: false + failed_when: false + always_run: true + +- name: install policycoreutils-python to get semanage + package: + name: policycoreutils-python + state: present + when: + - selinuxstatus.stdout != 'Disabled' + +- name: test if ganesha_t is already permissive + shell: | + semanage permissive -l | grep -soq ganesha_t + changed_when: false + failed_when: false + register: ganesha_t_permissive + +- name: run semanage permissive -a ganesha_t + command: semanage permissive -a ganesha_t + changed_when: false + when: + - selinuxstatus.stdout != 'Disabled' + - ganesha_t_permissive.rc != 0 diff --git a/roles/ceph-nfs/tasks/main.yml b/roles/ceph-nfs/tasks/main.yml index fe70f3c40..435628ed3 100644 --- a/roles/ceph-nfs/tasks/main.yml +++ b/roles/ceph-nfs/tasks/main.yml @@ -1,18 +1,24 @@ --- -- name: include pre_requisite.yml - include: pre_requisite.yml +- name: include pre_requisite_non_container.yml + include: pre_requisite_non_container.yml when: - not containerized_deployment +- name: include pre_requisite_container.yml + include: pre_requisite_container.yml + when: + - containerized_deployment + - name: include create_rgw_nfs_user.yml include: create_rgw_nfs_user.yml -- name: include start_nfs.yml - include: start_nfs.yml +# NOTE (leseb): workaround for issues with ganesha and librgw +- name: include ganesha_selinux_fix.yml + include: ganesha_selinux_fix.yml when: - not containerized_deployment + - ansible_os_family == 'RedHat' + - ansible_distribution_version >= '7.4' -- name: include docker/main.yml - include: docker/main.yml - when: - - containerized_deployment +- name: include start_nfs.yml + include: start_nfs.yml diff --git a/roles/ceph-nfs/tasks/docker/copy_configs.yml b/roles/ceph-nfs/tasks/pre_requisite_container.yml similarity index 100% rename from roles/ceph-nfs/tasks/docker/copy_configs.yml rename to roles/ceph-nfs/tasks/pre_requisite_container.yml diff --git a/roles/ceph-nfs/tasks/pre_requisite.yml b/roles/ceph-nfs/tasks/pre_requisite_non_container.yml similarity index 86% rename from roles/ceph-nfs/tasks/pre_requisite.yml rename to roles/ceph-nfs/tasks/pre_requisite_non_container.yml index afe1c7094..0ab6bd22b 100644 --- a/roles/ceph-nfs/tasks/pre_requisite.yml +++ b/roles/ceph-nfs/tasks/pre_requisite_non_container.yml @@ -1,4 +1,6 @@ --- +# NOTE (leseb): we use root:ceph for permissions since ganesha +# does not have the right selinux context to read ceph directories. - name: create rados gateway and ganesha directories file: path: "{{ item }}" @@ -11,8 +13,8 @@ - /var/lib/ceph/radosgw - /var/lib/ceph/radosgw/{{ cluster }}-rgw.{{ ansible_hostname }} - "{{ rbd_client_admin_socket_path }}" - - /var/lib/nfs/ganesha - - /var/run/ganesha + - /var/log/ceph + - /var/run/ceph/ when: - nfs_obj_gw @@ -51,7 +53,7 @@ - name: change ownership on /var/log/ganesha file: - path: '/var/log/ganesha' - owner: 'root' - group: 'root' - mode: '0755' + path: /var/log/ganesha + owner: "root" + group: "root" + mode: "0755" diff --git a/roles/ceph-nfs/tasks/start_nfs.yml b/roles/ceph-nfs/tasks/start_nfs.yml index 02513149d..da5e4c3d7 100644 --- a/roles/ceph-nfs/tasks/start_nfs.yml +++ b/roles/ceph-nfs/tasks/start_nfs.yml @@ -1,4 +1,12 @@ --- +- name: create /etc/ganesha + file: + path: /etc/ganesha + state: directory + owner: root + group: root + mode: "0755" + - name: generate ganesha configuration file action: config_template args: @@ -11,6 +19,27 @@ notify: - restart ceph nfss +- name: generate systemd unit file + become: true + template: + src: "{{ role_path }}/templates/ceph-nfs.service.j2" + dest: /etc/systemd/system/ceph-nfs@.service + owner: "root" + group: "root" + mode: "0644" + when: + - containerized_deployment + +- name: systemd start nfs container + systemd: + name: "ceph-nfs@{{ ansible_hostname }}.service" + state: started + enabled: yes + daemon_reload: yes + when: + - ceph_nfs_enable_service + - containerized_deployment + - name: start nfs gateway service service: name: nfs-ganesha @@ -18,3 +47,4 @@ enabled: yes when: - ceph_nfs_enable_service + - not containerized_deployment diff --git a/roles/ceph-nfs/templates/ceph-nfs.service.j2 b/roles/ceph-nfs/templates/ceph-nfs.service.j2 index 1b0834ab1..fadfc3499 100644 --- a/roles/ceph-nfs/templates/ceph-nfs.service.j2 +++ b/roles/ceph-nfs/templates/ceph-nfs.service.j2 @@ -8,20 +8,22 @@ EnvironmentFile=-/etc/environment ExecStartPre=-/usr/bin/docker rm ceph-nfs-%i ExecStartPre=/usr/bin/mkdir -p /etc/ceph /etc/ganesha /var/lib/nfs/ganesha ExecStart=/usr/bin/docker run --rm --net=host \ - {% if not containerized_deployment_with_kv -%} - -v /etc/ceph:/etc/ceph \ - -v /etc/ganesha:/etc/ganesha \ - {% else -%} - -e KV_TYPE={{kv_type}} \ - -e KV_IP={{kv_endpoint}}\ - -e KV_PORT={{kv_port}} \ - {% endif -%} - -v /etc/localtime:/etc/localtime:ro \ - --privileged \ - -e CEPH_DAEMON=NFS \ - {{ ceph_nfs_docker_extra_env }} \ - --name=ceph-nfs-{{ ansible_hostname }} \ - {{ ceph_docker_registry }}/{{ ceph_docker_image }}:{{ ceph_docker_image_tag }} + {% if not containerized_deployment_with_kv -%} + -v /var/lib/ceph:/var/lib/ceph \ + -v /etc/ceph:/etc/ceph \ + -v /var/lib/ganesha:/var/lib/ganesha \ + -v /etc/ganesha:/etc/ganesha \ + {% else -%} + -e KV_TYPE={{kv_type}} \ + -e KV_IP={{kv_endpoint}}\ + -e KV_PORT={{kv_port}} \ + {% endif -%} + -v /etc/localtime:/etc/localtime:ro \ + -e CLUSTER={{ cluster }} \ + -e CEPH_DAEMON=NFS \ + {{ ceph_nfs_docker_extra_env }} \ + --name=ceph-nfs-{{ ansible_hostname }} \ + {{ ceph_docker_registry }}/{{ ceph_docker_image }}:{{ ceph_docker_image_tag }} ExecStopPost=-/usr/bin/docker stop ceph-nfs-%i Restart=always RestartSec=10s From 048b55be4ab357cce23d9cb42449c9682e82efc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Han?= Date: Wed, 27 Sep 2017 00:55:31 +0200 Subject: [PATCH 5/9] defaults: only run socket checks on their specific roles MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Running the socket check on all the hosts will override the default value of docker_exec_cmd, leaving it with the last value (currently rbd-mirror), as a result the subsequent docker_exec_cmd usage for the :x Signed-off-by: Sébastien Han --- roles/ceph-defaults/handlers/main.yml | 64 +++++++++++++-------- roles/ceph-defaults/tasks/check_socket.yml | 67 ++++++++++++++++++++++ 2 files changed, 106 insertions(+), 25 deletions(-) diff --git a/roles/ceph-defaults/handlers/main.yml b/roles/ceph-defaults/handlers/main.yml index f911344a3..885a42abe 100644 --- a/roles/ceph-defaults/handlers/main.yml +++ b/roles/ceph-defaults/handlers/main.yml @@ -20,8 +20,8 @@ listen: "restart ceph mons" when: # We do not want to run these checks on initial deployment (`socket.rc == 0`) - - mon_socket_stat.rc == 0 - mon_group_name in group_names + - mon_socket_stat.rc == 0 # This does not just restart OSDs but everything else too. Unfortunately # at this time the ansible role does not have an OSD id list to use @@ -35,8 +35,8 @@ mode: 0750 listen: "restart ceph osds" when: - - inventory_hostname in play_hosts - osd_group_name in group_names + - inventory_hostname in play_hosts - name: restart containerized ceph osds daemon(s) command: /tmp/restart_osd_daemon.sh @@ -45,17 +45,18 @@ when: # We do not want to run these checks on initial deployment (`socket_osd_container_stat.results[n].rc == 0`) # except when a crush location is specified. ceph-disk will start the osds before the osd crush location is specified + - osd_group_name in group_names - containerized_deployment - ((crush_location is defined and crush_location) or item.get('rc') == 0) - handler_health_osd_check # See https://github.com/ceph/ceph-ansible/issues/1457 for the condition below - inventory_hostname in play_hosts - - osd_group_name in group_names - name: restart non-containerized ceph osds daemon(s) command: /tmp/restart_osd_daemon.sh listen: "restart ceph osds" when: + - osd_group_name in group_names - not containerized_deployment # We do not want to run these checks on initial deployment (`socket_osd_container.results[n].rc == 0`) # except when a crush location is specified. ceph-disk will start the osds before the osd crush location is specified @@ -64,27 +65,27 @@ - handler_health_osd_check # See https://github.com/ceph/ceph-ansible/issues/1457 for the condition below - inventory_hostname in play_hosts - - osd_group_name in group_names -- name: copy mds restart script - template: - src: restart_mds_daemon.sh.j2 - dest: /tmp/restart_mds_daemon.sh - owner: root - group: root - mode: 0750 - listen: "restart ceph mdss" - when: - - inventory_hostname in play_hosts - - mds_group_name in group_names +- block: + - name: copy mds restart script + template: + src: restart_mds_daemon.sh.j2 + dest: /tmp/restart_mds_daemon.sh + owner: root + group: root + mode: 0750 + listen: "restart ceph mdss" + when: + - mds_group_name in group_names + - inventory_hostname in play_hosts -- name: restart ceph mds daemon(s) - command: /tmp/restart_mds_daemon.sh - listen: "restart ceph mdss" + - name: restart ceph mds daemon(s) + command: /tmp/restart_mds_daemon.sh + listen: "restart ceph mdss" when: # We do not want to run these checks on initial deployment (`socket.rc == 0`) - - mds_socket_stat.rc == 0 - mds_group_name in group_names + - mds_socket_stat.rc == 0 - name: copy rgw restart script template: @@ -95,20 +96,33 @@ mode: 0750 listen: "restart ceph rgws" when: - - inventory_hostname in play_hosts - rgw_group_name in group_names + - inventory_hostname in play_hosts - name: restart ceph rgw daemon(s) command: /tmp/restart_rgw_daemon.sh listen: "restart ceph rgws" when: # We do not want to run these checks on initial deployment (`socket.rc == 0`) - - rgw_socket_stat.rc == 0 - rgw_group_name in group_names + - rgw_socket_stat.rc == 0 -- name: restart ceph nfss - service: - name: nfs-ganesha - state: restarted +- name: copy nfs restart script + template: + src: restart_nfs_daemon.sh.j2 + dest: /tmp/restart_nfs_daemon.sh + owner: root + group: root + mode: 0750 + listen: "restart ceph nfss" when: - nfs_group_name in group_names + - inventory_hostname in play_hosts + +- name: restart ceph nfs daemon(s) + command: /tmp/restart_nfs_daemon.sh + listen: "restart ceph nfss" + when: + # We do not want to run these checks on initial deployment (`socket.rc == 0`) + - nfs_group_name in group_names + - nfs_socket_stat.rc == 0 diff --git a/roles/ceph-defaults/tasks/check_socket.yml b/roles/ceph-defaults/tasks/check_socket.yml index 21b5de054..70ed0e433 100644 --- a/roles/ceph-defaults/tasks/check_socket.yml +++ b/roles/ceph-defaults/tasks/check_socket.yml @@ -4,6 +4,7 @@ set_fact: docker_exec_cmd: "docker exec ceph-mon-{{ ansible_hostname }}" when: + - inventory_hostname in groups.get(mon_group_name, []) - containerized_deployment - name: check for a ceph mon socket @@ -13,6 +14,8 @@ failed_when: false always_run: true register: mon_socket_stat + when: + - inventory_hostname in groups.get(mon_group_name, []) - name: check if the ceph mon socket is in-use shell: | @@ -22,6 +25,7 @@ always_run: true register: mon_socket when: + - inventory_hostname in groups.get(mon_group_name, []) - mon_socket_stat.rc == 0 - name: remove ceph mon socket if exists and not used by a process @@ -29,6 +33,7 @@ name: "{{ mon_socket_stat.stdout }}" state: absent when: + - inventory_hostname in groups.get(mon_group_name, []) - not containerized_deployment - mon_socket_stat.rc == 0 - mon_socket.rc != 0 @@ -41,6 +46,7 @@ always_run: true register: osd_socket_stat when: + - inventory_hostname in groups.get(osd_group_name, []) - not containerized_deployment - name: check if the ceph osd socket is in-use @@ -51,6 +57,7 @@ always_run: true register: osd_socket when: + - inventory_hostname in groups.get(osd_group_name, []) - not containerized_deployment - osd_socket_stat.rc == 0 @@ -59,6 +66,7 @@ name: "{{ osd_socket_stat.stdout }}" state: absent when: + - inventory_hostname in groups.get(osd_group_name, []) - not containerized_deployment - osd_socket_stat.rc == 0 - osd_socket.rc != 0 @@ -67,6 +75,7 @@ set_fact: docker_exec_cmd: "docker exec ceph-mds-{{ ansible_hostname }}" when: + - inventory_hostname in groups.get(mds_group_name, []) - containerized_deployment - name: check for a ceph mds socket @@ -76,6 +85,8 @@ failed_when: false always_run: true register: mds_socket_stat + when: + - inventory_hostname in groups.get(mds_group_name, []) - name: check if the ceph mds socket is in-use shell: | @@ -85,6 +96,7 @@ always_run: true register: mds_socket when: + - inventory_hostname in groups.get(mds_group_name, []) - mds_socket_stat.rc == 0 - name: remove ceph mds socket if exists and not used by a process @@ -92,6 +104,7 @@ name: "{{ mds_socket_stat.stdout }}" state: absent when: + - inventory_hostname in groups.get(mds_group_name, []) - not containerized_deployment - mds_socket_stat.rc == 0 - mds_socket.rc != 0 @@ -100,6 +113,7 @@ set_fact: docker_exec_cmd: "docker exec ceph-rgw-{{ ansible_hostname }}" when: + - inventory_hostname in groups.get(rgw_group_name, []) - containerized_deployment - name: check for a ceph rgw socket @@ -109,6 +123,8 @@ failed_when: false always_run: true register: rgw_socket_stat + when: + - inventory_hostname in groups.get(rgw_group_name, []) - name: check if the ceph rgw socket is in-use shell: | @@ -118,6 +134,7 @@ always_run: true register: rgw_socket when: + - inventory_hostname in groups.get(rgw_group_name, []) - rgw_socket_stat.rc == 0 - name: remove ceph rgw socket if exists and not used by a process @@ -125,6 +142,7 @@ name: "{{ rgw_socket_stat.stdout }}" state: absent when: + - inventory_hostname in groups.get(rgw_group_name, []) - not containerized_deployment - rgw_socket_stat.rc == 0 - rgw_socket.rc != 0 @@ -133,6 +151,7 @@ set_fact: docker_exec_cmd: "docker exec ceph-mgr-{{ ansible_hostname }}" when: + - inventory_hostname in groups.get(mgr_group_name, []) - containerized_deployment - name: check for a ceph mgr socket @@ -142,6 +161,8 @@ failed_when: false always_run: true register: mgr_socket_stat + when: + - inventory_hostname in groups.get(mgr_group_name, []) - name: check if the ceph mgr socket is in-use shell: | @@ -151,6 +172,7 @@ always_run: true register: mgr_socket when: + - inventory_hostname in groups.get(mgr_group_name, []) - mgr_socket_stat.rc == 0 - name: remove ceph mgr socket if exists and not used by a process @@ -158,6 +180,7 @@ name: "{{ mgr_socket_stat.stdout }}" state: absent when: + - inventory_hostname in groups.get(mgr_group_name, []) - not containerized_deployment - mgr_socket_stat.rc == 0 - mgr_socket.rc != 0 @@ -166,6 +189,7 @@ set_fact: docker_exec_cmd: "docker exec ceph-rbd-mirror-{{ ansible_hostname }}" when: + - inventory_hostname in groups.get(rbdmirror_group_name, []) - containerized_deployment - name: check for a ceph rbd mirror socket @@ -175,6 +199,8 @@ failed_when: false always_run: true register: rbd_mirror_socket_stat + when: + - inventory_hostname in groups.get(rbdmirror_group_name, []) - name: check if the ceph rbd mirror socket is in-use shell: | @@ -184,6 +210,7 @@ always_run: true register: rbd_mirror_socket when: + - inventory_hostname in groups.get(rbdmirror_group_name, []) - rbd_mirror_socket_stat.rc == 0 - name: remove ceph rbd mirror socket if exists and not used by a process @@ -191,10 +218,49 @@ name: "{{ rbd_mirror_socket_stat.stdout }}" state: absent when: + - inventory_hostname in groups.get(rbdmirror_group_name, []) - not containerized_deployment - rbd_mirror_socket_stat.rc == 0 - rbd_mirror_socket.rc != 0 +- name: set_fact docker_exec_cmd nfs ganesha + set_fact: + docker_exec_cmd: "docker exec ceph-rbd-mirror-{{ ansible_hostname }}" + when: + - inventory_hostname in groups.get(nfs_group_name, []) + - containerized_deployment + +- name: check for a ceph nfs ganesha socket + shell: | + {{ docker_exec_cmd | default('') }} bash -c 'stat --printf=%n /var/run/ganesha.pid' + changed_when: false + failed_when: false + always_run: true + register: nfs_socket_stat + when: + - inventory_hostname in groups.get(nfs_group_name, []) + +- name: check if the ceph nfs ganesha socket is in-use + shell: | + {{ docker_exec_cmd | default('') }} bash -c 'fuser --silent {{ nfs_socket_stat.stdout }}' + changed_when: false + failed_when: false + always_run: true + register: nfs_socket + when: + - inventory_hostname in groups.get(nfs_group_name, []) + - nfs_socket_stat.rc == 0 + +- name: remove ceph nfs ganesha socket if exists and not used by a process + file: + name: "{{ nfs_socket_stat.stdout }}" + state: absent + when: + - inventory_hostname in groups.get(nfs_group_name, []) + - not containerized_deployment + - nfs_socket_stat.rc == 0 + - nfs_socket.rc != 0 + - name: check for a ceph socket in containerized deployment (osds) shell: | docker exec ceph-osd-"{{ ansible_hostname }}"-"{{ item | replace('/', '') }}" bash -c 'stat --printf=%n /var/run/ceph/*.asok' @@ -206,3 +272,4 @@ when: - containerized_deployment - inventory_hostname in groups.get(osd_group_name, []) + From e121bc58e9dc63fccf1ed014ddf02701162513cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Han?= Date: Wed, 27 Sep 2017 02:08:40 +0200 Subject: [PATCH 6/9] defaults: add missing handlers for rbd mirorr and mgr MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Sébastien Han --- group_vars/all.yml.sample | 8 ++++ group_vars/rhcs.yml.sample | 8 ++++ roles/ceph-config/tasks/main.yml | 2 + roles/ceph-defaults/defaults/main.yml | 8 ++++ roles/ceph-defaults/handlers/main.yml | 40 +++++++++++++++++++ .../templates/restart_mgr_daemon.sh.j2 | 20 ++++++++++ .../templates/restart_rbd_mirror_daemon.sh.j2 | 20 ++++++++++ 7 files changed, 106 insertions(+) create mode 100644 roles/ceph-defaults/templates/restart_mgr_daemon.sh.j2 create mode 100644 roles/ceph-defaults/templates/restart_rbd_mirror_daemon.sh.j2 diff --git a/group_vars/all.yml.sample b/group_vars/all.yml.sample index e2d54f78b..22110f879 100644 --- a/group_vars/all.yml.sample +++ b/group_vars/all.yml.sample @@ -398,6 +398,14 @@ dummy: #handler_health_nfs_check_retries: 5 #handler_health_nfs_check_delay: 10 +# RBD MIRROR handler checks +#handler_health_rbd_mirror_check_retries: 5 +#handler_health_rbd_mirror_check_delay: 10 + +# MGR handler checks +#handler_health_mgr_check_retries: 5 +#handler_health_mgr_check_delay: 10 + ############### # NFS-GANESHA # ############### diff --git a/group_vars/rhcs.yml.sample b/group_vars/rhcs.yml.sample index 112efa41e..5ba40b7fc 100644 --- a/group_vars/rhcs.yml.sample +++ b/group_vars/rhcs.yml.sample @@ -398,6 +398,14 @@ ceph_repository: rhcs #handler_health_nfs_check_retries: 5 #handler_health_nfs_check_delay: 10 +# RBD MIRROR handler checks +#handler_health_rbd_mirror_check_retries: 5 +#handler_health_rbd_mirror_check_delay: 10 + +# MGR handler checks +#handler_health_mgr_check_retries: 5 +#handler_health_mgr_check_delay: 10 + ############### # NFS-GANESHA # ############### diff --git a/roles/ceph-config/tasks/main.yml b/roles/ceph-config/tasks/main.yml index 3ecf5d9e7..4a5df3350 100644 --- a/roles/ceph-config/tasks/main.yml +++ b/roles/ceph-config/tasks/main.yml @@ -117,6 +117,8 @@ - restart ceph osds - restart ceph mdss - restart ceph rgws + - restart ceph rbdmirrors + - restart ceph mgrs - name: set fsid fact when generate_fsid = true set_fact: diff --git a/roles/ceph-defaults/defaults/main.yml b/roles/ceph-defaults/defaults/main.yml index cfdbbbdca..20a31d3e7 100644 --- a/roles/ceph-defaults/defaults/main.yml +++ b/roles/ceph-defaults/defaults/main.yml @@ -390,6 +390,14 @@ handler_health_rgw_check_delay: 10 handler_health_nfs_check_retries: 5 handler_health_nfs_check_delay: 10 +# RBD MIRROR handler checks +handler_health_rbd_mirror_check_retries: 5 +handler_health_rbd_mirror_check_delay: 10 + +# MGR handler checks +handler_health_mgr_check_retries: 5 +handler_health_mgr_check_delay: 10 + ############### # NFS-GANESHA # ############### diff --git a/roles/ceph-defaults/handlers/main.yml b/roles/ceph-defaults/handlers/main.yml index 885a42abe..98799bd9a 100644 --- a/roles/ceph-defaults/handlers/main.yml +++ b/roles/ceph-defaults/handlers/main.yml @@ -126,3 +126,43 @@ # We do not want to run these checks on initial deployment (`socket.rc == 0`) - nfs_group_name in group_names - nfs_socket_stat.rc == 0 + +- name: copy rbd mirror restart script + template: + src: restart_rbd_mirror_daemon.sh.j2 + dest: /tmp/restart_rbd_mirror_daemon.sh + owner: root + group: root + mode: 0750 + listen: "restart ceph rbdmirrors" + when: + - rbdmirror_group_name in group_names + - inventory_hostname in play_hosts + +- name: restart ceph rbd mirror daemon(s) + command: /tmp/restart_rbd_mirror_daemon.sh + listen: "restart ceph rbdmirrors" + when: + # We do not want to run these checks on initial deployment (`socket.rc == 0`) + - rbdmirror_group_name in group_names + - rbd_mirror_socket_stat.rc == 0 + +- name: copy mgr restart script + template: + src: restart_mgr_daemon.sh.j2 + dest: /tmp/restart_mgr_daemon.sh + owner: root + group: root + mode: 0750 + listen: "restart ceph mgrs" + when: + - mgr_group_name in group_names + - inventory_hostname in play_hosts + +- name: restart ceph mgr daemon(s) + command: /tmp/restart_mgr_daemon.sh + listen: "restart ceph mgrs" + when: + # We do not want to run these checks on initial deployment (`socket.rc == 0`) + - mgr_group_name in group_names + - mgr_socket_stat.rc == 0 diff --git a/roles/ceph-defaults/templates/restart_mgr_daemon.sh.j2 b/roles/ceph-defaults/templates/restart_mgr_daemon.sh.j2 new file mode 100644 index 000000000..bfc85ba40 --- /dev/null +++ b/roles/ceph-defaults/templates/restart_mgr_daemon.sh.j2 @@ -0,0 +1,20 @@ +#!/bin/bash + +RETRIES="{{ handler_health_mgr_check_retries }}" +DELAY="{{ handler_health_mgr_check_delay }}" +MGR_NAME="{{ ansible_hostname }}" +SOCKET=/var/run/ceph/{{ cluster }}-mgr.${MGR_NAME}.asok + +# First, restart the daemon +systemctl restart ceph-mgr@${MGR_NAME} + +COUNT=10 +# Wait and ensure the socket exists after restarting the daemds +while [ $RETRIES -ne 0 ]; do + {{ docker_exec_cmd }} test -S $SOCKET && exit 0 + sleep $DELAY + let RETRIES=RETRIES-1 +done +# If we reach this point, it means the socket is not present. +echo "Socket file ${SOCKET} could not be found, which means ceph manager is not running." +exit 1 diff --git a/roles/ceph-defaults/templates/restart_rbd_mirror_daemon.sh.j2 b/roles/ceph-defaults/templates/restart_rbd_mirror_daemon.sh.j2 new file mode 100644 index 000000000..a9e9f6e29 --- /dev/null +++ b/roles/ceph-defaults/templates/restart_rbd_mirror_daemon.sh.j2 @@ -0,0 +1,20 @@ +#!/bin/bash + +RETRIES="{{ handler_health_rbd_mirror_check_retries }}" +DELAY="{{ handler_health_rbd_mirror_check_delay }}" +RBD_MIRROR_NAME="{{ ansible_hostname }}" +SOCKET=/var/run/ceph/{{ cluster }}-client.rbd-mirror.${RBD_MIRROR_NAME}.asok + +# First, restart the daemon +systemctl restart ceph-rbd-mirror@rbd-mirror.${RBD_MIRROR_NAME} + +COUNT=10 +# Wait and ensure the socket exists after restarting the daemon +while [ $RETRIES -ne 0 ]; do + {{ docker_exec_cmd }} test -S $SOCKET && exit 0 + sleep $DELAY + let RETRIES=RETRIES-1 +done +# If we reach this point, it means the socket is not present. +echo "Socket file ${SOCKET} could not be found, which means rbd mirror is not running." +exit 1 From 083c53c6bd961203d83d24cc6cbe06b4aafb7d87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Han?= Date: Thu, 28 Sep 2017 16:06:44 +0200 Subject: [PATCH 7/9] ci: run purge cluster on centos not ubuntu MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Sébastien Han --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index 8f8416776..53be8a1e8 100644 --- a/tox.ini +++ b/tox.ini @@ -177,7 +177,7 @@ changedir= purge_docker_cluster: {toxinidir}/tests/functional/centos/7/docker docker_dedicated_journal: {toxinidir}/tests/functional/centos/7/docker-ded-jrn docker_dmcrypt_journal_collocation: {toxinidir}/tests/functional/centos/7/docker-crypt-jrn-col - purge_cluster: {toxinidir}/tests/functional/ubuntu/16.04/cluster + purge_cluster: {toxinidir}/tests/functional/centos/7/cluster purge_dmcrypt: {toxinidir}/tests/functional/centos/7/crypt-ded-jrn update_dmcrypt: {toxinidir}/tests/functional/centos/7/crypt-ded-jrn update_cluster: {toxinidir}/tests/functional/centos/7/cluster From ca76c469819d12bcc0f407c3049d86605f92c614 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Han?= Date: Thu, 28 Sep 2017 18:24:15 +0200 Subject: [PATCH 8/9] site: only support nfs on luminous and above MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Sébastien Han --- site-docker.yml.sample | 8 ++++---- site.yml.sample | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/site-docker.yml.sample b/site-docker.yml.sample index e9e33088e..6e3bcedf5 100644 --- a/site-docker.yml.sample +++ b/site-docker.yml.sample @@ -73,10 +73,10 @@ become: True gather_facts: false roles: - - ceph-defaults - - ceph-docker-common - - ceph-config - - ceph-nfs + - { role: ceph-defaults, when: "ceph_release_num.{{ ceph_stable_release }} >= ceph_release_num.luminous" } + - { role: ceph-docker-common, when: "ceph_release_num.{{ ceph_stable_release }} >= ceph_release_num.luminous" } + - { role: ceph-config, when: "ceph_release_num.{{ ceph_stable_release }} >= ceph_release_num.luminous" } + - { role: ceph-nfs, when: "ceph_release_num.{{ ceph_stable_release }} >= ceph_release_num.luminous" } - hosts: rbdmirrors become: True diff --git a/site.yml.sample b/site.yml.sample index 39898ed12..4ae0cc54f 100644 --- a/site.yml.sample +++ b/site.yml.sample @@ -110,10 +110,10 @@ gather_facts: false become: True roles: - - ceph-defaults - - ceph-common - - ceph-config - - ceph-nfs + - { role: ceph-defaults, when: "ceph_release_num.{{ ceph_stable_release }} >= ceph_release_num.luminous" } + - { role: ceph-common, when: "ceph_release_num.{{ ceph_stable_release }} >= ceph_release_num.luminous" } + - { role: ceph-config, when: "ceph_release_num.{{ ceph_stable_release }} >= ceph_release_num.luminous" } + - { role: ceph-nfs, when: "ceph_release_num.{{ ceph_stable_release }} >= ceph_release_num.luminous" } - hosts: restapis gather_facts: false From b9050d62297dcf2698c699a79c039701f7206bbb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Han?= Date: Fri, 29 Sep 2017 12:05:43 +0200 Subject: [PATCH 9/9] update: fix var register MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Even if the task is skipped, ansible registers the var as 'skipped' so this task the task using this variable for its next usage. Signed-off-by: Sébastien Han --- infrastructure-playbooks/rolling_update.yml | 40 +++++---------------- 1 file changed, 9 insertions(+), 31 deletions(-) diff --git a/infrastructure-playbooks/rolling_update.yml b/infrastructure-playbooks/rolling_update.yml index 0040fed55..234a3f00f 100644 --- a/infrastructure-playbooks/rolling_update.yml +++ b/infrastructure-playbooks/rolling_update.yml @@ -312,37 +312,24 @@ - ceph-defaults tasks: - - name: unset osd flags - command: ceph osd unset {{ item }} --cluster {{ cluster }} - with_items: - - noout - - noscrub - - nodeep-scrub - delegate_to: "{{ groups[mon_group_name][0] }}" - when: not containerized_deployment + - name: set_fact docker_exec_cmd_osd + set_fact: + docker_exec_cmd_update_osd: "docker exec ceph-mon-{{ hostvars[groups[mon_group_name][0]]['ansible_hostname'] }}" + when: + - containerized_deployment - - name: unset containerized osd flags - command: | - docker exec ceph-mon-{{ hostvars[groups[mon_group_name][0]]['ansible_hostname'] }} ceph osd unset {{ item }} --cluster {{ cluster }} + - name: unset osd flags + command: "{{ docker_exec_cmd_update_osd|default('') }} ceph osd unset {{ item }} --cluster {{ cluster }}" with_items: - noout - noscrub - nodeep-scrub delegate_to: "{{ groups[mon_group_name][0] }}" - when: containerized_deployment - name: get osd versions - command: ceph --cluster {{ cluster }} versions + command: "{{ docker_exec_cmd_update_osd|default('') }} ceph --cluster {{ cluster }} versions" register: ceph_versions delegate_to: "{{ groups[mon_group_name][0] }}" - when: not containerized_deployment - - - name: containers - get osd versions - command: | - docker exec ceph-mon-{{ hostvars[groups[mon_group_name][0]]['ansible_hostname'] }} ceph --cluster {{ cluster }} versions - register: ceph_versions - delegate_to: "{{ groups[mon_group_name][0] }}" - when: containerized_deployment - name: set_fact ceph_versions_osd set_fact: @@ -352,21 +339,12 @@ # length == 1 means there is a single osds versions entry # thus all the osds are running the same version - name: complete osds upgrade - command: ceph --cluster {{ cluster }} osd require-osd-release luminous + command: "{{ docker_exec_cmd_update_osd|default('') }} ceph --cluster {{ cluster }} osd require-osd-release luminous" delegate_to: "{{ groups[mon_group_name][0] }}" when: - - not containerized_deployment - (ceph_versions.stdout|from_json).osd | length == 1 - ceph_versions_osd | string | search("ceph version 12") - - name: containers - complete osds upgrade - command: | - docker exec ceph-mon-{{ hostvars[groups[mon_group_name][0]]['ansible_hostname'] }} ceph --cluster {{ cluster }} osd require-osd-release luminous - delegate_to: "{{ groups[mon_group_name][0] }}" - when: - - containerized_deployment - - (ceph_versions.stdout|from_json).osd | length == 1 - - ceph_versions_osd | string | search("ceph version 12") - name: upgrade ceph mdss cluster