diff --git a/infrastructure-playbooks/purge-docker-cluster.yml b/infrastructure-playbooks/purge-docker-cluster.yml index 9ef7ccc82..03c623f61 100644 --- a/infrastructure-playbooks/purge-docker-cluster.yml +++ b/infrastructure-playbooks/purge-docker-cluster.yml @@ -97,13 +97,21 @@ tasks: - - name: disable ceph rgw service +# For backward compatibility + - name: disable ceph rgw service (old unit name, for backward compatibility) service: name: "ceph-rgw@{{ ansible_hostname }}" state: stopped enabled: no ignore_errors: true + - name: disable ceph rgw service (new unit name) + service: + name: "ceph-radosgw@{{ ansible_hostname }}" + state: stopped + enabled: no + ignore_errors: true + - name: remove ceph rgw container docker: image: "{{ ceph_docker_registry }}/{{ ceph_docker_image }}:{{ ceph_docker_image_tag }}" @@ -113,8 +121,12 @@ - name: remove ceph rgw service file: - path: /etc/systemd/system/ceph-rgw@.service + path: "{{ item }}" state: absent + with_items: +# For backward compatibility + - /etc/systemd/system/ceph-rgw@.service + - /etc/systemd/system/ceph-radosgw@.service - name: remove ceph rgw image docker_image: diff --git a/infrastructure-playbooks/rolling_update.yml b/infrastructure-playbooks/rolling_update.yml index 5c0919732..9ddd4a77c 100644 --- a/infrastructure-playbooks/rolling_update.yml +++ b/infrastructure-playbooks/rolling_update.yml @@ -450,7 +450,7 @@ - name: restart containerized ceph rgws with systemd service: - name: ceph-rgw@{{ ansible_hostname }} + name: ceph-radosgw@{{ ansible_hostname }} state: restarted enabled: yes when: diff --git a/roles/ceph-common/tasks/checks/check_socket.yml b/roles/ceph-common/tasks/checks/check_socket.yml deleted file mode 100644 index 79b512c8e..000000000 --- a/roles/ceph-common/tasks/checks/check_socket.yml +++ /dev/null @@ -1,15 +0,0 @@ ---- -# These checks are used to avoid running handlers at initial deployment. -- name: check for a ceph socket - shell: "stat /var/run/ceph/*.asok > /dev/null 2>&1" - changed_when: false - failed_when: false - always_run: true - register: socket - -- name: check for a rados gateway socket - shell: "stat {{ rbd_client_admin_socket_path }}*.asok > /dev/null 2>&1" - changed_when: false - failed_when: false - always_run: true - register: socketrgw diff --git a/roles/ceph-common/tasks/main.yml b/roles/ceph-common/tasks/main.yml index b2c1a0e3f..9c65a27bf 100644 --- a/roles/ceph-common/tasks/main.yml +++ b/roles/ceph-common/tasks/main.yml @@ -105,7 +105,6 @@ - ceph_current_fsid.rc == 0 - mon_group_name in group_names -- include: ./checks/check_socket.yml - include: create_ceph_initial_dirs.yml - include: generate_ceph_conf.yml - include: create_rbd_client_dir.yml diff --git a/roles/ceph-common/templates/restart_osd_daemon.sh.j2 b/roles/ceph-common/templates/restart_osd_daemon.sh.j2 deleted file mode 100644 index ae31f405e..000000000 --- a/roles/ceph-common/templates/restart_osd_daemon.sh.j2 +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash - -RETRIES="{{ handler_health_osd_check_retries }}" -DELAY="{{ handler_health_osd_check_delay }}" -CEPH_CLI="--name client.bootstrap-osd --keyring /var/lib/ceph/bootstrap-osd/{{ cluster }}.keyring --cluster {{ cluster }}" - -check_pgs() { - while [ $RETRIES -ne 0 ]; do - test "[""$(ceph $CEPH_CLI -s -f json | python -c 'import sys, json; print(json.load(sys.stdin)["pgmap"]["num_pgs"])')""]" = "$(ceph $CEPH_CLI -s -f json | python -c 'import sys, json; print [ i["count"] for i in json.load(sys.stdin)["pgmap"]["pgs_by_state"] if i["state_name"] == "active+clean"]')" - RET=$? - test $RET -eq 0 && return 0 - sleep $DELAY - let RETRIES=RETRIES-1 - done - # PGs not clean, exiting with return code 1 - echo "Error while running 'ceph $CEPH_CLI -s', PGs were not reported as active+clean" - echo "It is possible that the cluster has less OSDs than the replica configuration" - echo "Will refuse to continue" - ceph $CEPH_CLI -s - exit 1 -} - -for id in $(ls /var/lib/ceph/osd/ | sed 's/.*-//'); do - # First, restart daemon(s) - systemctl restart ceph-osd@${id} - # We need to wait because it may take some time for the socket to actually exists - COUNT=10 - # Wait and ensure the socket exists after restarting the daemon - SOCKET=/var/run/ceph/{{ cluster }}-osd.${id}.asok - while [ $COUNT -ne 0 ]; do - test -S $SOCKET && check_pgs && continue 2 - sleep 1 - let COUNT=COUNT-1 - done - # If we reach this point, it means the socket is not present. - echo "Socket file ${SOCKET} could not be found, which means the osd daemon is not running." - exit 1 -done diff --git a/roles/ceph-defaults/defaults/main.yml b/roles/ceph-defaults/defaults/main.yml index 2669e1bc9..3a41ed5fb 100644 --- a/roles/ceph-defaults/defaults/main.yml +++ b/roles/ceph-defaults/defaults/main.yml @@ -367,7 +367,7 @@ os_tuning_params: ########## # DOCKER # ########## - +docker_exec_cmd: docker: false ceph_docker_image: "ceph/daemon" ceph_docker_image_tag: latest diff --git a/roles/ceph-common/handlers/main.yml b/roles/ceph-defaults/handlers/main.yml similarity index 56% rename from roles/ceph-common/handlers/main.yml rename to roles/ceph-defaults/handlers/main.yml index cf3ed1d7c..e8659f904 100644 --- a/roles/ceph-common/handlers/main.yml +++ b/roles/ceph-defaults/handlers/main.yml @@ -17,39 +17,51 @@ - name: restart ceph mon daemon(s) command: /tmp/restart_mon_daemon.sh listen: "restart ceph mons" - when: # We do not want to run these checks on initial deployment (`socket.rc == 0`) - socket.rc == 0 - - ceph_current_fsid.rc == 0 - mon_group_name in group_names # This does not just restart OSDs but everything else too. Unfortunately # at this time the ansible role does not have an OSD id list to use # for restarting them specifically. -- block: - - name: copy osd restart script - template: - src: restart_osd_daemon.sh.j2 - dest: /tmp/restart_osd_daemon.sh - owner: root - group: root - mode: 0750 - listen: "restart ceph osds" - - - name: restart ceph osds daemon(s) - command: /tmp/restart_osd_daemon.sh - listen: "restart ceph osds" - when: handler_health_osd_check - +- name: copy osd restart script + template: + src: restart_osd_daemon.sh.j2 + dest: /tmp/restart_osd_daemon.sh + owner: root + group: root + mode: 0750 + listen: "restart ceph osds" when: -# We do not want to run these checks on initial deployment (`socket.rc == 0`) -# except when a crush location is specified. ceph-disk will start the osds before the osd crush location is specified + - inventory_hostname in play_hosts + - osd_group_name in group_names + +- name: restart containerized ceph osds daemon(s) + command: /tmp/restart_osd_daemon.sh + listen: "restart ceph osds" + with_items: "{{ socket_osd_container.results }}" + when: + # We do not want to run these checks on initial deployment (`socket_osd_container.results[n].rc == 0`) + # except when a crush location is specified. ceph-disk will start the osds before the osd crush location is specified + - ((crush_location is defined and crush_location) or item.get('rc') == 0) + - handler_health_osd_check + # See https://github.com/ceph/ceph-ansible/issues/1457 for the condition below + - inventory_hostname in play_hosts + - osd_group_name in group_names + +- name: restart non-containerized ceph osds daemon(s) + command: /tmp/restart_osd_daemon.sh + listen: "restart ceph osds" + when: + # We do not want to run these checks on initial deployment (`socket_osd_container.results[n].rc == 0`) + # except when a crush location is specified. ceph-disk will start the osds before the osd crush location is specified - ((crush_location is defined and crush_location) or socket.rc == 0) - ceph_current_fsid.rc == 0 - - osd_group_name in group_names -# See https://github.com/ceph/ceph-ansible/issues/1457 for the condition below + - handler_health_osd_check + # See https://github.com/ceph/ceph-ansible/issues/1457 for the condition below - inventory_hostname in play_hosts + - osd_group_name in group_names - name: restart ceph mdss service: diff --git a/roles/ceph-defaults/tasks/check_socket.yml b/roles/ceph-defaults/tasks/check_socket.yml new file mode 100644 index 000000000..11f04f6d3 --- /dev/null +++ b/roles/ceph-defaults/tasks/check_socket.yml @@ -0,0 +1,21 @@ +--- +# These checks are used to avoid running handlers at initial deployment. +- name: check for a ceph socket + shell: | + {{ docker_exec_cmd }} bash -c 'stat {{ rbd_client_admin_socket_path }}/*.asok > /dev/null 2>&1' + changed_when: false + failed_when: false + always_run: true + register: socket + +- name: check for a ceph socket in containerized deployment (osds) + shell: | + docker exec ceph-osd-"{{ ansible_hostname }}"-"{{ item | replace('/', '') }}" bash -c 'stat /var/run/ceph/*.asok > /dev/null 2>&1' + changed_when: false + failed_when: false + always_run: true + register: socket_osd_container + with_items: "{{ devices }}" + when: + - containerized_deployment + - inventory_hostname in groups.get(osd_group_name) diff --git a/roles/ceph-defaults/tasks/main.yml b/roles/ceph-defaults/tasks/main.yml index 163464872..25887efa7 100644 --- a/roles/ceph-defaults/tasks/main.yml +++ b/roles/ceph-defaults/tasks/main.yml @@ -1,2 +1,3 @@ --- - include: facts.yml +- include: check_socket.yml diff --git a/roles/ceph-common/templates/restart_mon_daemon.sh.j2 b/roles/ceph-defaults/templates/restart_mon_daemon.sh.j2 similarity index 72% rename from roles/ceph-common/templates/restart_mon_daemon.sh.j2 rename to roles/ceph-defaults/templates/restart_mon_daemon.sh.j2 index 4424ccacb..745f6915f 100644 --- a/roles/ceph-common/templates/restart_mon_daemon.sh.j2 +++ b/roles/ceph-defaults/templates/restart_mon_daemon.sh.j2 @@ -3,13 +3,12 @@ RETRIES="{{ handler_health_mon_check_retries }}" DELAY="{{ handler_health_mon_check_delay }}" MONITOR_NAME="{{ monitor_name }}" -CLUSTER="{{ cluster }}" -SOCKET=/var/run/ceph/${CLUSTER}-mon.${MONITOR_NAME}.asok +SOCKET=/var/run/ceph/{{ cluster }}-mon.${MONITOR_NAME}.asok check_quorum() { while [ $RETRIES -ne 0 ]; do - MEMBERS=$(ceph --cluster ${CLUSTER} -s --format json | sed -r 's/.*"quorum_names":(\[[^]]+\]).*/\1/') + MEMBERS=$({{ docker_exec_cmd }} ceph --cluster {{ cluster }} -s --format json | sed -r 's/.*"quorum_names":(\[[^]]+\]).*/\1/') test "${MEMBERS/$MONITOR_NAME}" != "$MEMBERS" && exit 0 sleep $DELAY let RETRIES=RETRIES-1 @@ -17,7 +16,7 @@ done # If we reach this point, it means there is a problem with the quorum echo "Error with quorum." echo "cluster status:" -ceph --cluster ${CLUSTER} -s +{{ docker_exec_cmd }} ceph --cluster {{ cluster }} -s exit 1 } @@ -27,7 +26,7 @@ systemctl restart ceph-mon@${MONITOR_NAME} COUNT=10 # Wait and ensure the socket exists after restarting the daemon while [ $COUNT -ne 0 ]; do - test -S $SOCKET && check_quorum + {{ docker_exec_cmd }} test -S $SOCKET && check_quorum sleep 1 let COUNT=COUNT-1 done diff --git a/roles/ceph-defaults/templates/restart_osd_daemon.sh.j2 b/roles/ceph-defaults/templates/restart_osd_daemon.sh.j2 new file mode 100644 index 000000000..de1fe101b --- /dev/null +++ b/roles/ceph-defaults/templates/restart_osd_daemon.sh.j2 @@ -0,0 +1,78 @@ +#!/bin/bash + +RETRIES="{{ handler_health_osd_check_retries }}" +DELAY="{{ handler_health_osd_check_delay }}" +CEPH_CLI="--name client.bootstrap-osd --keyring /var/lib/ceph/bootstrap-osd/{{ cluster }}.keyring --cluster {{ cluster }}" + +check_pgs() { + while [ $RETRIES -ne 0 ]; do + test "[""$($docker_exec ceph $CEPH_CLI -s -f json | python -c 'import sys, json; print(json.load(sys.stdin)["pgmap"]["num_pgs"])')""]" = "$($docker_exec ceph $CEPH_CLI -s -f json | python -c 'import sys, json; print [ i["count"] for i in json.load(sys.stdin)["pgmap"]["pgs_by_state"] if i["state_name"] == "active+clean"]')" + RET=$? + test $RET -eq 0 && return 0 + sleep $DELAY + let RETRIES=RETRIES-1 + done + # PGs not clean, exiting with return code 1 + echo "Error while running 'ceph $CEPH_CLI -s', PGs were not reported as active+clean" + echo "It is possible that the cluster has less OSDs than the replica configuration" + echo "Will refuse to continue" + $docker_exec ceph "$CEPH_CLI" -s + exit 1 +} + +wait_for_socket_in_docker() { + if ! docker exec "$1" timeout 10 bash -c "while [ ! -e /var/run/ceph/*.asok ]; do sleep 1 ; done"; then + log "Timed out while trying to look for a Ceph OSD socket." + log "Abort mission!" + exit 1 + fi +} + +get_dev_name() { + echo $1 | sed -r 's/ceph-osd@([a-z]{1,4})\.service/\1/' +} + +get_docker_id_from_dev_name() { + local id + local count + count=10 + while [ $count -ne 0 ]; do + id=$(docker ps -q -f "name=$1") + test "$id" != "" && break + sleep 1 + let count=count-1 + done + echo "$id" +} + +get_docker_osd_id() { + wait_for_socket_in_docker $1 + docker exec "$1" ls /var/run/ceph | cut -d'.' -f2 +} + +# For containerized deployments, the unit file looks like: ceph-osd@sda.service +# For non-containerized deployments, the unit file looks like: ceph-osd@0.service +for unit in $(systemctl list-units | grep -oE "ceph-osd@([0-9]{1,2}|[a-z]+).service"); do + # First, restart daemon(s) + systemctl restart "${unit}" + # We need to wait because it may take some time for the socket to actually exists + COUNT=10 + # Wait and ensure the socket exists after restarting the daemon + {% if containerized_deployment -%} + id=$(get_dev_name "$unit") + container_id=$(get_docker_id_from_dev_name "$id") + osd_id=$(get_docker_osd_id "$container_id") + docker_exec="docker exec $container_id" + {% else %} + osd_id=$(echo ${unit#ceph-osd@} | grep -oE '[0-9]{1,2}') + {% endif %} + SOCKET=/var/run/ceph/test-osd.${osd_id}.asok + while [ $COUNT -ne 0 ]; do + $docker_exec test -S "$SOCKET" && check_pgs && continue 2 + sleep 1 + let COUNT=COUNT-1 + done + # If we reach this point, it means the socket is not present. + echo "Socket file ${SOCKET} could not be found, which means the osd daemon is not running." + exit 1 +done diff --git a/roles/ceph-docker-common/tasks/create_configs.yml b/roles/ceph-docker-common/tasks/create_configs.yml index d1e6a2926..a33f72f4b 100644 --- a/roles/ceph-docker-common/tasks/create_configs.yml +++ b/roles/ceph-docker-common/tasks/create_configs.yml @@ -44,10 +44,15 @@ config_type: ini when: - (not mon_containerized_default_ceph_conf_with_kv and - (inventory_hostname in groups.get(mon_group_name, []))) or + (inventory_hostname in groups.get(mon_group_name, []) or inventory_hostname in groups.get(osd_group_name, []))) or (not mon_containerized_default_ceph_conf_with_kv and ((groups.get(nfs_group_name, []) | length > 0) and (inventory_hostname == groups.get(nfs_group_name, [])[0]))) + notify: + - restart ceph mons + - restart ceph osds + - restart ceph mdss + - restart ceph rgws - name: set fsid fact when generate_fsid = true set_fact: diff --git a/roles/ceph-mon/defaults/main.yml b/roles/ceph-mon/defaults/main.yml index 28ec80cfb..f81b254eb 100644 --- a/roles/ceph-mon/defaults/main.yml +++ b/roles/ceph-mon/defaults/main.yml @@ -105,7 +105,6 @@ openstack_keys: ########## # DOCKER # ########## -docker_exec_cmd: ceph_mon_docker_subnet: "{{ public_network }}"# subnet of the monitor_interface # ceph_mon_docker_extra_env: diff --git a/roles/ceph-rgw/tasks/docker/start_docker_rgw.yml b/roles/ceph-rgw/tasks/docker/start_docker_rgw.yml index e3bef3603..3bce3d4fe 100644 --- a/roles/ceph-rgw/tasks/docker/start_docker_rgw.yml +++ b/roles/ceph-rgw/tasks/docker/start_docker_rgw.yml @@ -2,14 +2,21 @@ - name: generate systemd unit file become: true template: - src: "{{ role_path }}/templates/ceph-rgw.service.j2" - dest: /etc/systemd/system/ceph-rgw@.service + src: "{{ role_path }}/templates/ceph-radosgw.service.j2" + dest: /etc/systemd/system/ceph-radosgw@.service owner: "root" group: "root" mode: "0644" +# For backward compatibility +- name: disable old systemd unit ('ceph-rgw@') if present + service: + name: ceph-rgw@{{ ansible_hostname }} + state: disable + ignore_errors: true + - name: enable systemd unit file for rgw instance - shell: systemctl enable ceph-rgw@{{ ansible_hostname }}.service + shell: systemctl enable ceph-radosgw@{{ ansible_hostname }}.service failed_when: false changed_when: false @@ -20,7 +27,7 @@ - name: systemd start rgw container service: - name: ceph-rgw@{{ ansible_hostname }} + name: ceph-radosgw@{{ ansible_hostname }} state: started enabled: yes changed_when: false diff --git a/roles/ceph-rgw/templates/ceph-rgw.service.j2 b/roles/ceph-rgw/templates/ceph-radosgw.service.j2 similarity index 100% rename from roles/ceph-rgw/templates/ceph-rgw.service.j2 rename to roles/ceph-rgw/templates/ceph-radosgw.service.j2