From bedc0ab69df6a8cadc3993d8b56288b511846802 Mon Sep 17 00:00:00 2001 From: Dimitri Savineau Date: Mon, 13 May 2019 17:18:52 -0400 Subject: [PATCH] ceph-osd: use OSD id with systemd ceph-disk When using containerized deployment we have to create the systemd service unit based on a template. The current implementation with ceph-disk is using the device name as paramater to the systemd service and for the container name too. $ systemctl start ceph-osd@sdb $ docker ps --filter 'name=ceph-osd-*' CONTAINER ID IMAGE NAMES 065530d0a27f ceph/daemon:latest-luminous ceph-osd-strg0-sdb This is the only scenario (compared to non containerized or ceph-volume based deployment) that isn't using the OSD id. $ systemctl start ceph-osd@0 $ docker ps --filter 'name=ceph-osd-*' CONTAINER ID IMAGE NAMES d34552ec157e ceph/daemon:latest-luminous ceph-osd-0 Also if the device mapping doesn't persist to system reboot (ie sdb might be remapped to sde) then the OSD service won't come back after the reboot. This patch allows to use the OSD id with the ceph-osd systemd service but requires to activate the OSD manually with ceph-disk first in order to affect the ID to that OSD. Closes: https://bugzilla.redhat.com/show_bug.cgi?id=1670734 Signed-off-by: Dimitri Savineau --- .../templates/restart_osd_daemon.sh.j2 | 19 +--- roles/ceph-osd/tasks/start_osds.yml | 98 +++++++++++++++---- roles/ceph-osd/templates/ceph-osd-run.sh.j2 | 22 ++++- .../templates/systemd-device-to-id.sh.j2 | 81 +++++++++++++++ 4 files changed, 181 insertions(+), 39 deletions(-) create mode 100644 roles/ceph-osd/templates/systemd-device-to-id.sh.j2 diff --git a/roles/ceph-handler/templates/restart_osd_daemon.sh.j2 b/roles/ceph-handler/templates/restart_osd_daemon.sh.j2 index d7fb3e1e0..e39fde10a 100644 --- a/roles/ceph-handler/templates/restart_osd_daemon.sh.j2 +++ b/roles/ceph-handler/templates/restart_osd_daemon.sh.j2 @@ -36,10 +36,6 @@ wait_for_socket_in_docker() { fi } -get_dev_name() { - echo $1 | sed -r 's/ceph-osd@([a-z]{1,4})\.service/\1/' -} - get_docker_id_from_dev_name() { local id local count @@ -53,26 +49,17 @@ get_docker_id_from_dev_name() { echo "$id" } -# For containerized deployments, the unit file looks like: ceph-osd@sda.service -# For non-containerized deployments, the unit file looks like: ceph-osd@NNN.service where NNN is OSD ID -for unit in $(systemctl list-units | grep -E "loaded * active" | grep -oE "ceph-osd@([0-9]+|[a-z]+).service"); do +# The unit file looks like: ceph-osd@NNN.service where NNN is OSD ID +for unit in $(systemctl list-units | grep -E "loaded * active" | grep -oE "ceph-osd@[0-9]+.service"); do # First, restart daemon(s) systemctl restart "${unit}" # We need to wait because it may take some time for the socket to actually exists COUNT=10 # Wait and ensure the socket exists after restarting the daemon - {% if containerized_deployment and osd_scenario != 'lvm' -%} - id=$(get_dev_name "$unit") - container_id=$(get_docker_id_from_dev_name "$id") - wait_for_socket_in_docker "$container_id" - osd_id=$whoami - docker_exec="docker exec $container_id" - {% elif containerized_deployment and osd_scenario == 'lvm' %} osd_id=$(echo ${unit#ceph-osd@} | grep -oE '[0-9]+') + {% if containerized_deployment -%} container_id=$(get_docker_id_from_dev_name "ceph-osd-${osd_id}") docker_exec="docker exec $container_id" - {% else %} - osd_id=$(echo ${unit#ceph-osd@} | grep -oE '[0-9]+') {% endif %} SOCKET=/var/run/ceph/{{ cluster }}-osd.${osd_id}.asok while [ $COUNT -ne 0 ]; do diff --git a/roles/ceph-osd/tasks/start_osds.yml b/roles/ceph-osd/tasks/start_osds.yml index d3dab5f53..a6503dfc5 100644 --- a/roles/ceph-osd/tasks/start_osds.yml +++ b/roles/ceph-osd/tasks/start_osds.yml @@ -11,21 +11,52 @@ when: - ceph_docker_on_openstack - - name: test if the container image has directory {{ container_bin_path }} - command: "docker run --rm --entrypoint=test {{ ceph_docker_registry }}/{{ ceph_docker_image }}:{{ ceph_docker_image_tag }} -d {{ container_bin_path }}" - changed_when: false - failed_when: false - register: test_container_bin_path - when: - - osd_scenario != 'lvm' + - name: with non lvm scenario + when: osd_scenario != 'lvm' + block: + - name: test if the container image has directory {{ container_bin_path }} + command: "docker run --rm --entrypoint=test {{ ceph_docker_registry }}/{{ ceph_docker_image }}:{{ ceph_docker_image_tag }} -d {{ container_bin_path }}" + changed_when: false + failed_when: false + register: test_container_bin_path - - name: test if the container image has the disk_list function - command: "docker run --rm --entrypoint=stat {{ ceph_docker_registry }}/{{ ceph_docker_image }}:{{ ceph_docker_image_tag }} {{ container_bin_path + '/disk_list.sh' if test_container_bin_path.rc == 0 else 'disk_list.sh' }}" - changed_when: false - failed_when: false - register: disk_list - when: - - osd_scenario != 'lvm' + - name: test if the container image has the disk_list function + command: "docker run --rm --entrypoint=stat {{ ceph_docker_registry }}/{{ ceph_docker_image }}:{{ ceph_docker_image_tag }} {{ container_bin_path + '/disk_list.sh' if test_container_bin_path.rc == 0 else 'disk_list.sh' }}" + changed_when: false + failed_when: false + register: disk_list + + - name: test activated ceph-disk osds + shell: | + ls /var/lib/ceph/osd/ | sed 's/.*-//' + register: activated_osds + + - name: activate containerized osd(s) + shell: | + DOCKER_ENV=$(docker run --rm --net=host --ulimit nofile=1024:1024 \ + --privileged=true -v /dev/:/dev/ -v /etc/ceph:/etc/ceph:z \ + -e CLUSTER={{ cluster }} -e OSD_DEVICE={{ item }} \ + {{ ceph_docker_registry }}/{{ ceph_docker_image }}:{{ ceph_docker_image_tag }} \ + disk_list) + docker run --rm --net=host \ + --ulimit nofile=1024:1024 \ + --ipc=host --pid=host --privileged=true \ + -v /etc/ceph:/etc/ceph:z \ + -v /var/lib/ceph/:/var/lib/ceph/:z \ + -v /dev:/dev \ + -v /etc/localtime:/etc/localtime:ro \ + -e DEBUG=verbose \ + -e CLUSTER={{ cluster }} \ + -e CEPH_DAEMON=OSD_CEPH_DISK_ACTIVATE_ONLY \ + -e OSD_DEVICE={{ item }} \ + ${DOCKER_ENV} \ + {{ docker_env_args }} \ + {{ ceph_osd_docker_prepare_env }} \ + {{ ceph_docker_registry }}/{{ ceph_docker_image }}:{{ ceph_docker_image_tag }} + with_items: "{{ devices }}" + when: + - devices is defined + - devices | length > activated_osds.stdout_lines | length - name: generate ceph osd docker run script become: true @@ -44,18 +75,21 @@ - name: get osd ids shell: | ls /var/lib/ceph/osd/ | sed 's/.*-//' - register: osd_ids_non_container + register: ceph_disk_osd_ids + when: osd_scenario != 'lvm' - name: set_fact docker_exec_start_osd set_fact: docker_exec_start_osd: "{{ 'docker run --rm --ulimit nofile=1024:1024 --privileged=true -v /run/lvm/lvmetad.socket:/run/lvm/lvmetad.socket -v /var/run/udev/:/var/run/udev/:z -v /etc/ceph:/etc/ceph:z -v /dev:/dev --entrypoint=ceph-volume ' + ceph_docker_registry + '/' + ceph_docker_image + ':' + ceph_docker_image_tag if containerized_deployment else 'ceph-volume' }}" + when: osd_scenario == 'lvm' - name: collect osd ids shell: > {{ docker_exec_start_osd }} lvm list --format json changed_when: false failed_when: false - register: ceph_osd_ids + register: ceph_volume_osd_ids + when: osd_scenario == 'lvm' - name: generate systemd unit file become: true @@ -70,13 +104,41 @@ when: - containerized_deployment +- name: device to ID migration + when: + - containerized_deployment | bool + - osd_scenario != 'lvm' + block: + - name: check ceph-osd service using device name + shell: | + systemctl list-units | grep -E "loaded * active" | grep -coE "ceph-osd@([a-z]+|nvme[0-9]+n[0-9]+).service" + register: ceph_osd_device_name + changed_when: false + failed_when: false + + - name: copy systemd-device-to-id.sh script + template: + src: systemd-device-to-id.sh.j2 + dest: /tmp/systemd-device-to-id.sh + owner: root + group: root + mode: 0750 + when: ceph_osd_device_name.stdout|int != 0 + + - name: run the systemd-device-to-id.sh script + command: /usr/bin/env bash /tmp/systemd-device-to-id.sh + when: ceph_osd_device_name.stdout|int != 0 + with_items: "{{ groups[osd_group_name] }}" + delegate_to: "{{ item }}" + run_once: true + - name: systemd start osd systemd: - name: ceph-osd@{{ item | regex_replace('/dev/', '') if osd_scenario != 'lvm' and containerized_deployment else item }} + name: ceph-osd@{{ item }} state: started enabled: yes daemon_reload: yes - with_items: "{{ devices if osd_scenario != 'lvm' and containerized_deployment else ((ceph_osd_ids.stdout | from_json).keys() | list) if osd_scenario == 'lvm' and not containerized_deployment else osd_ids_non_container.stdout_lines }}" + with_items: "{{ ((ceph_volume_osd_ids.stdout | from_json).keys() | list) if osd_scenario == 'lvm' else ceph_disk_osd_ids.stdout_lines }}" - name: ensure systemd service override directory exists file: diff --git a/roles/ceph-osd/templates/ceph-osd-run.sh.j2 b/roles/ceph-osd/templates/ceph-osd-run.sh.j2 index a98dc6578..e7e0e29cf 100644 --- a/roles/ceph-osd/templates/ceph-osd-run.sh.j2 +++ b/roles/ceph-osd/templates/ceph-osd-run.sh.j2 @@ -12,8 +12,20 @@ DOCKER_ENV="" ############# # FUNCTIONS # ############# +function id_to_device () { +{% if dmcrypt | bool %} + docker run --rm --net=host --ulimit nofile=1024:1024 --ipc=host --pid=host --privileged=true -v /etc/ceph:/etc/ceph:z -v /var/lib/ceph/:/var/lib/ceph/:z -v /dev:/dev -v /etc/localtime:/etc/localtime:ro -e DEBUG=verbose -e CLUSTER={{ cluster }} {{ ceph_docker_registry }}/{{ ceph_docker_image }}:{{ ceph_docker_image_tag }} osd_ceph_disk_dmcrypt_data_map +{% endif %} + DATA_PART=$(docker run --rm --ulimit nofile=1024:1024 --privileged=true -v /dev/:/dev/ -v /etc/ceph:/etc/ceph:z --entrypoint ceph-disk {{ ceph_docker_registry }}/{{ ceph_docker_image }}:{{ ceph_docker_image_tag }} list | grep ", osd\.${1}," | awk '{ print $1 }') + if [[ "${DATA_PART}" =~ ^/dev/(cciss|nvme) ]]; then + OSD_DEVICE=${DATA_PART:0:-2} + else + OSD_DEVICE=${DATA_PART:0:-1} + fi +} + function expose_partitions () { -DOCKER_ENV=$(docker run --rm --net=host --name expose_partitions_${1} --privileged=true -v /dev/:/dev/ -v /etc/ceph:/etc/ceph:z -e CLUSTER={{ cluster }} -e OSD_DEVICE=/dev/${1} {{ ceph_docker_registry }}/{{ ceph_docker_image }}:{{ ceph_docker_image_tag }} disk_list) + DOCKER_ENV=$(docker run --rm --net=host --privileged=true -v /dev/:/dev/ -v /etc/ceph:/etc/ceph:z -e CLUSTER={{ cluster }} -e OSD_DEVICE=${1} {{ ceph_docker_registry }}/{{ ceph_docker_image }}:{{ ceph_docker_image_tag }} disk_list) } {% else -%} # NOTE(leseb): maintains backwards compatibility with old ceph-docker Jewel images @@ -54,7 +66,8 @@ function expose_partitions { {% endif -%} -expose_partitions "$1" +id_to_device "$1" +expose_partitions "${OSD_DEVICE}" # discover osd_objectstore for ceph-disk based osds if [[ $DOCKER_ENV =~ "BLUESTORE" ]]; then @@ -122,12 +135,11 @@ numactl \ -v /run/lvm/lvmetad.socket:/run/lvm/lvmetad.socket \ -e CEPH_DAEMON=OSD_CEPH_VOLUME_ACTIVATE \ -e OSD_ID="$1" \ - --name=ceph-osd-"$1" \ {% else -%} $DOCKER_ENV \ -e CEPH_DAEMON=OSD_CEPH_DISK_ACTIVATE \ - -e OSD_DEVICE=/dev/"${1}" \ - --name=ceph-osd-{{ ansible_hostname }}-"${1}" \ + -e OSD_DEVICE="${OSD_DEVICE}" \ {% endif -%} + --name=ceph-osd-"$1" \ {{ ceph_osd_docker_extra_env }} \ {{ ceph_docker_registry }}/{{ ceph_docker_image }}:{{ ceph_docker_image_tag }} diff --git a/roles/ceph-osd/templates/systemd-device-to-id.sh.j2 b/roles/ceph-osd/templates/systemd-device-to-id.sh.j2 new file mode 100644 index 000000000..b73e1c1b9 --- /dev/null +++ b/roles/ceph-osd/templates/systemd-device-to-id.sh.j2 @@ -0,0 +1,81 @@ +#!/bin/bash + +DELAY="{{ handler_health_osd_check_delay }}" +CEPH_CLI="--name client.bootstrap-osd --keyring /var/lib/ceph/bootstrap-osd/{{ cluster }}.keyring --cluster {{ cluster }}" + +check_pgs() { + num_pgs=$($docker_exec ceph $CEPH_CLI -s -f json|python -c 'import sys, json; print(json.load(sys.stdin)["pgmap"]["num_pgs"])') + if [[ "$num_pgs" == "0" ]]; then + return 0 + fi + while [ $RETRIES -ne 0 ]; do + test "$($docker_exec ceph $CEPH_CLI -s -f json | python -c 'import sys, json; print(json.load(sys.stdin)["pgmap"]["num_pgs"])')" -eq "$($docker_exec ceph $CEPH_CLI -s -f json | python -c 'import sys, json; print sum ( [ i["count"] for i in json.load(sys.stdin)["pgmap"]["pgs_by_state"] if "active+clean" in i["state_name"]])')" + RET=$? + test $RET -eq 0 && return 0 + sleep $DELAY + let RETRIES=RETRIES-1 + done + # PGs not clean, exiting with return code 1 + echo "Error while running 'ceph $CEPH_CLI -s', PGs were not reported as active+clean" + echo "It is possible that the cluster has less OSDs than the replica configuration" + echo "Will refuse to continue" + $docker_exec ceph $CEPH_CLI -s + $docker_exec ceph $CEPH_CLI osd dump + $docker_exec ceph $CEPH_CLI osd tree + $docker_exec ceph $CEPH_CLI osd crush rule dump + exit 1 +} + +wait_for_socket_in_docker() { + osd_mount_point=$(docker exec "$1" df --output=target | grep '/var/lib/ceph/osd/') + whoami=$(docker exec "$1" cat $osd_mount_point/whoami) + if ! docker exec "$1" timeout 10 bash -c "while [ ! -e /var/run/ceph/{{ cluster }}-osd.${whoami}.asok ]; do sleep 1 ; done"; then + echo "Timed out while trying to look for a Ceph OSD socket." + echo "Abort mission!" + exit 1 + fi +} + +get_dev_name() { + echo $1 | sed -r 's/ceph-osd@([a-z]{1,4}|nvme[0-9]+n[0-9]+)\.service/\1/' +} + +get_docker_id_from_dev_name() { + local id + local count + count=10 + while [ $count -ne 0 ]; do + id=$(docker ps -q -f "name=${1}$") + test "$id" != "" && break + sleep $DELAY + let count=count-1 + done + echo "$id" +} + +for unit in $(systemctl list-units | grep -E "loaded * active" | grep -oE "ceph-osd@([a-z]+|nvme[0-9]+n[0-9]+).service"); do + dev_name=$(get_dev_name "$unit") + container_id=$(get_docker_id_from_dev_name "$dev_name") + wait_for_socket_in_docker "$container_id" + osd_id=$whoami + # Stop and Disable the unit based on device name + systemctl stop ceph-osd@${dev_name} + systemctl disable ceph-osd@${dev_name} + # Enable and Start the unit based on OSD id + systemctl enable ceph-osd@${osd_id} + systemctl start ceph-osd@${osd_id} + container_id=$(get_docker_id_from_dev_name "ceph-osd-${osd_id}") + docker_exec="docker exec $container_id" + SOCKET=/var/run/ceph/{{ cluster }}-osd.${osd_id}.asok + COUNT=10 + while [ $COUNT -ne 0 ]; do + RETRIES="{{ handler_health_osd_check_retries }}" + $docker_exec test -S "$SOCKET" && check_pgs && continue 2 + sleep $DELAY + let COUNT=COUNT-1 + done + # If we reach this point, it means the socket is not present. + echo "Socket file ${SOCKET} could not be found, which means the osd daemon is not running. Showing ceph-osd unit logs now:" + journalctl -u "ceph-osd@${osd_id}.service" + exit 1 +done