Add handlers for containerized deployment

Until now, there is no handlers for containerized deployments.

Signed-off-by: Guillaume Abrioux <gabrioux@redhat.com>
pull/1727/head
Guillaume Abrioux 2017-07-29 01:00:06 +02:00
parent fc6b6e9859
commit 7a333d05ce
15 changed files with 170 additions and 90 deletions

View File

@ -97,13 +97,21 @@
tasks:
- name: disable ceph rgw service
# For backward compatibility
- name: disable ceph rgw service (old unit name, for backward compatibility)
service:
name: "ceph-rgw@{{ ansible_hostname }}"
state: stopped
enabled: no
ignore_errors: true
- name: disable ceph rgw service (new unit name)
service:
name: "ceph-radosgw@{{ ansible_hostname }}"
state: stopped
enabled: no
ignore_errors: true
- name: remove ceph rgw container
docker:
image: "{{ ceph_docker_registry }}/{{ ceph_docker_image }}:{{ ceph_docker_image_tag }}"
@ -113,8 +121,12 @@
- name: remove ceph rgw service
file:
path: /etc/systemd/system/ceph-rgw@.service
path: "{{ item }}"
state: absent
with_items:
# For backward compatibility
- /etc/systemd/system/ceph-rgw@.service
- /etc/systemd/system/ceph-radosgw@.service
- name: remove ceph rgw image
docker_image:

View File

@ -450,7 +450,7 @@
- name: restart containerized ceph rgws with systemd
service:
name: ceph-rgw@{{ ansible_hostname }}
name: ceph-radosgw@{{ ansible_hostname }}
state: restarted
enabled: yes
when:

View File

@ -1,15 +0,0 @@
---
# These checks are used to avoid running handlers at initial deployment.
- name: check for a ceph socket
shell: "stat /var/run/ceph/*.asok > /dev/null 2>&1"
changed_when: false
failed_when: false
always_run: true
register: socket
- name: check for a rados gateway socket
shell: "stat {{ rbd_client_admin_socket_path }}*.asok > /dev/null 2>&1"
changed_when: false
failed_when: false
always_run: true
register: socketrgw

View File

@ -105,7 +105,6 @@
- ceph_current_fsid.rc == 0
- mon_group_name in group_names
- include: ./checks/check_socket.yml
- include: create_ceph_initial_dirs.yml
- include: generate_ceph_conf.yml
- include: create_rbd_client_dir.yml

View File

@ -1,38 +0,0 @@
#!/bin/bash
RETRIES="{{ handler_health_osd_check_retries }}"
DELAY="{{ handler_health_osd_check_delay }}"
CEPH_CLI="--name client.bootstrap-osd --keyring /var/lib/ceph/bootstrap-osd/{{ cluster }}.keyring --cluster {{ cluster }}"
check_pgs() {
while [ $RETRIES -ne 0 ]; do
test "[""$(ceph $CEPH_CLI -s -f json | python -c 'import sys, json; print(json.load(sys.stdin)["pgmap"]["num_pgs"])')""]" = "$(ceph $CEPH_CLI -s -f json | python -c 'import sys, json; print [ i["count"] for i in json.load(sys.stdin)["pgmap"]["pgs_by_state"] if i["state_name"] == "active+clean"]')"
RET=$?
test $RET -eq 0 && return 0
sleep $DELAY
let RETRIES=RETRIES-1
done
# PGs not clean, exiting with return code 1
echo "Error while running 'ceph $CEPH_CLI -s', PGs were not reported as active+clean"
echo "It is possible that the cluster has less OSDs than the replica configuration"
echo "Will refuse to continue"
ceph $CEPH_CLI -s
exit 1
}
for id in $(ls /var/lib/ceph/osd/ | sed 's/.*-//'); do
# First, restart daemon(s)
systemctl restart ceph-osd@${id}
# We need to wait because it may take some time for the socket to actually exists
COUNT=10
# Wait and ensure the socket exists after restarting the daemon
SOCKET=/var/run/ceph/{{ cluster }}-osd.${id}.asok
while [ $COUNT -ne 0 ]; do
test -S $SOCKET && check_pgs && continue 2
sleep 1
let COUNT=COUNT-1
done
# If we reach this point, it means the socket is not present.
echo "Socket file ${SOCKET} could not be found, which means the osd daemon is not running."
exit 1
done

View File

@ -367,7 +367,7 @@ os_tuning_params:
##########
# DOCKER #
##########
docker_exec_cmd:
docker: false
ceph_docker_image: "ceph/daemon"
ceph_docker_image_tag: latest

View File

@ -17,39 +17,51 @@
- name: restart ceph mon daemon(s)
command: /tmp/restart_mon_daemon.sh
listen: "restart ceph mons"
when:
# We do not want to run these checks on initial deployment (`socket.rc == 0`)
- socket.rc == 0
- ceph_current_fsid.rc == 0
- mon_group_name in group_names
# This does not just restart OSDs but everything else too. Unfortunately
# at this time the ansible role does not have an OSD id list to use
# for restarting them specifically.
- block:
- name: copy osd restart script
template:
src: restart_osd_daemon.sh.j2
dest: /tmp/restart_osd_daemon.sh
owner: root
group: root
mode: 0750
listen: "restart ceph osds"
- name: restart ceph osds daemon(s)
command: /tmp/restart_osd_daemon.sh
listen: "restart ceph osds"
when: handler_health_osd_check
- name: copy osd restart script
template:
src: restart_osd_daemon.sh.j2
dest: /tmp/restart_osd_daemon.sh
owner: root
group: root
mode: 0750
listen: "restart ceph osds"
when:
# We do not want to run these checks on initial deployment (`socket.rc == 0`)
# except when a crush location is specified. ceph-disk will start the osds before the osd crush location is specified
- inventory_hostname in play_hosts
- osd_group_name in group_names
- name: restart containerized ceph osds daemon(s)
command: /tmp/restart_osd_daemon.sh
listen: "restart ceph osds"
with_items: "{{ socket_osd_container.results }}"
when:
# We do not want to run these checks on initial deployment (`socket_osd_container.results[n].rc == 0`)
# except when a crush location is specified. ceph-disk will start the osds before the osd crush location is specified
- ((crush_location is defined and crush_location) or item.get('rc') == 0)
- handler_health_osd_check
# See https://github.com/ceph/ceph-ansible/issues/1457 for the condition below
- inventory_hostname in play_hosts
- osd_group_name in group_names
- name: restart non-containerized ceph osds daemon(s)
command: /tmp/restart_osd_daemon.sh
listen: "restart ceph osds"
when:
# We do not want to run these checks on initial deployment (`socket_osd_container.results[n].rc == 0`)
# except when a crush location is specified. ceph-disk will start the osds before the osd crush location is specified
- ((crush_location is defined and crush_location) or socket.rc == 0)
- ceph_current_fsid.rc == 0
- osd_group_name in group_names
# See https://github.com/ceph/ceph-ansible/issues/1457 for the condition below
- handler_health_osd_check
# See https://github.com/ceph/ceph-ansible/issues/1457 for the condition below
- inventory_hostname in play_hosts
- osd_group_name in group_names
- name: restart ceph mdss
service:

View File

@ -0,0 +1,21 @@
---
# These checks are used to avoid running handlers at initial deployment.
- name: check for a ceph socket
shell: |
{{ docker_exec_cmd }} bash -c 'stat {{ rbd_client_admin_socket_path }}/*.asok > /dev/null 2>&1'
changed_when: false
failed_when: false
always_run: true
register: socket
- name: check for a ceph socket in containerized deployment (osds)
shell: |
docker exec ceph-osd-"{{ ansible_hostname }}"-"{{ item | replace('/', '') }}" bash -c 'stat /var/run/ceph/*.asok > /dev/null 2>&1'
changed_when: false
failed_when: false
always_run: true
register: socket_osd_container
with_items: "{{ devices }}"
when:
- containerized_deployment
- inventory_hostname in groups.get(osd_group_name)

View File

@ -1,2 +1,3 @@
---
- include: facts.yml
- include: check_socket.yml

View File

@ -3,13 +3,12 @@
RETRIES="{{ handler_health_mon_check_retries }}"
DELAY="{{ handler_health_mon_check_delay }}"
MONITOR_NAME="{{ monitor_name }}"
CLUSTER="{{ cluster }}"
SOCKET=/var/run/ceph/${CLUSTER}-mon.${MONITOR_NAME}.asok
SOCKET=/var/run/ceph/{{ cluster }}-mon.${MONITOR_NAME}.asok
check_quorum() {
while [ $RETRIES -ne 0 ]; do
MEMBERS=$(ceph --cluster ${CLUSTER} -s --format json | sed -r 's/.*"quorum_names":(\[[^]]+\]).*/\1/')
MEMBERS=$({{ docker_exec_cmd }} ceph --cluster {{ cluster }} -s --format json | sed -r 's/.*"quorum_names":(\[[^]]+\]).*/\1/')
test "${MEMBERS/$MONITOR_NAME}" != "$MEMBERS" && exit 0
sleep $DELAY
let RETRIES=RETRIES-1
@ -17,7 +16,7 @@ done
# If we reach this point, it means there is a problem with the quorum
echo "Error with quorum."
echo "cluster status:"
ceph --cluster ${CLUSTER} -s
{{ docker_exec_cmd }} ceph --cluster {{ cluster }} -s
exit 1
}
@ -27,7 +26,7 @@ systemctl restart ceph-mon@${MONITOR_NAME}
COUNT=10
# Wait and ensure the socket exists after restarting the daemon
while [ $COUNT -ne 0 ]; do
test -S $SOCKET && check_quorum
{{ docker_exec_cmd }} test -S $SOCKET && check_quorum
sleep 1
let COUNT=COUNT-1
done

View File

@ -0,0 +1,78 @@
#!/bin/bash
RETRIES="{{ handler_health_osd_check_retries }}"
DELAY="{{ handler_health_osd_check_delay }}"
CEPH_CLI="--name client.bootstrap-osd --keyring /var/lib/ceph/bootstrap-osd/{{ cluster }}.keyring --cluster {{ cluster }}"
check_pgs() {
while [ $RETRIES -ne 0 ]; do
test "[""$($docker_exec ceph $CEPH_CLI -s -f json | python -c 'import sys, json; print(json.load(sys.stdin)["pgmap"]["num_pgs"])')""]" = "$($docker_exec ceph $CEPH_CLI -s -f json | python -c 'import sys, json; print [ i["count"] for i in json.load(sys.stdin)["pgmap"]["pgs_by_state"] if i["state_name"] == "active+clean"]')"
RET=$?
test $RET -eq 0 && return 0
sleep $DELAY
let RETRIES=RETRIES-1
done
# PGs not clean, exiting with return code 1
echo "Error while running 'ceph $CEPH_CLI -s', PGs were not reported as active+clean"
echo "It is possible that the cluster has less OSDs than the replica configuration"
echo "Will refuse to continue"
$docker_exec ceph "$CEPH_CLI" -s
exit 1
}
wait_for_socket_in_docker() {
if ! docker exec "$1" timeout 10 bash -c "while [ ! -e /var/run/ceph/*.asok ]; do sleep 1 ; done"; then
log "Timed out while trying to look for a Ceph OSD socket."
log "Abort mission!"
exit 1
fi
}
get_dev_name() {
echo $1 | sed -r 's/ceph-osd@([a-z]{1,4})\.service/\1/'
}
get_docker_id_from_dev_name() {
local id
local count
count=10
while [ $count -ne 0 ]; do
id=$(docker ps -q -f "name=$1")
test "$id" != "" && break
sleep 1
let count=count-1
done
echo "$id"
}
get_docker_osd_id() {
wait_for_socket_in_docker $1
docker exec "$1" ls /var/run/ceph | cut -d'.' -f2
}
# For containerized deployments, the unit file looks like: ceph-osd@sda.service
# For non-containerized deployments, the unit file looks like: ceph-osd@0.service
for unit in $(systemctl list-units | grep -oE "ceph-osd@([0-9]{1,2}|[a-z]+).service"); do
# First, restart daemon(s)
systemctl restart "${unit}"
# We need to wait because it may take some time for the socket to actually exists
COUNT=10
# Wait and ensure the socket exists after restarting the daemon
{% if containerized_deployment -%}
id=$(get_dev_name "$unit")
container_id=$(get_docker_id_from_dev_name "$id")
osd_id=$(get_docker_osd_id "$container_id")
docker_exec="docker exec $container_id"
{% else %}
osd_id=$(echo ${unit#ceph-osd@} | grep -oE '[0-9]{1,2}')
{% endif %}
SOCKET=/var/run/ceph/test-osd.${osd_id}.asok
while [ $COUNT -ne 0 ]; do
$docker_exec test -S "$SOCKET" && check_pgs && continue 2
sleep 1
let COUNT=COUNT-1
done
# If we reach this point, it means the socket is not present.
echo "Socket file ${SOCKET} could not be found, which means the osd daemon is not running."
exit 1
done

View File

@ -44,10 +44,15 @@
config_type: ini
when:
- (not mon_containerized_default_ceph_conf_with_kv and
(inventory_hostname in groups.get(mon_group_name, []))) or
(inventory_hostname in groups.get(mon_group_name, []) or inventory_hostname in groups.get(osd_group_name, []))) or
(not mon_containerized_default_ceph_conf_with_kv and
((groups.get(nfs_group_name, []) | length > 0)
and (inventory_hostname == groups.get(nfs_group_name, [])[0])))
notify:
- restart ceph mons
- restart ceph osds
- restart ceph mdss
- restart ceph rgws
- name: set fsid fact when generate_fsid = true
set_fact:

View File

@ -105,7 +105,6 @@ openstack_keys:
##########
# DOCKER #
##########
docker_exec_cmd:
ceph_mon_docker_subnet: "{{ public_network }}"# subnet of the monitor_interface
# ceph_mon_docker_extra_env:

View File

@ -2,14 +2,21 @@
- name: generate systemd unit file
become: true
template:
src: "{{ role_path }}/templates/ceph-rgw.service.j2"
dest: /etc/systemd/system/ceph-rgw@.service
src: "{{ role_path }}/templates/ceph-radosgw.service.j2"
dest: /etc/systemd/system/ceph-radosgw@.service
owner: "root"
group: "root"
mode: "0644"
# For backward compatibility
- name: disable old systemd unit ('ceph-rgw@') if present
service:
name: ceph-rgw@{{ ansible_hostname }}
state: disable
ignore_errors: true
- name: enable systemd unit file for rgw instance
shell: systemctl enable ceph-rgw@{{ ansible_hostname }}.service
shell: systemctl enable ceph-radosgw@{{ ansible_hostname }}.service
failed_when: false
changed_when: false
@ -20,7 +27,7 @@
- name: systemd start rgw container
service:
name: ceph-rgw@{{ ansible_hostname }}
name: ceph-radosgw@{{ ansible_hostname }}
state: started
enabled: yes
changed_when: false