Add handlers for containerized deployment

Until now, there is no handlers for containerized deployments. Signed-off-by: Guillaume Abrioux <gabrioux@redhat.com>
2017-07-29 01:00:06 +02:00 · 2017-07-29 01:00:06 +02:00 · 7a333d05ce
parent fc6b6e9859
commit 7a333d05ce
15 changed files with 170 additions and 90 deletions
--- a/infrastructure-playbooks/purge-docker-cluster.yml
+++ b/infrastructure-playbooks/purge-docker-cluster.yml
@ -97,13 +97,21 @@

  tasks:

-  - name: disable ceph rgw service
+# For backward compatibility
+  - name: disable ceph rgw service (old unit name, for backward compatibility)
    service:
      name: "ceph-rgw@{{ ansible_hostname }}"
      state: stopped
      enabled: no
    ignore_errors: true

+  - name: disable ceph rgw service (new unit name)
+    service:
+      name: "ceph-radosgw@{{ ansible_hostname }}"
+      state: stopped
+      enabled: no
+    ignore_errors: true
+
  - name: remove ceph rgw container
    docker:
      image: "{{ ceph_docker_registry }}/{{ ceph_docker_image }}:{{ ceph_docker_image_tag }}"
@ -113,8 +121,12 @@

  - name: remove ceph rgw service
    file:
-      path: /etc/systemd/system/ceph-rgw@.service
+      path: "{{ item }}"
      state: absent
+    with_items:
+# For backward compatibility
+      - /etc/systemd/system/ceph-rgw@.service
+      - /etc/systemd/system/ceph-radosgw@.service

  - name: remove ceph rgw image
    docker_image:
--- a/infrastructure-playbooks/rolling_update.yml
+++ b/infrastructure-playbooks/rolling_update.yml
@ -450,7 +450,7 @@

    - name: restart containerized ceph rgws with systemd
      service:
-        name: ceph-rgw@{{ ansible_hostname }}
+        name: ceph-radosgw@{{ ansible_hostname }}
        state: restarted
        enabled: yes
      when:
--- a/roles/ceph-common/tasks/checks/check_socket.yml
+++ b/roles/ceph-common/tasks/checks/check_socket.yml
@ -1,15 +0,0 @@
---
-# These checks are used to avoid running handlers at initial deployment.
- name: check for a ceph socket
-  shell: "stat /var/run/ceph/*.asok > /dev/null 2>&1"
-  changed_when: false
-  failed_when: false
-  always_run: true
-  register: socket
-
- name: check for a rados gateway socket
-  shell: "stat {{ rbd_client_admin_socket_path }}*.asok > /dev/null 2>&1"
-  changed_when: false
-  failed_when: false
-  always_run: true
-  register: socketrgw
--- a/roles/ceph-common/tasks/main.yml
+++ b/roles/ceph-common/tasks/main.yml
@ -105,7 +105,6 @@
    - ceph_current_fsid.rc == 0
    - mon_group_name in group_names

- include: ./checks/check_socket.yml
 - include: create_ceph_initial_dirs.yml
 - include: generate_ceph_conf.yml
 - include: create_rbd_client_dir.yml
--- a/roles/ceph-common/templates/restart_osd_daemon.sh.j2
+++ b/roles/ceph-common/templates/restart_osd_daemon.sh.j2
@ -1,38 +0,0 @@
-#!/bin/bash
-
-RETRIES="{{ handler_health_osd_check_retries }}"
-DELAY="{{ handler_health_osd_check_delay }}"
-CEPH_CLI="--name client.bootstrap-osd --keyring /var/lib/ceph/bootstrap-osd/{{ cluster }}.keyring --cluster {{ cluster }}"
-
-check_pgs() {
-  while [ $RETRIES -ne 0 ]; do
-    test "[""$(ceph $CEPH_CLI -s -f json | python -c 'import sys, json; print(json.load(sys.stdin)["pgmap"]["num_pgs"])')""]" = "$(ceph $CEPH_CLI -s -f json | python -c 'import sys, json; print [ i["count"] for i in json.load(sys.stdin)["pgmap"]["pgs_by_state"] if i["state_name"] == "active+clean"]')"
-    RET=$?
-    test $RET -eq 0 && return 0
-    sleep $DELAY
-    let RETRIES=RETRIES-1
-  done
-  # PGs not clean, exiting with return code 1
-  echo "Error while running 'ceph $CEPH_CLI -s', PGs were not reported as active+clean"
-  echo "It is possible that the cluster has less OSDs than the replica configuration"
-  echo "Will refuse to continue"
-  ceph $CEPH_CLI -s
-  exit 1
-}
-
-for id in $(ls /var/lib/ceph/osd/ | sed 's/.*-//'); do
-  # First, restart daemon(s)
-  systemctl restart ceph-osd@${id}
-  # We need to wait because it may take some time for the socket to actually exists
-  COUNT=10
-  # Wait and ensure the socket exists after restarting the daemon
-  SOCKET=/var/run/ceph/{{ cluster }}-osd.${id}.asok
-  while [ $COUNT -ne 0 ]; do
-    test -S $SOCKET && check_pgs && continue 2
-    sleep 1
-    let COUNT=COUNT-1
-  done
-  # If we reach this point, it means the socket is not present.
-  echo "Socket file ${SOCKET} could not be found, which means the osd daemon is not running."
-  exit 1
-done
--- a/roles/ceph-defaults/defaults/main.yml
+++ b/roles/ceph-defaults/defaults/main.yml
@ -367,7 +367,7 @@ os_tuning_params:
 ##########
 # DOCKER #
 ##########
-
+docker_exec_cmd:
 docker: false
 ceph_docker_image: "ceph/daemon"
 ceph_docker_image_tag: latest
--- a/roles/ceph-defaults/handlers/main.yml
+++ b/roles/ceph-defaults/handlers/main.yml
@ -17,39 +17,51 @@
  - name: restart ceph mon daemon(s)
    command: /tmp/restart_mon_daemon.sh
    listen: "restart ceph mons"
-
  when:
 # We do not want to run these checks on initial deployment (`socket.rc == 0`)
    - socket.rc == 0
-    - ceph_current_fsid.rc == 0
    - mon_group_name in group_names

 # This does not just restart OSDs but everything else too. Unfortunately
 # at this time the ansible role does not have an OSD id list to use
 # for restarting them specifically.
- block:
-  - name: copy osd restart script
-    template:
-      src: restart_osd_daemon.sh.j2
-      dest: /tmp/restart_osd_daemon.sh
-      owner: root
-      group: root
-      mode: 0750
-    listen: "restart ceph osds"
-
-  - name: restart ceph osds daemon(s)
-    command: /tmp/restart_osd_daemon.sh
-    listen: "restart ceph osds"
-    when: handler_health_osd_check
-
+- name: copy osd restart script
+  template:
+    src: restart_osd_daemon.sh.j2
+    dest: /tmp/restart_osd_daemon.sh
+    owner: root
+    group: root
+    mode: 0750
+  listen: "restart ceph osds"
  when:
-# We do not want to run these checks on initial deployment (`socket.rc == 0`)
-# except when a crush location is specified. ceph-disk will start the osds before the osd crush location is specified
+    - inventory_hostname in play_hosts
+    - osd_group_name in group_names
+
+- name: restart containerized ceph osds daemon(s)
+  command: /tmp/restart_osd_daemon.sh
+  listen: "restart ceph osds"
+  with_items: "{{ socket_osd_container.results }}"
+  when:
+  # We do not want to run these checks on initial deployment (`socket_osd_container.results[n].rc == 0`)
+  # except when a crush location is specified. ceph-disk will start the osds before the osd crush location is specified
+    - ((crush_location is defined and crush_location) or item.get('rc') == 0)
+    - handler_health_osd_check
+    # See https://github.com/ceph/ceph-ansible/issues/1457 for the condition below
+    - inventory_hostname in play_hosts
+    - osd_group_name in group_names
+
+- name: restart non-containerized ceph osds daemon(s)
+  command: /tmp/restart_osd_daemon.sh
+  listen: "restart ceph osds"
+  when:
+  # We do not want to run these checks on initial deployment (`socket_osd_container.results[n].rc == 0`)
+  # except when a crush location is specified. ceph-disk will start the osds before the osd crush location is specified
    - ((crush_location is defined and crush_location) or socket.rc == 0)
    - ceph_current_fsid.rc == 0
-    - osd_group_name in group_names
-# See https://github.com/ceph/ceph-ansible/issues/1457 for the condition below
+    - handler_health_osd_check
+    # See https://github.com/ceph/ceph-ansible/issues/1457 for the condition below
    - inventory_hostname in play_hosts
+    - osd_group_name in group_names

 - name: restart ceph mdss
  service:
--- a/roles/ceph-defaults/tasks/check_socket.yml
+++ b/roles/ceph-defaults/tasks/check_socket.yml
@ -0,0 +1,21 @@
+---
+# These checks are used to avoid running handlers at initial deployment.
+- name: check for a ceph socket
+  shell: |
+    {{ docker_exec_cmd }} bash -c 'stat {{ rbd_client_admin_socket_path }}/*.asok > /dev/null 2>&1'
+  changed_when: false
+  failed_when: false
+  always_run: true
+  register: socket
+
+- name: check for a ceph socket in containerized deployment (osds)
+  shell: |
+    docker exec ceph-osd-"{{ ansible_hostname }}"-"{{ item | replace('/', '') }}" bash -c 'stat /var/run/ceph/*.asok > /dev/null 2>&1'
+  changed_when: false
+  failed_when: false
+  always_run: true
+  register: socket_osd_container
+  with_items: "{{ devices }}"
+  when:
+    - containerized_deployment
+    - inventory_hostname in groups.get(osd_group_name)
--- a/roles/ceph-defaults/tasks/main.yml
+++ b/roles/ceph-defaults/tasks/main.yml
@ -1,2 +1,3 @@
 ---
 - include: facts.yml
+- include: check_socket.yml
--- a/roles/ceph-defaults/templates/restart_mon_daemon.sh.j2
+++ b/roles/ceph-defaults/templates/restart_mon_daemon.sh.j2
@ -3,13 +3,12 @@
 RETRIES="{{ handler_health_mon_check_retries }}"
 DELAY="{{ handler_health_mon_check_delay }}"
 MONITOR_NAME="{{ monitor_name }}"
-CLUSTER="{{ cluster }}"
-SOCKET=/var/run/ceph/${CLUSTER}-mon.${MONITOR_NAME}.asok
+SOCKET=/var/run/ceph/{{ cluster }}-mon.${MONITOR_NAME}.asok


 check_quorum() {
 while [ $RETRIES -ne 0 ]; do
-  MEMBERS=$(ceph --cluster ${CLUSTER} -s --format json | sed -r 's/.*"quorum_names":(\[[^]]+\]).*/\1/')
+  MEMBERS=$({{ docker_exec_cmd }} ceph --cluster {{ cluster }}   -s --format json | sed -r 's/.*"quorum_names":(\[[^]]+\]).*/\1/')
  test "${MEMBERS/$MONITOR_NAME}" != "$MEMBERS" && exit 0
  sleep $DELAY
  let RETRIES=RETRIES-1
@ -17,7 +16,7 @@ done
 # If we reach this point, it means there is a problem with the quorum
 echo "Error with quorum."
 echo "cluster status:"
-ceph --cluster ${CLUSTER} -s
+{{ docker_exec_cmd }} ceph --cluster {{ cluster }} -s
 exit 1
 }

@ -27,7 +26,7 @@ systemctl restart ceph-mon@${MONITOR_NAME}
 COUNT=10
 # Wait and ensure the socket exists after restarting the daemon
 while [ $COUNT -ne 0 ]; do
-  test -S $SOCKET && check_quorum
+  {{ docker_exec_cmd }} test -S $SOCKET && check_quorum
  sleep 1
  let COUNT=COUNT-1
 done
--- a/roles/ceph-defaults/templates/restart_osd_daemon.sh.j2
+++ b/roles/ceph-defaults/templates/restart_osd_daemon.sh.j2
@ -0,0 +1,78 @@
+#!/bin/bash
+
+RETRIES="{{ handler_health_osd_check_retries }}"
+DELAY="{{ handler_health_osd_check_delay }}"
+CEPH_CLI="--name client.bootstrap-osd --keyring /var/lib/ceph/bootstrap-osd/{{ cluster }}.keyring --cluster {{ cluster }}"
+
+check_pgs() {
+  while [ $RETRIES -ne 0 ]; do
+    test "[""$($docker_exec ceph $CEPH_CLI -s -f json | python -c 'import sys, json; print(json.load(sys.stdin)["pgmap"]["num_pgs"])')""]" = "$($docker_exec ceph $CEPH_CLI -s -f json | python -c 'import sys, json; print [ i["count"] for i in json.load(sys.stdin)["pgmap"]["pgs_by_state"] if i["state_name"] == "active+clean"]')"
+    RET=$?
+    test $RET -eq 0 && return 0
+    sleep $DELAY
+    let RETRIES=RETRIES-1
+  done
+  # PGs not clean, exiting with return code 1
+  echo "Error while running 'ceph $CEPH_CLI -s', PGs were not reported as active+clean"
+  echo "It is possible that the cluster has less OSDs than the replica configuration"
+  echo "Will refuse to continue"
+  $docker_exec ceph "$CEPH_CLI" -s
+  exit 1
+}
+
+wait_for_socket_in_docker() {
+  if ! docker exec "$1" timeout 10 bash -c "while [ ! -e /var/run/ceph/*.asok ]; do sleep 1 ; done"; then
+    log "Timed out while trying to look for a Ceph OSD socket."
+    log "Abort mission!"
+    exit 1
+  fi
+}
+
+get_dev_name() {
+  echo $1 | sed -r 's/ceph-osd@([a-z]{1,4})\.service/\1/'
+}
+
+get_docker_id_from_dev_name() {
+  local id
+  local count
+  count=10
+  while [ $count -ne 0 ]; do
+    id=$(docker ps -q -f "name=$1")
+    test "$id" != "" && break
+    sleep 1
+    let count=count-1
+  done
+  echo "$id"
+}
+
+get_docker_osd_id() {
+  wait_for_socket_in_docker $1
+  docker exec "$1" ls /var/run/ceph | cut -d'.' -f2
+}
+
+# For containerized deployments, the unit file looks like: ceph-osd@sda.service
+# For non-containerized deployments, the unit file looks like: ceph-osd@0.service
+for unit in $(systemctl list-units | grep -oE "ceph-osd@([0-9]{1,2}|[a-z]+).service"); do
+  # First, restart daemon(s)
+  systemctl restart "${unit}"
+  # We need to wait because it may take some time for the socket to actually exists
+  COUNT=10
+  # Wait and ensure the socket exists after restarting the daemon
+  {% if containerized_deployment -%}
+  id=$(get_dev_name "$unit")
+  container_id=$(get_docker_id_from_dev_name "$id")
+  osd_id=$(get_docker_osd_id "$container_id")
+  docker_exec="docker exec $container_id"
+  {% else %}
+  osd_id=$(echo ${unit#ceph-osd@} | grep -oE '[0-9]{1,2}')
+  {% endif %}
+  SOCKET=/var/run/ceph/test-osd.${osd_id}.asok
+  while [ $COUNT -ne 0 ]; do
+    $docker_exec test -S "$SOCKET" && check_pgs && continue 2
+    sleep 1
+    let COUNT=COUNT-1
+  done
+  # If we reach this point, it means the socket is not present.
+  echo "Socket file ${SOCKET} could not be found, which means the osd daemon is not running."
+  exit 1
+done
--- a/roles/ceph-docker-common/tasks/create_configs.yml
+++ b/roles/ceph-docker-common/tasks/create_configs.yml
@ -44,10 +44,15 @@
    config_type: ini
  when:
    - (not mon_containerized_default_ceph_conf_with_kv and
-        (inventory_hostname in groups.get(mon_group_name, []))) or
+        (inventory_hostname in groups.get(mon_group_name, []) or inventory_hostname in groups.get(osd_group_name, []))) or
      (not mon_containerized_default_ceph_conf_with_kv and
        ((groups.get(nfs_group_name, []) | length > 0)
          and (inventory_hostname == groups.get(nfs_group_name, [])[0])))
+  notify:
+    - restart ceph mons
+    - restart ceph osds
+    - restart ceph mdss
+    - restart ceph rgws

 - name: set fsid fact when generate_fsid = true
  set_fact:
--- a/roles/ceph-mon/defaults/main.yml
+++ b/roles/ceph-mon/defaults/main.yml
@ -105,7 +105,6 @@ openstack_keys:
 ##########
 # DOCKER #
 ##########
-docker_exec_cmd:
 ceph_mon_docker_subnet: "{{ public_network }}"# subnet of the monitor_interface

 # ceph_mon_docker_extra_env:
--- a/roles/ceph-rgw/tasks/docker/start_docker_rgw.yml
+++ b/roles/ceph-rgw/tasks/docker/start_docker_rgw.yml
@ -2,14 +2,21 @@
 - name: generate systemd unit file
  become: true
  template:
-    src: "{{ role_path }}/templates/ceph-rgw.service.j2"
-    dest: /etc/systemd/system/ceph-rgw@.service
+    src: "{{ role_path }}/templates/ceph-radosgw.service.j2"
+    dest: /etc/systemd/system/ceph-radosgw@.service
    owner: "root"
    group: "root"
    mode: "0644"

+# For backward compatibility
+- name: disable old systemd unit ('ceph-rgw@') if present
+  service:
+    name: ceph-rgw@{{ ansible_hostname }}
+    state: disable
+  ignore_errors: true
+
 - name: enable systemd unit file for rgw instance
-  shell: systemctl enable ceph-rgw@{{ ansible_hostname }}.service
+  shell: systemctl enable ceph-radosgw@{{ ansible_hostname }}.service
  failed_when: false
  changed_when: false

@ -20,7 +27,7 @@

 - name: systemd start rgw container
  service:
-    name: ceph-rgw@{{ ansible_hostname }}
+    name: ceph-radosgw@{{ ansible_hostname }}
    state: started
    enabled: yes
  changed_when: false
--- a/roles/ceph-rgw/templates/ceph-radosgw.service.j2
+++ b/roles/ceph-rgw/templates/ceph-radosgw.service.j2