Add handlers for containerized deployment

Until now, there is no handlers for containerized deployments. Signed-off-by: Guillaume Abrioux <gabrioux@redhat.com>
2017-07-29 01:00:06 +02:00 · 2017-07-29 01:00:06 +02:00 · 7a333d05ce
parent fc6b6e9859
commit 7a333d05ce
15 changed files with 170 additions and 90 deletions
--- a/infrastructure-playbooks/purge-docker-cluster.yml
+++ b/infrastructure-playbooks/purge-docker-cluster.yml
@ -97,13 +97,21 @@
  tasks:
-  - name: disable ceph rgw service
+# For backward compatibility
  - name: disable ceph rgw service (old unit name, for backward compatibility)
    service:
      name: "ceph-rgw@{{ ansible_hostname }}"
      state: stopped
      enabled: no
    ignore_errors: true
  - name: disable ceph rgw service (new unit name)
    service:
      name: "ceph-radosgw@{{ ansible_hostname }}"
      state: stopped
      enabled: no
    ignore_errors: true
  - name: remove ceph rgw container
    docker:
      image: "{{ ceph_docker_registry }}/{{ ceph_docker_image }}:{{ ceph_docker_image_tag }}"
@ -113,8 +121,12 @@
  - name: remove ceph rgw service
    file:
-      path: /etc/systemd/system/ceph-rgw@.service
+      path: "{{ item }}"
      state: absent
    with_items:
 # For backward compatibility
      - /etc/systemd/system/ceph-rgw@.service
      - /etc/systemd/system/ceph-radosgw@.service
  - name: remove ceph rgw image
    docker_image:
--- a/infrastructure-playbooks/rolling_update.yml
+++ b/infrastructure-playbooks/rolling_update.yml
@ -450,7 +450,7 @@
    - name: restart containerized ceph rgws with systemd
      service:
-        name: ceph-rgw@{{ ansible_hostname }}
+        name: ceph-radosgw@{{ ansible_hostname }}
        state: restarted
        enabled: yes
      when:
--- a/roles/ceph-common/tasks/checks/check_socket.yml
+++ b/roles/ceph-common/tasks/checks/check_socket.yml
@ -1,15 +0,0 @@
 ---
 # These checks are used to avoid running handlers at initial deployment.
 - name: check for a ceph socket
  shell: "stat /var/run/ceph/*.asok > /dev/null 2>&1"
  changed_when: false
  failed_when: false
  always_run: true
  register: socket
 - name: check for a rados gateway socket
  shell: "stat {{ rbd_client_admin_socket_path }}*.asok > /dev/null 2>&1"
  changed_when: false
  failed_when: false
  always_run: true
  register: socketrgw
--- a/roles/ceph-common/tasks/main.yml
+++ b/roles/ceph-common/tasks/main.yml
@ -105,7 +105,6 @@
    - ceph_current_fsid.rc == 0
    - mon_group_name in group_names
 - include: ./checks/check_socket.yml
 - include: create_ceph_initial_dirs.yml
 - include: generate_ceph_conf.yml
 - include: create_rbd_client_dir.yml
--- a/roles/ceph-common/templates/restart_osd_daemon.sh.j2
+++ b/roles/ceph-common/templates/restart_osd_daemon.sh.j2
@ -1,38 +0,0 @@
 #!/bin/bash
 RETRIES="{{ handler_health_osd_check_retries }}"
 DELAY="{{ handler_health_osd_check_delay }}"
 CEPH_CLI="--name client.bootstrap-osd --keyring /var/lib/ceph/bootstrap-osd/{{ cluster }}.keyring --cluster {{ cluster }}"
 check_pgs() {
  while [ $RETRIES -ne 0 ]; do
    test "[""$(ceph $CEPH_CLI -s -f json | python -c 'import sys, json; print(json.load(sys.stdin)["pgmap"]["num_pgs"])')""]" = "$(ceph $CEPH_CLI -s -f json | python -c 'import sys, json; print [ i["count"] for i in json.load(sys.stdin)["pgmap"]["pgs_by_state"] if i["state_name"] == "active+clean"]')"
    RET=$?
    test $RET -eq 0 && return 0
    sleep $DELAY
    let RETRIES=RETRIES-1
  done
  # PGs not clean, exiting with return code 1
  echo "Error while running 'ceph $CEPH_CLI -s', PGs were not reported as active+clean"
  echo "It is possible that the cluster has less OSDs than the replica configuration"
  echo "Will refuse to continue"
  ceph $CEPH_CLI -s
  exit 1
 }
 for id in $(ls /var/lib/ceph/osd/ | sed 's/.*-//'); do
  # First, restart daemon(s)
  systemctl restart ceph-osd@${id}
  # We need to wait because it may take some time for the socket to actually exists
  COUNT=10
  # Wait and ensure the socket exists after restarting the daemon
  SOCKET=/var/run/ceph/{{ cluster }}-osd.${id}.asok
  while [ $COUNT -ne 0 ]; do
    test -S $SOCKET && check_pgs && continue 2
    sleep 1
    let COUNT=COUNT-1
  done
  # If we reach this point, it means the socket is not present.
  echo "Socket file ${SOCKET} could not be found, which means the osd daemon is not running."
  exit 1
 done
--- a/roles/ceph-defaults/defaults/main.yml
+++ b/roles/ceph-defaults/defaults/main.yml
@ -367,7 +367,7 @@ os_tuning_params:
 ##########
 # DOCKER #
 ##########
-
+docker_exec_cmd:
 docker: false
 ceph_docker_image: "ceph/daemon"
 ceph_docker_image_tag: latest
--- a/roles/ceph-defaults/handlers/main.yml
+++ b/roles/ceph-defaults/handlers/main.yml
@ -17,17 +17,14 @@
  - name: restart ceph mon daemon(s)
    command: /tmp/restart_mon_daemon.sh
    listen: "restart ceph mons"
  when:
 # We do not want to run these checks on initial deployment (`socket.rc == 0`)
    - socket.rc == 0
    - ceph_current_fsid.rc == 0
    - mon_group_name in group_names
 # This does not just restart OSDs but everything else too. Unfortunately
 # at this time the ansible role does not have an OSD id list to use
 # for restarting them specifically.
 - block:
 - name: copy osd restart script
  template:
    src: restart_osd_daemon.sh.j2
@ -36,20 +33,35 @@
    group: root
    mode: 0750
  listen: "restart ceph osds"
  when:
    - inventory_hostname in play_hosts
    - osd_group_name in group_names
-  - name: restart ceph osds daemon(s)
+- name: restart containerized ceph osds daemon(s)
  command: /tmp/restart_osd_daemon.sh
  listen: "restart ceph osds"
-    when: handler_health_osd_check
+  with_items: "{{ socket_osd_container.results }}"
  when:
-# We do not want to run these checks on initial deployment (`socket.rc == 0`)
+  # We do not want to run these checks on initial deployment (`socket_osd_container.results[n].rc == 0`)
  # except when a crush location is specified. ceph-disk will start the osds before the osd crush location is specified
    - ((crush_location is defined and crush_location) or item.get('rc') == 0)
    - handler_health_osd_check
    # See https://github.com/ceph/ceph-ansible/issues/1457 for the condition below
    - inventory_hostname in play_hosts
    - osd_group_name in group_names
 - name: restart non-containerized ceph osds daemon(s)
  command: /tmp/restart_osd_daemon.sh
  listen: "restart ceph osds"
  when:
  # We do not want to run these checks on initial deployment (`socket_osd_container.results[n].rc == 0`)
  # except when a crush location is specified. ceph-disk will start the osds before the osd crush location is specified
    - ((crush_location is defined and crush_location) or socket.rc == 0)
    - ceph_current_fsid.rc == 0
-    - osd_group_name in group_names
+    - handler_health_osd_check
    # See https://github.com/ceph/ceph-ansible/issues/1457 for the condition below
    - inventory_hostname in play_hosts
    - osd_group_name in group_names
 - name: restart ceph mdss
  service:
--- a/roles/ceph-defaults/tasks/check_socket.yml
+++ b/roles/ceph-defaults/tasks/check_socket.yml
@ -0,0 +1,21 @@
 ---
 # These checks are used to avoid running handlers at initial deployment.
 - name: check for a ceph socket
  shell: |
    {{ docker_exec_cmd }} bash -c 'stat {{ rbd_client_admin_socket_path }}/*.asok > /dev/null 2>&1'
  changed_when: false
  failed_when: false
  always_run: true
  register: socket
 - name: check for a ceph socket in containerized deployment (osds)
  shell: |
    docker exec ceph-osd-"{{ ansible_hostname }}"-"{{ item | replace('/', '') }}" bash -c 'stat /var/run/ceph/*.asok > /dev/null 2>&1'
  changed_when: false
  failed_when: false
  always_run: true
  register: socket_osd_container
  with_items: "{{ devices }}"
  when:
    - containerized_deployment
    - inventory_hostname in groups.get(osd_group_name)
--- a/roles/ceph-defaults/tasks/main.yml
+++ b/roles/ceph-defaults/tasks/main.yml
@ -1,2 +1,3 @@
 ---
 - include: facts.yml
 - include: check_socket.yml
--- a/roles/ceph-defaults/templates/restart_mon_daemon.sh.j2
+++ b/roles/ceph-defaults/templates/restart_mon_daemon.sh.j2
@ -3,13 +3,12 @@
 RETRIES="{{ handler_health_mon_check_retries }}"
 DELAY="{{ handler_health_mon_check_delay }}"
 MONITOR_NAME="{{ monitor_name }}"
-CLUSTER="{{ cluster }}"
+SOCKET=/var/run/ceph/{{ cluster }}-mon.${MONITOR_NAME}.asok
 SOCKET=/var/run/ceph/${CLUSTER}-mon.${MONITOR_NAME}.asok
 check_quorum() {
 while [ $RETRIES -ne 0 ]; do
-  MEMBERS=$(ceph --cluster ${CLUSTER} -s --format json | sed -r 's/.*"quorum_names":(\[[^]]+\]).*/\1/')
+  MEMBERS=$({{ docker_exec_cmd }} ceph --cluster {{ cluster }}   -s --format json | sed -r 's/.*"quorum_names":(\[[^]]+\]).*/\1/')
  test "${MEMBERS/$MONITOR_NAME}" != "$MEMBERS" && exit 0
  sleep $DELAY
  let RETRIES=RETRIES-1
@ -17,7 +16,7 @@ done
 # If we reach this point, it means there is a problem with the quorum
 echo "Error with quorum."
 echo "cluster status:"
-ceph --cluster ${CLUSTER} -s
+{{ docker_exec_cmd }} ceph --cluster {{ cluster }} -s
 exit 1
 }
@ -27,7 +26,7 @@ systemctl restart ceph-mon@${MONITOR_NAME}
 COUNT=10
 # Wait and ensure the socket exists after restarting the daemon
 while [ $COUNT -ne 0 ]; do
-  test -S $SOCKET && check_quorum
+  {{ docker_exec_cmd }} test -S $SOCKET && check_quorum
  sleep 1
  let COUNT=COUNT-1
 done
--- a/roles/ceph-defaults/templates/restart_osd_daemon.sh.j2
+++ b/roles/ceph-defaults/templates/restart_osd_daemon.sh.j2
@ -0,0 +1,78 @@
 #!/bin/bash
 RETRIES="{{ handler_health_osd_check_retries }}"
 DELAY="{{ handler_health_osd_check_delay }}"
 CEPH_CLI="--name client.bootstrap-osd --keyring /var/lib/ceph/bootstrap-osd/{{ cluster }}.keyring --cluster {{ cluster }}"
 check_pgs() {
  while [ $RETRIES -ne 0 ]; do
    test "[""$($docker_exec ceph $CEPH_CLI -s -f json | python -c 'import sys, json; print(json.load(sys.stdin)["pgmap"]["num_pgs"])')""]" = "$($docker_exec ceph $CEPH_CLI -s -f json | python -c 'import sys, json; print [ i["count"] for i in json.load(sys.stdin)["pgmap"]["pgs_by_state"] if i["state_name"] == "active+clean"]')"
    RET=$?
    test $RET -eq 0 && return 0
    sleep $DELAY
    let RETRIES=RETRIES-1
  done
  # PGs not clean, exiting with return code 1
  echo "Error while running 'ceph $CEPH_CLI -s', PGs were not reported as active+clean"
  echo "It is possible that the cluster has less OSDs than the replica configuration"
  echo "Will refuse to continue"
  $docker_exec ceph "$CEPH_CLI" -s
  exit 1
 }
 wait_for_socket_in_docker() {
  if ! docker exec "$1" timeout 10 bash -c "while [ ! -e /var/run/ceph/*.asok ]; do sleep 1 ; done"; then
    log "Timed out while trying to look for a Ceph OSD socket."
    log "Abort mission!"
    exit 1
  fi
 }
 get_dev_name() {
  echo $1 | sed -r 's/ceph-osd@([a-z]{1,4})\.service/\1/'
 }
 get_docker_id_from_dev_name() {
  local id
  local count
  count=10
  while [ $count -ne 0 ]; do
    id=$(docker ps -q -f "name=$1")
    test "$id" != "" && break
    sleep 1
    let count=count-1
  done
  echo "$id"
 }
 get_docker_osd_id() {
  wait_for_socket_in_docker $1
  docker exec "$1" ls /var/run/ceph | cut -d'.' -f2
 }
 # For containerized deployments, the unit file looks like: ceph-osd@sda.service
 # For non-containerized deployments, the unit file looks like: ceph-osd@0.service
 for unit in $(systemctl list-units | grep -oE "ceph-osd@([0-9]{1,2}|[a-z]+).service"); do
  # First, restart daemon(s)
  systemctl restart "${unit}"
  # We need to wait because it may take some time for the socket to actually exists
  COUNT=10
  # Wait and ensure the socket exists after restarting the daemon
  {% if containerized_deployment -%}
  id=$(get_dev_name "$unit")
  container_id=$(get_docker_id_from_dev_name "$id")
  osd_id=$(get_docker_osd_id "$container_id")
  docker_exec="docker exec $container_id"
  {% else %}
  osd_id=$(echo ${unit#ceph-osd@} | grep -oE '[0-9]{1,2}')
  {% endif %}
  SOCKET=/var/run/ceph/test-osd.${osd_id}.asok
  while [ $COUNT -ne 0 ]; do
    $docker_exec test -S "$SOCKET" && check_pgs && continue 2
    sleep 1
    let COUNT=COUNT-1
  done
  # If we reach this point, it means the socket is not present.
  echo "Socket file ${SOCKET} could not be found, which means the osd daemon is not running."
  exit 1
 done
--- a/roles/ceph-docker-common/tasks/create_configs.yml
+++ b/roles/ceph-docker-common/tasks/create_configs.yml
@ -44,10 +44,15 @@
    config_type: ini
  when:
    - (not mon_containerized_default_ceph_conf_with_kv and
-        (inventory_hostname in groups.get(mon_group_name, []))) or
+        (inventory_hostname in groups.get(mon_group_name, []) or inventory_hostname in groups.get(osd_group_name, []))) or
      (not mon_containerized_default_ceph_conf_with_kv and
        ((groups.get(nfs_group_name, []) | length > 0)
          and (inventory_hostname == groups.get(nfs_group_name, [])[0])))
  notify:
    - restart ceph mons
    - restart ceph osds
    - restart ceph mdss
    - restart ceph rgws
 - name: set fsid fact when generate_fsid = true
  set_fact:
--- a/roles/ceph-mon/defaults/main.yml
+++ b/roles/ceph-mon/defaults/main.yml
@ -105,7 +105,6 @@ openstack_keys:
 ##########
 # DOCKER #
 ##########
 docker_exec_cmd:
 ceph_mon_docker_subnet: "{{ public_network }}"# subnet of the monitor_interface
 # ceph_mon_docker_extra_env:
--- a/roles/ceph-rgw/tasks/docker/start_docker_rgw.yml
+++ b/roles/ceph-rgw/tasks/docker/start_docker_rgw.yml
@ -2,14 +2,21 @@
 - name: generate systemd unit file
  become: true
  template:
-    src: "{{ role_path }}/templates/ceph-rgw.service.j2"
+    src: "{{ role_path }}/templates/ceph-radosgw.service.j2"
-    dest: /etc/systemd/system/ceph-rgw@.service
+    dest: /etc/systemd/system/ceph-radosgw@.service
    owner: "root"
    group: "root"
    mode: "0644"
 # For backward compatibility
 - name: disable old systemd unit ('ceph-rgw@') if present
  service:
    name: ceph-rgw@{{ ansible_hostname }}
    state: disable
  ignore_errors: true
 - name: enable systemd unit file for rgw instance
-  shell: systemctl enable ceph-rgw@{{ ansible_hostname }}.service
+  shell: systemctl enable ceph-radosgw@{{ ansible_hostname }}.service
  failed_when: false
  changed_when: false
@ -20,7 +27,7 @@
 - name: systemd start rgw container
  service:
-    name: ceph-rgw@{{ ansible_hostname }}
+    name: ceph-radosgw@{{ ansible_hostname }}
    state: started
    enabled: yes
  changed_when: false
--- a/roles/ceph-rgw/templates/ceph-radosgw.service.j2
+++ b/roles/ceph-rgw/templates/ceph-radosgw.service.j2