Merge pull request #1455 from ceph/restart_daemons

Common: Restore check_socket
2017-04-24 06:54:07 -07:00 · 2017-04-24 06:54:07 -07:00 · b28424334a
parent 58e7d39bcc 800b439667
commit b28424334a
6 changed files with 31 additions and 9 deletions
--- a/roles/ceph-common/handlers/main.yml
+++ b/roles/ceph-common/handlers/main.yml
@ -19,6 +19,9 @@
    listen: "restart ceph mons"

  when:
+# We do not want to run these checks on initial deployment (`socket.rc == 0`)
+    - socket.rc == 0
+    - ceph_current_fsid.rc == 0
    - mon_group_name in group_names

 # This does not just restart OSDs but everything else too. Unfortunately
@ -37,10 +40,15 @@
  - name: restart ceph osds daemon(s)
    command: /tmp/restart_osd_daemon.sh
    listen: "restart ceph osds"
-    when:
-      - handler_health_osd_check
+    when: handler_health_osd_check
+
  when:
+# We do not want to run these checks on initial deployment (`socket.rc == 0`)
+    - socket.rc == 0
+    - ceph_current_fsid.rc == 0
    - osd_group_name in group_names
+# See https://github.com/ceph/ceph-ansible/issues/1457 for the condition below
+    - item in play_hosts

 - name: restart ceph mdss
  service:
--- a/roles/ceph-common/tasks/checks/check_socket.yml
+++ b/roles/ceph-common/tasks/checks/check_socket.yml
@ -0,0 +1,15 @@
+---
+# These checks are used to avoid running handlers at initial deployment.
+- name: check for a ceph socket
+  shell: "stat /var/run/ceph/*.asok > /dev/null 2>&1"
+  changed_when: false
+  failed_when: false
+  always_run: true
+  register: socket
+
+- name: check for a rados gateway socket
+  shell: "stat {{ rbd_client_admin_socket_path }}*.asok > /dev/null 2>&1"
+  changed_when: false
+  failed_when: false
+  always_run: true
+  register: socketrgw
--- a/roles/ceph-common/tasks/main.yml
+++ b/roles/ceph-common/tasks/main.yml
@ -87,6 +87,7 @@
  static: False

 - include: facts.yml
+- include: ./checks/check_socket.yml
 - include: create_ceph_initial_dirs.yml
 - include: generate_cluster_fsid.yml
 - include: generate_ceph_conf.yml
--- a/roles/ceph-common/templates/restart_mon_daemon.sh.j2
+++ b/roles/ceph-common/templates/restart_mon_daemon.sh.j2
@ -15,6 +15,9 @@ while [ $RETRIES -ne 0 ]; do
  let RETRIES=RETRIES-1
 done
 # If we reach this point, it means there is a problem with the quorum
+echo "Error with quorum."
+echo "cluster status:"
+ceph --cluster ${CLUSTER} -s
 exit 1
 }

@ -29,5 +32,5 @@ while [ $COUNT -ne 0 ]; do
  let COUNT=COUNT-1
 done
 # If we reach this point, it means the socket is not present.
-echo "Error while restarting mon daemon"
+echo "Socket file ${SOCKET} could not be found, which means the monitor is not running."
 exit 1
--- a/roles/ceph-common/templates/restart_osd_daemon.sh.j2
+++ b/roles/ceph-common/templates/restart_osd_daemon.sh.j2
@ -17,7 +17,6 @@ check_pgs() {
  exit 1
 }

-
 for id in $(ls /var/lib/ceph/osd/ | sed 's/.*-//'); do
  # First, restart daemon(s)
  systemctl restart ceph-osd@${id}
@ -31,6 +30,6 @@ for id in $(ls /var/lib/ceph/osd/ | sed 's/.*-//'); do
    let COUNT=COUNT-1
  done
  # If we reach this point, it means the socket is not present.
-  echo "Error while restarting mon daemon"
+  echo "Socket file ${SOCKET} could not be found, which means the osd daemon is not running."
  exit 1
 done
--- a/tests/functional/centos/7/journal-collocation/group_vars/all
+++ b/tests/functional/centos/7/journal-collocation/group_vars/all
@ -12,7 +12,3 @@ journal_collocation: True
 os_tuning_params:
  - { name: kernel.pid_max, value: 4194303 }
  - { name: fs.file-max, value: 26234859 }
-ceph_conf_overrides:
-  global:
-    osd_pool_default_pg_num: 8
-    osd_pool_default_size: 1