From 52ff9ce5d1a09facdd55a49256fa292f83401a78 Mon Sep 17 00:00:00 2001 From: Guillaume Abrioux Date: Thu, 20 Jun 2019 14:45:07 +0200 Subject: [PATCH] facts: add a retry on get current fsid task sometimes it can happen the following task fails: ``` TASK [ceph-facts : get current fsid] ******************************************* task path: /home/jenkins-build/build/workspace/ceph-ansible-prs-dev-centos-container-update/roles/ceph-facts/tasks/facts.yml:78 Wednesday 19 June 2019 18:12:49 +0000 (0:00:00.203) 0:02:39.995 ******** fatal: [mon2 -> mon1]: FAILED! => changed=true cmd: - timeout - --foreground - -s - KILL - 600s - docker - exec - ceph-mon-mon1 - ceph - --cluster - ceph - daemon - mon.mon1 - config - get - fsid delta: '0:00:00.239339' end: '2019-06-19 18:12:49.812099' msg: non-zero return code rc: 22 start: '2019-06-19 18:12:49.572760' stderr: 'admin_socket: exception getting command descriptions: [Errno 2] No such file or directory' stderr_lines: stdout: '' stdout_lines: ``` not sure exactly why since just before this task, mon1 seems to be well UP otherwise it wouldn't have passed the task `waiting for the containerized monitor to join the quorum`. As a quick fix/workaround, let's add a retry which allows us to get around this situation: ``` TASK [ceph-facts : get current fsid] ******************************************* task path: /home/jenkins-build/build/workspace/ceph-ansible-scenario/roles/ceph-facts/tasks/facts.yml:78 Thursday 20 June 2019 15:35:07 +0000 (0:00:00.201) 0:03:47.288 ********* FAILED - RETRYING: get current fsid (3 retries left). changed: [mon2 -> mon1] => changed=true attempts: 2 cmd: - timeout - --foreground - -s - KILL - 600s - docker - exec - ceph-mon-mon1 - ceph - --cluster - ceph - daemon - mon.mon1 - config - get - fsid delta: '0:00:00.290252' end: '2019-06-20 15:35:13.960188' rc: 0 start: '2019-06-20 15:35:13.669936' stderr: '' stderr_lines: stdout: |- { "fsid": "153e159d-7ade-42a7-842c-4d04348b901e" } stdout_lines: ``` Signed-off-by: Guillaume Abrioux (cherry picked from commit 46a268394490cb37bdb56fb839ecc8711bda1ec0) --- roles/ceph-facts/tasks/facts.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/roles/ceph-facts/tasks/facts.yml b/roles/ceph-facts/tasks/facts.yml index c6f978179..987c11181 100644 --- a/roles/ceph-facts/tasks/facts.yml +++ b/roles/ceph-facts/tasks/facts.yml @@ -79,6 +79,7 @@ command: "{{ timeout_command }} {{ container_exec_cmd }} ceph --cluster {{ cluster }} daemon mon.{{ hostvars[mon_host | default(groups[mon_group_name][0])]['ansible_hostname'] }} config get fsid" register: rolling_update_fsid delegate_to: "{{ mon_host | default(groups[mon_group_name][0]) }}" + until: rolling_update_fsid is succeeded when: rolling_update | bool - name: set_fact fsid