Merge pull request #1077 from font/rolling_update

Support containerized rolling update
2016-11-22 16:56:46 +01:00 · 2016-11-22 16:56:46 +01:00 · 829e2b6598
parent d6bfacdbf7 255e816e28
commit 829e2b6598
10 changed files with 178 additions and 106 deletions
--- a/group_vars/all.docker.sample
+++ b/group_vars/all.docker.sample
@ -39,6 +39,9 @@ dummy:
 #ceph_osd_docker_devices:
 # - /dev/sdb
 # - /dev/sdc
+#journal_size: 5120 # OSD journal size in MB
+#public_network: 0.0.0.0/0
+#cluster_network: "{{ public_network }}"

 #######
 # MDS #
--- a/infrastructure-playbooks/rolling_update.yml
+++ b/infrastructure-playbooks/rolling_update.yml
@ -28,10 +28,11 @@
      msg: >
        "Exiting rolling_update.yml playbook, cluster was NOT upgraded.
         To upgrade the cluster, either say 'yes' on the prompt or
-         or use `-e ireallymeanit=yes` on the command line when
+         use `-e ireallymeanit=yes` on the command line when
         invoking the playbook"
    when: ireallymeanit != 'yes'

+
 - name: gather facts and check the init system
  vars:
    mon_group_name:       mons
@ -48,6 +49,9 @@
  become: True
  tasks:
    - debug: msg="gather facts on all Ceph hosts for following reference"
+
+    - set_fact: rolling_update=true
+
    - name: check if sysvinit
      stat:
        path: /etc/rc?.d/S??ceph
@ -68,7 +72,6 @@

  vars:
    mon_group_name:       mons
-    restapi_group_name:   restapis
    health_mon_check_retries: 5
    health_mon_check_delay: 10
    upgrade_ceph_packages: True
@ -80,15 +83,11 @@
  become: True

  pre_tasks:
-    - include_vars: roles/ceph-common/defaults/main.yml
-    - include_vars: roles/ceph-mon/defaults/main.yml
-    - include_vars: roles/ceph-restapi/defaults/main.yml
-    - include_vars: group_vars/all
-      failed_when: false
-    - include_vars: group_vars/{{ mon_group_name }}
-      failed_when: false
-    - include_vars: group_vars/{{ restapi_group_name }}
-      failed_when: false
+    - name: set mon_host_count
+      set_fact: mon_host_count={{ groups.mons | length }}
+
+    - debug: msg="WARNING - upgrading a ceph cluster with only one monitor node ({{ inventory_hostname }})"
+      when: mon_host_count | int == 1

    - name: stop ceph mons with upstart
      service:
@ -108,23 +107,14 @@
        name: ceph-mon@{{ ansible_hostname }}
        state: stopped
        enabled: yes
-      when: is_systemd
+      when:
+        - is_systemd
+        - not mon_containerized_deployment

  roles:
-    - ceph-common
    - ceph-mon

  post_tasks:
-    - include_vars: roles/ceph-common/defaults/main.yml
-    - include_vars: roles/ceph-mon/defaults/main.yml
-    - include_vars: roles/ceph-restapi/defaults/main.yml
-    - include_vars: group_vars/all
-      failed_when: false
-    - include_vars: group_vars/{{ mon_group_name }}
-      failed_when: false
-    - include_vars: group_vars/{{ restapi_group_name }}
-      failed_when: false
-
    - name: start ceph mons with upstart
      service:
        name: ceph-mon
@ -143,21 +133,54 @@
        name: ceph-mon@{{ ansible_hostname }}
        state: started
        enabled: yes
-      when: is_systemd
+      when:
+        - is_systemd
+        - not mon_containerized_deployment

-    - name: select a running monitor
+    - name: restart containerized ceph mons with systemd
+      service:
+        name: ceph-mon@{{ ansible_hostname }}
+        state: restarted
+        enabled: yes
+      when:
+        - is_systemd
+        - mon_containerized_deployment
+
+    - name: set mon_host_count
+      set_fact: mon_host_count={{ groups.mons | length }}
+
+    - name: select a running monitor if multiple monitors
      set_fact: mon_host={{ item }}
      with_items: "{{ groups.mons }}"
-      when: item != inventory_hostname
+      when:
+        - mon_host_count | int > 1
+        - item != inventory_hostname
+
+    - name: select first monitor if only one monitor
+      set_fact: mon_host={{ item }}
+      with_items: "{{ groups.mons[0] }}"
+      when:
+        - mon_host_count | int == 1

    - name: waiting for the monitor to join the quorum...
      shell: |
-        ceph -s  --cluster {{ cluster }} | grep monmap | sed 's/.*quorum//' | egrep -sq {{ ansible_hostname }}
+        ceph -s --cluster {{ cluster }} | grep monmap | sed 's/.*quorum//' | egrep -sq {{ ansible_hostname }}
      register: result
      until: result.rc == 0
      retries: "{{ health_mon_check_retries }}"
      delay: "{{ health_mon_check_delay }}"
      delegate_to: "{{ mon_host }}"
+      when: not mon_containerized_deployment
+
+    - name: waiting for the containerized monitor to join the quorum...
+      shell: |
+        docker exec {{ hostvars[mon_host]['ansible_hostname'] }} ceph -s --cluster {{ cluster }} | grep quorum | sed 's/.*quorum//' | egrep -sq {{ ansible_hostname }}
+      register: result
+      until: result.rc == 0
+      retries: "{{ health_mon_check_retries }}"
+      delay: "{{ health_mon_check_delay }}"
+      delegate_to: "{{ mon_host }}"
+      when: mon_containerized_deployment


 - name: upgrade ceph osds cluster
@ -175,13 +198,6 @@
  become: True

  pre_tasks:
-    - include_vars: roles/ceph-common/defaults/main.yml
-    - include_vars: roles/ceph-osd/defaults/main.yml
-    - include_vars: group_vars/all
-      failed_when: false
-    - include_vars: group_vars/{{ osd_group_name }}
-      failed_when: false
-
    - name: set osd flags
      command: ceph osd set {{ item }} --cluster {{ cluster }}
      with_items:
@ -189,68 +205,87 @@
        - noscrub
        - nodeep-scrub
      delegate_to: "{{ groups.mons[0] }}"
+      when: not mon_containerized_deployment
+
+    - name: set containerized osd flags
+      command: |
+          docker exec {{ hostvars[groups.mons[0]]['ansible_hostname'] }} ceph osd set {{ item }} --cluster {{ cluster }}
+      with_items:
+        - noout
+        - noscrub
+        - nodeep-scrub
+      delegate_to: "{{ groups.mons[0] }}"
+      when: mon_containerized_deployment

    - name: get osd numbers
      shell: "if [ -d /var/lib/ceph/osd ] ; then ls /var/lib/ceph/osd | cut -d '-' -f 2 ; fi"
      register: osd_ids
      changed_when: false
+      when: not osd_containerized_deployment

-    - name: stop ceph osds (upstart)
+    - name: stop ceph osds with upstart
      service:
        name: ceph-osd-all
        state: stopped
      when: is_upstart.stat.exists == True

-    - name: stop ceph osds (sysvinit)
+    - name: stop ceph osds with sysvinit
      service:
        name: ceph
        state: stopped
      when: is_sysvinit.stat.exists == True

-    - name: stop ceph osds (systemd)
+    - name: stop ceph osds with systemd
      service:
        name: ceph-osd@{{item}}
        state: stopped
        enabled: yes
      with_items: "{{ osd_ids.stdout_lines }}"
-      when: is_systemd
+      when:
+        - is_systemd
+        - not osd_containerized_deployment

  roles:
-    - ceph-common
    - ceph-osd

  post_tasks:
-    - include_vars: roles/ceph-common/defaults/main.yml
-    - include_vars: roles/ceph-osd/defaults/main.yml
-    - include_vars: group_vars/all
-      failed_when: false
-    - include_vars: group_vars/{{ osd_group_name }}
-      failed_when: false
-
    - name: get osd numbers
      shell: "if [ -d /var/lib/ceph/osd ] ; then ls /var/lib/ceph/osd | cut -d '-' -f 2 ; fi"
      register: osd_ids
      changed_when: false
+      when: not osd_containerized_deployment

-    - name: start ceph osds (upstart)
+    - name: start ceph osds with upstart
      service:
        name: ceph-osd-all
        state: started
      when: is_upstart.stat.exists == True

-    - name: start ceph osds (sysvinit)
+    - name: start ceph osds with sysvinit
      service:
        name: ceph
        state: started
      when: is_sysvinit.stat.exists == True

-    - name: start ceph osds (systemd)
+    - name: start ceph osds with systemd
      service:
        name: ceph-osd@{{item}}
        state: started
        enabled: yes
      with_items: "{{ osd_ids.stdout_lines }}"
-      when: is_systemd
+      when:
+        - is_systemd
+        - not osd_containerized_deployment
+
+    - name: restart containerized ceph osds with systemd
+      service:
+        name: ceph-osd@{{ item | basename }}
+        state: restarted
+        enabled: yes
+      with_items: "{{ ceph_osd_docker_devices }}"
+      when:
+        - is_systemd
+        - osd_containerized_deployment

    - name: waiting for clean pgs...
      shell: |
@ -260,6 +295,17 @@
      retries: "{{ health_osd_check_retries }}"
      delay: "{{ health_osd_check_delay }}"
      delegate_to: "{{ groups.mons[0] }}"
+      when: not osd_containerized_deployment
+
+    - name: container - waiting for clean pgs...
+      shell: |
+        test "$(docker exec {{ hostvars[groups.mons[0]]['ansible_hostname'] }} ceph pg stat --cluster {{ cluster }} | sed 's/^.*pgs://;s/active+clean.*//;s/ //')" -eq "$(docker exec {{ hostvars[groups.mons[0]]['ansible_hostname'] }} ceph pg stat --cluster {{ cluster }} | sed 's/pgs.*//;s/^.*://;s/ //')" && docker exec {{ hostvars[groups.mons[0]]['ansible_hostname'] }} ceph health --cluster {{ cluster }}  | egrep -sq "HEALTH_OK|HEALTH_WARN"
+      register: result
+      until: result.rc == 0
+      retries: "{{ health_osd_check_retries }}"
+      delay: "{{ health_osd_check_delay }}"
+      delegate_to: "{{ groups.mons[0] }}"
+      when: osd_containerized_deployment

    - name: unset osd flags
      command: ceph osd unset {{ item }} --cluster {{ cluster }}
@ -268,6 +314,17 @@
        - noscrub
        - nodeep-scrub
      delegate_to: "{{ groups.mons[0] }}"
+      when: not osd_containerized_deployment
+
+    - name: unset containerized osd flags
+      command: |
+          docker exec {{ hostvars[groups.mons[0]]['ansible_hostname'] }} ceph osd unset {{ item }} --cluster {{ cluster }}
+      with_items:
+        - noout
+        - noscrub
+        - nodeep-scrub
+      delegate_to: "{{ groups.mons[0] }}"
+      when: osd_containerized_deployment


 - name: upgrade ceph mdss cluster
@ -283,13 +340,6 @@
  become: True

  pre_tasks:
-    - include_vars: roles/ceph-common/defaults/main.yml
-    - include_vars: roles/ceph-mds/defaults/main.yml
-    - include_vars: group_vars/all
-      failed_when: false
-    - include_vars: group_vars/{{ mds_group_name }}
-      failed_when: false
-
    - name: stop ceph mdss with upstart
      service:
        name: ceph-mds
@ -309,20 +359,14 @@
        name: ceph-mds@{{ ansible_hostname }}
        state: stopped
        enabled: yes
-      when: is_systemd
+      when:
+        - is_systemd
+        - not mds_containerized_deployment

  roles:
-    - ceph-common
    - ceph-mds

  post_tasks:
-    - include_vars: roles/ceph-common/defaults/main.yml
-    - include_vars: roles/ceph-mds/defaults/main.yml
-    - include_vars: group_vars/all
-      failed_when: false
-    - include_vars: group_vars/{{ mds_group_name }}
-      failed_when: false
-
    - name: start ceph mdss with upstart
      service:
        name: ceph-mds
@ -342,7 +386,18 @@
        name: ceph-mds@{{ ansible_hostname }}
        state: started
        enabled: yes
-      when: is_systemd
+      when:
+        - is_systemd
+        - not mds_containerized_deployment
+
+    - name: restart ceph mdss with systemd
+      service:
+        name: ceph-mds@{{ ansible_hostname }}
+        state: restarted
+        enabled: yes
+      when:
+        - is_systemd
+        - mds_containerized_deployment


 - name: upgrade ceph rgws cluster
@ -358,19 +413,11 @@
  become: True

  pre_tasks:
-    - include_vars: roles/ceph-common/defaults/main.yml
-    - include_vars: roles/ceph-rgw/defaults/main.yml
-    - include_vars: group_vars/all
-      failed_when: false
-    - include_vars: group_vars/{{ rgw_group_name }}
-      failed_when: false
-
-    - name: stop ceph rgws with systemd
+    - name: stop ceph rgws with upstart
      service:
-        name: ceph-radosgw@rgw.{{ ansible_hostname }}
+        name: ceph-radosgw
        state: stopped
-        enabled: yes
-      when: is_systemd
+      when: is_upstart.stat.exists == True

    - name: stop ceph rgws with sysvinit
      service:
@ -378,30 +425,24 @@
        state: stopped
      when: is_sysvinit.stat.exists == True

-    - name: stop ceph rgws with upstart
+    - name: stop ceph rgws with systemd
      service:
-        name: ceph-radosgw
+        name: ceph-radosgw@rgw.{{ ansible_hostname }}
        state: stopped
-      when: is_upstart.stat.exists == True
+        enabled: yes
+      when:
+        - is_systemd
+        - not rgw_containerized_deployment

  roles:
-    - ceph-common
    - ceph-rgw

  post_tasks:
-    - include_vars: roles/ceph-common/defaults/main.yml
-    - include_vars: roles/ceph-rgw/defaults/main.yml
-    - include_vars: group_vars/all
-      failed_when: false
-    - include_vars: group_vars/{{ rgw_group_name }}
-      failed_when: false
-
-    - name: start ceph rgws with systemd
+    - name: start ceph rgws with upstart
      service:
-        name: ceph-radosgw@rgw.{{ ansible_hostname }}
+        name: ceph-radosgw
        state: started
-        enabled: yes
-      when: is_systemd
+      when: is_upstart.stat.exists == True

    - name: start ceph rgws with sysvinit
      service:
@ -409,8 +450,20 @@
        state: started
      when: is_sysvinit.stat.exists == True

-    - name: start ceph rgws with upstart
+    - name: start ceph rgws with systemd
      service:
-        name: ceph-radosgw
+        name: ceph-radosgw@rgw.{{ ansible_hostname }}
        state: started
-      when: is_upstart.stat.exists == True
+        enabled: yes
+      when:
+        - is_systemd
+        - not rgw_containerized_deployment
+
+    - name: restart containerized ceph rgws with systemd
+      service:
+        name: ceph-rgw@{{ ansible_hostname }}
+        state: restarted
+        enabled: yes
+      when:
+        - is_systemd
+        - rgw_containerized_deployment
--- a/roles/ceph-mds/tasks/docker/main.yml
+++ b/roles/ceph-mds/tasks/docker/main.yml
@ -11,12 +11,15 @@

 - name: set fact for using Atomic host
  set_fact:
-      is_atomic='{{ stat_ostree.stat.exists }}'
+    is_atomic: '{{ stat_ostree.stat.exists }}'

 - include: checks.yml
-  when: ceph_health.rc != 0
+  when:
+    - ceph_health.rc != 0
+    - not "{{ rolling_update | default(false) }}"

 - include: pre_requisite.yml
+  when: not is_atomic

 - include: "{{ playbook_dir }}/roles/ceph-common/tasks/misc/ntp_atomic.yml"
  when:
--- a/roles/ceph-mon/tasks/docker/main.yml
+++ b/roles/ceph-mon/tasks/docker/main.yml
@ -17,8 +17,10 @@
  when:
    - ceph_health.rc != 0
    - not mon_containerized_deployment_with_kv
+    - not "{{ rolling_update | default(false) }}"

 - include: pre_requisite.yml
+  when: not is_atomic

 - include: "{{ playbook_dir }}/roles/ceph-common/tasks/misc/ntp_atomic.yml"
  when:
@ -58,6 +60,14 @@

 - include: start_docker_monitor.yml

+# NOTE: if we don't wait we will attempt to copy config to ansible host
+# before admin key is ready, preventing future daemons e.g. ceph-mds from
+# properly retrieving key
+- name: wait for client.admin key exists
+  wait_for:
+    path: /etc/ceph/{{ cluster }}.client.admin.keyring
+  when: cephx
+
 - include: copy_configs.yml
  when: not mon_containerized_deployment_with_kv

--- a/roles/ceph-mon/tasks/docker/start_docker_monitor.yml
+++ b/roles/ceph-mon/tasks/docker/start_docker_monitor.yml
@ -72,11 +72,6 @@
  changed_when: false
  when: ansible_os_family == 'RedHat' or ansible_os_family == 'CoreOS'

- name: wait for ceph.conf exists
-  wait_for:
-      path: "/etc/ceph/{{ cluster }}.conf"
-  when: ansible_os_family == 'RedHat'
-
 - name: run the ceph monitor docker image
  docker:
    image: "{{ ceph_mon_docker_username }}/{{ ceph_mon_docker_imagename }}:{{ ceph_mon_docker_image_tag }}"
--- a/roles/ceph-nfs/tasks/docker/main.yml
+++ b/roles/ceph-nfs/tasks/docker/main.yml
@ -19,6 +19,7 @@
    not mon_containerized_deployment_with_kv

 - include: pre_requisite.yml
+  when: not is_atomic

 - include: "{{ playbook_dir }}/roles/ceph-common/tasks/misc/ntp_atomic.yml"
  when:
--- a/roles/ceph-osd/tasks/docker/main.yml
+++ b/roles/ceph-osd/tasks/docker/main.yml
@ -9,6 +9,7 @@
  when:
    - ceph_health.rc != 0
    - not osd_containerized_deployment_with_kv
+    - not "{{ rolling_update | default(false) }}"

 - name: check if it is Atomic host
  stat: path=/run/ostree-booted
@ -16,9 +17,10 @@

 - name: set fact for using Atomic host
  set_fact:
-      is_atomic: '{{ stat_ostree.stat.exists }}'
+    is_atomic: '{{ stat_ostree.stat.exists }}'

 - include: pre_requisite.yml
+  when: not is_atomic

 - include: "{{ playbook_dir }}/roles/ceph-common/tasks/misc/ntp_atomic.yml"
  when:
--- a/roles/ceph-rbd-mirror/tasks/docker/main.yml
+++ b/roles/ceph-rbd-mirror/tasks/docker/main.yml
@ -17,6 +17,7 @@
  when: ceph_health.rc != 0

 - include: pre_requisite.yml
+  when: not is_atomic

 - include: "{{ playbook_dir }}/roles/ceph-common/tasks/misc/ntp_atomic.yml"
  when:
--- a/roles/ceph-restapi/tasks/docker/main.yml
+++ b/roles/ceph-restapi/tasks/docker/main.yml
@ -8,6 +8,7 @@
      is_atomic: '{{ stat_ostree.stat.exists }}'

 - include: pre_requisite.yml
+  when: not is_atomic

 - include: "{{ playbook_dir }}/roles/ceph-common/tasks/misc/ntp_atomic.yml"
  when:
--- a/roles/ceph-rgw/tasks/docker/main.yml
+++ b/roles/ceph-rgw/tasks/docker/main.yml
@ -11,12 +11,15 @@

 - name: set fact for using Atomic host
  set_fact:
-      is_atomic='{{ stat_ostree.stat.exists }}'
+    is_atomic: '{{ stat_ostree.stat.exists }}'

 - include: checks.yml
-  when: ceph_health.rc != 0
+  when:
+    - ceph_health.rc != 0
+    - not "{{ rolling_update | default(false) }}"

 - include: pre_requisite.yml
+  when: not is_atomic

 - include: "{{ playbook_dir }}/roles/ceph-common/tasks/misc/ntp_atomic.yml"
  when: