ceph-ansible/infrastructure-playbooks/recover-osds-after-ssd-jour...

---
# This playbook use to recover Ceph OSDs after ssd journal failure.
# You will also realise that it’s really simple to bring your
# OSDs back to life after replacing your faulty SSD with a new one.
#
# You should define `dev_ssds` variable for host which change ssds after
# failture.
#
# For example in host_vars/hostname1.yml
#
# dev_ssds:
# - device_name: sdd
#   partitions:
#   - index: 1
#     size: 10G
#     osd_id: 0
#   - index: 2
#     size: 10G
#     osd_id: 1
# - device_name: sdf
#   partitions:
#   - index: 1
#     size: 10G
#     osd_id: 2
#
# @param device_name: The device name of new ssd
# @param partitions:  The custom partition layout of new ssd
# @param index:  The index of this partition
# @param size:  The size of this partition
# @param osd_id: Which osds's journal this pattition for.
#
# ansible-playbook recover-osds-after-ssd-journal-failure.yml
#     Prompts for select which host to recover, defaults to null,
#     doesn't select host the recover ssd. Input the hostname
#     which to recover osds after ssd journal failure
#
# ansible-playbook -e target_host=hostname \
#     recover-osds-after-ssd-journal-failure.yml
#     Overrides the prompt using -e option. Can be used in
#     automation scripts to avoid interactive prompt.

- hosts: localhost
  gather_facts: no
  vars_prompt:
  - name: target_host
    prompt: please enter the target hostname which to recover osds after ssd journal failure
    private: no
  tasks:
    - add_host:
        name: "{{ target_host }}"
        groups: dynamically_created_hosts

- hosts: dynamically_created_hosts
  vars:
   journal_typecode: 45b0969e-9b03-4f30-b4c6-b4b80ceff106
   dev_ssds: []

  tasks:
  - name: get the name of the existing ceph cluster
    shell: |
      basename $(grep -R fsid /etc/ceph/ | egrep -o '^[^.]*')
    changed_when: false
    register: cluster_name

  - name: get osd(s) if directory stat
    stat:
      path: "/var/lib/ceph/osd/{{ cluster_name.stdout }}-{{ item.1.osd_id }}/journal_uuid"
    register: osds_dir_stat
    with_subelements:
      - "{{ dev_ssds }}"
      - partitions
    when: dev_ssds is defined

  - name: exit playbook osd(s) is not on this host
    fail:
        msg: exit playbook osds is not no this host
    with_items:
        osds_dir_stat.results
    when:
      -  osds_dir_stat is defined and item.stat.exists == false

  - name: install sgdisk(gdisk)
    package:
      name: gdisk
      state: present
    when: dev_ssds is defined

  - name: get osd(s) journal uuid
    shell: cat "/var/lib/ceph/osd/{{ cluster_name.stdout }}-{{ item.1.osd_id }}/journal_uuid"
    register: osds_uuid
    with_subelements:
      - "{{ dev_ssds }}"
      - partitions
    when: dev_ssds is defined

  - name: make partitions on new ssd
    shell: >
      sgdisk --new={{item.item[1].index}}:0:+{{item.item[1].size}} "--change-name={{ item.item[1].index }}:ceph journal" --typecode={{ item.item[1].index }}:{{ journal_typecode }} --partition-guid={{ item.item[1].index }}:{{ item.stdout }} --mbrtogpt -- /dev/{{ item.item[0].device_name }}
    with_items:
      - "{{ osds_uuid.results }}"
    when: dev_ssds is defined

  - name: stop osd(s) service
    service:
      name: "ceph-osd@{{ item.item[1].osd_id }}"
      state: stopped
      enabled: no
    with_items:
      - "{{ osds_uuid.results }}"
    when: dev_ssds is defined

  - name: reinitialize osd(s) journal in new ssd
    shell: >
       ceph-osd -i {{ item.item[1].osd_id }} --mkjournal --cluster {{ cluster_name.stdout }}
    with_items:
      - "{{ osds_uuid.results }}"
    when: dev_ssds is defined and cluster_name is defined

  - name: start osd(s) service
    service:
      name: "ceph-osd@{{ item.item[1].osd_id }}"
      state: started
      enabled: yes
    with_items:
       - "{{ osds_uuid.results }}"
    when: dev_ssds is defined