--- # This playbook use to recover Ceph OSDs after ssd journal failure. # You will also realise that it’s really simple to bring your # OSDs back to life after replacing your faulty SSD with a new one. # # You should define `dev_ssds` variable for host which change ssds after # failture. # # For example in host_vars/hostname1.yml # # dev_ssds: # - device_name: sdd # partitions: # - index: 1 # size: 10G # osd_id: 0 # - index: 2 # size: 10G # osd_id: 1 # - device_name: sdf # partitions: # - index: 1 # size: 10G # osd_id: 2 # # @param device_name: The device name of new ssd # @param partitions: The custom partition layout of new ssd # @param index: The index of this partition # @param size: The size of this partition # @param osd_id: Which osds's journal this pattition for. # # ansible-playbook recover-osds-after-ssd-journal-failure.yml # Prompts for select which host to recover, defaults to null, # doesn't select host the recover ssd. Input the hostname # which to recover osds after ssd journal failure # # ansible-playbook -e target_host=hostname \ # recover-osds-after-ssd-journal-failure.yml # Overrides the prompt using -e option. Can be used in # automation scripts to avoid interactive prompt. - hosts: localhost gather_facts: no vars_prompt: - name: target_host prompt: please enter the target hostname which to recover osds after ssd journal failure private: no tasks: - add_host: name: "{{ target_host }}" groups: dynamically_created_hosts - hosts: dynamically_created_hosts vars: journal_typecode: 45b0969e-9b03-4f30-b4c6-b4b80ceff106 dev_ssds: [] tasks: - name: get osd(s) if directory stat stat: path: "/var/lib/ceph/osd/ceph-{{ item.1.osd_id }}/journal_uuid" register: osds_dir_stat with_subelements: - "{{ dev_ssds }}" - partitions when: dev_ssds is defined - name: exit playbook osd(s) is not on this host fail: msg: exit playbook osds is not no this host with_items: osds_dir_stat.results when: - osds_dir_stat is defined and item.stat.exists == false - name: install sgdisk(gdisk) package: name: gdisk state: present when: dev_ssds is defined - name: get osd(s) journal uuid shell: cat "/var/lib/ceph/osd/ceph-{{ item.1.osd_id }}/journal_uuid" register: osds_uuid with_subelements: - "{{ dev_ssds }}" - partitions when: dev_ssds is defined - name: make partitions on new ssd shell: > sgdisk --new={{item.item[1].index}}:0:+{{item.item[1].size}} "--change-name={{ item.item[1].index }}:ceph journal" --typecode={{ item.item[1].index }}:{{ journal_typecode }} --partition-guid={{ item.item[1].index }}:{{ item.stdout }} --mbrtogpt -- /dev/{{ item.item[0].device_name }} with_items: - "{{ osds_uuid.results }}" when: dev_ssds is defined - name: stop osd(s) service service: name: "ceph-osd@{{ item.item[1].osd_id }}" state: stopped enabled: no with_items: - "{{ osds_uuid.results }}" when: dev_ssds is defined - name: reinitialize osd(s) journal in new ssd shell: > ceph-osd -i {{ item.item[1].osd_id }} --mkjournal with_items: - "{{ osds_uuid.results }}" when: dev_ssds is defined - name: start osd(s) service service: name: "ceph-osd@{{ item.item[1].osd_id }}" state: started enabled: yes with_items: - "{{ osds_uuid.results }}" when: dev_ssds is defined