--- # This playbook use to recover Ceph OSDs after ssd journal failure. # You will also realise that it’s really simple to bring your # OSDs back to life after replacing your faulty SSD with a new one. # # You should define `dev_ssds` variable for host which changes ssds after # failure. # # For example in host_vars/hostname1.yml # # dev_ssds: # - device_name: /dev/sdd # partitions: # - index: 1 # size: 10G # osd_id: 0 # - index: 2 # size: 10G # osd_id: 1 # - device_name: /dev/sdf # partitions: # - index: 1 # size: 10G # osd_id: 2 # # @param device_name: The full device path of new ssd # @param partitions: The custom partition layout of new ssd # @param index: The index of this partition # @param size: The size of this partition # @param osd_id: Which osds's journal this partition for. # # ansible-playbook recover-osds-after-ssd-journal-failure.yml # Prompts for select which host to recover, defaults to null, # doesn't select host the recover ssd. Input the hostname # which to recover osds after ssd journal failure # # ansible-playbook -e target_host=hostname \ # recover-osds-after-ssd-journal-failure.yml # Overrides the prompt using -e option. Can be used in # automation scripts to avoid interactive prompt. - hosts: localhost gather_facts: no vars_prompt: - name: target_host prompt: please enter the target hostname which to recover osds after ssd journal failure private: no tasks: - add_host: name: "{{ target_host }}" groups: dynamically_created_hosts - hosts: dynamically_created_hosts vars: journal_typecode: 45b0969e-9b03-4f30-b4c6-b4b80ceff106 dev_ssds: [] tasks: - fail: msg="please define dev_ssds variable" when: dev_ssds|length <= 0 - name: get osd(s) if directory stat stat: path: "/var/lib/ceph/osd/{{ cluster }}-{{ item.1.osd_id }}/journal_uuid" register: osds_dir_stat with_subelements: - "{{ dev_ssds }}" - partitions - name: exit playbook osd(s) is not on this host fail: msg: exit playbook osds is not no this host with_items: osds_dir_stat.results when: - osds_dir_stat is defined - item.stat.exists == false - name: install sgdisk(gdisk) package: name: gdisk state: present - name: get osd(s) journal uuid command: cat "/var/lib/ceph/osd/{{ cluster }}-{{ item.1.osd_id }}/journal_uuid" register: osds_uuid with_subelements: - "{{ dev_ssds }}" - partitions - name: make partitions on new ssd shell: > sgdisk --new={{item.item[1].index}}:0:+{{item.item[1].size}} "--change-name={{ item.item[1].index }}:ceph journal" --typecode={{ item.item[1].index }}:{{ journal_typecode }} --partition-guid={{ item.item[1].index }}:{{ item.stdout }} --mbrtogpt -- {{ item.item[0].device_name }} with_items: - "{{ osds_uuid.results }}" - name: stop osd(s) service service: name: "ceph-osd@{{ item.item[1].osd_id }}" state: stopped with_items: - "{{ osds_uuid.results }}" - name: reinitialize osd(s) journal in new ssd command: ceph-osd -i {{ item.item[1].osd_id }} --mkjournal --cluster {{ cluster }} with_items: - "{{ osds_uuid.results }}" - name: start osd(s) service service: name: "ceph-osd@{{ item.item[1].osd_id }}" state: started with_items: - "{{ osds_uuid.results }}"