Add retry_stagger var for failed download/pushes.
* Add the retry_stagger var to tweak push and retry time strategies. * Add large deployments related docs. Signed-off-by: Bogdan Dobrelya <bdobrelia@mirantis.com>pull/488/head
parent
9926395e5b
commit
390764c2b4
|
@ -0,0 +1,19 @@
|
||||||
|
Large deployments of K8s
|
||||||
|
========================
|
||||||
|
|
||||||
|
For a large scaled deployments, consider the following configuration changes:
|
||||||
|
|
||||||
|
* Tune [ansible settings](http://docs.ansible.com/ansible/intro_configuration.html)
|
||||||
|
for `forks` and `timeout` vars to fit large numbers of nodes being deployed.
|
||||||
|
|
||||||
|
* Override containers' `foo_image_repo` vars to point to intranet registry.
|
||||||
|
|
||||||
|
* Override the ``download_run_once: true`` to download binaries and container
|
||||||
|
images only once then push to nodes in batches.
|
||||||
|
|
||||||
|
* Adjust the `retry_stagger` global var as appropriate. It should provide sane
|
||||||
|
load on a delegate (the first K8s master node) then retrying failed
|
||||||
|
push or download operations.
|
||||||
|
|
||||||
|
For example, when deploying 200 nodes, you may want to run ansible with
|
||||||
|
``--forks=50``, ``--timeout=600`` and define the ``retry_stagger: 60``.
|
|
@ -7,6 +7,8 @@ bin_dir: /usr/local/bin
|
||||||
# Where the binaries will be downloaded.
|
# Where the binaries will be downloaded.
|
||||||
# Note: ensure that you've enough disk space (about 1G)
|
# Note: ensure that you've enough disk space (about 1G)
|
||||||
local_release_dir: "/tmp/releases"
|
local_release_dir: "/tmp/releases"
|
||||||
|
# Random shifts for retrying failed ops like pushing/downloading
|
||||||
|
retry_stagger: 5
|
||||||
|
|
||||||
# Uncomment this line for CoreOS only.
|
# Uncomment this line for CoreOS only.
|
||||||
# Directory where python binary is installed
|
# Directory where python binary is installed
|
||||||
|
|
|
@ -30,7 +30,7 @@
|
||||||
register: keyserver_task_result
|
register: keyserver_task_result
|
||||||
until: keyserver_task_result|success
|
until: keyserver_task_result|success
|
||||||
retries: 4
|
retries: 4
|
||||||
delay: "{{ 20 | random + 3 }}"
|
delay: "{{ retry_stagger | random + 3 }}"
|
||||||
with_items: "{{ docker_repo_key_info.repo_keys }}"
|
with_items: "{{ docker_repo_key_info.repo_keys }}"
|
||||||
when: ansible_os_family != "CoreOS"
|
when: ansible_os_family != "CoreOS"
|
||||||
|
|
||||||
|
@ -58,7 +58,7 @@
|
||||||
register: docker_task_result
|
register: docker_task_result
|
||||||
until: docker_task_result|success
|
until: docker_task_result|success
|
||||||
retries: 4
|
retries: 4
|
||||||
delay: "{{ 20 | random + 3 }}"
|
delay: "{{ retry_stagger | random + 3 }}"
|
||||||
with_items: "{{ docker_package_info.pkgs }}"
|
with_items: "{{ docker_package_info.pkgs }}"
|
||||||
when: (ansible_os_family != "CoreOS") and (docker_package_info.pkgs|length > 0)
|
when: (ansible_os_family != "CoreOS") and (docker_package_info.pkgs|length > 0)
|
||||||
|
|
||||||
|
|
|
@ -22,7 +22,7 @@
|
||||||
register: get_url_result
|
register: get_url_result
|
||||||
until: "'OK' in get_url_result.msg or 'file already exists' in get_url_result.msg"
|
until: "'OK' in get_url_result.msg or 'file already exists' in get_url_result.msg"
|
||||||
retries: 4
|
retries: 4
|
||||||
delay: "{{ 20 | random + 3 }}"
|
delay: "{{ retry_stagger | random + 3 }}"
|
||||||
when: "{{ download.enabled|bool and not download.container|bool }}"
|
when: "{{ download.enabled|bool and not download.container|bool }}"
|
||||||
delegate_to: "{{ groups['kube-master'][0] if download_run_once|bool else omit }}"
|
delegate_to: "{{ groups['kube-master'][0] if download_run_once|bool else omit }}"
|
||||||
run_once: "{{ download_run_once|bool }}"
|
run_once: "{{ download_run_once|bool }}"
|
||||||
|
@ -63,7 +63,7 @@
|
||||||
register: pull_task_result
|
register: pull_task_result
|
||||||
until: pull_task_result.rc == 0
|
until: pull_task_result.rc == 0
|
||||||
retries: 4
|
retries: 4
|
||||||
delay: "{{ 20 | random + 3 }}"
|
delay: "{{ retry_stagger | random + 3 }}"
|
||||||
when: "{{ download.enabled|bool and download.container|bool }}"
|
when: "{{ download.enabled|bool and download.container|bool }}"
|
||||||
delegate_to: "{{ groups['kube-master'][0] if download_run_once|bool else omit }}"
|
delegate_to: "{{ groups['kube-master'][0] if download_run_once|bool else omit }}"
|
||||||
run_once: "{{ download_run_once|bool }}"
|
run_once: "{{ download_run_once|bool }}"
|
||||||
|
@ -85,7 +85,7 @@
|
||||||
register: get_task
|
register: get_task
|
||||||
until: get_task|success
|
until: get_task|success
|
||||||
retries: 4
|
retries: 4
|
||||||
delay: "{{ 20 | random + 3 }}"
|
delay: "{{ retry_stagger | random + 3 }}"
|
||||||
when: ansible_os_family != "CoreOS" and inventory_hostname != groups['kube-master'][0] and download_run_once|bool
|
when: ansible_os_family != "CoreOS" and inventory_hostname != groups['kube-master'][0] and download_run_once|bool
|
||||||
|
|
||||||
- name: Download | load container images
|
- name: Download | load container images
|
||||||
|
|
|
@ -20,7 +20,7 @@
|
||||||
register: etcd_task_result
|
register: etcd_task_result
|
||||||
until: etcd_task_result.rc == 0
|
until: etcd_task_result.rc == 0
|
||||||
retries: 4
|
retries: 4
|
||||||
delay: "{{ 20 | random + 3 }}"
|
delay: "{{ retry_stagger | random + 3 }}"
|
||||||
changed_when: false
|
changed_when: false
|
||||||
|
|
||||||
#Plan B: looks nicer, but requires docker-py on all hosts:
|
#Plan B: looks nicer, but requires docker-py on all hosts:
|
||||||
|
|
|
@ -12,7 +12,7 @@
|
||||||
register: kube_task_result
|
register: kube_task_result
|
||||||
until: kube_task_result.rc == 0
|
until: kube_task_result.rc == 0
|
||||||
retries: 4
|
retries: 4
|
||||||
delay: "{{ 20 | random + 3 }}"
|
delay: "{{ retry_stagger | random + 3 }}"
|
||||||
changed_when: false
|
changed_when: false
|
||||||
|
|
||||||
- name: Write kube-apiserver manifest
|
- name: Write kube-apiserver manifest
|
||||||
|
|
|
@ -104,7 +104,7 @@
|
||||||
register: pkgs_task_result
|
register: pkgs_task_result
|
||||||
until: pkgs_task_result|success
|
until: pkgs_task_result|success
|
||||||
retries: 4
|
retries: 4
|
||||||
delay: "{{ 20 | random + 3 }}"
|
delay: "{{ retry_stagger | random + 3 }}"
|
||||||
with_items: "{{required_pkgs | default([]) | union(common_required_pkgs|default([]))}}"
|
with_items: "{{required_pkgs | default([]) | union(common_required_pkgs|default([]))}}"
|
||||||
when: ansible_os_family != "CoreOS"
|
when: ansible_os_family != "CoreOS"
|
||||||
|
|
||||||
|
|
|
@ -48,7 +48,7 @@
|
||||||
register: cni_task_result
|
register: cni_task_result
|
||||||
until: cni_task_result.rc == 0
|
until: cni_task_result.rc == 0
|
||||||
retries: 4
|
retries: 4
|
||||||
delay: "{{ 20 | random + 3 }}"
|
delay: "{{ retry_stagger | random + 3 }}"
|
||||||
changed_when: false
|
changed_when: false
|
||||||
when: use_hyperkube_cni
|
when: use_hyperkube_cni
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue