From c1b9660ec85eeeb01ef6f3279bdc83f8923a3a06 Mon Sep 17 00:00:00 2001 From: Matthew Mosesohn Date: Fri, 17 Mar 2017 18:21:00 +0300 Subject: [PATCH 01/16] Move graceful upgrade test to debian canal HA, adjust drain Graceful upgrades require 3 nodes Drain now has a command timeout of 40s --- .gitlab-ci.yml | 2 +- roles/upgrade/pre-upgrade/tasks/main.yml | 8 +++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 316c2428d..7397a450b 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -267,6 +267,7 @@ before_script: CLOUD_REGION: us-east1-b UPGRADE_TEST: "basic" CLUSTER_MODE: ha + UPGRADE_TEST: "graceful" .rhel7_weave_variables: &rhel7_weave_variables # stage: deploy-gce-part1 @@ -288,7 +289,6 @@ before_script: CLOUD_IMAGE: debian-8-kubespray CLOUD_REGION: us-central1-b CLUSTER_MODE: default - UPGRADE_TEST: "graceful" .coreos_canal_variables: &coreos_canal_variables # stage: deploy-gce-part2 diff --git a/roles/upgrade/pre-upgrade/tasks/main.yml b/roles/upgrade/pre-upgrade/tasks/main.yml index 90b535d2e..9b47c845b 100644 --- a/roles/upgrade/pre-upgrade/tasks/main.yml +++ b/roles/upgrade/pre-upgrade/tasks/main.yml @@ -5,7 +5,13 @@ delegate_to: "{{ groups['kube-master'][0] }}" - name: Drain node - command: "{{ bin_dir }}/kubectl drain --force --ignore-daemonsets --grace-period 30 --delete-local-data {{ ansible_hostname }}" + command: >- + {{ bin_dir }}/kubectl drain + --force + --ignore-daemonsets + --grace-period {{ drain_grace_period }} + --timeout {{ drain_timeout }} + --delete-local-data {{ ansible_hostname }} delegate_to: "{{ groups['kube-master'][0] }}" - name: Sleep for grace period for draining From 5ed03ce7f0a18b38a4d0d9c60a4e83e21ca1294d Mon Sep 17 00:00:00 2001 From: Matthew Mosesohn Date: Wed, 22 Mar 2017 13:02:39 +0300 Subject: [PATCH 02/16] Use checksum of dnsmasq config to trigger updates of dnsmasq Allows config changes made by Ansible to restart dnsmasq deployment --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 7397a450b..5f8c08279 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -264,7 +264,7 @@ before_script: # stage: deploy-gce-part1 KUBE_NETWORK_PLUGIN: canal CLOUD_IMAGE: debian-8-kubespray - CLOUD_REGION: us-east1-b + CLOUD_REGION: europe-west1-b UPGRADE_TEST: "basic" CLUSTER_MODE: ha UPGRADE_TEST: "graceful" From 44d851d5bbb8201547b0a8e3bde3123bd7f2aab4 Mon Sep 17 00:00:00 2001 From: Matthew Mosesohn Date: Wed, 22 Mar 2017 16:19:27 +0300 Subject: [PATCH 03/16] Only cordon Ready nodes --- roles/kubernetes/master/tasks/pre-upgrade.yml | 2 ++ roles/upgrade/post-upgrade/tasks/main.yml | 1 + roles/upgrade/pre-upgrade/tasks/main.yml | 17 ++++++++++++++--- 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/roles/kubernetes/master/tasks/pre-upgrade.yml b/roles/kubernetes/master/tasks/pre-upgrade.yml index 244c8b13e..adafe80ae 100644 --- a/roles/kubernetes/master/tasks/pre-upgrade.yml +++ b/roles/kubernetes/master/tasks/pre-upgrade.yml @@ -93,3 +93,5 @@ delegate_to: "{{item}}" with_items: "{{groups['etcd']}}" when: needs_etcd_migration|bool + notify: "Master | reload kubelet" + diff --git a/roles/upgrade/post-upgrade/tasks/main.yml b/roles/upgrade/post-upgrade/tasks/main.yml index d8243d04d..bff9983ff 100644 --- a/roles/upgrade/post-upgrade/tasks/main.yml +++ b/roles/upgrade/post-upgrade/tasks/main.yml @@ -3,3 +3,4 @@ - name: Uncordon node command: "{{ bin_dir }}/kubectl uncordon {{ ansible_hostname }}" delegate_to: "{{ groups['kube-master'][0] }}" + when: needs_cordoning|default(false) diff --git a/roles/upgrade/pre-upgrade/tasks/main.yml b/roles/upgrade/pre-upgrade/tasks/main.yml index 9b47c845b..e0e5ded81 100644 --- a/roles/upgrade/pre-upgrade/tasks/main.yml +++ b/roles/upgrade/pre-upgrade/tasks/main.yml @@ -1,8 +1,21 @@ --- +- name: See if node is in ready state + command: "kubectl get nodes | grep {{ inventory_hostname }}" + register: kubectl_nodes + ignore_errors: true + +- set_fact: + needs_cordoning: >- + {% if " Ready" in kubectl_nodes.stdout %} + true + {% else %} + false + {% endif %} - name: Cordon node command: "{{ bin_dir }}/kubectl cordon {{ ansible_hostname }}" delegate_to: "{{ groups['kube-master'][0] }}" + when: needs_cordoning - name: Drain node command: >- @@ -13,6 +26,4 @@ --timeout {{ drain_timeout }} --delete-local-data {{ ansible_hostname }} delegate_to: "{{ groups['kube-master'][0] }}" - -- name: Sleep for grace period for draining - pause: seconds=30 + when: needs_cordoning From e9a294fd9c6f94cbbf3cd75ac9bb020226a4438d Mon Sep 17 00:00:00 2001 From: Matthew Mosesohn Date: Thu, 23 Mar 2017 12:11:30 +0300 Subject: [PATCH 04/16] Significantly reduce memory requirements Canal runs more pods and upgrades need a bit of extra room to load new pods in and get the old ones out. --- roles/dnsmasq/defaults/main.yml | 2 +- roles/kubernetes/master/defaults/main.yml | 6 +++--- roles/kubernetes/master/tasks/pre-upgrade.yml | 2 +- roles/kubernetes/node/defaults/main.yml | 2 +- roles/network_plugin/canal/defaults/main.yml | 6 +++--- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/roles/dnsmasq/defaults/main.yml b/roles/dnsmasq/defaults/main.yml index 21d051ded..58b1b7f1d 100644 --- a/roles/dnsmasq/defaults/main.yml +++ b/roles/dnsmasq/defaults/main.yml @@ -24,7 +24,7 @@ dnsmasq_image_tag: "{{ dnsmasq_version }}" # Limits for dnsmasq/kubedns apps dns_cpu_limit: 100m dns_memory_limit: 170Mi -dns_cpu_requests: 70m +dns_cpu_requests: 40m dns_memory_requests: 50Mi # Autoscaler parameters diff --git a/roles/kubernetes/master/defaults/main.yml b/roles/kubernetes/master/defaults/main.yml index 2fd307801..9b6e3eff4 100644 --- a/roles/kubernetes/master/defaults/main.yml +++ b/roles/kubernetes/master/defaults/main.yml @@ -19,7 +19,7 @@ kube_apiserver_storage_backend: etcd3 # Limits for kube components kube_controller_memory_limit: 512M kube_controller_cpu_limit: 250m -kube_controller_memory_requests: 170M +kube_controller_memory_requests: 100M kube_controller_cpu_requests: 100m kube_controller_node_monitor_grace_period: 40s kube_controller_node_monitor_period: 5s @@ -27,11 +27,11 @@ kube_controller_pod_eviction_timeout: 5m0s kube_scheduler_memory_limit: 512M kube_scheduler_cpu_limit: 250m kube_scheduler_memory_requests: 170M -kube_scheduler_cpu_requests: 100m +kube_scheduler_cpu_requests: 80m kube_apiserver_memory_limit: 2000M kube_apiserver_cpu_limit: 800m kube_apiserver_memory_requests: 256M -kube_apiserver_cpu_requests: 300m +kube_apiserver_cpu_requests: 100m ## Enable/Disable Kube API Server Authentication Methods diff --git a/roles/kubernetes/master/tasks/pre-upgrade.yml b/roles/kubernetes/master/tasks/pre-upgrade.yml index adafe80ae..948b944c5 100644 --- a/roles/kubernetes/master/tasks/pre-upgrade.yml +++ b/roles/kubernetes/master/tasks/pre-upgrade.yml @@ -38,7 +38,7 @@ environment: ETCDCTL_API: 2 register: old_data_exists - delegate_to: "{{groups['kube-master'][0]}}" + delegate_to: "{{groups['etcd'][0]}}" when: kube_apiserver_storage_backend == "etcd3" failed_when: false diff --git a/roles/kubernetes/node/defaults/main.yml b/roles/kubernetes/node/defaults/main.yml index e29847a39..b4ca13d12 100644 --- a/roles/kubernetes/node/defaults/main.yml +++ b/roles/kubernetes/node/defaults/main.yml @@ -21,7 +21,7 @@ kube_proxy_cpu_requests: 150m nginx_memory_limit: 512M nginx_cpu_limit: 300m nginx_memory_requests: 32M -nginx_cpu_requests: 50m +nginx_cpu_requests: 25m # kube_api_runtime_config: # - extensions/v1beta1/daemonsets=true diff --git a/roles/network_plugin/canal/defaults/main.yml b/roles/network_plugin/canal/defaults/main.yml index f82ff831e..d4018db4d 100644 --- a/roles/network_plugin/canal/defaults/main.yml +++ b/roles/network_plugin/canal/defaults/main.yml @@ -21,13 +21,13 @@ canal_policy_dir: /etc/kubernetes/policy calico_node_memory_limit: 500M calico_node_cpu_limit: 200m calico_node_memory_requests: 64M -calico_node_cpu_requests: 100m +calico_node_cpu_requests: 50m flannel_memory_limit: 500M flannel_cpu_limit: 200m flannel_memory_requests: 64M -flannel_cpu_requests: 100m +flannel_cpu_requests: 50m calicoctl_memory_limit: 170M calicoctl_cpu_limit: 100m calicoctl_memory_requests: 32M -calicoctl_cpu_requests: 50m +calicoctl_cpu_requests: 25m From 6e505c0c3fb470a0ae3860ffae7f44d8e7546656 Mon Sep 17 00:00:00 2001 From: Matthew Mosesohn Date: Thu, 23 Mar 2017 13:10:42 +0300 Subject: [PATCH 05/16] Fix delegate tasks for kubectl and etcdctl --- roles/kubernetes/master/tasks/post-upgrade.yml | 1 + roles/upgrade/pre-upgrade/tasks/main.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/roles/kubernetes/master/tasks/post-upgrade.yml b/roles/kubernetes/master/tasks/post-upgrade.yml index 07fc57b96..391646272 100644 --- a/roles/kubernetes/master/tasks/post-upgrade.yml +++ b/roles/kubernetes/master/tasks/post-upgrade.yml @@ -3,4 +3,5 @@ command: "{{ bin_dir }}/etcdctl --endpoints={{ etcd_access_addresses }} rm -r /registry" environment: ETCDCTL_API: 2 + delegate_to: "{{groups['etcd'][0]}}" when: kube_apiserver_storage_backend == "etcd3" and needs_etcd_migration|bool|default(false) diff --git a/roles/upgrade/pre-upgrade/tasks/main.yml b/roles/upgrade/pre-upgrade/tasks/main.yml index e0e5ded81..fbcd1cf85 100644 --- a/roles/upgrade/pre-upgrade/tasks/main.yml +++ b/roles/upgrade/pre-upgrade/tasks/main.yml @@ -2,6 +2,7 @@ - name: See if node is in ready state command: "kubectl get nodes | grep {{ inventory_hostname }}" register: kubectl_nodes + delegate_to: "{{ groups['kube-master'][0] }}" ignore_errors: true - set_fact: From 21a9dea99f82f73612dedcfc455808b4bf5382ac Mon Sep 17 00:00:00 2001 From: Matthew Mosesohn Date: Thu, 23 Mar 2017 13:43:01 +0300 Subject: [PATCH 06/16] move kubernetes-apps/network-plugin back to master role --- upgrade-cluster.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/upgrade-cluster.yml b/upgrade-cluster.yml index f4f48d543..1f0479200 100644 --- a/upgrade-cluster.yml +++ b/upgrade-cluster.yml @@ -68,6 +68,7 @@ - { role: kubernetes/master, tags: master } - { role: network_plugin, tags: network } - { role: upgrade/post-upgrade, tags: post-upgrade } + - { role: kubernetes-apps/network_plugin, tags: network } #Finally handle worker upgrades, based on given batch size - hosts: kube-node:!kube-master @@ -79,7 +80,6 @@ - { role: kubernetes/node, tags: node } - { role: network_plugin, tags: network } - { role: upgrade/post-upgrade, tags: post-upgrade } - - { role: kubernetes-apps/network_plugin, tags: network } - hosts: calico-rr any_errors_fatal: true From 49e4d344da009e3a2bdd71c7b365aaba36f13e4d Mon Sep 17 00:00:00 2001 From: Matthew Mosesohn Date: Thu, 23 Mar 2017 17:18:51 +0300 Subject: [PATCH 07/16] move network plugins out of grouped upgrades --- roles/kubernetes/master/tasks/pre-upgrade.yml | 4 +++- upgrade-cluster.yml | 9 ++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/roles/kubernetes/master/tasks/pre-upgrade.yml b/roles/kubernetes/master/tasks/pre-upgrade.yml index 948b944c5..10093a08f 100644 --- a/roles/kubernetes/master/tasks/pre-upgrade.yml +++ b/roles/kubernetes/master/tasks/pre-upgrade.yml @@ -55,11 +55,13 @@ set_fact: needs_etcd_migration: "{{ kube_apiserver_storage_backend == 'etcd3' and data_migrated.stdout_lines|length == 0 and old_data_exists.rc == 0 }}" -- name: "Pre-upgrade | Write invalid image to kube-apiserver manifest if necessary" +- name: "Pre-upgrade | Write invalid image to kube-apiserver manifest if necessary on all kube-masters" replace: dest: /etc/kubernetes/manifests/kube-apiserver.manifest regexp: '(\s+)image:\s+.*?$' replace: '\1image: kill.apiserver.using.fake.image.in:manifest' + delegate_to: "{{item}}" + with_items: "{{groups['kube-master']}}" register: kube_apiserver_manifest_replaced when: (secret_changed|default(false) or etcd_secret_changed|default(false) or needs_etcd_migration|bool) and kube_apiserver_manifest.stat.exists diff --git a/upgrade-cluster.yml b/upgrade-cluster.yml index 1f0479200..4ecc660f9 100644 --- a/upgrade-cluster.yml +++ b/upgrade-cluster.yml @@ -68,7 +68,6 @@ - { role: kubernetes/master, tags: master } - { role: network_plugin, tags: network } - { role: upgrade/post-upgrade, tags: post-upgrade } - - { role: kubernetes-apps/network_plugin, tags: network } #Finally handle worker upgrades, based on given batch size - hosts: kube-node:!kube-master @@ -80,6 +79,14 @@ - { role: kubernetes/node, tags: node } - { role: network_plugin, tags: network } - { role: upgrade/post-upgrade, tags: post-upgrade } + - { role: kargo-defaults} + +- hosts: kube-master + any_errors_fatal: true + roles: + - { role: kargo-defaults} + - { role: kubernetes-apps/network_plugin, tags: network } + - { role: kubernetes-apps/policy_controller, tags: policy-controller } - hosts: calico-rr any_errors_fatal: true From 0794a866a7e9fac802be07db001d0719984d9e0a Mon Sep 17 00:00:00 2001 From: Matthew Mosesohn Date: Thu, 23 Mar 2017 18:55:37 +0300 Subject: [PATCH 08/16] switch debian8-canal-ha to ubuntu --- .gitlab-ci.yml | 12 ++++++------ roles/kubernetes/master/tasks/main.yml | 2 ++ roles/kubernetes/master/tasks/pre-upgrade.yml | 10 +++++++++- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 5f8c08279..186251328 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -260,10 +260,10 @@ before_script: BOOTSTRAP_OS: coreos RESOLVCONF_MODE: host_resolvconf # This is required as long as the CoreOS stable channel uses docker < 1.12 -.debian8_canal_ha_variables: &debian8_canal_ha_variables +.ubuntu_canal_ha_variables: &ubuntu_canal_ha_variables # stage: deploy-gce-part1 KUBE_NETWORK_PLUGIN: canal - CLOUD_IMAGE: debian-8-kubespray + CLOUD_IMAGE: ubuntu-1604-xenial CLOUD_REGION: europe-west1-b UPGRADE_TEST: "basic" CLUSTER_MODE: ha @@ -416,24 +416,24 @@ ubuntu-weave-sep-triggers: only: ['triggers'] # More builds for PRs/merges (manual) and triggers (auto) -debian8-canal-ha: +ubuntu-canal-ha: stage: deploy-gce-part1 <<: *job <<: *gce variables: <<: *gce_variables - <<: *debian8_canal_ha_variables + <<: *ubuntu_canal_ha_variables when: manual except: ['triggers'] only: ['master', /^pr-.*$/] -debian8-canal-ha-triggers: +ubuntu-canal-ha-triggers: stage: deploy-gce-part1 <<: *job <<: *gce variables: <<: *gce_variables - <<: *debian8_canal_ha_variables + <<: *ubuntu_canal_ha_variables when: on_success only: ['triggers'] diff --git a/roles/kubernetes/master/tasks/main.yml b/roles/kubernetes/master/tasks/main.yml index 2bd9758bf..f0e944bd9 100644 --- a/roles/kubernetes/master/tasks/main.yml +++ b/roles/kubernetes/master/tasks/main.yml @@ -52,6 +52,8 @@ - name: Create kube system namespace command: "{{ bin_dir }}/kubectl create -f {{kube_config_dir}}/{{system_namespace}}-ns.yml" + retries: 4 + delay: "{{ retry_stagger | random + 3 }}" changed_when: False when: kubesystem|failed and inventory_hostname == groups['kube-master'][0] tags: apps diff --git a/roles/kubernetes/master/tasks/pre-upgrade.yml b/roles/kubernetes/master/tasks/pre-upgrade.yml index 10093a08f..7776b9703 100644 --- a/roles/kubernetes/master/tasks/pre-upgrade.yml +++ b/roles/kubernetes/master/tasks/pre-upgrade.yml @@ -95,5 +95,13 @@ delegate_to: "{{item}}" with_items: "{{groups['etcd']}}" when: needs_etcd_migration|bool - notify: "Master | reload kubelet" + +- name: "Pre-upgrade | restart kubelet on all masters" + service: + name: kubelet + state: restarted + delegate_to: "{{item}}" + with_items: "{{groups['kube-master']}}" + register: kube_apiserver_manifest_replaced + when: needs_etcd_migration|bool From 57ee3042603bdc92737496dd51672c9f79a49819 Mon Sep 17 00:00:00 2001 From: Matthew Mosesohn Date: Thu, 23 Mar 2017 19:19:51 +0300 Subject: [PATCH 09/16] ensure post-upgrade purge ones only once --- roles/kubernetes/master/tasks/post-upgrade.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/roles/kubernetes/master/tasks/post-upgrade.yml b/roles/kubernetes/master/tasks/post-upgrade.yml index 391646272..e68526493 100644 --- a/roles/kubernetes/master/tasks/post-upgrade.yml +++ b/roles/kubernetes/master/tasks/post-upgrade.yml @@ -4,4 +4,5 @@ environment: ETCDCTL_API: 2 delegate_to: "{{groups['etcd'][0]}}" + run_once: true when: kube_apiserver_storage_backend == "etcd3" and needs_etcd_migration|bool|default(false) From a3f568fc643ad1041756546b4f17e3736333aa00 Mon Sep 17 00:00:00 2001 From: Matthew Mosesohn Date: Fri, 24 Mar 2017 18:29:28 +0300 Subject: [PATCH 10/16] restart scheduler and controller-manager too --- roles/kubernetes/master/tasks/main.yml | 2 ++ .../kubernetes/master/tasks/post-upgrade.yml | 9 +++++++++ roles/kubernetes/master/tasks/pre-upgrade.yml | 20 ++++++------------- .../kubernetes/secrets/tasks/check-tokens.yml | 4 ++-- roles/upgrade/pre-upgrade/defaults/main.yml | 3 +++ roles/upgrade/pre-upgrade/tasks/main.yml | 4 ++-- 6 files changed, 24 insertions(+), 18 deletions(-) create mode 100644 roles/upgrade/pre-upgrade/defaults/main.yml diff --git a/roles/kubernetes/master/tasks/main.yml b/roles/kubernetes/master/tasks/main.yml index f0e944bd9..2c669c46d 100644 --- a/roles/kubernetes/master/tasks/main.yml +++ b/roles/kubernetes/master/tasks/main.yml @@ -54,6 +54,8 @@ command: "{{ bin_dir }}/kubectl create -f {{kube_config_dir}}/{{system_namespace}}-ns.yml" retries: 4 delay: "{{ retry_stagger | random + 3 }}" + register: create_system_ns + until: create_system_ns.rc == 0 changed_when: False when: kubesystem|failed and inventory_hostname == groups['kube-master'][0] tags: apps diff --git a/roles/kubernetes/master/tasks/post-upgrade.yml b/roles/kubernetes/master/tasks/post-upgrade.yml index e68526493..d157311de 100644 --- a/roles/kubernetes/master/tasks/post-upgrade.yml +++ b/roles/kubernetes/master/tasks/post-upgrade.yml @@ -1,4 +1,13 @@ --- +- name: "Post-upgrade | restart kubelet on all masters" + service: + name: kubelet + state: restarted + delegate_to: "{{item}}" + with_items: "{{groups['kube-master']}}" + register: kube_apiserver_manifest_replaced + when: needs_etcd_migration|bool + - name: "Post-upgrade | etcd3 upgrade | purge etcd2 k8s data" command: "{{ bin_dir }}/etcdctl --endpoints={{ etcd_access_addresses }} rm -r /registry" environment: diff --git a/roles/kubernetes/master/tasks/pre-upgrade.yml b/roles/kubernetes/master/tasks/pre-upgrade.yml index 7776b9703..12b270421 100644 --- a/roles/kubernetes/master/tasks/pre-upgrade.yml +++ b/roles/kubernetes/master/tasks/pre-upgrade.yml @@ -55,13 +55,15 @@ set_fact: needs_etcd_migration: "{{ kube_apiserver_storage_backend == 'etcd3' and data_migrated.stdout_lines|length == 0 and old_data_exists.rc == 0 }}" -- name: "Pre-upgrade | Write invalid image to kube-apiserver manifest if necessary on all kube-masters" +- name: "Pre-upgrade | Write invalid image to master manifests on all kube-masters" replace: - dest: /etc/kubernetes/manifests/kube-apiserver.manifest + dest: "/etc/kubernetes/manifests/{{item[1]}}.manifest" regexp: '(\s+)image:\s+.*?$' replace: '\1image: kill.apiserver.using.fake.image.in:manifest' - delegate_to: "{{item}}" - with_items: "{{groups['kube-master']}}" + delegate_to: "{{item[0]}}" + with_nested: + - "{{groups['kube-master']}}" + - ["kube-apiserver", "kube-controller-manager", "kube-scheduler"] register: kube_apiserver_manifest_replaced when: (secret_changed|default(false) or etcd_secret_changed|default(false) or needs_etcd_migration|bool) and kube_apiserver_manifest.stat.exists @@ -95,13 +97,3 @@ delegate_to: "{{item}}" with_items: "{{groups['etcd']}}" when: needs_etcd_migration|bool - -- name: "Pre-upgrade | restart kubelet on all masters" - service: - name: kubelet - state: restarted - delegate_to: "{{item}}" - with_items: "{{groups['kube-master']}}" - register: kube_apiserver_manifest_replaced - when: needs_etcd_migration|bool - diff --git a/roles/kubernetes/secrets/tasks/check-tokens.yml b/roles/kubernetes/secrets/tasks/check-tokens.yml index 497bc7caf..616664b93 100644 --- a/roles/kubernetes/secrets/tasks/check-tokens.yml +++ b/roles/kubernetes/secrets/tasks/check-tokens.yml @@ -27,9 +27,9 @@ sync_tokens: true when: >- {%- set tokens = {'sync': False} -%} - {%- for server in groups['kube-master'] | intersect(ansible_play_hosts) + {%- for server in groups['kube-master'] | intersect(ansible_play_batch) if (not hostvars[server].known_tokens.stat.exists) or - (hostvars[server].known_tokens.stat.checksum != known_tokens_master.stat.checksum|default('')) -%} + (hostvars[server].known_tokens.stat.checksum|default('') != known_tokens_master.stat.checksum|default('')) -%} {%- set _ = tokens.update({'sync': True}) -%} {%- endfor -%} {{ tokens.sync }} diff --git a/roles/upgrade/pre-upgrade/defaults/main.yml b/roles/upgrade/pre-upgrade/defaults/main.yml new file mode 100644 index 000000000..5980360fc --- /dev/null +++ b/roles/upgrade/pre-upgrade/defaults/main.yml @@ -0,0 +1,3 @@ +drain_grace_period: 30 +drain_timeout: 40s + diff --git a/roles/upgrade/pre-upgrade/tasks/main.yml b/roles/upgrade/pre-upgrade/tasks/main.yml index fbcd1cf85..f2251375b 100644 --- a/roles/upgrade/pre-upgrade/tasks/main.yml +++ b/roles/upgrade/pre-upgrade/tasks/main.yml @@ -1,9 +1,9 @@ --- - name: See if node is in ready state - command: "kubectl get nodes | grep {{ inventory_hostname }}" + shell: "kubectl get nodes | grep {{ inventory_hostname }}" register: kubectl_nodes delegate_to: "{{ groups['kube-master'][0] }}" - ignore_errors: true + failed_when: false - set_fact: needs_cordoning: >- From 48beef25fa6ca412357c9ef484b8ddeb5917e88e Mon Sep 17 00:00:00 2001 From: Matthew Mosesohn Date: Mon, 27 Mar 2017 14:44:21 +0300 Subject: [PATCH 11/16] delete master containers forcefully --- roles/kubernetes/master/handlers/main.yml | 2 +- roles/kubernetes/master/tasks/pre-upgrade.yml | 22 ++++++++++--------- tests/templates/inventory-gce.j2 | 2 ++ 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/roles/kubernetes/master/handlers/main.yml b/roles/kubernetes/master/handlers/main.yml index 38edeeb1f..1eb5a4886 100644 --- a/roles/kubernetes/master/handlers/main.yml +++ b/roles/kubernetes/master/handlers/main.yml @@ -26,7 +26,7 @@ url: http://localhost:10251/healthz register: scheduler_result until: scheduler_result.status == 200 - retries: 15 + retries: 60 delay: 5 - name: Master | wait for kube-controller-manager diff --git a/roles/kubernetes/master/tasks/pre-upgrade.yml b/roles/kubernetes/master/tasks/pre-upgrade.yml index 12b270421..35d573e97 100644 --- a/roles/kubernetes/master/tasks/pre-upgrade.yml +++ b/roles/kubernetes/master/tasks/pre-upgrade.yml @@ -55,11 +55,10 @@ set_fact: needs_etcd_migration: "{{ kube_apiserver_storage_backend == 'etcd3' and data_migrated.stdout_lines|length == 0 and old_data_exists.rc == 0 }}" -- name: "Pre-upgrade | Write invalid image to master manifests on all kube-masters" - replace: - dest: "/etc/kubernetes/manifests/{{item[1]}}.manifest" - regexp: '(\s+)image:\s+.*?$' - replace: '\1image: kill.apiserver.using.fake.image.in:manifest' +- name: "Pre-upgrade | Delete master manifests on all kube-masters" + file: + path: "/etc/kubernetes/manifests/{{item[1]}}.manifest" + state: absent delegate_to: "{{item[0]}}" with_nested: - "{{groups['kube-master']}}" @@ -67,11 +66,14 @@ register: kube_apiserver_manifest_replaced when: (secret_changed|default(false) or etcd_secret_changed|default(false) or needs_etcd_migration|bool) and kube_apiserver_manifest.stat.exists -- name: "Pre-upgrade | Pause while waiting for kubelet to delete kube-apiserver pod" - pause: - seconds: 20 - when: kube_apiserver_manifest_replaced.changed - tags: kube-apiserver +- name: "Pre-upgrade | Delete master containers forcefully on all kube-masters" + shell: "docker ps -f name=k8s-{{item}}* -q | xargs --no-run-if-empty docker rm -f" + delegate_to: "{{item[0]}}" + with_nested: + - "{{groups['kube-master']}}" + - ["kube-apiserver", "kube-controller-manager", "kube-scheduler"] + register: kube_apiserver_manifest_replaced + when: (secret_changed|default(false) or etcd_secret_changed|default(false) or needs_etcd_migration|bool) and kube_apiserver_manifest.stat.exists - name: "Pre-upgrade | etcd3 upgrade | stop etcd" service: diff --git a/tests/templates/inventory-gce.j2 b/tests/templates/inventory-gce.j2 index d58f6896c..c77556f0b 100644 --- a/tests/templates/inventory-gce.j2 +++ b/tests/templates/inventory-gce.j2 @@ -27,10 +27,12 @@ {{node3}} [etcd] +{{node1}} {{node2}} {{node3}} [vault] +{{node1}} {{node2}} {{node3}} {% else %} From fb467df47c383ff130968d8031b8c4acfe25a181 Mon Sep 17 00:00:00 2001 From: Matthew Mosesohn Date: Wed, 29 Mar 2017 23:22:49 +0400 Subject: [PATCH 12/16] fix etcd restart --- roles/etcd/templates/etcd-docker.service.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/roles/etcd/templates/etcd-docker.service.j2 b/roles/etcd/templates/etcd-docker.service.j2 index c5cae99cf..18deee1d9 100644 --- a/roles/etcd/templates/etcd-docker.service.j2 +++ b/roles/etcd/templates/etcd-docker.service.j2 @@ -9,10 +9,10 @@ PermissionsStartOnly=true EnvironmentFile=/etc/etcd.env ExecStart={{ bin_dir }}/etcd ExecStartPre=-{{ docker_bin_dir }}/docker rm -f {{ etcd_member_name | default("etcd") }} -ExecReload={{ docker_bin_dir }}/docker restart {{ etcd_member_name | default("etcd") }} ExecStop={{ docker_bin_dir }}/docker stop {{ etcd_member_name | default("etcd") }} Restart=always RestartSec=15s +TimeoutStartSec=30s [Install] WantedBy=multi-user.target From d42e4f23448cd0c89b1d5defe5156ead975b8d32 Mon Sep 17 00:00:00 2001 From: Matthew Mosesohn Date: Wed, 29 Mar 2017 23:28:05 +0300 Subject: [PATCH 13/16] Update .gitlab-ci.yml --- .gitlab-ci.yml | 3 +++ roles/kubernetes/master/tasks/post-upgrade.yml | 18 +++++++++++++++--- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 186251328..2a0106162 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -47,6 +47,7 @@ before_script: PRIVATE_KEY: $GCE_PRIVATE_KEY GS_ACCESS_KEY_ID: $GS_KEY GS_SECRET_ACCESS_KEY: $GS_SECRET + CLOUD_MACHINE_TYPE: "g1-small" ANSIBLE_KEEP_REMOTE_FILES: "1" ANSIBLE_CONFIG: ./tests/ansible.cfg BOOTSTRAP_OS: none @@ -97,6 +98,7 @@ before_script: -e gce_credentials_file=${HOME}/.ssh/gce.json -e gce_project_id=${GCE_PROJECT_ID} -e gce_service_account_email=${GCE_ACCOUNT} + -e cloud_machine_type=${CLOUD_MACHINE_TYPE} -e inventory_path=${PWD}/inventory/inventory.ini -e kube_network_plugin=${KUBE_NETWORK_PLUGIN} -e mode=${CLUSTER_MODE} @@ -265,6 +267,7 @@ before_script: KUBE_NETWORK_PLUGIN: canal CLOUD_IMAGE: ubuntu-1604-xenial CLOUD_REGION: europe-west1-b + CLOUD_MACHINE_TYPE: "n1-standard-2" UPGRADE_TEST: "basic" CLUSTER_MODE: ha UPGRADE_TEST: "graceful" diff --git a/roles/kubernetes/master/tasks/post-upgrade.yml b/roles/kubernetes/master/tasks/post-upgrade.yml index d157311de..e99912976 100644 --- a/roles/kubernetes/master/tasks/post-upgrade.yml +++ b/roles/kubernetes/master/tasks/post-upgrade.yml @@ -1,11 +1,23 @@ --- -- name: "Post-upgrade | restart kubelet on all masters" +- name: "Post-upgrade | stop kubelet on all masters" service: name: kubelet - state: restarted + state: stopped + delegate_to: "{{item}}" + with_items: "{{groups['kube-master']}}" + when: needs_etcd_migration|bool + +- name: "Post-upgrade | Pause for kubelet stop" + pause: + seconds: 10 + when: needs_etcd_migration|bool + +- name: "Post-upgrade | stop kubelet on all masters" + service: + name: kubelet + state: started delegate_to: "{{item}}" with_items: "{{groups['kube-master']}}" - register: kube_apiserver_manifest_replaced when: needs_etcd_migration|bool - name: "Post-upgrade | etcd3 upgrade | purge etcd2 k8s data" From 80828a7c77def9faa032476ef3b6eae8972d89ce Mon Sep 17 00:00:00 2001 From: Matthew Mosesohn Date: Thu, 30 Mar 2017 17:08:13 +0400 Subject: [PATCH 14/16] use etcd2 when upgrading unless forced --- roles/kubernetes/master/defaults/main.yml | 3 +++ roles/kubernetes/master/handlers/main.yml | 2 +- roles/kubernetes/master/tasks/pre-upgrade.yml | 7 ++++++- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/roles/kubernetes/master/defaults/main.yml b/roles/kubernetes/master/defaults/main.yml index 9b6e3eff4..016df0c64 100644 --- a/roles/kubernetes/master/defaults/main.yml +++ b/roles/kubernetes/master/defaults/main.yml @@ -16,6 +16,9 @@ etcd_cert_dir: "{{ etcd_config_dir }}/ssl" # ETCD backend for k8s data kube_apiserver_storage_backend: etcd3 +# By default, force back to etcd2. Set to true to force etcd3 (experimental!) +force_etcd3: false + # Limits for kube components kube_controller_memory_limit: 512M kube_controller_cpu_limit: 250m diff --git a/roles/kubernetes/master/handlers/main.yml b/roles/kubernetes/master/handlers/main.yml index 1eb5a4886..94cec7d1b 100644 --- a/roles/kubernetes/master/handlers/main.yml +++ b/roles/kubernetes/master/handlers/main.yml @@ -42,5 +42,5 @@ url: http://localhost:8080/healthz register: result until: result.status == 200 - retries: 10 + retries: 20 delay: 6 diff --git a/roles/kubernetes/master/tasks/pre-upgrade.yml b/roles/kubernetes/master/tasks/pre-upgrade.yml index 35d573e97..b42cd06d1 100644 --- a/roles/kubernetes/master/tasks/pre-upgrade.yml +++ b/roles/kubernetes/master/tasks/pre-upgrade.yml @@ -42,6 +42,11 @@ when: kube_apiserver_storage_backend == "etcd3" failed_when: false +- name: "Pre-upgrade | etcd3 upgrade | use etcd2 unless forced to etc3" + set_fact: + kube_apiserver_storage_backend: "etcd2" + when: old_data_exists.rc == 0 and not force_etcd3|bool + - name: "Pre-upgrade | etcd3 upgrade | see if data was already migrated" command: "{{ bin_dir }}/etcdctl --endpoints={{ etcd_access_addresses }} get --limit=1 --prefix=true /registry/minions" environment: @@ -53,7 +58,7 @@ - name: "Pre-upgrade | etcd3 upgrade | set needs_etcd_migration" set_fact: - needs_etcd_migration: "{{ kube_apiserver_storage_backend == 'etcd3' and data_migrated.stdout_lines|length == 0 and old_data_exists.rc == 0 }}" + needs_etcd_migration: "{{ force_etcd3|default(false) and kube_apiserver_storage_backend == 'etcd3' and data_migrated.stdout_lines|length == 0 and old_data_exists.rc == 0 }}" - name: "Pre-upgrade | Delete master manifests on all kube-masters" file: From b4d06ff8dda9e3d65b25be1bef393a4030f495ea Mon Sep 17 00:00:00 2001 From: Matthew Mosesohn Date: Mon, 3 Apr 2017 16:50:17 +0300 Subject: [PATCH 15/16] Add /var/lib/cni to kubelet Necessary to persist this directory for host-local IPAM used by Canal Add pre-upgrade task to copy /var/lib/cni out of old kubelet. --- roles/kubernetes/node/tasks/main.yml | 3 +++ roles/kubernetes/node/tasks/pre_upgrade.yml | 6 ++++++ roles/kubernetes/node/templates/kubelet-container.j2 | 1 + roles/kubernetes/node/templates/kubelet.rkt.service.j2 | 2 ++ 4 files changed, 12 insertions(+) create mode 100644 roles/kubernetes/node/tasks/pre_upgrade.yml diff --git a/roles/kubernetes/node/tasks/main.yml b/roles/kubernetes/node/tasks/main.yml index a6a9c16f2..324e38867 100644 --- a/roles/kubernetes/node/tasks/main.yml +++ b/roles/kubernetes/node/tasks/main.yml @@ -4,6 +4,9 @@ {%- if inventory_hostname in groups['kube-master'] and inventory_hostname not in groups['kube-node'] -%}true{%- else -%}false{%- endif -%} tags: facts +- include: pre_upgrade.yml + tags: kubelet + - include: install.yml tags: kubelet diff --git a/roles/kubernetes/node/tasks/pre_upgrade.yml b/roles/kubernetes/node/tasks/pre_upgrade.yml new file mode 100644 index 000000000..612dd3e6f --- /dev/null +++ b/roles/kubernetes/node/tasks/pre_upgrade.yml @@ -0,0 +1,6 @@ +--- +- name: "Pre-upgrade | copy /var/lib/cni from kubelet" + command: docker cp kubelet:/var/lib/cni /var/lib/cni + args: + creates: "/var/lib/cni" + failed_when: false diff --git a/roles/kubernetes/node/templates/kubelet-container.j2 b/roles/kubernetes/node/templates/kubelet-container.j2 index ffea01cec..70c317b78 100644 --- a/roles/kubernetes/node/templates/kubelet-container.j2 +++ b/roles/kubernetes/node/templates/kubelet-container.j2 @@ -21,6 +21,7 @@ -v {{ docker_daemon_graph }}:/var/lib/docker:rw \ -v /var/log:/var/log:rw \ -v /var/lib/kubelet:/var/lib/kubelet:shared \ + -v /var/lib/cni:/var/lib/cni:shared \ -v /var/run:/var/run:rw \ -v {{kube_config_dir}}:{{kube_config_dir}}:ro \ {{ hyperkube_image_repo }}:{{ hyperkube_image_tag}} \ diff --git a/roles/kubernetes/node/templates/kubelet.rkt.service.j2 b/roles/kubernetes/node/templates/kubelet.rkt.service.j2 index cc5af31ec..be8a13dbf 100644 --- a/roles/kubernetes/node/templates/kubelet.rkt.service.j2 +++ b/roles/kubernetes/node/templates/kubelet.rkt.service.j2 @@ -34,8 +34,10 @@ ExecStart=/usr/bin/rkt run \ {% if kube_network_plugin in ["calico", "weave", "canal"] %} --volume etc-cni,kind=host,source=/etc/cni,readOnly=true \ --volume opt-cni,kind=host,source=/opt/cni,readOnly=true \ + --volume var-lib-cni,kind=host,source=/var/lib/cni,readOnly=false \ --mount volume=etc-cni,target=/etc/cni \ --mount volume=opt-cni,target=/opt/cni \ + --mount volume=var-lib-cni,target=/var/lib/cni \ {% endif %} --mount volume=dns,target=/etc/resolv.conf \ --mount volume=etc-kubernetes,target={{ kube_config_dir }} \ From fd20e0de9028dd36215be4004d8b8d3a7d45df21 Mon Sep 17 00:00:00 2001 From: Matthew Mosesohn Date: Tue, 4 Apr 2017 13:12:24 +0300 Subject: [PATCH 16/16] Wait for container creation in check network test --- tests/testcases/030_check-network.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/testcases/030_check-network.yml b/tests/testcases/030_check-network.yml index c15a7539e..ee5f60785 100644 --- a/tests/testcases/030_check-network.yml +++ b/tests/testcases/030_check-network.yml @@ -21,6 +21,9 @@ - name: Get pod names shell: "{{bin_dir}}/kubectl get pods -o json" register: pods + until: '"ContainerCreating" not in pods.stdout' + retries: 60 + delay: 2 no_log: true - name: Get hostnet pods