mirror of https://github.com/easzlab/kubeasz.git
Revert "在k8s中,配置GPU节点"
parent
77fe605711
commit
cf7917aa1c
|
@ -6,4 +6,3 @@ hosts
|
||||||
*.crt
|
*.crt
|
||||||
*.pem
|
*.pem
|
||||||
roles/prepare/files/ca*
|
roles/prepare/files/ca*
|
||||||
.idea
|
|
||||||
|
|
|
@ -1,3 +0,0 @@
|
||||||
- hosts: gpu-node
|
|
||||||
roles:
|
|
||||||
- gpu
|
|
|
@ -1,46 +0,0 @@
|
||||||
## 08-配置GPU-node节点.md
|
|
||||||
|
|
||||||
推荐阅读[官方GPU节点配置文档](https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/)
|
|
||||||
|
|
||||||
### 1. 允许k8s系统使用Device Plugins
|
|
||||||
|
|
||||||
实现方式:通过修改kube-apiserver, kubelet, kube-proxy配置文件模板
|
|
||||||
|
|
||||||
因为GPU node是kube node的子集,正常执行`90.setup.yml`搭建k8s即可实现此目标
|
|
||||||
|
|
||||||
### 2. 配置GPU节点
|
|
||||||
在[官方驱动网页](http://www.nvidia.com/Download/index.aspx?lang=en-uk)下载对应操作系统与显卡的驱动,
|
|
||||||
改名为nvidia-diag-driver-local-repo.deb,
|
|
||||||
并放入到invertory file中{{ base_dir }}路线下的 bin 文件夹中
|
|
||||||
|
|
||||||
执行命令:`ansible-playbook 21.gpunode.yml`
|
|
||||||
|
|
||||||
相当于完成以下任务
|
|
||||||
|
|
||||||
#### 1). GPU 节点安装 nvidia driver
|
|
||||||
建议在node上只安装驱动,而不安装CUDA包,所有CUDA包都放到镜像中去,否则容易出现版本不匹配的问题。
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#### 2). GPU 节点安装 nvidia docker 2.0
|
|
||||||
这一部分可能失败,原因可能是系统所用docker版本与nvidia docker 2.0依赖的docker版本不一致,
|
|
||||||
具体参考[文档](https://github.com/NVIDIA/nvidia-docker/wiki/Frequently-Asked-Questions)
|
|
||||||
|
|
||||||
|
|
||||||
可以通过命令`apt-cache madison nvidia-docker2 nvidia-container-runtime`查询可以安装的nvidia docker 2.0
|
|
||||||
的 docker 版本
|
|
||||||
|
|
||||||
可以通过命令`dpkg -l | grep docker`来查看实际安装docker 版本
|
|
||||||
|
|
||||||
nvidia docker 安装失败,可以[手动更新docker](https://docs.docker.com/engine/installation/linux/docker-ce/ubuntu/#upgrade-docker-ce),
|
|
||||||
再重新运行安装playbook
|
|
||||||
|
|
||||||
|
|
||||||
#### 3). GPU 节点配置 nvidia-container-runtime 为 docker default runtime
|
|
||||||
|
|
||||||
通过修改`/etc/docker/daemon.json`实现
|
|
||||||
|
|
||||||
### 3. 配置Nvidia device plugin
|
|
||||||
执行命令 `kubectl create -f manifests/gpu-device-plugin/v1.9/nvidia-device-plugin.yml`
|
|
||||||
|
|
||||||
可以通过执行`kubectl describe nodes | grep nvidia.com/gpu` 来查看GPU节点配置是否成功
|
|
|
@ -14,11 +14,6 @@
|
||||||
[kube-node]
|
[kube-node]
|
||||||
192.168.1.1 NODE_ID=node1 NODE_IP="192.168.1.1"
|
192.168.1.1 NODE_ID=node1 NODE_IP="192.168.1.1"
|
||||||
|
|
||||||
#gpu-node 是 kube-node的子集
|
|
||||||
[gpu-node]
|
|
||||||
192.168.1.1 NODE_IP="192.168.1.1"
|
|
||||||
|
|
||||||
|
|
||||||
[kube-cluster:children]
|
[kube-cluster:children]
|
||||||
kube-node
|
kube-node
|
||||||
kube-master
|
kube-master
|
||||||
|
|
|
@ -30,12 +30,6 @@ MASTER_PORT="8443" # api-server 服务端口
|
||||||
192.168.1.3 NODE_ID=node2 NODE_IP="192.168.1.3"
|
192.168.1.3 NODE_ID=node2 NODE_IP="192.168.1.3"
|
||||||
192.168.1.4 NODE_ID=node3 NODE_IP="192.168.1.4"
|
192.168.1.4 NODE_ID=node3 NODE_IP="192.168.1.4"
|
||||||
|
|
||||||
#gpu-node 是 kube-node的子集
|
|
||||||
[gpu-node]
|
|
||||||
192.168.1.2 NODE_IP="192.168.1.2"
|
|
||||||
192.168.1.3 NODE_IP="192.168.1.3"
|
|
||||||
|
|
||||||
|
|
||||||
[kube-cluster:children]
|
[kube-cluster:children]
|
||||||
kube-node
|
kube-node
|
||||||
kube-master
|
kube-master
|
||||||
|
|
|
@ -18,12 +18,6 @@
|
||||||
192.168.1.2 NODE_ID=node2 NODE_IP="192.168.1.2"
|
192.168.1.2 NODE_ID=node2 NODE_IP="192.168.1.2"
|
||||||
192.168.1.3 NODE_ID=node3 NODE_IP="192.168.1.3"
|
192.168.1.3 NODE_ID=node3 NODE_IP="192.168.1.3"
|
||||||
|
|
||||||
#gpu-node 是 kube-node的子集
|
|
||||||
[gpu-node]
|
|
||||||
192.168.1.2 NODE_IP="192.168.1.2"
|
|
||||||
192.168.1.3 NODE_IP="192.168.1.3"
|
|
||||||
|
|
||||||
|
|
||||||
[kube-cluster:children]
|
[kube-cluster:children]
|
||||||
kube-node
|
kube-node
|
||||||
kube-master
|
kube-master
|
||||||
|
|
|
@ -1,24 +0,0 @@
|
||||||
apiVersion: extensions/v1beta1
|
|
||||||
kind: DaemonSet
|
|
||||||
metadata:
|
|
||||||
name: nvidia-device-plugin-daemonset
|
|
||||||
spec:
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
name: nvidia-device-plugin-ds
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- image: nvidia/k8s-device-plugin:1.8
|
|
||||||
name: nvidia-device-plugin-ctr
|
|
||||||
securityContext:
|
|
||||||
allowPrivilegeEscalation: false
|
|
||||||
capabilities:
|
|
||||||
drop: ["ALL"]
|
|
||||||
volumeMounts:
|
|
||||||
- name: device-plugin
|
|
||||||
mountPath: /var/lib/kubelet/device-plugins
|
|
||||||
volumes:
|
|
||||||
- name: device-plugin
|
|
||||||
hostPath:
|
|
||||||
path: /var/lib/kubelet/device-plugins
|
|
|
@ -1,25 +0,0 @@
|
||||||
apiVersion: extensions/v1beta1
|
|
||||||
kind: DaemonSet
|
|
||||||
metadata:
|
|
||||||
name: nvidia-device-plugin-daemonset
|
|
||||||
namespace: kube-system
|
|
||||||
spec:
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
name: nvidia-device-plugin-ds
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- image: nvidia/k8s-device-plugin:1.9
|
|
||||||
name: nvidia-device-plugin-ctr
|
|
||||||
securityContext:
|
|
||||||
allowPrivilegeEscalation: false
|
|
||||||
capabilities:
|
|
||||||
drop: ["ALL"]
|
|
||||||
volumeMounts:
|
|
||||||
- name: device-plugin
|
|
||||||
mountPath: /var/lib/kubelet/device-plugins
|
|
||||||
volumes:
|
|
||||||
- name: device-plugin
|
|
||||||
hostPath:
|
|
||||||
path: /var/lib/kubelet/device-plugins
|
|
|
@ -1,12 +0,0 @@
|
||||||
{
|
|
||||||
"registry-mirrors": ["https://registry.docker-cn.com"],
|
|
||||||
"max-concurrent-downloads": 6,
|
|
||||||
"default-runtime": "nvidia",
|
|
||||||
"runtimes": {
|
|
||||||
"nvidia": {
|
|
||||||
"path": "/usr/bin/nvidia-container-runtime",
|
|
||||||
"runtimeArgs": []
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
|
@ -1,30 +0,0 @@
|
||||||
- name: 下载nvidia driver安装包
|
|
||||||
copy: src={{ base_dir }}/bin/nvidia-diag-driver-local-repo.deb dest=/tmp/nvidia-diag-driver-local-repo.deb
|
|
||||||
when: ansible_distribution == "Ubuntu"
|
|
||||||
|
|
||||||
- name: 安装nvidia dirver驱动
|
|
||||||
apt:
|
|
||||||
deb: /tmp/nvidia-diag-driver-local-repo.deb
|
|
||||||
when: ansible_distribution == "Ubuntu"
|
|
||||||
|
|
||||||
- name: Add the package repositories 1
|
|
||||||
shell: curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | apt-key add -
|
|
||||||
when: ansible_distribution == "Ubuntu"
|
|
||||||
|
|
||||||
- name: Add the package repositories 2
|
|
||||||
shell: curl -s -L https://nvidia.github.io/nvidia-docker/ubuntu16.04/amd64/nvidia-docker.list | tee /etc/apt/sources.list.d/nvidia-docker.list
|
|
||||||
when: ansible_distribution == "Ubuntu"
|
|
||||||
|
|
||||||
- name: Update apt-get and Install nvidia docker 2
|
|
||||||
apt:
|
|
||||||
name: nvidia-docker2
|
|
||||||
update_cache: yes
|
|
||||||
when: ansible_distribution == "Ubuntu"
|
|
||||||
|
|
||||||
- name: 配置nvidia docker runtime
|
|
||||||
copy: src=daemon.json dest=/etc/docker/daemon.json
|
|
||||||
|
|
||||||
- name: restart docker
|
|
||||||
service:
|
|
||||||
name: docker
|
|
||||||
state: restarted
|
|
|
@ -31,7 +31,6 @@ ExecStart={{ bin_dir }}/kube-apiserver \
|
||||||
--audit-log-maxsize=100 \
|
--audit-log-maxsize=100 \
|
||||||
--audit-log-path=/var/lib/audit.log \
|
--audit-log-path=/var/lib/audit.log \
|
||||||
--event-ttl=1h \
|
--event-ttl=1h \
|
||||||
--feature-gates="DevicePlugins=true" \
|
|
||||||
--v=2
|
--v=2
|
||||||
Restart=on-failure
|
Restart=on-failure
|
||||||
RestartSec=5
|
RestartSec=5
|
||||||
|
|
|
@ -12,7 +12,6 @@ ExecStart={{ bin_dir }}/kube-proxy \
|
||||||
--hostname-override={{ NODE_IP }} \
|
--hostname-override={{ NODE_IP }} \
|
||||||
--kubeconfig=/etc/kubernetes/kube-proxy.kubeconfig \
|
--kubeconfig=/etc/kubernetes/kube-proxy.kubeconfig \
|
||||||
--logtostderr=true \
|
--logtostderr=true \
|
||||||
--feature-gates="DevicePlugins=true" \
|
|
||||||
--v=2
|
--v=2
|
||||||
Restart=on-failure
|
Restart=on-failure
|
||||||
RestartSec=5
|
RestartSec=5
|
||||||
|
|
|
@ -23,7 +23,6 @@ ExecStart={{ bin_dir }}/kubelet \
|
||||||
--allow-privileged=true \
|
--allow-privileged=true \
|
||||||
--fail-swap-on=false \
|
--fail-swap-on=false \
|
||||||
--logtostderr=true \
|
--logtostderr=true \
|
||||||
--feature-gates="DevicePlugins=true" \
|
|
||||||
--v=2
|
--v=2
|
||||||
#kubelet cAdvisor 默认在所有接口监听 4194 端口的请求, 以下iptables限制内网访问
|
#kubelet cAdvisor 默认在所有接口监听 4194 端口的请求, 以下iptables限制内网访问
|
||||||
ExecStartPost=/sbin/iptables -A INPUT -s 10.0.0.0/8 -p tcp --dport 4194 -j ACCEPT
|
ExecStartPost=/sbin/iptables -A INPUT -s 10.0.0.0/8 -p tcp --dport 4194 -j ACCEPT
|
||||||
|
|
Loading…
Reference in New Issue