From a0f9f1cbe01897a560dc1c5dcd16c455cc74bf7f Mon Sep 17 00:00:00 2001 From: roc Date: Sat, 27 Apr 2024 14:34:13 +0800 Subject: [PATCH] update at 2024-04-27 14:34:13 --- codeblock/llama/ollama-nodeselector.yaml | 45 ++++++++++++++++++++++++ content/cases/llama3.md | 19 ++++++++-- 2 files changed, 62 insertions(+), 2 deletions(-) create mode 100644 codeblock/llama/ollama-nodeselector.yaml diff --git a/codeblock/llama/ollama-nodeselector.yaml b/codeblock/llama/ollama-nodeselector.yaml new file mode 100644 index 0000000..ed9b296 --- /dev/null +++ b/codeblock/llama/ollama-nodeselector.yaml @@ -0,0 +1,45 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: ollama + namespace: llama +spec: + serviceName: "ollama" + replicas: 1 + selector: + matchLabels: + app: ollama + template: + metadata: + labels: + app: ollama + spec: + # highlight-start + nodeSelector: + gpu: v100 + # highlight-end + containers: + - name: ollama + image: ollama/ollama:latest + ports: + - containerPort: 11434 + resources: + requests: + cpu: "2000m" + memory: "2Gi" + nvidia.com/gpu: "4" + limits: + cpu: "4000m" + memory: "4Gi" + volumeMounts: + - name: ollama-volume + mountPath: /root/.ollama + tty: true + volumeClaimTemplates: + - metadata: + name: ollama-volume + spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 200Gi diff --git a/content/cases/llama3.md b/content/cases/llama3.md index ebb9353..01d1c19 100644 --- a/content/cases/llama3.md +++ b/content/cases/llama3.md @@ -103,7 +103,9 @@ ollama pull llama3:70b ## 开始对话 -打开 `OpenWebUI` 页面, +打开 `OpenWebUI` 页面,选择模型,然后就可以在对话框中开始对话了。 + +![](https://image-host-1251893006.cos.ap-chengdu.myqcloud.com/2024%2F04%2F27%2F20240427135707.png) ## 小技巧 @@ -111,7 +113,20 @@ ollama pull llama3:70b 对于像 `70b` 这样的模型,需要较好的 GPU 才能跑起来,如果集群内有多种 GPU 节点,需要加下调度策略,避免分配到较差的 GPU。 -比如要调度到显卡型号为 `Nvdia Tesla V100` 的节点, +比如要调度到显卡型号为 `Nvdia Tesla V100` 的节点,可以给节点打上 label: + +```bash +kubectl label node gpu=v100 +``` + +然后配置下调度策略(高亮部分): + + + +### 省钱小妙招 + +* 如果使用云厂商托管的 Kubernetes 集群,且不需要大模型高可用,可以购买竞价实例(Spot),会便宜很多。 +* 如果只在部分时间段使用,可以使用定时伸缩,在不需要的时间段将 Ollama 和 OpenWebUI 的副本数自动缩到 0 以停止计费,比如 [使用 KEDA 的 Cron 触发器实现定时伸缩](../best-practices/autoscaling/keda/cron)。 ## 常见问题