From ad3ea3eab5fc335858ea5863cf59c5936f6a6361 Mon Sep 17 00:00:00 2001 From: gjmzj Date: Sat, 2 Feb 2019 17:35:50 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=20prometheus=20=E5=91=8A?= =?UTF-8?q?=E8=AD=A6=E5=8F=91=E9=80=81=E9=92=89=E9=92=89=E9=85=8D=E7=BD=AE?= =?UTF-8?q?=E5=92=8C=E6=96=87=E6=A1=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/guide/prometheus.md | 49 ++++++++++++++++---- manifests/prometheus/dingtalk-webhook.yaml | 42 +++++++++++++++++ manifests/prometheus/prom-alertsmanager.yaml | 8 +++- 3 files changed, 89 insertions(+), 10 deletions(-) create mode 100644 manifests/prometheus/dingtalk-webhook.yaml diff --git a/docs/guide/prometheus.md b/docs/guide/prometheus.md index 8fa84ea..ef8f783 100644 --- a/docs/guide/prometheus.md +++ b/docs/guide/prometheus.md @@ -98,18 +98,49 @@ $ helm del --tls grafana --purge ## 验证告警 - 修改`prom-alertsmanager.yaml`文件中邮件告警为有效的配置内容,并使用 helm upgrade更新安装 -- 查看`prom-alertrules.yaml`文件,确认文件中设置了内存使用超过90%的告警规则 -- 部署测试应用,并压力测试使其内存超过90%,看是否触发告警并发送告警邮件 +- 手动临时关闭 master 节点的 kubelet 服务,等待几分钟看是否有告警邮件发送 + ``` bash -# 创建deploy和service -$ kubectl run nginx1 --image=nginx --port=80 --expose --limits='cpu=500m,memory=4Mi' +# 在 master 节点运行 +$ systemctl stop kubelet +``` -# 增加负载(可用Ctrl + C 停止) -$ kubectl run --rm -it load-generator --image=busybox /bin/sh -Hit enter for command prompt -$ while true; do wget -q -O- http://nginx1; done; +## [可选] 配置钉钉告警 -# 等待约几分钟查看是否有告警 +- 创建钉钉群,获取群机器人 webhook 地址 + +使用钉钉创建群聊以后可以方便设置群机器人,【群设置】-【群机器人】-【添加】-【自定义】-【添加】,然后按提示操作即可,参考 https://open-doc.dingtalk.com/docs/doc.htm?spm=a219a.7629140.0.0.666d4a97eCG7XA&treeId=257&articleId=105735&docType=1 + +上述配置好群机器人,获得这个机器人对应的Webhook地址,记录下来,后续配置钉钉告警插件要用,格式如下 + +``` +https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxx +``` + +- 创建钉钉告警插件,参考 http://theo.im/blog/2017/10/16/release-prometheus-alertmanager-webhook-for-dingtalk/ + +``` bash +# 编辑修改文件中 access_token=xxxxxx 为上一步你获得的机器人认证 token +$ vi /etc/ansible/manifests/prometheus/dingtalk-webhook.yaml +# 运行插件 +$ kubectl apply -f /etc/ansible/manifests/prometheus/dingtalk-webhook.yaml +``` + +- 修改 alertsmanager 告警配置后,更新 helm prometheus 部署,成功后如上节测试告警发送 + +``` bash +# 修改 alertsmanager 告警配置 +$ cd /etc/ansible/manifests/prometheus +$ vi prom-alertsmanager.yaml +# 增加 receiver dingtalk,然后在 route 配置使用 receiver: dingtalk + receivers: + - name: dingtalk + webhook_configs: + - send_resolved: false + url: http://webhook-dingtalk.monitoring.svc.cluster.local:8060/dingtalk/webhook1/send +# ... +# 更新 helm prometheus 部署 +$ helm upgrade --tls monitor -f prom-settings.yaml -f prom-alertsmanager.yaml -f prom-alertrules.yaml prometheus ``` ## 下一步 diff --git a/manifests/prometheus/dingtalk-webhook.yaml b/manifests/prometheus/dingtalk-webhook.yaml new file mode 100644 index 0000000..ce60f17 --- /dev/null +++ b/manifests/prometheus/dingtalk-webhook.yaml @@ -0,0 +1,42 @@ +--- +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + labels: + run: dingtalk + name: webhook-dingtalk + namespace: monitoring +spec: + replicas: 1 + template: + metadata: + labels: + run: dingtalk + spec: + containers: + - name: dingtalk + image: timonwong/prometheus-webhook-dingtalk:v0.3.0 + imagePullPolicy: IfNotPresent + # 设置钉钉群聊自定义机器人后,使用实际 access_token 替换下面 xxxxxx部分 + args: + - --ding.profile=webhook1=https://oapi.dingtalk.com/robot/send?access_token=xxxxxx + ports: + - containerPort: 8060 + protocol: TCP + +--- +apiVersion: v1 +kind: Service +metadata: + labels: + run: dingtalk + name: webhook-dingtalk + namespace: monitoring +spec: + ports: + - port: 8060 + protocol: TCP + targetPort: 8060 + selector: + run: dingtalk + sessionAffinity: None diff --git a/manifests/prometheus/prom-alertsmanager.yaml b/manifests/prometheus/prom-alertsmanager.yaml index 6c8d43b..9ca7897 100644 --- a/manifests/prometheus/prom-alertsmanager.yaml +++ b/manifests/prometheus/prom-alertsmanager.yaml @@ -11,11 +11,17 @@ alertmanagerFiles: group_by: ['alertname', 'pod_name'] group_wait: 10s group_interval: 5m - receiver: AlertMail + #receiver: AlertMail + receiver: dingtalk repeat_interval: 3h receivers: - name: 'AlertMail' email_configs: - to: 'xxxx@163.com' + - name: dingtalk + webhook_configs: + - send_resolved: false + # 需要运行插件 dingtalk-webhook.yaml,详情阅读 docs/guide/prometheus.md + url: http://webhook-dingtalk.monitoring.svc.cluster.local:8060/dingtalk/webhook1/send