prometheus策略文件编写样例

mac2026-02-10  14

前端将内容转换为json

var _metricValues=[]; for(){ var _jsonObj = {}; _jsonObj.id = $threshold.eq(j).attr('data-ruleId'); _metricValues.push(_jsonObj); } var _jsonDatas=JSON.stringify(_metricValues);

json示例

前台传给后台的参数,yaml文件就是根据这个json生成的

[ { "id":"0c030a8bac537214e071ffec745077f6", "expr":"107,117,98,101,95,110,111,100,101,95,115,116,97,116,117,115,95,99,111,110,100,105,116,105,111,110,123,115,116,97,116,117,115,61,34,116,114,117,101,34,44,99,111,110,100,105,116,105,111,110,61,34,82,101,97,100,121,34,125", "summary":"{{ $labels.node}},{{ $value }},node,{{ $labels.nodeip }}", "metricName":"节点-健康情况", "measurementId":"Node-Healthy", "ynState":"1", "orignalStatus":"ANY_VALUE", "ynHealthy":"1", "comparator":"NEQ", "threshold":"0", "priority":"3", "trembleTimes":"30s", "alertContent":"节点[ {{ $labels.pod_name}} ]状态异常" }, { "id":"463cc6e522b502d71da6866475dd5422", "expr":"107,117,98,101,95,100,101,112,108,111,121,109,101,110,116,95,115,116,97,116,117,115,95,114,101,112,108,105,99,97,115,95,117,110,97,118,97,105,108,97,98,108,101", "summary":"{{ $labels.deployment}},{{ $value }},deployment,{{ $labels.nodeip}}", "metricName":"控制器-部署-健康情况", "measurementId":"Deployment-Healthy", "ynState":"1", "orignalStatus":"ANY_VALUE", "ynHealthy":"0", "comparator":"NEQ", "threshold":"0", "priority":"3", "trembleTimes":"30s", "alertContent":"命名空间[ {{ $labels.namespace}} ] 下,控制器-部署[ {{ $labels.deployment}} ]状态异常" }, { "id":"76f070b09c73007e4bfe3438d024f703", "expr":"107,117,98,101,95,100,97,101,109,111,110,115,101,116,95,115,116,97,116,117,115,95,110,117,109,98,101,114,95,117,110,97,118,97,105,108,97,98,108,101", "summary":"{{ $labels.daemonset}},{{ $value }},daemonset,{{ $labels.nodeip }}", "metricName":"控制器-守护进程集-健康情况", "measurementId":"Daemonset-Healthy", "ynState":"1", "orignalStatus":"ANY_VALUE", "ynHealthy":"0", "comparator":"NEQ", "threshold":"0", "priority":"3", "trembleTimes":"30s", "alertContent":"命名空间[ {{ $labels.namespace}} ] 下,控制器-守护进程集[ {{ $labels.daemonset}} ]状态异常" }, { "id":"e4a9dfa674cdf31cab1f127376aab2a3", "expr":"107,117,98,101,95,115,116,97,116,101,102,117,108,115,101,116,95,114,101,112,108,105,99,97,115,45,107,117,98,101,95,115,116,97,116,101,102,117,108,115,101,116,95,115,116,97,116,117,115,95,114,101,112,108,105,99,97,115,95,114,101,97,100,121", "summary":"{{ $labels.statefulset}},{{ $value }},statefulset,{{ $labels.nodeip}}", "metricName":"控制器-有状态副本集-健康情况", "measurementId":"Statefulset-Healthy", "ynState":"1", "orignalStatus":"ANY_VALUE", "ynHealthy":"0", "comparator":"NEQ", "threshold":"0", "priority":"3", "trembleTimes":"30s", "alertContent":"命名空间[ {{ $labels.namespace}} ] 下,控制器-有状态副本集[ {{ $labels.statefulset}} ]状态异常" }, { "id":"41ff54ace66c4fa8916c0d494204fd66", "expr":"107,117,98,101,95,112,111,100,95,115,116,97,116,117,115,95,112,104,97,115,101,123,112,104,97,115,101,61,34,82,117,110,110,105,110,103,34,125", "summary":"{{ $labels.pod}},{{ $value }},pod,{{ $labels.nodeip }}", "metricName":"容器组-健康情况", "measurementId":"Pod-Healthy", "ynState":"1", "orignalStatus":"ANY_VALUE", "ynHealthy":"0", "comparator":"EQ", "threshold":"0", "priority":"2", "trembleTimes":"30s", "alertContent":"命名空间[ {{ $labels.namespace}} ] 下,容器组[ {{ $labels.pod}} ]状态异常" }, { "id":"088a967da99f4c741e97ed0b4ee7c56a", "expr":"107,117,98,101,95,112,111,100,95,99,111,110,116,97,105,110,101,114,95,115,116,97,116,117,115,95,114,117,110,110,105,110,103", "summary":"{{ $labels.container}},{{ $value }},container,{{ $labels.nodeip }}", "metricName":"容器-健康情况", "measurementId":"Container-Healthy", "ynState":"1", "orignalStatus":"ANY_VALUE", "ynHealthy":"0", "comparator":"EQ", "threshold":"0", "priority":"2", "trembleTimes":"30s", "alertContent":"容器[ {{ $labels.container}} ]状态异常" }, { "id":"cf0041618371e8153edd10dca7595841", "expr":"115,117,109,32,98,121,40,112,111,100,95,110,97,109,101,44,110,97,109,101,115,112,97,99,101,44,110,111,100,101,105,112,41,32,40,114,97,116,101,40,99,111,110,116,97,105,110,101,114,95,99,112,117,95,117,115,97,103,101,95,115,101,99,111,110,100,115,95,116,111,116,97,108,123,105,109,97,103,101,33,61,34,34,44,110,97,109,101,115,112,97,99,101,33,61,34,34,125,91,49,109,93,41,41", "summary":"{{ $labels.pod_name}},{{ $value }},pod_name,{{ $labels.nodeip }}", "metricName":"容器组-CPU使用情况", "measurementId":"Pod-CPU", "thresholdState":"static", "ynState":"0", "comparator":"GT", "threshold":"0.3", "priority":"1", "unit":"Core", "trembleTimes":"30s", "alertContent":"命名空间[ {{ $labels.namespace}} ] 下,容器组[ {{ $labels.pod_name}} ]CPU使用情况为{{$value}}{{comparator}}{{threshold}}" }, { "id":"e8ab18f5d9b541c2bcf443163069997d", "expr":"115,117,109,32,98,121,40,112,111,100,95,110,97,109,101,44,110,97,109,101,115,112,97,99,101,44,110,111,100,101,105,112,41,32,40,114,97,116,101,40,99,111,110,116,97,105,110,101,114,95,99,112,117,95,117,115,97,103,101,95,115,101,99,111,110,100,115,95,116,111,116,97,108,123,105,109,97,103,101,33,61,34,34,44,110,97,109,101,115,112,97,99,101,33,61,34,34,125,91,49,109,93,41,41", "summary":"{{ $labels.pod_name}},{{ $value }},pod_name,{{ $labels.nodeip}}", "metricName":"容器组-CPU使用情况", "measurementId":"Pod-CPU", "thresholdState":"static", "ynState":"0", "comparator":"GT", "threshold":"0.5", "priority":"2", "unit":"Core", "trembleTimes":"30s", "alertContent":"命名空间[ {{ $labels.namespace}} ] 下,容器组[ {{ $labels.pod_name}} ]CPU使用情况为{{$value}}{{comparator}}{{threshold}}" }, { "id":"0b0c7ab033646ece5dc47c39854466a2", "expr":"115,117,109,32,98,121,40,112,111,100,95,110,97,109,101,44,110,97,109,101,115,112,97,99,101,44,110,111,100,101,105,112,41,32,40,114,97,116,101,40,99,111,110,116,97,105,110,101,114,95,99,112,117,95,117,115,97,103,101,95,115,101,99,111,110,100,115,95,116,111,116,97,108,123,105,109,97,103,101,33,61,34,34,44,110,97,109,101,115,112,97,99,101,33,61,34,34,125,91,49,109,93,41,41", "summary":"{{ $labels.pod_name}},{{ $value }},pod_name,{{ $labels.nodeip}}", "metricName":"容器组-CPU使用情况", "measurementId":"Pod-CPU", "thresholdState":"static", "ynState":"0", "comparator":"GT", "threshold":"0.8", "priority":"3", "unit":"Core", "trembleTimes":"30s", "alertContent":"命名空间[ {{ $labels.namespace}} ] 下,容器组[ {{ $labels.pod_name}} ]CPU使用情况为{{$value}}{{comparator}}{{threshold}}" }, { "id":"2e143bdec6869764e15fc988cd920428", "expr":"115,117,109,32,98,121,40,112,111,100,95,110,97,109,101,44,110,97,109,101,115,112,97,99,101,44,110,111,100,101,105,112,41,32,40,99,111,110,116,97,105,110,101,114,95,109,101,109,111,114,121,95,117,115,97,103,101,95,98,121,116,101,115,123,105,109,97,103,101,33,61,34,34,44,110,97,109,101,115,112,97,99,101,33,61,34,34,125,41,47,49,48,50,52,47,49,48,50,52", "summary":"{{ $labels.pod_name}},{{ $value }},pod_name,{{ $labels.nodeip}}", "metricName":"容器组-内存使用情况", "measurementId":"Pod-Memory", "thresholdState":"static", "ynState":"0", "comparator":"GT", "threshold":"500", "priority":"1", "unit":"MB", "trembleTimes":"30s", "alertContent":"命名空间[ {{ $labels.namespace}} ] 下,容器组[ {{ $labels.pod_name}} ]内存使用状态为{{$value}}{{comparator}}{{threshold}}" }, { "id":"1c694c321a21719dab5cb12201abecaf", "expr":"115,117,109,32,98,121,40,112,111,100,95,110,97,109,101,44,110,97,109,101,115,112,97,99,101,44,110,111,100,101,105,112,41,32,40,114,97,116,101,40,99,111,110,116,97,105,110,101,114,95,110,101,116,119,111,114,107,95,114,101,99,101,105,118,101,95,98,121,116,101,115,95,116,111,116,97,108,123,105,109,97,103,101,33,61,34,34,44,110,97,109,101,115,112,97,99,101,33,61,34,34,125,91,49,109,93,41,41,47,49,48,50,52", "summary":"{{ $labels.pod_name}},{{ $value }},pod_name,{{ $labels.nodeip}}", "metricName":"容器组-流入流量", "measurementId":"Pod-NetReceive", "thresholdState":"static", "ynState":"0", "comparator":"GT", "threshold":"300", "priority":"1", "unit":"Kb", "trembleTimes":"30s", "alertContent":"命名空间[ {{ $labels.namespace}} ] 下,容器组[ {{ $labels.pod_name}} ]流入流量为{{$value}}{{comparator}}{{threshold}}" }, { "id":"ea0ef6e277a6d83ff65fa9312adb66bb", "expr":"115,117,109,32,98,121,40,112,111,100,95,110,97,109,101,44,110,97,109,101,115,112,97,99,101,44,110,111,100,101,105,112,41,32,40,114,97,116,101,40,99,111,110,116,97,105,110,101,114,95,110,101,116,119,111,114,107,95,116,114,97,110,115,109,105,116,95,98,121,116,101,115,95,116,111,116,97,108,123,105,109,97,103,101,33,61,34,34,44,110,97,109,101,115,112,97,99,101,33,61,34,34,125,91,49,109,93,41,41,47,49,48,50,52", "summary":"{{ $labels.pod_name}},{{ $value }},pod_name,{{ $labels.nodeip}}", "metricName":"容器组-流出流量", "measurementId":"Pod-NetTransmit", "thresholdState":"static", "ynState":"0", "comparator":"GT", "threshold":"300", "priority":"1", "unit":"Kb", "trembleTimes":"30s", "alertContent":"命名空间[ {{ $labels.namespace}} ] 下,容器组[ {{ $labels.pod_name}} ]流出流量为{{$value}}{{comparator}}{{threshold}}" }, { "id":"180c4123a8c09b9554f911cf249a4e3c", "expr":"115,117,109,32,98,121,40,110,97,109,101,44,110,97,109,101,115,112,97,99,101,44,110,111,100,101,105,112,41,32,40,114,97,116,101,40,99,111,110,116,97,105,110,101,114,95,99,112,117,95,117,115,97,103,101,95,115,101,99,111,110,100,115,95,116,111,116,97,108,123,105,109,97,103,101,33,61,34,34,125,91,49,109,93,41,41", "summary":"{{ $labels.name}},{{ $value }},name,{{ $labels.nodeip }}", "metricName":"容器-CPU使用情况", "measurementId":"Container-CPU", "thresholdState":"static", "ynState":"0", "comparator":"GT", "threshold":"0.3", "priority":"1", "unit":"Core", "trembleTimes":"30s", "alertContent":"容器[ {{ $labels.name}} ]CPU使用情况为{{$value}}{{comparator}}{{threshold}}" } ]

json字段注释

orignalStatus #框架需要 comparator #状态类指标都为EQ,性能类指标分大于小于 threshold #阈值 priority #告警优先级 trembleTimes #对应rules文件的for alertContent #告警内容

prometheus策略文件

1.alertType毫无用处,可以去掉 groups: - name: MonitotingRules rules: - alert: Node-Healthy annotations: {description: '33410,28857,91,32,123,123,32,36,108,97,98,101,108,115,46,112,111,100,95,110,97,109,101,125,125,32,93,29366,24577,24322,24120', summary: '{{ $labels.node}},{{ $value }},node,{{ $labels.nodeip }}'} expr: kube_node_status_condition{status="true",condition="Ready"}==0 for: 30s labels: {alertType: Healthy, bindresource: '', comparator: EQ, expr: 'kube_node_status_condition{status="true",condition="Ready"}==0', forTime: 30s, kubernetesIP: '', metricName: 'kube_node_status_condition{status="true",condition="Ready"}', resourceType: Node, severity: emergency, threshold: '0', unit: null, ynHealthy: '0', ynState: '1'} - alert: Deployment-Healthy annotations: {description: '21629,21517,31354,38388,91,32,123,123,32,36,108,97,98,101,108,115,46,110,97,109,101,115,112,97,99,101,125,125,32,93,32,19979,44,25511,21046,22120,45,37096,32626,91,32,123,123,32,36,108,97,98,101,108,115,46,100,101,112,108,111,121,109,101,110,116,125,125,32,93,29366,24577,24322,24120', summary: '{{ $labels.deployment}},{{ $value }},deployment,{{ $labels.nodeip}}'} expr: kube_deployment_status_replicas_unavailable!=0 for: 30s labels: {alertType: Healthy, bindresource: '', comparator: NEQ, expr: 'kube_deployment_status_replicas_unavailable!=0', forTime: 30s, kubernetesIP: '', metricName: kube_deployment_status_replicas_unavailable, resourceType: Deployment, severity: emergency, threshold: '0', unit: null, ynHealthy: '0', ynState: '1'} - alert: Daemonset-Healthy annotations: {description: '21629,21517,31354,38388,91,32,123,123,32,36,108,97,98,101,108,115,46,110,97,109,101,115,112,97,99,101,125,125,32,93,32,19979,44,25511,21046,22120,45,23432,25252,36827,31243,38598,91,32,123,123,32,36,108,97,98,101,108,115,46,100,97,101,109,111,110,115,101,116,125,125,32,93,29366,24577,24322,24120', summary: '{{ $labels.daemonset}},{{ $value }},daemonset,{{ $labels.nodeip }}'} expr: kube_daemonset_status_number_unavailable!=0 for: 30s labels: {alertType: Healthy, bindresource: '', comparator: NEQ, expr: 'kube_daemonset_status_number_unavailable!=0', forTime: 30s, kubernetesIP: '', metricName: kube_daemonset_status_number_unavailable, resourceType: Daemonset, severity: emergency, threshold: '0', unit: null, ynHealthy: '0', ynState: '1'} - alert: Statefulset-Healthy annotations: {description: '21629,21517,31354,38388,91,32,123,123,32,36,108,97,98,101,108,115,46,110,97,109,101,115,112,97,99,101,125,125,32,93,32,19979,44,25511,21046,22120,45,26377,29366,24577,21103,26412,38598,91,32,123,123,32,36,108,97,98,101,108,115,46,115,116,97,116,101,102,117,108,115,101,116,125,125,32,93,29366,24577,24322,24120', summary: '{{ $labels.statefulset}},{{ $value }},statefulset,{{ $labels.nodeip}}'} expr: kube_statefulset_replicas-kube_statefulset_status_replicas_ready!=0 for: 30s labels: {alertType: Healthy, bindresource: '', comparator: NEQ, expr: 'kube_statefulset_replicas-kube_statefulset_status_replicas_ready!=0', forTime: 30s, kubernetesIP: '', metricName: kube_statefulset_replicas-kube_statefulset_status_replicas_ready, resourceType: Statefulset, severity: emergency, threshold: '0', unit: null, ynHealthy: '0', ynState: '1'} - alert: Pod-Healthy annotations: {description: '21629,21517,31354,38388,91,32,123,123,32,36,108,97,98,101,108,115,46,110,97,109,101,115,112,97,99,101,125,125,32,93,32,19979,44,23481,22120,32452,91,32,123,123,32,36,108,97,98,101,108,115,46,112,111,100,125,125,32,93,29366,24577,24322,24120', summary: '{{ $labels.pod}},{{ $value }},pod,{{ $labels.nodeip }}'} expr: kube_pod_status_phase{phase="Running"}==0 for: 30s labels: {alertType: Healthy, bindresource: '', comparator: EQ, expr: 'kube_pod_status_phase{phase="Running"}==0', forTime: 30s, kubernetesIP: '', metricName: 'kube_pod_status_phase{phase="Running"}', resourceType: Pod, severity: critical, threshold: '0', unit: null, ynHealthy: '0', ynState: '1'} - alert: Container-Healthy annotations: {description: '23481,22120,91,32,123,123,32,36,108,97,98,101,108,115,46,99,111,110,116,97,105,110,101,114,125,125,32,93,29366,24577,24322,24120', summary: '{{ $labels.container}},{{ $value }},container,{{ $labels.nodeip }}'} expr: kube_pod_container_status_running==0 for: 30s labels: {alertType: Healthy, bindresource: '', comparator: EQ, expr: 'kube_pod_container_status_running==0', forTime: 30s, kubernetesIP: '', metricName: kube_pod_container_status_running, resourceType: Container, severity: critical, threshold: '0', unit: null, ynHealthy: '0', ynState: '1'} - alert: Pod-CPU annotations: {description: '21629,21517,31354,38388,91,32,123,123,32,36,108,97,98,101,108,115,46,110,97,109,101,115,112,97,99,101,125,125,32,93,32,19979,44,23481,22120,32452,91,32,123,123,32,36,108,97,98,101,108,115,46,112,111,100,95,110,97,109,101,125,125,32,93,67,80,85,20351,29992,24773,20917,20026,123,123,36,118,97,108,117,101,125,125,123,123,99,111,109,112,97,114,97,116,111,114,125,125,123,123,116,104,114,101,115,104,111,108,100,125,125', summary: '{{ $labels.pod_name}},{{ $value }},pod_name,{{ $labels.nodeip }}'} expr: sum by(pod_name,namespace,nodeip) (rate(container_cpu_usage_seconds_total{image!="",namespace!=""}[1m]))>0.3 for: 30s labels: {alertType: CPU, bindresource: '', comparator: GT, expr: 'sum by(pod_name,namespace,nodeip) (rate(container_cpu_usage_seconds_total{image!="",namespace!=""}[1m]))>0.3', forTime: 30s, kubernetesIP: '', metricName: 'sum by(pod_name,namespace,nodeip) (rate(container_cpu_usage_seconds_total{image!="",namespace!=""}[1m]))', resourceType: Pod, severity: warning, threshold: '0.3', unit: Core, ynHealthy: null, ynState: '0'} - alert: Pod-CPU annotations: {description: '21629,21517,31354,38388,91,32,123,123,32,36,108,97,98,101,108,115,46,110,97,109,101,115,112,97,99,101,125,125,32,93,32,19979,44,23481,22120,32452,91,32,123,123,32,36,108,97,98,101,108,115,46,112,111,100,95,110,97,109,101,125,125,32,93,67,80,85,20351,29992,24773,20917,20026,123,123,36,118,97,108,117,101,125,125,123,123,99,111,109,112,97,114,97,116,111,114,125,125,123,123,116,104,114,101,115,104,111,108,100,125,125', summary: '{{ $labels.pod_name}},{{ $value }},pod_name,{{ $labels.nodeip}}'} expr: sum by(pod_name,namespace,nodeip) (rate(container_cpu_usage_seconds_total{image!="",namespace!=""}[1m]))>0.5 for: 30s labels: {alertType: CPU, bindresource: '', comparator: GT, expr: 'sum by(pod_name,namespace,nodeip) (rate(container_cpu_usage_seconds_total{image!="",namespace!=""}[1m]))>0.5', forTime: 30s, kubernetesIP: '', metricName: 'sum by(pod_name,namespace,nodeip) (rate(container_cpu_usage_seconds_total{image!="",namespace!=""}[1m]))', resourceType: Pod, severity: critical, threshold: '0.5', unit: Core, ynHealthy: null, ynState: '0'} - alert: Pod-CPU annotations: {description: '21629,21517,31354,38388,91,32,123,123,32,36,108,97,98,101,108,115,46,110,97,109,101,115,112,97,99,101,125,125,32,93,32,19979,44,23481,22120,32452,91,32,123,123,32,36,108,97,98,101,108,115,46,112,111,100,95,110,97,109,101,125,125,32,93,67,80,85,20351,29992,24773,20917,20026,123,123,36,118,97,108,117,101,125,125,123,123,99,111,109,112,97,114,97,116,111,114,125,125,123,123,116,104,114,101,115,104,111,108,100,125,125', summary: '{{ $labels.pod_name}},{{ $value }},pod_name,{{ $labels.nodeip}}'} expr: sum by(pod_name,namespace,nodeip) (rate(container_cpu_usage_seconds_total{image!="",namespace!=""}[1m]))>0.8 for: 30s labels: {alertType: CPU, bindresource: '', comparator: GT, expr: 'sum by(pod_name,namespace,nodeip) (rate(container_cpu_usage_seconds_total{image!="",namespace!=""}[1m]))>0.8', forTime: 30s, kubernetesIP: '', metricName: 'sum by(pod_name,namespace,nodeip) (rate(container_cpu_usage_seconds_total{image!="",namespace!=""}[1m]))', resourceType: Pod, severity: emergency, threshold: '0.8', unit: Core, ynHealthy: null, ynState: '0'} - alert: Pod-Memory annotations: {description: '21629,21517,31354,38388,91,32,123,123,32,36,108,97,98,101,108,115,46,110,97,109,101,115,112,97,99,101,125,125,32,93,32,19979,44,23481,22120,32452,91,32,123,123,32,36,108,97,98,101,108,115,46,112,111,100,95,110,97,109,101,125,125,32,93,20869,23384,20351,29992,29366,24577,20026,123,123,36,118,97,108,117,101,125,125,123,123,99,111,109,112,97,114,97,116,111,114,125,125,123,123,116,104,114,101,115,104,111,108,100,125,125', summary: '{{ $labels.pod_name}},{{ $value }},pod_name,{{ $labels.nodeip}}'} expr: sum by(pod_name,namespace,nodeip) (container_memory_usage_bytes{image!="",namespace!=""})/1024/1024>500 for: 30s labels: {alertType: Memory, bindresource: '', comparator: GT, expr: 'sum by(pod_name,namespace,nodeip) (container_memory_usage_bytes{image!="",namespace!=""})/1024/1024>500', forTime: 30s, kubernetesIP: '', metricName: 'sum by(pod_name,namespace,nodeip) (container_memory_usage_bytes{image!="",namespace!=""})/1024/1024', resourceType: Pod, severity: warning, threshold: '500', unit: MB, ynHealthy: null, ynState: '0'} - alert: Pod-NetReceive annotations: {description: '21629,21517,31354,38388,91,32,123,123,32,36,108,97,98,101,108,115,46,110,97,109,101,115,112,97,99,101,125,125,32,93,32,19979,44,23481,22120,32452,91,32,123,123,32,36,108,97,98,101,108,115,46,112,111,100,95,110,97,109,101,125,125,32,93,27969,20837,27969,37327,20026,123,123,36,118,97,108,117,101,125,125,123,123,99,111,109,112,97,114,97,116,111,114,125,125,123,123,116,104,114,101,115,104,111,108,100,125,125', summary: '{{ $labels.pod_name}},{{ $value }},pod_name,{{ $labels.nodeip}}'} expr: sum by(pod_name,namespace,nodeip) (rate(container_network_receive_bytes_total{image!="",namespace!=""}[1m]))/1024>200 for: 30s labels: {alertType: NetReceive, bindresource: '', comparator: GT, expr: 'sum by(pod_name,namespace,nodeip) (rate(container_network_receive_bytes_total{image!="",namespace!=""}[1m]))/1024>200', forTime: 30s, kubernetesIP: '', metricName: 'sum by(pod_name,namespace,nodeip) (rate(container_network_receive_bytes_total{image!="",namespace!=""}[1m]))/1024', resourceType: Pod, severity: warning, threshold: '200', unit: Kb, ynHealthy: null, ynState: '0'} - alert: Pod-NetTransmit annotations: {description: '21629,21517,31354,38388,91,32,123,123,32,36,108,97,98,101,108,115,46,110,97,109,101,115,112,97,99,101,125,125,32,93,32,19979,44,23481,22120,32452,91,32,123,123,32,36,108,97,98,101,108,115,46,112,111,100,95,110,97,109,101,125,125,32,93,27969,20986,27969,37327,20026,123,123,36,118,97,108,117,101,125,125,123,123,99,111,109,112,97,114,97,116,111,114,125,125,123,123,116,104,114,101,115,104,111,108,100,125,125', summary: '{{ $labels.pod_name}},{{ $value }},pod_name,{{ $labels.nodeip}}'} expr: sum by(pod_name,namespace,nodeip) (rate(container_network_transmit_bytes_total{image!="",namespace!=""}[1m]))/1024>200 for: 30s labels: {alertType: NetTransmit, bindresource: '', comparator: GT, expr: 'sum by(pod_name,namespace,nodeip) (rate(container_network_transmit_bytes_total{image!="",namespace!=""}[1m]))/1024>200', forTime: 30s, kubernetesIP: '', metricName: 'sum by(pod_name,namespace,nodeip) (rate(container_network_transmit_bytes_total{image!="",namespace!=""}[1m]))/1024', resourceType: Pod, severity: warning, threshold: '200', unit: Kb, ynHealthy: null, ynState: '0'} - alert: Container-CPU annotations: {description: '23481,22120,91,32,123,123,32,36,108,97,98,101,108,115,46,110,97,109,101,125,125,32,93,67,80,85,20351,29992,24773,20917,20026,123,123,36,118,97,108,117,101,125,125,123,123,99,111,109,112,97,114,97,116,111,114,125,125,123,123,116,104,114,101,115,104,111,108,100,125,125', summary: '{{ $labels.name}},{{ $value }},name,{{ $labels.nodeip }}'} expr: sum by(name,namespace,nodeip) (rate(container_cpu_usage_seconds_total{image!=""}[1m]))>0.3 for: 30s labels: {alertType: CPU, bindresource: '', comparator: GT, expr: 'sum by(name,namespace,nodeip) (rate(container_cpu_usage_seconds_total{image!=""}[1m]))>0.3', forTime: 30s, kubernetesIP: '', metricName: 'sum by(name,namespace,nodeip) (rate(container_cpu_usage_seconds_total{image!=""}[1m]))', resourceType: Container, severity: warning, threshold: '0.3', unit: Core, ynHealthy: null, ynState: '0'} kind: ConfigMap metadata: name: all-prometheus-rules namespace: monitoring

prometheus相关策略直观展示(yaml)

alert: Node-Healthy expr: kube_node_status_condition{condition="Ready",status="true"} != 0 for: 30s labels: alertType: Healthy bindresource: "" comparator: NEQ expr: kube_node_status_condition{status="true",condition="Ready"}!=0 forTime: 30s kubernetesIP: 10.4.41.63 metricName: kube_node_status_condition{status="true",condition="Ready"} resourceType: Node severity: emergency threshold: "0" unit: "" ynHealthy: "1" ynState: "1" annotations: description: 33410,28857,91,32,123,123,32,36,108,97,98,101,108,115,46,112,111,100,95,110,97,109,101,125,125,32,93,29366,24577,24322,24120 summary: '{{ $labels.node}},{{ $value }},node,{{ $labels.nodeip }}'

相关公式

#节点健康 kube_node_status_condition{condition="Ready",status="true"}!= 0 #节点不健康 kube_node_status_condition{condition="Ready",status="true"}== 0

来源注释

rule中各变量的来源 #alertName 来源:前台json String alertName = jo.getString("measurementId"); #expr 来源:参数拼凑 String expr = metricName+trans2Comparator(comparator)+threshold #metricName 来源:如果在prometheus-rules中有这条告警策略,则从这条策略的labels中读取;如果没有,说明这条策略之前被删除了,则从all-prometheus-rules中读取。这个属性在一开始就写好在策略文件里了 String metricName = rule.getLabels().getMetricName(); #comparator 来源:前台json String comparator = jo.getString("comparator"); #threshold 来源:前台json String threshold = jo.getString("threshold"); #for 来源:前台json String trembleTimes = jo.getString("trembleTimes"); labels #alertType 来源:alertType拆分 String alertType = alertName.split("-")[1]; #bindresource 来源:如果在prometheus-rules中有这条告警策略,则从这条策略的labels中读取;如果没有置为空 //第一次保存策略时将ip注入到策略文件 //没有保存操作时kubernetesIP 一直为空 #kubernetesIP 来源:根据传入的resId确定 #resourceType 来源:alertType拆分 String resourceType = alertName.split("-")[0]; #severity 来源:前台json String priority = trans2Priority(jo.getString("priority")); #threshold 来源:前台json String threshold = jo.getString("threshold"); #unit 来源:前台json String unit = jo.getString("unit"); #ynHealthy 来源:前台json String ynHealthy = jo.getString("ynHealthy"); #ynState 来源:前台json String ynState = jo.getString("ynState"); annotations #description 来源:前台json中的alertContent字段转换成ascii码 String description = sbDescription.toString(); #summary 来源:前台json String summary = jo.getString("summary");

Java代码

String resourceId = ServletActionContext.getRequest().getParameter("resourceId"); String jsonData = ServletActionContext.getRequest().getParameter("jsonData"); JSONArray resultArray = JSONArray.parseArray(jsonData); PromRules pr = new PromRules(); //在for循环中逐步填充PromRules for(int i = 0; i< resultArray.size(); i++){ JSONObject jo = resultArray.getJSONObject(i); //从json中获得alertName等各种信息 String alertName = jo.getString("measurementId"); Map<String,String> map = getConfigMap(resourceId,"prometheus-rules"); String metricName = ""; String bindresource = ""; for(String fileName:map.keySet()){ //yamlContent即为yaml格式的rules全部内容 //groups: String yamlContent = map.get(fileName); //将yaml格式文件转换成我们定义的PromRules对象 PromRules prReadMap = yaml.loadAs(yamlContent, PromRules.class); //得到其中的若干规则 List<Rule> rules = prReadMap.getGroups().get(0).getRules(); //删除某条策略后,下次增加时,在prometheus-rules中找不到,需要到all-prometheus-rules中找 for(int j = 0; j< rules.size(); j++){ Rule rule = rules.get(j); if(rule.getAlert().equals(alertName)){ metricName = rule.getLabels().getMetricName(); bindresource = rule.getLabels().getBindresource(); break; } } if(metricName.equals("")){ Map<String,String> allRulesMap = getConfigMap(resourceId,"all-prometheus-rules"); Yaml allRulesYaml = new Yaml(); for(String fileName:allRulesMap.keySet()){ String yamlContent = allRulesMap.get(fileName); PromRules prReadMap = allRulesYaml.loadAs(yamlContent, PromRules.class); List<Rule> rules = prReadMap.getGroups().get(0).getRules(); for(int j = 0; j< rules.size(); j++){ Rule rule = rules.get(j); if(rule.getAlert().equals(alertName)){ metricName = rule.getLabels().getMetricName(); bindresource = ""; break; } } } } //masterHostIp根据resourceId得到 //alertName,comparator,threshold,trembleTimes,priority,summary,ynState,unit,descriptionyn,Healthy从参数json中得到 //metricName,如kube_node_status_condition{status="true",condition="Ready"},根据alertName在rules文件中找出(写死在对应规则的labels中) //bindresource根据alertName在rules文件中找出(存储在对应规则的labels中,每次绑定资源时改变) pr = generateRules(pr,masterHostIp,alertName,metricName,comparator,threshold,trembleTimes,priority,summary,ynState,unit,bindresource,description,ynHealthy); } } KubernetesClient client=null; client = getK8sClient(client,masterHostIp,masterHostPort,username,password); ConfigMap cm = client.configMaps().inNamespace("monitoring").withName("prometheus-rules").get(); //map的键为prometheusRules.yml,值为策略文件的值 Map<String,String> map = cm.getData(); Yaml yaml = new Yaml(); for(String fileName:map.keySet()){ String yamlContent = map.get(fileName); PromRules prRead = yaml.loadAs(yamlContent, PromRules.class); //pr为新生成的组,prRead为原本的组,这里采用覆盖操作 prRead.setGroups(pr.getGroups()); StringWriter sw = new StringWriter(); //yaml.dump之后sw中就有值了 yaml.dump(pr, sw); Map<String,String> maptest = new HashMap<String,String>(); maptest.put("prometheusRules.yml", sw.toString()); cm.setData(maptest); client.configMaps().inNamespace("monitoring").withName("prometheus-rules").replace(cm); //多线程执行reload命令 } public Map<String,String> getConfigMap(String resourceId,String name){ client = getK8sClient(client,hostIP,hostPort,userName,password); ConfigMap cm = client.configMaps().inNamespace("monitoring").withName(name).get(); return cm.getData(); } public String trans2Comparator(String comp){ String comparator = ""; if(comp.equals("GT")){ comparator = ">"; }else if(comp.equals("LT")){ comparator = "<"; } else if(comp.equals("EQ")){ comparator = "=="; } else if(comp.equals("NEQ")){ comparator = "!="; } return comparator; }
最新回复(0)