前端将内容转换为json
var _metricValues
=[];
for(){
var _jsonObj
= {};
_jsonObj
.id
= $threshold
.eq(j
).attr('data-ruleId');
_metricValues
.push(_jsonObj
);
}
var _jsonDatas
=JSON.stringify(_metricValues
);
json示例
前台传给后台的参数,yaml文件就是根据这个json生成的
[
{
"id":"0c030a8bac537214e071ffec745077f6",
"expr":"107,117,98,101,95,110,111,100,101,95,115,116,97,116,117,115,95,99,111,110,100,105,116,105,111,110,123,115,116,97,116,117,115,61,34,116,114,117,101,34,44,99,111,110,100,105,116,105,111,110,61,34,82,101,97,100,121,34,125",
"summary":"{{ $labels.node}},{{ $value }},node,{{ $labels.nodeip }}",
"metricName":"节点-健康情况",
"measurementId":"Node-Healthy",
"ynState":"1",
"orignalStatus":"ANY_VALUE",
"ynHealthy":"1",
"comparator":"NEQ",
"threshold":"0",
"priority":"3",
"trembleTimes":"30s",
"alertContent":"节点[ {{ $labels.pod_name}} ]状态异常"
},
{
"id":"463cc6e522b502d71da6866475dd5422",
"expr":"107,117,98,101,95,100,101,112,108,111,121,109,101,110,116,95,115,116,97,116,117,115,95,114,101,112,108,105,99,97,115,95,117,110,97,118,97,105,108,97,98,108,101",
"summary":"{{ $labels.deployment}},{{ $value }},deployment,{{ $labels.nodeip}}",
"metricName":"控制器-部署-健康情况",
"measurementId":"Deployment-Healthy",
"ynState":"1",
"orignalStatus":"ANY_VALUE",
"ynHealthy":"0",
"comparator":"NEQ",
"threshold":"0",
"priority":"3",
"trembleTimes":"30s",
"alertContent":"命名空间[ {{ $labels.namespace}} ] 下,控制器-部署[ {{ $labels.deployment}} ]状态异常"
},
{
"id":"76f070b09c73007e4bfe3438d024f703",
"expr":"107,117,98,101,95,100,97,101,109,111,110,115,101,116,95,115,116,97,116,117,115,95,110,117,109,98,101,114,95,117,110,97,118,97,105,108,97,98,108,101",
"summary":"{{ $labels.daemonset}},{{ $value }},daemonset,{{ $labels.nodeip }}",
"metricName":"控制器-守护进程集-健康情况",
"measurementId":"Daemonset-Healthy",
"ynState":"1",
"orignalStatus":"ANY_VALUE",
"ynHealthy":"0",
"comparator":"NEQ",
"threshold":"0",
"priority":"3",
"trembleTimes":"30s",
"alertContent":"命名空间[ {{ $labels.namespace}} ] 下,控制器-守护进程集[ {{ $labels.daemonset}} ]状态异常"
},
{
"id":"e4a9dfa674cdf31cab1f127376aab2a3",
"expr":"107,117,98,101,95,115,116,97,116,101,102,117,108,115,101,116,95,114,101,112,108,105,99,97,115,45,107,117,98,101,95,115,116,97,116,101,102,117,108,115,101,116,95,115,116,97,116,117,115,95,114,101,112,108,105,99,97,115,95,114,101,97,100,121",
"summary":"{{ $labels.statefulset}},{{ $value }},statefulset,{{ $labels.nodeip}}",
"metricName":"控制器-有状态副本集-健康情况",
"measurementId":"Statefulset-Healthy",
"ynState":"1",
"orignalStatus":"ANY_VALUE",
"ynHealthy":"0",
"comparator":"NEQ",
"threshold":"0",
"priority":"3",
"trembleTimes":"30s",
"alertContent":"命名空间[ {{ $labels.namespace}} ] 下,控制器-有状态副本集[ {{ $labels.statefulset}} ]状态异常"
},
{
"id":"41ff54ace66c4fa8916c0d494204fd66",
"expr":"107,117,98,101,95,112,111,100,95,115,116,97,116,117,115,95,112,104,97,115,101,123,112,104,97,115,101,61,34,82,117,110,110,105,110,103,34,125",
"summary":"{{ $labels.pod}},{{ $value }},pod,{{ $labels.nodeip }}",
"metricName":"容器组-健康情况",
"measurementId":"Pod-Healthy",
"ynState":"1",
"orignalStatus":"ANY_VALUE",
"ynHealthy":"0",
"comparator":"EQ",
"threshold":"0",
"priority":"2",
"trembleTimes":"30s",
"alertContent":"命名空间[ {{ $labels.namespace}} ] 下,容器组[ {{ $labels.pod}} ]状态异常"
},
{
"id":"088a967da99f4c741e97ed0b4ee7c56a",
"expr":"107,117,98,101,95,112,111,100,95,99,111,110,116,97,105,110,101,114,95,115,116,97,116,117,115,95,114,117,110,110,105,110,103",
"summary":"{{ $labels.container}},{{ $value }},container,{{ $labels.nodeip }}",
"metricName":"容器-健康情况",
"measurementId":"Container-Healthy",
"ynState":"1",
"orignalStatus":"ANY_VALUE",
"ynHealthy":"0",
"comparator":"EQ",
"threshold":"0",
"priority":"2",
"trembleTimes":"30s",
"alertContent":"容器[ {{ $labels.container}} ]状态异常"
},
{
"id":"cf0041618371e8153edd10dca7595841",
"expr":"115,117,109,32,98,121,40,112,111,100,95,110,97,109,101,44,110,97,109,101,115,112,97,99,101,44,110,111,100,101,105,112,41,32,40,114,97,116,101,40,99,111,110,116,97,105,110,101,114,95,99,112,117,95,117,115,97,103,101,95,115,101,99,111,110,100,115,95,116,111,116,97,108,123,105,109,97,103,101,33,61,34,34,44,110,97,109,101,115,112,97,99,101,33,61,34,34,125,91,49,109,93,41,41",
"summary":"{{ $labels.pod_name}},{{ $value }},pod_name,{{ $labels.nodeip }}",
"metricName":"容器组-CPU使用情况",
"measurementId":"Pod-CPU",
"thresholdState":"static",
"ynState":"0",
"comparator":"GT",
"threshold":"0.3",
"priority":"1",
"unit":"Core",
"trembleTimes":"30s",
"alertContent":"命名空间[ {{ $labels.namespace}} ] 下,容器组[ {{ $labels.pod_name}} ]CPU使用情况为{{$value}}{{comparator}}{{threshold}}"
},
{
"id":"e8ab18f5d9b541c2bcf443163069997d",
"expr":"115,117,109,32,98,121,40,112,111,100,95,110,97,109,101,44,110,97,109,101,115,112,97,99,101,44,110,111,100,101,105,112,41,32,40,114,97,116,101,40,99,111,110,116,97,105,110,101,114,95,99,112,117,95,117,115,97,103,101,95,115,101,99,111,110,100,115,95,116,111,116,97,108,123,105,109,97,103,101,33,61,34,34,44,110,97,109,101,115,112,97,99,101,33,61,34,34,125,91,49,109,93,41,41",
"summary":"{{ $labels.pod_name}},{{ $value }},pod_name,{{ $labels.nodeip}}",
"metricName":"容器组-CPU使用情况",
"measurementId":"Pod-CPU",
"thresholdState":"static",
"ynState":"0",
"comparator":"GT",
"threshold":"0.5",
"priority":"2",
"unit":"Core",
"trembleTimes":"30s",
"alertContent":"命名空间[ {{ $labels.namespace}} ] 下,容器组[ {{ $labels.pod_name}} ]CPU使用情况为{{$value}}{{comparator}}{{threshold}}"
},
{
"id":"0b0c7ab033646ece5dc47c39854466a2",
"expr":"115,117,109,32,98,121,40,112,111,100,95,110,97,109,101,44,110,97,109,101,115,112,97,99,101,44,110,111,100,101,105,112,41,32,40,114,97,116,101,40,99,111,110,116,97,105,110,101,114,95,99,112,117,95,117,115,97,103,101,95,115,101,99,111,110,100,115,95,116,111,116,97,108,123,105,109,97,103,101,33,61,34,34,44,110,97,109,101,115,112,97,99,101,33,61,34,34,125,91,49,109,93,41,41",
"summary":"{{ $labels.pod_name}},{{ $value }},pod_name,{{ $labels.nodeip}}",
"metricName":"容器组-CPU使用情况",
"measurementId":"Pod-CPU",
"thresholdState":"static",
"ynState":"0",
"comparator":"GT",
"threshold":"0.8",
"priority":"3",
"unit":"Core",
"trembleTimes":"30s",
"alertContent":"命名空间[ {{ $labels.namespace}} ] 下,容器组[ {{ $labels.pod_name}} ]CPU使用情况为{{$value}}{{comparator}}{{threshold}}"
},
{
"id":"2e143bdec6869764e15fc988cd920428",
"expr":"115,117,109,32,98,121,40,112,111,100,95,110,97,109,101,44,110,97,109,101,115,112,97,99,101,44,110,111,100,101,105,112,41,32,40,99,111,110,116,97,105,110,101,114,95,109,101,109,111,114,121,95,117,115,97,103,101,95,98,121,116,101,115,123,105,109,97,103,101,33,61,34,34,44,110,97,109,101,115,112,97,99,101,33,61,34,34,125,41,47,49,48,50,52,47,49,48,50,52",
"summary":"{{ $labels.pod_name}},{{ $value }},pod_name,{{ $labels.nodeip}}",
"metricName":"容器组-内存使用情况",
"measurementId":"Pod-Memory",
"thresholdState":"static",
"ynState":"0",
"comparator":"GT",
"threshold":"500",
"priority":"1",
"unit":"MB",
"trembleTimes":"30s",
"alertContent":"命名空间[ {{ $labels.namespace}} ] 下,容器组[ {{ $labels.pod_name}} ]内存使用状态为{{$value}}{{comparator}}{{threshold}}"
},
{
"id":"1c694c321a21719dab5cb12201abecaf",
"expr":"115,117,109,32,98,121,40,112,111,100,95,110,97,109,101,44,110,97,109,101,115,112,97,99,101,44,110,111,100,101,105,112,41,32,40,114,97,116,101,40,99,111,110,116,97,105,110,101,114,95,110,101,116,119,111,114,107,95,114,101,99,101,105,118,101,95,98,121,116,101,115,95,116,111,116,97,108,123,105,109,97,103,101,33,61,34,34,44,110,97,109,101,115,112,97,99,101,33,61,34,34,125,91,49,109,93,41,41,47,49,48,50,52",
"summary":"{{ $labels.pod_name}},{{ $value }},pod_name,{{ $labels.nodeip}}",
"metricName":"容器组-流入流量",
"measurementId":"Pod-NetReceive",
"thresholdState":"static",
"ynState":"0",
"comparator":"GT",
"threshold":"300",
"priority":"1",
"unit":"Kb",
"trembleTimes":"30s",
"alertContent":"命名空间[ {{ $labels.namespace}} ] 下,容器组[ {{ $labels.pod_name}} ]流入流量为{{$value}}{{comparator}}{{threshold}}"
},
{
"id":"ea0ef6e277a6d83ff65fa9312adb66bb",
"expr":"115,117,109,32,98,121,40,112,111,100,95,110,97,109,101,44,110,97,109,101,115,112,97,99,101,44,110,111,100,101,105,112,41,32,40,114,97,116,101,40,99,111,110,116,97,105,110,101,114,95,110,101,116,119,111,114,107,95,116,114,97,110,115,109,105,116,95,98,121,116,101,115,95,116,111,116,97,108,123,105,109,97,103,101,33,61,34,34,44,110,97,109,101,115,112,97,99,101,33,61,34,34,125,91,49,109,93,41,41,47,49,48,50,52",
"summary":"{{ $labels.pod_name}},{{ $value }},pod_name,{{ $labels.nodeip}}",
"metricName":"容器组-流出流量",
"measurementId":"Pod-NetTransmit",
"thresholdState":"static",
"ynState":"0",
"comparator":"GT",
"threshold":"300",
"priority":"1",
"unit":"Kb",
"trembleTimes":"30s",
"alertContent":"命名空间[ {{ $labels.namespace}} ] 下,容器组[ {{ $labels.pod_name}} ]流出流量为{{$value}}{{comparator}}{{threshold}}"
},
{
"id":"180c4123a8c09b9554f911cf249a4e3c",
"expr":"115,117,109,32,98,121,40,110,97,109,101,44,110,97,109,101,115,112,97,99,101,44,110,111,100,101,105,112,41,32,40,114,97,116,101,40,99,111,110,116,97,105,110,101,114,95,99,112,117,95,117,115,97,103,101,95,115,101,99,111,110,100,115,95,116,111,116,97,108,123,105,109,97,103,101,33,61,34,34,125,91,49,109,93,41,41",
"summary":"{{ $labels.name}},{{ $value }},name,{{ $labels.nodeip }}",
"metricName":"容器-CPU使用情况",
"measurementId":"Container-CPU",
"thresholdState":"static",
"ynState":"0",
"comparator":"GT",
"threshold":"0.3",
"priority":"1",
"unit":"Core",
"trembleTimes":"30s",
"alertContent":"容器[ {{ $labels.name}} ]CPU使用情况为{{$value}}{{comparator}}{{threshold}}"
}
]
json字段注释
orignalStatus #框架需要
comparator #状态类指标都为EQ
,性能类指标分大于小于
threshold #阈值
priority #告警优先级
trembleTimes #对应rules文件的
for
alertContent #告警内容
prometheus策略文件
1.alertType毫无用处,可以去掉
groups:
- name: MonitotingRules
rules:
- alert: Node
-Healthy
annotations: {description: '33410,28857,91,32,123,123,32,36,108,97,98,101,108,115,46,112,111,100,95,110,97,109,101,125,125,32,93,29366,24577,24322,24120',
summary: '{{ $labels.node}},{{ $value }},node,{{ $labels.nodeip }}'}
expr: kube_node_status_condition
{status="true"
,condition="Ready"
}==0
for: 30s
labels: {alertType: Healthy
, bindresource: '', comparator: EQ
, expr: 'kube_node_status_condition{status="true",condition="Ready"}==0',
forTime: 30s
, kubernetesIP: '', metricName: 'kube_node_status_condition{status="true",condition="Ready"}',
resourceType: Node
, severity: emergency
, threshold: '0', unit: null, ynHealthy: '0',
ynState: '1'}
- alert: Deployment
-Healthy
annotations: {description: '21629,21517,31354,38388,91,32,123,123,32,36,108,97,98,101,108,115,46,110,97,109,101,115,112,97,99,101,125,125,32,93,32,19979,44,25511,21046,22120,45,37096,32626,91,32,123,123,32,36,108,97,98,101,108,115,46,100,101,112,108,111,121,109,101,110,116,125,125,32,93,29366,24577,24322,24120',
summary: '{{ $labels.deployment}},{{ $value }},deployment,{{ $labels.nodeip}}'}
expr: kube_deployment_status_replicas_unavailable
!=0
for: 30s
labels: {alertType: Healthy
, bindresource: '', comparator: NEQ
, expr: 'kube_deployment_status_replicas_unavailable!=0',
forTime: 30s
, kubernetesIP: '', metricName: kube_deployment_status_replicas_unavailable
,
resourceType: Deployment
, severity: emergency
, threshold: '0', unit: null, ynHealthy: '0',
ynState: '1'}
- alert: Daemonset
-Healthy
annotations: {description: '21629,21517,31354,38388,91,32,123,123,32,36,108,97,98,101,108,115,46,110,97,109,101,115,112,97,99,101,125,125,32,93,32,19979,44,25511,21046,22120,45,23432,25252,36827,31243,38598,91,32,123,123,32,36,108,97,98,101,108,115,46,100,97,101,109,111,110,115,101,116,125,125,32,93,29366,24577,24322,24120',
summary: '{{ $labels.daemonset}},{{ $value }},daemonset,{{ $labels.nodeip }}'}
expr: kube_daemonset_status_number_unavailable
!=0
for: 30s
labels: {alertType: Healthy
, bindresource: '', comparator: NEQ
, expr: 'kube_daemonset_status_number_unavailable!=0',
forTime: 30s
, kubernetesIP: '', metricName: kube_daemonset_status_number_unavailable
,
resourceType: Daemonset
, severity: emergency
, threshold: '0', unit: null, ynHealthy: '0',
ynState: '1'}
- alert: Statefulset
-Healthy
annotations: {description: '21629,21517,31354,38388,91,32,123,123,32,36,108,97,98,101,108,115,46,110,97,109,101,115,112,97,99,101,125,125,32,93,32,19979,44,25511,21046,22120,45,26377,29366,24577,21103,26412,38598,91,32,123,123,32,36,108,97,98,101,108,115,46,115,116,97,116,101,102,117,108,115,101,116,125,125,32,93,29366,24577,24322,24120',
summary: '{{ $labels.statefulset}},{{ $value }},statefulset,{{ $labels.nodeip}}'}
expr: kube_statefulset_replicas
-kube_statefulset_status_replicas_ready
!=0
for: 30s
labels: {alertType: Healthy
, bindresource: '', comparator: NEQ
, expr: 'kube_statefulset_replicas-kube_statefulset_status_replicas_ready!=0',
forTime: 30s
, kubernetesIP: '', metricName: kube_statefulset_replicas
-kube_statefulset_status_replicas_ready
,
resourceType: Statefulset
, severity: emergency
, threshold: '0', unit: null, ynHealthy: '0',
ynState: '1'}
- alert: Pod
-Healthy
annotations: {description: '21629,21517,31354,38388,91,32,123,123,32,36,108,97,98,101,108,115,46,110,97,109,101,115,112,97,99,101,125,125,32,93,32,19979,44,23481,22120,32452,91,32,123,123,32,36,108,97,98,101,108,115,46,112,111,100,125,125,32,93,29366,24577,24322,24120',
summary: '{{ $labels.pod}},{{ $value }},pod,{{ $labels.nodeip }}'}
expr: kube_pod_status_phase
{phase="Running"
}==0
for: 30s
labels: {alertType: Healthy
, bindresource: '', comparator: EQ
, expr: 'kube_pod_status_phase{phase="Running"}==0',
forTime: 30s
, kubernetesIP: '', metricName: 'kube_pod_status_phase{phase="Running"}',
resourceType: Pod
, severity: critical
, threshold: '0', unit: null, ynHealthy: '0',
ynState: '1'}
- alert: Container
-Healthy
annotations: {description: '23481,22120,91,32,123,123,32,36,108,97,98,101,108,115,46,99,111,110,116,97,105,110,101,114,125,125,32,93,29366,24577,24322,24120',
summary: '{{ $labels.container}},{{ $value }},container,{{ $labels.nodeip }}'}
expr: kube_pod_container_status_running==0
for: 30s
labels: {alertType: Healthy
, bindresource: '', comparator: EQ
, expr: 'kube_pod_container_status_running==0',
forTime: 30s
, kubernetesIP: '', metricName: kube_pod_container_status_running
,
resourceType: Container
, severity: critical
, threshold: '0', unit: null, ynHealthy: '0',
ynState: '1'}
- alert: Pod
-CPU
annotations: {description: '21629,21517,31354,38388,91,32,123,123,32,36,108,97,98,101,108,115,46,110,97,109,101,115,112,97,99,101,125,125,32,93,32,19979,44,23481,22120,32452,91,32,123,123,32,36,108,97,98,101,108,115,46,112,111,100,95,110,97,109,101,125,125,32,93,67,80,85,20351,29992,24773,20917,20026,123,123,36,118,97,108,117,101,125,125,123,123,99,111,109,112,97,114,97,116,111,114,125,125,123,123,116,104,114,101,115,104,111,108,100,125,125',
summary: '{{ $labels.pod_name}},{{ $value }},pod_name,{{ $labels.nodeip }}'}
expr: sum by(pod_name
,namespace
,nodeip) (rate(container_cpu_usage_seconds_total
{image
!="",namespace!=""}[1m]))>0.3
for: 30s
labels: {alertType: CPU
, bindresource: '', comparator: GT
, expr: 'sum by(pod_name
,namespace
,nodeip)
(rate(container_cpu_usage_seconds_total
{image
!="",namespace!=""}[1m]))>0.3',
forTime: 30s
, kubernetesIP: '', metricName: 'sum by(pod_name
,namespace
,nodeip)
(rate(container_cpu_usage_seconds_total
{image
!="",namespace!=""}[1m]))', resourceType: Pod
,
severity: warning
, threshold: '0.3', unit: Core
, ynHealthy: null, ynState: '0'}
- alert: Pod
-CPU
annotations: {description: '21629,21517,31354,38388,91,32,123,123,32,36,108,97,98,101,108,115,46,110,97,109,101,115,112,97,99,101,125,125,32,93,32,19979,44,23481,22120,32452,91,32,123,123,32,36,108,97,98,101,108,115,46,112,111,100,95,110,97,109,101,125,125,32,93,67,80,85,20351,29992,24773,20917,20026,123,123,36,118,97,108,117,101,125,125,123,123,99,111,109,112,97,114,97,116,111,114,125,125,123,123,116,104,114,101,115,104,111,108,100,125,125',
summary: '{{ $labels.pod_name}},{{ $value }},pod_name,{{ $labels.nodeip}}'}
expr: sum by(pod_name
,namespace
,nodeip) (rate(container_cpu_usage_seconds_total
{image
!="",namespace!=""}[1m]))>0.5
for: 30s
labels: {alertType: CPU
, bindresource: '', comparator: GT
, expr: 'sum by(pod_name
,namespace
,nodeip)
(rate(container_cpu_usage_seconds_total
{image
!="",namespace!=""}[1m]))>0.5',
forTime: 30s
, kubernetesIP: '', metricName: 'sum by(pod_name
,namespace
,nodeip)
(rate(container_cpu_usage_seconds_total
{image
!="",namespace!=""}[1m]))', resourceType: Pod
,
severity: critical
, threshold: '0.5', unit: Core
, ynHealthy: null, ynState: '0'}
- alert: Pod
-CPU
annotations: {description: '21629,21517,31354,38388,91,32,123,123,32,36,108,97,98,101,108,115,46,110,97,109,101,115,112,97,99,101,125,125,32,93,32,19979,44,23481,22120,32452,91,32,123,123,32,36,108,97,98,101,108,115,46,112,111,100,95,110,97,109,101,125,125,32,93,67,80,85,20351,29992,24773,20917,20026,123,123,36,118,97,108,117,101,125,125,123,123,99,111,109,112,97,114,97,116,111,114,125,125,123,123,116,104,114,101,115,104,111,108,100,125,125',
summary: '{{ $labels.pod_name}},{{ $value }},pod_name,{{ $labels.nodeip}}'}
expr: sum by(pod_name
,namespace
,nodeip) (rate(container_cpu_usage_seconds_total
{image
!="",namespace!=""}[1m]))>0.8
for: 30s
labels: {alertType: CPU
, bindresource: '', comparator: GT
, expr: 'sum by(pod_name
,namespace
,nodeip)
(rate(container_cpu_usage_seconds_total
{image
!="",namespace!=""}[1m]))>0.8',
forTime: 30s
, kubernetesIP: '', metricName: 'sum by(pod_name
,namespace
,nodeip)
(rate(container_cpu_usage_seconds_total
{image
!="",namespace!=""}[1m]))', resourceType: Pod
,
severity: emergency
, threshold: '0.8', unit: Core
, ynHealthy: null, ynState: '0'}
- alert: Pod
-Memory
annotations: {description: '21629,21517,31354,38388,91,32,123,123,32,36,108,97,98,101,108,115,46,110,97,109,101,115,112,97,99,101,125,125,32,93,32,19979,44,23481,22120,32452,91,32,123,123,32,36,108,97,98,101,108,115,46,112,111,100,95,110,97,109,101,125,125,32,93,20869,23384,20351,29992,29366,24577,20026,123,123,36,118,97,108,117,101,125,125,123,123,99,111,109,112,97,114,97,116,111,114,125,125,123,123,116,104,114,101,115,104,111,108,100,125,125',
summary: '{{ $labels.pod_name}},{{ $value }},pod_name,{{ $labels.nodeip}}'}
expr: sum by(pod_name
,namespace
,nodeip) (container_memory_usage_bytes
{image
!="",namespace!=""})/1024/1024>500
for: 30s
labels: {alertType: Memory
, bindresource: '', comparator: GT
, expr: 'sum by(pod_name
,namespace
,nodeip)
(container_memory_usage_bytes
{image
!="",namespace!=""})/1024/1024>500', forTime: 30s
,
kubernetesIP: '', metricName: 'sum by(pod_name,namespace,nodeip) (container_memory_usage_bytes{image!="",namespace!=""})/1024/1024',
resourceType: Pod
, severity: warning
, threshold: '500', unit: MB
, ynHealthy: null,
ynState: '0'}
- alert: Pod
-NetReceive
annotations: {description: '21629,21517,31354,38388,91,32,123,123,32,36,108,97,98,101,108,115,46,110,97,109,101,115,112,97,99,101,125,125,32,93,32,19979,44,23481,22120,32452,91,32,123,123,32,36,108,97,98,101,108,115,46,112,111,100,95,110,97,109,101,125,125,32,93,27969,20837,27969,37327,20026,123,123,36,118,97,108,117,101,125,125,123,123,99,111,109,112,97,114,97,116,111,114,125,125,123,123,116,104,114,101,115,104,111,108,100,125,125',
summary: '{{ $labels.pod_name}},{{ $value }},pod_name,{{ $labels.nodeip}}'}
expr: sum by(pod_name
,namespace
,nodeip) (rate(container_network_receive_bytes_total
{image
!="",namespace!=""}[1m]))/1024>200
for: 30s
labels: {alertType: NetReceive
, bindresource: '', comparator: GT
, expr: 'sum by(pod_name
,namespace
,nodeip)
(rate(container_network_receive_bytes_total
{image
!="",namespace!=""}[1m]))/1024>200',
forTime: 30s
, kubernetesIP: '', metricName: 'sum by(pod_name
,namespace
,nodeip)
(rate(container_network_receive_bytes_total
{image
!="",namespace!=""}[1m]))/1024',
resourceType: Pod
, severity: warning
, threshold: '200', unit: Kb
, ynHealthy: null,
ynState: '0'}
- alert: Pod
-NetTransmit
annotations: {description: '21629,21517,31354,38388,91,32,123,123,32,36,108,97,98,101,108,115,46,110,97,109,101,115,112,97,99,101,125,125,32,93,32,19979,44,23481,22120,32452,91,32,123,123,32,36,108,97,98,101,108,115,46,112,111,100,95,110,97,109,101,125,125,32,93,27969,20986,27969,37327,20026,123,123,36,118,97,108,117,101,125,125,123,123,99,111,109,112,97,114,97,116,111,114,125,125,123,123,116,104,114,101,115,104,111,108,100,125,125',
summary: '{{ $labels.pod_name}},{{ $value }},pod_name,{{ $labels.nodeip}}'}
expr: sum by(pod_name
,namespace
,nodeip) (rate(container_network_transmit_bytes_total
{image
!="",namespace!=""}[1m]))/1024>200
for: 30s
labels: {alertType: NetTransmit
, bindresource: '', comparator: GT
, expr: 'sum
by(pod_name
,namespace
,nodeip) (rate(container_network_transmit_bytes_total
{image
!="",namespace!=""}[1m]))/1024>200',
forTime: 30s
, kubernetesIP: '', metricName: 'sum by(pod_name
,namespace
,nodeip)
(rate(container_network_transmit_bytes_total
{image
!="",namespace!=""}[1m]))/1024',
resourceType: Pod
, severity: warning
, threshold: '200', unit: Kb
, ynHealthy: null,
ynState: '0'}
- alert: Container
-CPU
annotations: {description: '23481,22120,91,32,123,123,32,36,108,97,98,101,108,115,46,110,97,109,101,125,125,32,93,67,80,85,20351,29992,24773,20917,20026,123,123,36,118,97,108,117,101,125,125,123,123,99,111,109,112,97,114,97,116,111,114,125,125,123,123,116,104,114,101,115,104,111,108,100,125,125',
summary: '{{ $labels.name}},{{ $value }},name,{{ $labels.nodeip }}'}
expr: sum by(name
,namespace
,nodeip) (rate(container_cpu_usage_seconds_total
{image
!=""}[1m]))>0.3
for: 30s
labels: {alertType: CPU
, bindresource: '', comparator: GT
, expr: 'sum by(name
,namespace
,nodeip)
(rate(container_cpu_usage_seconds_total
{image
!=""}[1m]))>0.3', forTime: 30s
,
kubernetesIP: '', metricName: 'sum by(name,namespace,nodeip) (rate(container_cpu_usage_seconds_total{image!=""}[1m]))',
resourceType: Container
, severity: warning
, threshold: '0.3', unit: Core
, ynHealthy: null,
ynState: '0'}
kind: ConfigMap
metadata:
name: all
-prometheus
-rules
namespace: monitoring
prometheus相关策略直观展示(yaml)
alert: Node
-Healthy
expr: kube_node_status_condition
{condition="Ready"
,status="true"
}
!= 0
for: 30s
labels:
alertType: Healthy
bindresource: ""
comparator: NEQ
expr: kube_node_status_condition
{status="true"
,condition="Ready"
}!=0
forTime: 30s
kubernetesIP: 10.4.41.63
metricName: kube_node_status_condition
{status="true"
,condition="Ready"
}
resourceType: Node
severity: emergency
threshold: "0"
unit: ""
ynHealthy: "1"
ynState: "1"
annotations:
description: 33410,28857,91,32,123,123,32,36,108,97,98,101,108,115,46,112,111,100,95,110,97,109,101,125,125,32,93,29366,24577,24322,24120
summary: '{{ $labels.node}},{{ $value }},node,{{ $labels.nodeip }}'
相关公式
#节点健康
kube_node_status_condition
{condition
="Ready",status
="true"}!= 0
#节点不健康
kube_node_status_condition
{condition
="Ready",status
="true"}== 0
来源注释
rule中各变量的来源
#alertName 来源
:前台json
String alertName
= jo
.getString("measurementId");
#expr 来源:参数拼凑
String expr
= metricName
+trans2Comparator(comparator
)+threshold
#metricName 来源:如果在prometheus
-rules中有这条告警策略
,则从这条策略的labels中读取
;如果没有
,说明这条策略之前被删除了
,则从all
-prometheus
-rules中读取。这个属性在一开始就写好在策略文件里了
String metricName
= rule
.getLabels().getMetricName();
#comparator 来源:前台json
String comparator
= jo
.getString("comparator");
#threshold 来源:前台json
String threshold
= jo
.getString("threshold");
#
for 来源:前台json
String trembleTimes
= jo
.getString("trembleTimes");
labels
#alertType 来源:alertType拆分
String alertType
= alertName
.split("-")[1];
#bindresource 来源:如果在prometheus
-rules中有这条告警策略
,则从这条策略的labels中读取
;如果没有置为空
#kubernetesIP 来源:根据传入的resId确定
#resourceType 来源:alertType拆分
String resourceType
= alertName
.split("-")[0];
#severity 来源:前台json
String priority
= trans2Priority(jo
.getString("priority"));
#threshold 来源:前台json
String threshold
= jo
.getString("threshold");
#unit 来源:前台json
String unit
= jo
.getString("unit");
#ynHealthy 来源:前台json
String ynHealthy
= jo
.getString("ynHealthy");
#ynState 来源:前台json
String ynState
= jo
.getString("ynState");
annotations
#description 来源:前台json中的alertContent字段转换成ascii码
String description
= sbDescription
.toString();
#summary 来源:前台json
String summary
= jo
.getString("summary");
Java代码
String resourceId
= ServletActionContext
.getRequest().getParameter("resourceId");
String jsonData
= ServletActionContext
.getRequest().getParameter("jsonData");
JSONArray resultArray
= JSONArray
.parseArray(jsonData
);
PromRules pr
= new PromRules();
for(int i
= 0; i
< resultArray
.size(); i
++){
JSONObject jo
= resultArray
.getJSONObject(i
);
String alertName
= jo
.getString("measurementId");
Map
<String,String> map
= getConfigMap(resourceId
,"prometheus-rules");
String metricName
= "";
String bindresource
= "";
for(String fileName
:map
.keySet()){
String yamlContent
= map
.get(fileName
);
PromRules prReadMap
= yaml
.loadAs(yamlContent
, PromRules
.class);
List
<Rule> rules
= prReadMap
.getGroups().get(0).getRules();
for(int j
= 0; j
< rules
.size(); j
++){
Rule rule
= rules
.get(j
);
if(rule
.getAlert().equals(alertName
)){
metricName
= rule
.getLabels().getMetricName();
bindresource
= rule
.getLabels().getBindresource();
break;
}
}
if(metricName
.equals("")){
Map
<String,String> allRulesMap
= getConfigMap(resourceId
,"all-prometheus-rules");
Yaml allRulesYaml
= new Yaml();
for(String fileName
:allRulesMap
.keySet()){
String yamlContent
= allRulesMap
.get(fileName
);
PromRules prReadMap
= allRulesYaml
.loadAs(yamlContent
, PromRules
.class);
List
<Rule> rules
= prReadMap
.getGroups().get(0).getRules();
for(int j
= 0; j
< rules
.size(); j
++){
Rule rule
= rules
.get(j
);
if(rule
.getAlert().equals(alertName
)){
metricName
= rule
.getLabels().getMetricName();
bindresource
= "";
break;
}
}
}
}
pr
= generateRules(pr
,masterHostIp
,alertName
,metricName
,comparator
,threshold
,trembleTimes
,priority
,summary
,ynState
,unit
,bindresource
,description
,ynHealthy
);
}
}
KubernetesClient client
=null
;
client
= getK8sClient(client
,masterHostIp
,masterHostPort
,username
,password
);
ConfigMap cm
= client
.configMaps().inNamespace("monitoring").withName("prometheus-rules").get();
Map
<String,String> map
= cm
.getData();
Yaml yaml
= new Yaml();
for(String fileName
:map
.keySet()){
String yamlContent
= map
.get(fileName
);
PromRules prRead
= yaml
.loadAs(yamlContent
, PromRules
.class);
prRead
.setGroups(pr
.getGroups());
StringWriter sw
= new StringWriter();
yaml
.dump(pr
, sw
);
Map
<String,String> maptest
= new HashMap<String,String>();
maptest
.put("prometheusRules.yml", sw
.toString());
cm
.setData(maptest
);
client
.configMaps().inNamespace("monitoring").withName("prometheus-rules").replace(cm
);
}
public Map
<String,String> getConfigMap(String resourceId
,String name
){
client
= getK8sClient(client
,hostIP
,hostPort
,userName
,password
);
ConfigMap cm
= client
.configMaps().inNamespace("monitoring").withName(name
).get();
return cm
.getData();
}
public String
trans2Comparator(String comp
){
String comparator
= "";
if(comp
.equals("GT")){
comparator
= ">";
}else if(comp
.equals("LT")){
comparator
= "<";
}
else if(comp
.equals("EQ")){
comparator
= "==";
}
else if(comp
.equals("NEQ")){
comparator
= "!=";
}
return comparator
;
}