k8s集群监控 cadvisorexporter+prometheus+grafana

mac2022-11-25  25

k8s监控处理

1.cadvisor/exporter+prometheus+grafana 安装

1.1 配置nfs安装

ubuntu: nfs 服务器 apt-get install nfs-kernel-server # 创建一个/data/pvdata的共享目录 mkdir /data/pvdata centos: chown nfsnobody:nfsnobody /data/pvdata ubuntu: chown chown nobody:nogroup /data/pvdata vim /etc/exports #ip填写自己所支持IP范围 /data/pvdata xxx.xxx.xxx.0/24(rw,async,all_squash) exportfs -rv #显示 exporting xxx.xxx.xxx.0/24:/data/pvdata #需要在prometheus服务所在的节点安装 #nfs 客户端 apt-get update apt-get install nfs-common 在其他节点上测试nfs是否可用,挂载命令: mkdir /kubernetes mount nfs服务器的ip:/data/pvdata /kubernetes

1.2 prometheus配置

mkdir /data/k8s/yaml/kube-system/prometheus cd /data/k8s/yaml/kube-system/prometheus/ # 从github官网下载yaml部署文件 curl -O https://raw.githubusercontent.com/kubernetes/kubernetes/master/cluster/addons/prometheus/prometheus-rbac.yaml curl -O https://raw.githubusercontent.com/kubernetes/kubernetes/master/cluster/addons/prometheus/prometheus-configmap.yaml curl -O https://raw.githubusercontent.com/kubernetes/kubernetes/master/cluster/addons/prometheus/prometheus-service.yaml curl -O https://raw.githubusercontent.com/kubernetes/kubernetes/master/cluster/addons/prometheus/prometheus-statefulset.yaml

1.2.1 修改prometheus-statefulset.yaml

# 删掉最下面的10行 volumeClaimTemplates: - metadata: name: prometheus-data spec: storageClassName: standard accessModes: - ReadWriteOnce resources: requests: storage: "16Gi" # 新增下面3行(使用自己指定的nfs作为存储) - name: prometheus-data persistentVolumeClaim: claimName: prometheus-data

1.2.2 新增pv/pvc 修改yaml文件

#创建普罗米修斯数据存储位置 mkdir /data/pvdata/prometheus #注意 ubuntu 系统需要换成nobody chown nfsnobody. /data/pvdata/prometheus cat > prometheus-pvc-data.yaml << EFO apiVersion: v1 kind: PersistentVolume metadata: name: prometheus-data spec: storageClassName: prometheus-data capacity: storage: 10Gi accessModes: - ReadWriteOnce persistentVolumeReclaimPolicy: Recycle nfs: path: /data/pvdata/prometheus server: nfs-server-ip --- apiVersion: v1 kind: PersistentVolumeClaim metadata: name: prometheus-data namespace: kube-system spec: accessModes: - ReadWriteOnce resources: requests: storage: 10Gi storageClassName: prometheus-data EFO

1.2.3 修改Nodeport与prometheus镜像版本

#修改prometheus-service.yaml type: NodePort #修改prometheus-statefulset.yaml #注意prometheus的配置,默认的cpu、mem配置支持10个节点30个pod prometheus 镜像版本更改最新的v2.13.0 添加 args: --storage.tsdb.retention.time=指定数据保存时长

1.2.4 开始安装配置prometheus

# 应用yaml文件 kubectl apply -f prometheus-rbac.yaml kubectl apply -f prometheus-configmap.yaml kubectl apply -f prometheus-pvc-data.yaml kubectl apply -f prometheus-service.yaml kubectl apply -f prometheus-statefulset.yaml #查看是否安装成功 kubectl get pods -n kube-system |grep prometheus #获取所在prometheus所在的节点 NODE 信息 kubectl get pods -n kube-system -o wide |grep prometheus #获取prometheus的NODEPort kubectl get service -n kube-system prometheus NodePort xxx.xxx.xxx.xxx <none> 9090:32809/TCP 5d20h 访问prometheus服务 NodeIP+Nodeport(32809)

1.3 安装node-exporter

#下载node-exporter的yaml文件 curl -O https://raw.githubusercontent.com/kubernetes/kubernetes/master/cluster/addons/prometheus/node-exporter-ds.yml curl -O https://raw.githubusercontent.com/kubernetes/kubernetes/master/cluster/addons/prometheus/node-exporter-service.yaml #应用node-exporter kubectl apply -f node-exporter-service.yaml kubectl apply -f node-exporter-ds.yml

1.4 部署kube-state-metrics

#下载yaml文件 curl -O https://raw.githubusercontent.com/kubernetes/kubernetes/master/cluster/addons/prometheus/kube-state-metrics-service.yaml curl -O https://raw.githubusercontent.com/kubernetes/kubernetes/master/cluster/addons/prometheus/kube-state-metrics-rbac.yaml curl -O https://raw.githubusercontent.com/kubernetes/kubernetes/master/cluster/addons/prometheus/kube-state-metrics-deployment.yaml #应用yaml文件 kubectl apply -f kube-state-metrics-service.yaml kubectl apply -f kube-state-metrics-rbac.yaml kubectl apply -f kube-state-metrics-deployment.yaml

1.5 部署grafana

1.5.1 创建数据存储目录

mkdir /data/pvdata/prometheus-grafana ubuntu: chown nobody. /data/pvdata/prometheus-grafana centos: chown nfsnobody. /data/pvdata/prometheus-grafana

1.5.2 创建grafana的pvc

cat > grafana-pvc.yaml << EFO apiVersion: v1 kind: PersistentVolume metadata: name: prometheus-grafana spec: storageClassName: prometheus-grafana capacity: storage: 1Gi accessModes: - ReadWriteOnce persistentVolumeReclaimPolicy: Recycle nfs: path: /data/pvdata/prometheus-grafana server: nfs服务ip --- apiVersion: v1 kind: PersistentVolumeClaim metadata: name: prometheus-grafana namespace: kube-system spec: accessModes: - ReadWriteOnce resources: requests: storage: 1Gi storageClassName: prometheus-grafana EFO

1.5.3 grafana-deploment.yaml

#该服务使用的cpu和内存可跟所需配置 cat > grafana-deployment.yaml << EFO apiVersion: extensions/v1beta1 kind: Deployment metadata: name: grafana namespace: kube-system labels: app: grafana spec: revisionHistoryLimit: 10 template: metadata: labels: app: grafana component: prometheus spec: #nodeSelector: # kubernetes.io/hostname: 可设置指定部署到的节点 containers: - name: grafana env: - name: GF_SECURITY_ADMIN_USER value: admin - name: GF_SECURITY_ADMIN_PASSWORD value: admin image: grafana/grafana:6.4.3 imagePullPolicy: IfNotPresent ports: - containerPort: 3000 name: grafana readinessProbe: failureThreshold: 10 httpGet: path: /api/health port: 3000 scheme: HTTP initialDelaySeconds: 30 periodSeconds: 10 successThreshold: 1 timeoutSeconds: 30 livenessProbe: failureThreshold: 3 httpGet: path: /api/health port: 3000 scheme: HTTP periodSeconds: 10 successThreshold: 1 timeoutSeconds: 1 resources: limits: cpu: 100m memory: 256Mi requests: cpu: 100m memory: 256Mi volumeMounts: - mountPath: /var/lib/grafana subPath: grafana name: grafana-volumes volumes: - name: grafana-volumes persistentVolumeClaim: claimName: prometheus-grafana --- # ------------------- APP Service ------------------- # kind: Service apiVersion: v1 metadata: labels: app: grafana name: grafana namespace: kube-system spec: #type: ClusterIP type: NodePort ports: - port: 80 targetPort: 3000 selector: app: grafana EFO

1.5.4 部署文件并查看服务的ip和端口

kubectl apply -f grafana-pvc.yaml kubectl apply -f grafana-deployment.yaml #查看服务和端口 kubectl get service -n kube-system prometheus NodePort xxx.xxx.xxx.xxx <none> 9090:31920/TCP 3d23h kubectl get pods -n kube-system -o wide 获取到grafana所在的node上,拿到nodeip+上面获得的端口(31920)即可访问grafana服务。

– 登陆用户名默认admin,admin。登陆之后需要配置数据源,数据源选择prometheus,填写prometheus服务的ip和端口。然后在datasource 那一栏设置。 导入入dashboard可以选择模版10000,就可以看数据监控页面了

1.6 k8s报警系统

1.6.1 下载所需alertmanager yaml文件

#建议与prometheus放到一个服务器上,避免出现需要安装nfs客户端的错误。 curl -O https://raw.githubusercontent.com/kubernetes/kubernetes/master/cluster/addons/prometheus/alertmanager-pvc.yaml curl -O https://raw.githubusercontent.com/kubernetes/kubernetes/master/cluster/addons/prometheus/alertmanager-service.yaml curl -O https://raw.githubusercontent.com/kubernetes/kubernetes/master/cluster/addons/prometheus/alertmanager-deployment.yaml curl -O https://raw.githubusercontent.com/kubernetes/kubernetes/master/cluster/addons/prometheus/alertmanager-configmap.yaml
1.6.2 创建保存报警信息的文件夹
mkdir /data/pvdata/prometheus-alertmanager chown nfsnobody. /data/pvdata/prometheus-alertmanager

1.6.3 创建alertmanager-pvc.yaml

cat > alertmanager-pvc.yaml << EFO apiVersion: v1 kind: PersistentVolume metadata: name: prometheus-alertmanager spec: storageClassName: prometheus-alertmanager capacity: storage: 1Gi accessModes: - ReadWriteOnce persistentVolumeReclaimPolicy: Recycle nfs: path: /data/pvdata/prometheus-alertmanager server: 搭建的nfs服务ip --- apiVersion: v1 kind: PersistentVolumeClaim metadata: name: prometheus-alertmanager namespace: kube-system spec: accessModes: - ReadWriteOnce resources: requests: storage: 1Gi storageClassName: prometheus-alertmanager EFO

1.6.4 修改alertmanger-deploment.yaml

# 修改最后一行的claimName,更改自己创建的volume - name: storage-volume persistentVolumeClaim: claimName: prometheus-alertmanager ``` #### 1.6.5 修改alertmanger-service.yaml ```shell #修改spec中的type 为NodePort,方便根据节点ip和随机映射的端口访问该服务。 apiVersion: v1 kind: Service metadata: name: alertmanager namespace: kube-system labels: kubernetes.io/cluster-service: "true" addonmanager.kubernetes.io/mode: Reconcile kubernetes.io/name: "Alertmanager" spec: ports: - name: http port: 80 protocol: TCP targetPort: 9093 selector: k8s-app: alertmanager type: NodePort

1.6.6 部署alertmanger

kubectl apply -f alertmanager-pvc.yaml kubectl apply -f alertmanager-configmap.yaml kubectl apply -f alertmanager-service.yaml kubectl apply -f alertmanager-deployment.yaml

1.6.7 创建告警规则

kubectl edit configmaps prometheus-config -n kube-system // 在prometheus.yml: |下面添加 global: #抓取数据间隔 scrape_interval: 5s #评估告警时间间隔 evaluation_interval: 5s alerting: alertmanagers: - static_configs: - targets: ["alertmanger服务的ip和监听的端口"] rule_files: - "/etc/config/rules.yml" // 创建告警规则, 在最下面添加 rules.yml: | groups: - name: monitor rules: - alert: InstanceDown expr: up == 0 for: 1m labels: team: kube-system annotations: summary: "Instance {{ $labels.instance }} down" description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes." // 重载配置文件 curl -X POST http://prometheus的节点ip:端口/-/reload

1.6.8 创建邮件告警

# 修改alertmanager-configmap.yaml文件 cat > alertmanager-configmap.yaml << EFO apiVersion: v1 kind: ConfigMap metadata: name: alertmanager-config namespace: kube-system labels: kubernetes.io/cluster-service: "true" addonmanager.kubernetes.io/mode: EnsureExists data: alertmanager.yml: | global: resolve_timeout: 3m #解析的超时时间 smtp_smarthost: 'smtp.163.com:25' smtp_from: 'USERNAMR@163.com' smtp_auth_username: 'USERNAMR@163.com' smtp_auth_password: 'PASSWORD' smtp_require_tls: false route: group_by: ['example'] group_wait: 60s group_interval: 60s repeat_interval: 12h receiver: 'webhook' receivers: - name: 'webhook' webhook_configs: #填写web_hook_url - url: 'web_hook_url' #是否在告警消除时发送回执消息 send_resolved: false email_configs: - to: 'xxxx@qq.com' send_resolved: false EFO kubectl delete configmaps -n kube-system alertmanager-config kubectl apply -f alertmanager-configmap.yaml

参考 : https://www.jianshu.com/p/e76053b6f3f5

最新回复(0)