promethues监控k8s
适合二进制安装的k8s,其他基于kubeadmin部署的不需要创建etcd secret还要把Prometheus部署清单中关于etcd的删掉
创建命名空间
apiVersion: v1
kind: Namespace
metadata:
name: monitor
labels:
name: monitor创建etcd secret
kubectl create secret generic etcd-certs --from-file=ca.pem=/etc/kubernetes/ssl/ca.pem --from-file=etcd.pem=/etc/kubernetes/ssl/etcd.pem --from-file=etcd-key.pem=/etc/kubernetes/ssl/etcd-key.pem -n monitor部署kube-state-metrics
apiVersion: apps/v1
kind: Deployment
metadata:
name: kube-state-metrics
namespace: monitor
labels:
k8s-app: kube-state-metrics
spec:
selector:
matchLabels:
k8s-app: kube-state-metrics
version: v1.8.0
replicas: 1
template:
metadata:
labels:
k8s-app: kube-state-metrics
version: v1.8.0
spec:
serviceAccountName: kube-state-metrics
containers:
- name: kube-state-metrics
image: bitnami/kube-state-metrics:2.4.2
imagePullPolicy: IfNotPresent
ports:
- name: http-metrics
containerPort: 8080
- name: telemetry
containerPort: 8081
readinessProbe:
httpGet:
path: /healthz
port: 8080
initialDelaySeconds: 5
timeoutSeconds: 5
- name: addon-resizer
image: ibmcom/addon-resizer:2.1
imagePullPolicy: IfNotPresent
resources:
limits:
cpu: 100m
memory: 200Mi
requests:
cpu: 100m
memory: 200Mi
env:
- name: MY_POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: MY_POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
volumeMounts:
- name: config-volume
mountPath: /etc/config
command:
- /pod_nanny
# - --config-dir=/etc/config
- --container=kube-state-metrics
- --cpu=100m
- --extra-cpu=1m
- --memory=100Mi
- --extra-memory=2Mi
- --threshold=5
- --deployment=kube-state-metrics
volumes:
- name: config-volume
configMap:
name: kube-state-metrics-config
---
apiVersion: v1
kind: ConfigMap
metadata:
name: kube-state-metrics-config
namespace: monitor
data:
NannyConfiguration: |-
apiVersion: nannyconfig/v1alpha1
kind: NannyConfiguration
---
apiVersion: v1
kind: Service
metadata:
name: kube-state-metrics
namespace: monitor
annotations:
prometheus.io/scrape: 'true'
spec:
ports:
- name: http-metrics
port: 8080
targetPort: http-metrics
protocol: TCP
- name: telemetry
port: 8081
targetPort: telemetry
protocol: TCP
selector:
k8s-app: kube-state-metrics
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: kube-state-metrics
namespace: monitor
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: kube-state-metrics
rules:
- apiGroups: [""]
resources:
- configmaps
- secrets
- nodes
- pods
- services
- resourcequotas
- replicationcontrollers
- limitranges
- persistentvolumeclaims
- persistentvolumes
- namespaces
- endpoints
verbs: ["list", "watch"]
- apiGroups: ["apps"]
resources:
- statefulsets
- daemonsets
- deployments
- replicasets
verbs: ["list", "watch"]
- apiGroups: ["batch"]
resources:
- cronjobs
- jobs
verbs: ["list", "watch"]
- apiGroups: ["autoscaling"]
resources:
- horizontalpodautoscalers
verbs: ["list", "watch"]
- apiGroups: ["networking.k8s.io", "extensions"]
resources:
- ingresses
verbs: ["list", "watch"]
- apiGroups: ["storage.k8s.io"]
resources:
- storageclasses
verbs: ["list", "watch"]
- apiGroups: ["certificates.k8s.io"]
resources:
- certificatesigningrequests
verbs: ["list", "watch"]
- apiGroups: ["policy"]
resources:
- poddisruptionbudgets
verbs: ["list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: kube-state-metrics-resizer
namespace: monitor
rules:
- apiGroups: [""]
resources:
- pods
verbs: ["get"]
- apiGroups: ["extensions","apps"]
resources:
- deployments
resourceNames: ["kube-state-metrics"]
verbs: ["get", "update"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: kube-state-metrics
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: kube-state-metrics
subjects:
- kind: ServiceAccount
name: kube-state-metrics
namespace: monitor
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: kube-state-metrics
namespace: monitor
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: kube-state-metrics-resizer
subjects:
- kind: ServiceAccount
name: kube-state-metrics
namespace: monitor创建Prometheus认证
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-auth
namespace: monitor
data:
webconfig.yml: |
basic_auth_users:
UnJRAXyk: $2y$10$Xdj.b5xW70/rdRm0BSPmAuaeuwAsoA16hZoEyKNZLmblNanm7JtvO创建Prometheus cm
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
namespace: monitor
data:
prometheus.yml: |
rule_files:
- /etc/config/rules/*.rules
global:
scrape_interval: 30s # 默认每30秒抓取一次
scrape_timeout: 25s # 默认每次抓取超时时间为25秒
evaluation_interval: 30s # 每30秒评估一次规则
scrape_configs:
- job_name: prometheus
basic_auth:
username: UnJRAXyk
password: iQU~tBU9,Z
static_configs:
- targets:
- localhost:9090
- job_name: kubernetes-apiservers
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- action: keep
regex: default;kubernetes;https
source_labels:
- __meta_kubernetes_namespace
- __meta_kubernetes_service_name
- __meta_kubernetes_endpoint_port_name
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
- job_name: kubernetes-nodes-kubelet
kubernetes_sd_configs:
- role: node # 发现集群中的节点
relabel_configs:
# 将标签(.*)作为新标签名,原有值不变
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
- job_name: kubernetes-nodes-cadvisor
kubernetes_sd_configs:
- role: node
relabel_configs:
# 将标签(.*)作为新标签名,原有值不变
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __metrics_path__
replacement: /metrics/cadvisor
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
metric_relabel_configs:
- source_labels: [instance]
separator: ;
regex: (.+)
target_label: node
replacement: $1
action: replace
- job_name: kubernetes-service-endpoints
kubernetes_sd_configs:
- role: endpoints # 从Service列表中的Endpoint发现Pod为目标
basic_auth:
username: UnJRAXyk
password: iQU~tBU9,Z
relabel_configs:
# Service没配置注解prometheus.io/scrape的不采集
- action: keep
regex: true
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_scrape
# 重命名采集目标协议
- action: replace
regex: (https?)
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_scheme
target_label: __scheme__
# 重命名采集目标指标URL路径
- action: replace
regex: (.+)
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_path
target_label: __metrics_path__
# 重命名采集目标地址
- action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
source_labels:
- __address__
- __meta_kubernetes_service_annotation_prometheus_io_port
target_label: __address__
# 将K8s标签(.*)作为新标签名,原有值不变
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
# 生成命名空间标签
- action: replace
source_labels:
- __meta_kubernetes_namespace
target_label: kubernetes_namespace
# 生成Service名称标签
- action: replace
source_labels:
- __meta_kubernetes_service_name
target_label: kubernetes_name
- job_name: etcd
scheme: https
tls_config:
ca_file: /etc/prometheus/secrets/etcd-certs/ca.pem
cert_file: /etc/prometheus/secrets/etcd-certs/etcd.pem
key_file: /etc/prometheus/secrets/etcd-certs/etcd-key.pem
insecure_skip_verify: false
static_configs:
- targets:
- 172.18.60.219:2379
- job_name: blackbox-http-2xx
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets:
- 'https://googgle.vip'
- 'https://i09.net'
- 'https://i09.vip'
- 'https://userapi.jackpotparty.cc/api/public/maintain'
- 'https://jackpp.cc'
- 'https://jackpp.vip'
- 'https://admin-phl.labubugame.cc'
- 'https://data-phl.labubugame.cc'
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 'blackbox-exporter:9115'
- job_name: kubernetes-pods
kubernetes_sd_configs:
- role: pod # 发现所有Pod为目标
relabel_configs:
- action: keep
regex: true
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_scrape
# 重命名采集目标指标URL路径
- action: replace
regex: (.+)
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_path
target_label: __metrics_path__
# 重命名采集目标地址
- action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
source_labels:
- __address__
- __meta_kubernetes_pod_annotation_prometheus_io_port
target_label: __address__
# 将K8s标签(.*)作为新标签名,原有值不变
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
# 生成命名空间标签
- action: replace
source_labels:
- __meta_kubernetes_namespace
target_label: kubernetes_namespace
# 生成Service名称标签
- action: replace
source_labels:
- __meta_kubernetes_pod_name
target_label: kubernetes_pod_name
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager:80"]创建Prometheus rule
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-rules
namespace: monitor
data:
general.rules: |
groups:
- name: general.rules
rules:
- alert: k8s资源状态
expr: up == 0
for: 5m
labels:
severity: error
annotations:
summary: "k8s资源{{ $labels.job }}/{{ $labels.instance }}已停止工作"
node.rules: |
groups:
- name: node.rules
rules:
- alert: k8s节点分区
expr: |
100 - (node_filesystem_free_bytes / node_filesystem_size_bytes) * 100 > 75
for: 1m
labels:
severity: critical
annotations:
summary: "节点{{ $labels.instance }}分区{{ $labels.device }}使用率大于75%,当前值{{ $value }}%"
- alert: k8s节点内存
expr: |
100 - (node_memory_MemFree_bytes+node_memory_Cached_bytes+node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100 > 90
for: 1m
labels:
severity: critical
annotations:
summary: "节点{{ $labels.instance }}内存使用率大于90%,当前值{{ $value }}%"
- alert: k8s节点CPU
expr: |
100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) * 100) > 90
for: 1m
labels:
severity: critical
annotations:
summary: "节点{{ $labels.instance }}CPU使用率大于90%,当前值{{ $value }}%"
- alert: k8s节点状态
expr: |
kube_node_status_condition{condition="Ready",status="true"} == 0
for: 1m
labels:
severity: error
annotations:
summary: '节点{{ $labels.node }}已经超过1分钟没有就绪,当前值{{ $value }}'
- alert: k8s节点线程数
expr: |
sum(node_processes_threads) by(instance) >240000
for: 1m
labels:
severity: critical
annotations:
summary: "节点{{ $labels.instance }}线程数已经接近最大,当前值{{ $value }}"
pod.rules: |
groups:
- name: pod.rules
rules:
- alert: PodCPU
expr: |
sum(rate(container_cpu_usage_seconds_total{container!="", pod!=""}[5m])) by (pod, namespace) / sum(kube_pod_container_resource_limits{resource="cpu", container!=""}) by (pod, namespace) * 100 > 80
for: 3m
labels:
severity: critical
annotations:
summary: "命名空间:{{ $labels.namespace }} Pod名称:{{ $labels.pod }}CPU使用大于80%,当前值{{ $value }}%"
- alert: Pod内存
expr: |
sum(container_memory_working_set_bytes{pod!=""}) by(pod, namespace) / sum(container_spec_memory_limit_bytes{pod!=""}) by(pod, namespace) * 100 != +inf > 80
for: 3m
labels:
severity: critical
annotations:
summary: "命名空间:{{ $labels.namespace }} Pod名称:{{ $labels.pod }}内存使用大于80%,当前值{{ $value }}%"
- alert: Pod网路入口流量
expr: |
sum(rate(container_network_receive_bytes_total{image!="",name=~"^k8s_.*"}[5m])) by (pod,namespace,node) > 8000000000
for: 5m
labels:
severity: warning
annotations:
summary: "节点:{{ $labels.node }}|命名空间:{{ $labels.namespace }}|Pod名称:{{ $labels.pod }}入口流量大于800MB/s,当前值{{ $value }}K/s"
- alert: Pod网路出口流量
expr: |
sum(rate(container_network_transmit_bytes_total{image!="",name=~"^k8s_.*"}[5m])) by (pod,namespace,node) > 8000000000
for: 5m
labels:
severity: warning
annotations:
summary: "节点:{{ $labels.node }}|命名空间:{{ $labels.namespace }}|Pod名称:{{ $labels.pod }}出口流量大于800MB/s,当前值{{ $value }}K/s"
- alert: Pod网路错误
expr: |
sum by (pod, namespace) (rate(container_network_receive_errors_total[5m]) + rate(container_network_transmit_errors_total[5m])) > 10
for: 5m
labels:
severity: critical
annotations:
summary: "命名空间:{{ $labels.namespace }}|Pod名称:{{ $labels.pod }}5分钟内网络错误多次,当前值{{ $value }}"
- alert: Pod重启
expr: |
increase(kube_pod_container_status_restarts_total[1m]) > 0
for: 0m
labels:
severity: critical
annotations:
summary: "命名空间:{{ $labels.namespace }}|Pod名称:{{ $labels.pod }}5分钟内重启多次,当前值{{ $value }}"
- alert: Pod状态Failed
expr: |
sum(kube_pod_status_phase{phase="Failed"}) by (pod,namespace) > 0
for: 0m
labels:
severity: critical
annotations:
summary: "命名空间:{{ $labels.namespace }}|Pod名称:{{ $labels.pod }}状态Failed"
- alert: Pod状态Pending
expr: |
sum(kube_pod_status_phase{phase="Pending"}) by (pod,namespace) > 0
for: 1m
labels:
severity: warning
annotations:
summary: "命名空间:{{ $labels.namespace }}|Pod名称:{{ $labels.pod }}状态Pending"
- alert: Pod拉取镜像
expr: |
sum by(namespace,pod) (kube_pod_container_status_waiting_reason{reason="ErrImagePull"}) == 1
for: 1m
labels:
severity: warning
annotations:
summary: "命名空间:{{ $labels.namespace }}|Pod名称:{{ $labels.pod }}拉取镜像失败ErrImagePull"
- alert: Pod拉取镜像
expr: |
sum by(namespace,pod) (kube_pod_container_status_waiting_reason{reason="ImagePullBackOff"}) == 1
for: 1m
labels:
severity: warning
annotations:
summary: "命名空间:{{ $labels.namespace }}|Pod名称:{{ $labels.pod }}拉取镜像失败ImagePullBackOff"
- alert: Pod运行状态CrashLoopBackOff
expr: |
sum by(namespace,pod) (kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"}) == 1
for: 0m
labels:
severity: critical
annotations:
summary: "命名空间:{{ $labels.namespace }}|Pod名称:{{ $labels.pod }}状态为CrashLoopBackOff"
- alert: Pod运行状态InvalidImageName
expr: |
sum by(namespace,pod) (kube_pod_container_status_waiting_reason{reason="InvalidImageName"}) == 1
for: 0m
labels:
severity: critical
annotations:
summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }}状态为InvalidImageName"
- alert: Pod运行状态CreateContainerConfigError
expr: |
sum by(namespace,pod) (kube_pod_container_status_waiting_reason{reason="CreateContainerConfigError"}) == 1
for: 0m
labels:
severity: critical
annotations:
summary: "命名空间:{{ $labels.namespace }}|Pod名称:{{ $labels.pod }}状态为CreateContainerConfigError"
- alert: default空间下Pod线程数超过阈值
expr: |
sum by (pod, namespace) (container_threads{container!="POD",container!="",namespace="default",pod!~"redis.*"}) > 3000
for: 10m
labels:
severity: critical
annotations:
summary: "命名空间:{{ $labels.namespace }}|Pod名称:{{ $labels.pod }}线程数大于3000,当前值{{ $value }}"
- alert: Pod线程数超过阈值
expr: |
sum by (pod, namespace) (container_threads{container!="POD",container!="",pod=~"redis.*|haproxy.*"}) > 10000
for: 10m
labels:
severity: critical
annotations:
summary: "命名空间:{{ $labels.namespace }}|Pod名称:{{ $labels.pod }}线程数大于10000,当前值{{ $value }}"
- alert: kube-apiservers线程数超过阈值
expr: |
go_threads{ job="kubernetes-apiservers"} > 1500
for: 10m
labels:
severity: critical
annotations:
summary: "节点:{{ $labels.instance }}kube-apiservers线程数过高大于1500,当前值{{ $value }}"
- alert: 节点容器线程数接近系统限制
expr: |
sum(container_threads{pod!=""}) by(instance) >202363
for: 30m
labels:
severity: critical
annotations:
summary: "k8s节点:{{ $labels.instance }}线程数过接近系统限制,当前值{{ $value }}"
- alert: PodOOMKilled
expr: |
kube_pod_container_status_last_terminated_reason{reason="OOMKilled"} == 1 and kube_pod_container_status_terminated == 1
for: 0m
labels:
severity: critical
annotations:
summary: "命名空间:{{ $labels.namespace }}|Pod名称:{{ $labels.pod }}已经OOMKilled"
volume.rules: |
groups:
- name: volume.rules
rules:
- alert: pvc状态为Lost
expr: |
sum by(namespace, persistentvolumeclaim) (kube_persistentvolumeclaim_status_phase{phase="Lost"}) == 1
for: 1m
labels:
severity: critical
annotations:
summary: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }}状态为lost"
- alert: pvc状态为Pending
expr: |
sum by(namespace, persistentvolumeclaim) (kube_persistentvolumeclaim_status_phase{phase="Pending"}) == 1
for: 1m
labels:
severity: critical
annotations:
summary: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }}状态为pending"
- alert: pvc状态为Failed
expr: |
sum(kube_persistentvolume_status_phase{phase="Failed",job="kubernetes-service-endpoints"}) by (persistentvolume) == 1
for: 2m
labels:
severity: critical
annotations:
summary: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }}状态为Failed"
blackbox.rules: |
groups:
- name: blackbox
rules:
- alert: https网站状态
expr: probe_success{job="blackbox-https"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "https网站{{ $labels.instance }}不能访问"
- alert: 网站访问时间
expr: avg_over_time(probe_http_duration_seconds[1m]) > 10
for: 1m
labels:
severity: critical
annotations:
summary: "网站{{ $labels.instance }}访问超过10s,当前值{{ $value }}"
domain.rules: |
groups:
- name: domain-expiry-alerts
rules:
- alert: 域名过期时间
expr: domain_expiry_days < 30
for: 10m
labels:
severity: warning
annotations:
summary: "域名 {{ $labels.domain }} 将在 {{ $value }} 天后过期!"
ssl.rules: |
groups:
- name: SSL证书即将过期
rules:
- alert: SSLCertificateExpiry
expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 10
for: 5m
labels:
severity: warning
annotations:
summary: "域名{{ $labels.instance }}SSL证书将在10天内过期,当前值{{ $value }}天"
redis.rules: |
groups:
- name: redis
rules:
- alert: redis内存
expr: 100 * (redis_memory_used_bytes{instance=~"$instance"} / redis_memory_max_bytes{instance=~"$instance"}) > 80
for: 5m
labels:
severity: critical
annotations:
summary: "{{ $labels.instance }}Redis使用内存大于80%,当前值{{ $value }}%"
- alert: Redis连接数
expr: redis_connected_clients{instance=~"$instance"} > 50000
for: 5m
labels:
severity: critical
annotations:
summary: "{{ $labels.instance }}Redis连接数大于50000,当前值{{ $value }}"
- alert: Redis内存碎片率
expr: redis_mem_fragmentation_ratio{instance=~"$instance"} > 2
for: 5m
labels:
severity: warning
annotations:
summary: "{{ $labels.instance }}Redis内存碎片率大于2,当前值{{ $value }}"
- alert: Redis持久化
expr: redis_rdb_last_bgsave_status{instance=~"$instance"} == 0 or redis_aof_last_write_status{instance=~"$instance"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "{{ $labels.instance }}Redis持久化(RDB 或 AOF)失败"
- alert: Redis主从延迟
expr: redis_master_repl_offset{instance=~"$instance"} - redis_slave_repl_offset{instance=~"$instance"} > 10485760 # 10MB
for: 5m
labels:
severity: critical
annotations:
summary: "{{ $labels.instance }}redis主从延迟过高"创建Prometheus pvc
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: prometheus-pv-claim
namespace: monitor
labels:
app: prometheus
spec:
storageClassName: nfs
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 100Gi部署Prometheus
apiVersion: apps/v1
kind: Deployment
metadata:
name: prometheus
namespace: monitor
labels:
k8s-app: prometheus
spec:
replicas: 1
selector:
matchLabels: ##下面匹配的是这里的标签
k8s-app: prometheus
template:
metadata:
labels:
k8s-app: prometheus
spec:
# affinity:
# podAntiAffinity: ##pod反亲和
# requiredDuringSchedulingIgnoredDuringExecution: ##强制匹配,不满足就不调度
# - topologyKey: 'kubernetes.io/hostname' ##设置拓扑域为主机
# labelSelector: ## 匹配标签 k8s-app 等于 prometheus
# matchExpressions:
# - key: k8s-app
# operator: In
# values:
# - prometheus
# nodeAffinity: ##节点亲和,简单实用更建议自定义标签后使用nodeSelector字段
# requiredDuringSchedulingIgnoredDuringExecution: ##强制匹配,不满足就不调度
# nodeSelectorTerms:
# - matchExpressions: ## 匹配主机名称 等于是 xxx1 或 xxx2
# - key: kubernetes.io/hostname
# operator: In
# values:
# - xxx1
# - xxx2
serviceAccountName: prometheus
initContainers:
- name: "init-chown-data"
image: busybox:latest
imagePullPolicy: "IfNotPresent"
command: ["chown", "-R", "65534:65534", "/data"]
volumeMounts:
- name: prometheus-data
mountPath: /data
subPath: ""
containers:
- name: prometheus-server-configmap-reload
image: jimmidyson/configmap-reload:v0.7.1
imagePullPolicy: "IfNotPresent"
args:
- --volume-dir=/etc/config
- --webhook-url=http://localhost:9090/-/reload
volumeMounts:
- name: config-volume
mountPath: /etc/config
readOnly: true
- mountPath: /etc/localtime
name: timezone
- name: prometheus-server
image: prom/prometheus:v2.45.1
imagePullPolicy: "IfNotPresent"
args:
- --config.file=/etc/config/prometheus.yml
- --storage.tsdb.path=/data
- --storage.tsdb.retention.time=30d #60天
- --web.console.libraries=/etc/prometheus/console_libraries
- --web.console.templates=/etc/prometheus/consoles
- --web.enable-lifecycle
- --web.config.file=/etc/prometheus/basicauth/webconfig.yml
ports:
- containerPort: 9090
readinessProbe:
httpGet:
httpHeaders:
- name: Authorization
value: Basic VW5KUkFYeWs6aVFVfnRCVTksWg==
path: /-/ready
port: 9090
initialDelaySeconds: 30
timeoutSeconds: 30
livenessProbe:
httpGet:
httpHeaders:
- name: Authorization
value: Basic VW5KUkFYeWs6aVFVfnRCVTksWg==
path: /-/healthy
port: 9090
initialDelaySeconds: 30
timeoutSeconds: 30
volumeMounts:
- name: config-volume
mountPath: /etc/config
- name: prometheus-data
mountPath: /data
subPath: ""
- name: prometheus-rules
mountPath: /etc/config/rules
- mountPath: /etc/localtime
name: timezone
- mountPath: /etc/prometheus/basicauth
name: basic-auth
- name: etcd-certs
mountPath: /etc/prometheus/secrets/etcd-certs
volumes:
- name: config-volume
configMap:
name: prometheus-config
- name: prometheus-rules
configMap:
name: prometheus-rules
- name: prometheus-data
persistentVolumeClaim:
claimName: prometheus-pv-claim
- name: timezone
hostPath:
path: /usr/share/zoneinfo/Asia/Shanghai
- name: basic-auth
configMap:
name: prometheus-auth
- name: etcd-certs
secret:
secretName: etcd-certs
---
apiVersion: v1
kind: Service
metadata:
name: prometheus
namespace: monitor
spec:
type: NodePort
ports:
- name: http
port: 9090
protocol: TCP
targetPort: 9090
nodePort: 30089
selector:
k8s-app: prometheus
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: monitor
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups:
- ""
resources:
- nodes
- nodes/metrics
- services
- endpoints
- pods
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- configmaps
verbs:
- get
- nonResourceURLs:
- "/metrics"
verbs:
- get
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: monitor创建node_erporter 认证
apiVersion: v1
kind: ConfigMap
metadata:
name: node-exporter-auth
namespace: monitor
data:
config.yml: |
basic_auth_users:
UnJRAXyk: $2y$10$Xdj.b5xW70/rdRm0BSPmAuaeuwAsoA16hZoEyKNZLmblNanm7JtvO部署node_erporter
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: node-exporter
namespace: monitor
labels:
k8s-app: node-exporter
spec:
selector:
matchLabels:
version: v1.3.1
template:
metadata:
labels:
k8s-app: node-exporter
version: v1.3.1
spec:
tolerations:
- operator: "Exists"
containers:
- name: prometheus-node-exporter
image: prom/node-exporter:v1.7.0
imagePullPolicy: IfNotPresent
ports:
- name: metrics
containerPort: 9100
hostPort: 9100
resources:
limits:
cpu: 250m
memory: 180Mi
requests:
cpu: 102m
memory: 180Mi
volumeMounts:
- mountPath: /host/proc
name: proc
- mountPath: /host/sys
name: sys
- mountPath: /host
name: rootfs
- mountPath: /etc/node/basicauth
name: basic-auth
args:
- --collector.processes
- --collector.systemd
- --path.procfs=/host/proc
- --path.sysfs=/host/sys
- --path.rootfs=/host
- --web.config.file=/etc/node/basicauth/config.yml
hostNetwork: true
hostPID: true
securityContext:
runAsNonRoot: true
runAsUser: 65534
volumes:
- name: proc
hostPath:
path: /proc
- name: sys
hostPath:
path: /sys
- name: rootfs
hostPath:
path: /
- name: basic-auth
configMap:
name: node-exporter-auth
---
apiVersion: v1
kind: Service
metadata:
name: node-exporter
namespace: monitor
annotations:
prometheus.io/scrape: "true"
spec:
clusterIP: None
ports:
- name: metrics
port: 9100
protocol: TCP
targetPort: 9100
selector:
k8s-app: node-exporter创建altermanager cm
apiVersion: v1
kind: ConfigMap
metadata:
name: alertmanager-config
namespace: monitor
data:
alertmanager.yml: |-
global:
resolve_timeout: 5m
route:
group_by: ['alertname', 'cluster']
group_wait: 10s
group_interval: 10s
repeat_interval: 5m
receiver: 'telegram'
templates:
- '/etc/alertmanager/template/telegram.tmpl'
receivers:
- name: 'telegram'
telegram_configs:
- bot_token: 7808485242:AAHSIl3F7XSzvnDC1h4jYZtzYD9JkahZVHg
api_url: https://api.telegram.org
chat_id: -4695150894
parse_mode: HTML
message: '{{ template "telegram.message". }}'
send_resolved: true创建altermanager 告警模板
#自定义告警模板
apiVersion: v1
kind: ConfigMap
metadata:
name: alertmanager-template-volume
namespace: monitor
data:
telegram.tmpl: |
{{ define "telegram.message" }}
{{ if gt (len .Alerts.Firing) 0 }}
🔥🔥🔥🔥🔥🔥触发告警 ({{ .Alerts.Firing | len }}条)
{{ range .Alerts }}
---
🔴 告警名称: {{ .Labels.alertname }}
🔴 告警级别: {{ .Labels.severity }}
🔴 告警内容: {{ .Annotations.summary }}
🔴 触发时间: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
{{ end }}
{{ end }}
{{ if gt (len .Alerts.Resolved) 0 }}
✅✅✅✅✅✅恢复告警 ({{ .Alerts.Resolved | len }}条)
{{ range .Alerts }}
---
🟢 告警名称: {{ .Labels.alertname }}
🟢 告警级别: {{ .Labels.severity }}
🟢 告警内容: {{ .Annotations.summary }}
🟢 触发时间: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
🟢 恢复时间: {{ (.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
{{ end }}
{{ end }}
{{ end }}创建altermanager pvc
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: alertmanager-pv-claim
namespace: monitor
labels:
app: alertmanager
spec:
storageClassName: nfs
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 20Gi部署altermanager
apiVersion: apps/v1
kind: Deployment
metadata:
name: alertmanager
namespace: monitor
spec:
replicas: 1
selector:
matchLabels:
k8s-app: alertmanager
version: v0.26.0
template:
metadata:
labels:
k8s-app: alertmanager
version: v0.26.0
spec:
containers:
- name: prometheus-alertmanager
image: prom/alertmanager:v0.26.0
imagePullPolicy: "IfNotPresent"
args:
- --config.file=/etc/alertmanager/alertmanager.yml
- --storage.path=/alertmanager
- --cluster.advertise-address=0.0.0.0:9093
ports:
- containerPort: 9093
readinessProbe:
httpGet:
path: /#/status
port: 9093
initialDelaySeconds: 30
timeoutSeconds: 30
volumeMounts:
- name: config-volume
mountPath: /etc/alertmanager
- name: config-template-volume
mountPath: /etc/alertmanager/template
- name: storage-volume
mountPath: "/alertmanager"
subPath: ""
- mountPath: /etc/localtime
name: timezone
resources:
limits:
cpu: 40m
memory: 200Mi
requests:
cpu: 40m
memory: 100Mi
volumes:
- name: config-volume
configMap:
name: alertmanager-config
- name: config-template-volume
configMap:
name: alertmanager-template-volume
- name: storage-volume
persistentVolumeClaim:
claimName: alertmanager-pv-claim
- name: timezone
hostPath:
path: /usr/share/zoneinfo/Asia/Shanghai
---
apiVersion: v1
kind: Service
metadata:
name: alertmanager
namespace: monitor
labels:
kubernetes.io/cluster-service: "true"
addonmanager.kubernetes.io/mode: Reconcile
kubernetes.io/name: "Alertmanager"
spec:
type: "ClusterIP"
ports:
- name: http
port: 80
protocol: TCP
targetPort: 9093
selector:
k8s-app: alertmanager创建blackbox cm
apiVersion: v1
kind: ConfigMap
metadata:
name: blackbox
namespace: monitor
data:
blackbox.yml: |-
modules:
http_2xx:
prober: http
timeout: 10s
http:
valid_status_codes: [200,204,301,302,304]
method: GET
follow_redirects: true
preferred_ip_protocol: "ip4"
http_post_2xx:
prober: http
timeout: 10s
http:
preferred_ip_protocol: "ip4"
method: POST
valid_status_codes: [200,204,301,302,304]
tcp_connect:
prober: tcp
timeout: 10s
ping:
prober: icmp
timeout: 10s
icmp:
preferred_ip_protocol: "ip4"部署blackbox-exporter
apiVersion: v1
kind: Service
metadata:
name: blackbox-exporter
namespace: monitor
labels:
app: blackbox-exporter
spec:
type: ClusterIP
ports:
- name: http
port: 9115
targetPort: 9115
selector:
app: blackbox-exporter
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: blackbox-exporter
namespace: monitor
spec:
replicas: 1
selector:
matchLabels:
app: blackbox-exporter
template:
metadata:
labels:
app: blackbox-exporter
spec:
containers:
- name: blackbox-exporter
image: prom/blackbox-exporter:v0.25.0
ports:
- name: http
containerPort: 9115
args:
- --config.file=/etc/blackbox_exporter/blackbox.yml
readinessProbe:
httpGet:
path: /-/ready
port: 9115
initialDelaySeconds: 5
timeoutSeconds: 5
livenessProbe:
httpGet:
path: /-/healthy
port: 9115
initialDelaySeconds: 10
timeoutSeconds: 5
volumeMounts:
- name: config-volume
mountPath: /etc/blackbox_exporter
volumes:
- name: config-volume
configMap:
name: blackbox以下是相关的部署yaml下载地址
下载地址
- 感谢你赐予我前进的力量
赞赏者名单
因为你们的支持让我意识到写文章的价值🙏
本文是原创文章,采用 CC BY-NC-ND 4.0 协议,完整转载请注明来自 运维小白
评论
匿名评论
隐私政策
你无需删除空行,直接评论以获取最佳展示效果

