Prometheus
target
[root@vm11 prometheus]# cat prometheus.yml
global:
scrape_interval: 60s
evaluation_interval: 65s
scrape_timeout: 15s
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets: ["localhost:3xxx"]
rule_files:
- "rule.yml"
scrape_configs:
- job_name: "prometheus"
static_configs:
- targets: ["192.168.1x4.xx:3xx"]
labels:
appservice: "prometheus"
city: "苏州"
instancehost: "192.1x.1x4.xx"
rule
[root@vm11 prometheus]# cat rule.yml
groups:
- name: Hosts.rules
rules:
- alert: HostDown
expr: up{job=~"node-exporter|prometheus|grafana|alertmanager"} == 0
for: 0m
labels:
severity: critical
annotations:
description: "主机: 【{{ $labels.instance }}】 service is down-- Das ist ein Test."
summary: "主机: 【{{ $labels.instance }}】 service is resolved"
- alert: HostCpuLoadAvage
expr: sum(node_load5) by (instance) > 10
for: 1m
annotations:
title: "5分钟内CPU负载过高"
description: "主机: 【{{ $labels.instance }}】 5五分钟内CPU负载超过10 (当前值:{{ $value }})"
labels:
severity: 'warning'
- alert: HostCpuUsage
expr: (1-((sum(increase(node_cpu_seconds_total{mode="idle"}[5m])) by (instance))/ (sum(increase(node_cpu_seconds_total[5m])) by (instance))))*100 > 80
for: 1m
annotations:
title: "CPU使用率过高"
description: "主机: 【{{ $labels.instance }}】 5五分钟内CPU使用率超过80% (当前值:{{ $value }})"
labels:
severity: 'warning'
- alert: HostMemoryUsage
expr: (1-((node_memory_Buffers_bytes + node_memory_Cached_bytes + node_memory_MemFree_bytes)/node_memory_MemTotal_bytes))*100 > 80
for: 1m
annotations:
title: "主机内存使用率超过80%"
description: "主机: 【{{ $labels.instance }}】 内存使用率超过80% (当前使用率:{{ $value }}%)"
labels:
severity: 'warning'
- alert: HostIOWait
expr: ((sum(increase(node_cpu_seconds_total{mode="iowait"}[5m])) by (instance))/(sum(increase(node_cpu_seconds_total[5m])) by (instance)))*100 > 10
for: 1m
annotations:
title: "磁盘负载过高"
description: "主机: 【{{ $labels.instance }}】 5五分钟内磁盘负载过高 (当前负载值:{{ $value }})"
labels:
severity: 'warning'
- alert: HostFileSystemUsage
expr: (1-(node_filesystem_free_bytes{fstype=~"ext4|xfs",mountpoint!~".*tmp|.*boot" }/node_filesystem_size_bytes{fstype=~"ext4|xfs",mountpoint!~".*tmp|.*boot" }))*100 > 70
for: 1m
annotations:
title: "磁盘空间剩余不足"
description: "主机: 【{{ $labels.instance }}】 {{ $labels.mountpoint }}分区使用率超过70%, 当前值使用率:{{ $value }}%"
labels:
severity: 'warning'
- alert: HostSwapIsFillingUp
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
for: 2m
labels:
severity: 'warning'
annotations:
title: "主机swap分区不足"
description: "主机: 【{{ $labels.instance }}】 swap分区使用超过 (>80%), 当前值使用率: {{ $value }}%"
- alert: HostNetworkConnection-ESTABLISHED
expr: sum(node_netstat_Tcp_CurrEstab) by (instance) > 1000
for: 5m
labels:
severity: 'warning'
annotations:
title: "主机ESTABLISHED连接数过高"
description: "主机: 【{{ $labels.instance }}】 ESTABLISHED连接数超过1000, 当前ESTABLISHED连接数: {{ $value }}"
- alert: HostNetworkConnection-TIME_WAIT
expr: sum(node_sockstat_TCP_tw) by (instance) > 1000
for: 5m
labels:
severity: 'warning'
annotations:
title: "主机TIME_WAIT连接数过高"
description: "主机: 【{{ $labels.instance }}】 TIME_WAIT连接数超过1000, 当前TIME_WAIT连接数: {{ $value }}"
- alert: HostUnusualNetworkThroughputIn
expr: sum by (instance, device) (rate(node_network_receive_bytes_total{device=~"ens.*"}[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: 'warning'
annotations:
title: "主机网卡入口流量过高"
description: "主机: 【{{ $labels.instance }}】, 网卡: {{ $labels.device }} 入口流量超过 (> 100 MB/s), 当前值: {{ $value }}"
- alert: HostUnusualNetworkThroughputOut
expr: sum by (instance, device) (rate(node_network_transmit_bytes_total{device=~"ens.*"}[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: 'warning'
annotations:
title: "主机网卡出口流量过高"
description: "主机: 【{{ $labels.instance }}】, 网卡: {{ $labels.device }} 出口流量超过 (> 100 MB/s), 当前值: {{ $value }}"
- alert: HostUnusualDiskReadRate
expr: sum by (instance, device) (rate(node_disk_read_bytes_total{device=~"sd.*"}[2m])) / 1024 / 1024 > 50
for: 5m
labels:
severity: 'warning'
annotations:
title: "主机磁盘读取速率过高"
description: "主机: 【{{ $labels.instance }}】, 磁盘: {{ $labels.device }} 读取速度超过(50 MB/s), 当前值: {{ $value }}"
- alert: HostUnusualDiskWriteRate
expr: sum by (instance, device) (rate(node_disk_written_bytes_total{device=~"sd.*"}[2m])) / 1024 / 1024 > 50
for: 2m
labels:
severity: 'warning'
annotations:
title: "主机磁盘写入速率过高"
description: "主机: 【{{ $labels.instance }}】, 磁盘: {{ $labels.device }} 写入速度超过(50 MB/s), 当前值: {{ $value }}"
- alert: HostOutOfInodes
expr: node_filesystem_files_free{fstype=~"ext4|xfs",mountpoint!~".*tmp|.*boot" } / node_filesystem_files{fstype=~"ext4|xfs",mountpoint!~".*tmp|.*boot" } * 100 < 10
for: 2m
labels:
severity: 'warning'
annotations:
title: "主机分区Inode节点不足"
description: "主机: 【{{ $labels.instance }}】 {{ $labels.mountpoint }}分区inode节点不足 (可用值小于{{ $value }}%)"
- alert: HostUnusualDiskReadLatency
expr: rate(node_disk_read_time_seconds_total{device=~"sd.*"}[1m]) / rate(node_disk_reads_completed_total{device=~"sd.*"}[1m]) > 0.1 and rate(node_disk_reads_completed_total{device=~"sd.*"}[1m]) > 0
for: 2m
labels:
severity: 'warning'
annotations:
title: "主机磁盘Read延迟过高"
description: "主机: 【{{ $labels.instance }}】, 磁盘: {{ $labels.device }} Read延迟过高 (read operations > 100ms), 当前延迟值: {{ $value }}ms"
- alert: HostUnusualDiskWriteLatency
expr: rate(node_disk_write_time_seconds_total{device=~"sd.*"}[1m]) / rate(node_disk_writes_completed_total{device=~"sd.*"}[1m]) > 0.1 and rate(node_disk_writes_completed_total{device=~"sd.*"}[1m]) > 0
for: 2m
labels:
severity: 'warning'
annotations:
title: "主机磁盘Write延迟过高"
description: "主机: 【{{ $labels.instance }}】, 磁盘: {{ $labels.device }} Write延迟过高 (write operations > 100ms), 当前延迟值: {{ $value }}ms"
alertmanager
[root@vm11 alertmanager]# cat alertmanager.yml
route:
group_by: ['alertname']
group_wait: 30s
group_interval: 5m
repeat_interval: 3600s
receiver: 'Warning'
routes:
- receiver: 'Information'
repeat_interval: 3600s
match:
severity: Information
- receiver: 'Warning'
repeat_interval: 3600s
match:
severity: Warning
receivers:
- name: 'Warning'
webhook_configs:
- url: 'http://192.1x.1xx.xx:3xx/alertmanager/warning'
send_resolved: true
- name: 'Information'
webhook_configs:
- url: 'http://192.168.1x.1x:38xx/alertmanager/information'
send_resolved: true
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
go code
- main.go
package main
import (
"bytes"
"encoding/json"
"fmt"
"github.com/gin-gonic/gin"
"io"
"io/ioutil"
"net/http"
"prometheus_alarm/alarmtool"
"prometheus_alarm/prometheus"
"strings"
)
// 关键
func JSONDecode(r io.Reader, obj interface{}) error {
if err := json.NewDecoder(r).Decode(obj); err != nil {
return err
}
return nil
}
func main() {
gin.SetMode(gin.ReleaseMode)
r := gin.Default()
r.GET("/health", func(c *gin.Context) {
//将发送的信息封装成JSON发送给浏览器
c.JSON(http.StatusOK, gin.H{
//这是我们定义的数据
"status": 200,
})
})
r.POST("/alertmanager/warning", func(c *gin.Context) {
// 打印出 body
//data, _ := ioutil.ReadAll(c.Request.Body)
//fmt.Printf("req.body=%s\n, content-type=%v\n", data, c.ContentType())
var reqInfo prometheus.AlertMsgRet
data, err := c.GetRawData()
if err != nil {
fmt.Println(err.Error())
}
fmt.Println("### data:", string(data))
// 把字节流重新放回 body 中
c.Request.Body = ioutil.NopCloser(bytes.NewBuffer(data))
err2 := JSONDecode(c.Request.Body, &reqInfo)
if err2 != nil {
fmt.Println("decode err:", err2)
}
//// 打印body中的参数
//fmt.Println(reqInfo.Status)
//fmt.Println(reqInfo.Alerts[0].Labels)
//fmt.Println(reqInfo.Alerts[0].Labels.Severity)
//fmt.Println(reqInfo.Alerts[0].Annotations.Description)
userList := []string{"18xxxxxxxxx"}
url := "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=axxxxxxxxx"
//alarmtool.SendMsgChatGroupPerson(url, userList, "## hallo kugou")
for _, i := range reqInfo.Alerts {
if i.Status == "firing" {
content := prometheus.FiringTemp
replacements := []struct {
Old string
New string
}{
{Old: "alertname17", New: i.Labels.AlertName},
{Old: "appservice17", New: i.Labels.AppService},
{Old: "startsAt17", New: alarmtool.TimeTransform(i.StartsAt)},
{Old: "city17", New: i.Labels.City},
{Old: "instance17", New: i.Labels.Instance},
{Old: "severity17", New: i.Labels.Severity},
{Old: "description17", New: i.Annotations.Description},
}
for _, repl := range replacements {
content = strings.Replace(content, repl.Old, repl.New, -1)
}
alarmtool.SendMsgChatGroupPerson(url, userList, content)
} else if i.Status == "resolved" {
content := prometheus.ResolvedTemp
replacements := []struct {
Old string
New string
}{
{Old: "alertname17", New: i.Labels.AlertName},
{Old: "appservice17", New: i.Labels.AppService},
{Old: "city17", New: i.Labels.City},
{Old: "startsAt17", New: alarmtool.TimeTransform(i.StartsAt)},
{Old: "endsAt17", New: alarmtool.TimeTransform(i.EndsAt)},
{Old: "instance17", New: i.Labels.Instance},
{Old: "severity17", New: i.Labels.Severity},
{Old: "summary17", New: i.Annotations.Summary},
}
for _, repl := range replacements {
content = strings.Replace(content, repl.Old, repl.New, -1)
}
alarmtool.SendMsgChatGroupPerson(url, userList, content)
}
}
c.JSON(200, "success")
})
r.Run("0.0.0.0:38060") //默认在本地8080端口启动服务
}
- prometheus/promethues.go
package prometheus
type AlertMsgRet struct {
Receiver string `json:"receiver"`
Status string `json:"status"`
Alerts []struct {
Status string `json:"status"`
Labels struct {
AlertName string `json:"alertname"`
AppService string `json:"appservice"`
City string `json:"city"`
Instance string `json:"instance"`
InstanceHost string `json:"instancehost"`
Job string `json:"job"`
Severity string `json:"severity"`
} `json:"labels"`
Annotations struct {
Description string `json:"description"`
Summary string `json:"summary"`
} `json:"annotations"`
StartsAt string `json:"startsAt"`
EndsAt string `json:"endsAt"`
GeneratorURL string `json:"generatorURL"`
Fingerprint string `json:"fingerprint"`
} `json:"alerts"`
GroupLabels struct {
AlertName string `json:"alertname"`
} `json:"groupLabels"`
CommonLabels struct {
AlertName string `json:"alertname"`
AppService string `json:"appservice"`
City string `json:"city"`
Instance string `json:"instance"`
InstanceHost string `json:"instancehost"`
Job string `json:"job"`
Severity string `json:"severity"`
} `json:"commonLabels"`
CommonAnnotations struct {
Description string `json:"description"`
Summary string `json:"summary"`
} `json:"commonAnnotations"`
ExternalURL string `json:"externalURL"`
Version string `json:"version"`
GroupKey string `json:"groupKey"`
TruncatedAlerts int `json:"truncatedAlerts"`
}
const(
FiringTemp = "#### [Prometheus 告警信息] \n> <font color=\"#FF0000\">告警名称</font>:alertname17\n> <font color=\"#FF0000\">告警级别</font>:severity17 \n> <font color=\"#FF0000\">应用名称</font>:appservice17\n> <font color=\"#FF0000\">开始时间</font>:startsAt17 \n> <font color=\"#FF0000\">主机地址</font>:instance17 \n> <font color=\"#FF0000\">City</font>:city17 \n> <font color=\"#FF0000\">故障信息</font>:\n** messages: description17 ** "
ResolvedTemp = "#### [Prometheus 恢复信息] \n> <font color=\"#00FF00\">告警名称</font>:alertname17\n> <font color=\"#00FF00\">告警级别</font>:severity17 \n> <font color=\"#00FF00\">应用名称</font>:appservice17\n> <font color=\"#00FF00\">开始时间</font>:startsAt17 \n> <font color=\"#00FF00\">恢复时间</font>:endsAt17 \n> <font color=\"#00FF00\">主机地址</font>:instance17 \n> <font color=\"#00FF00\">City</font>:city17 \n> <font color=\"#00FF00\">恢复信息</font>:\n** messages: summary17 ** "
)
json
{ "receiver": "Warning", "status": "resolved", "alerts": [{ "status": "resolved", "labels": { "alertname": "HostDown", "appservice": "prometheus", "city": "苏州", "instance": "192.168.1xx.x1:3xx0", "instancehost": "192.1xx.1xx.xx", "job": "prometheus", "severity": "critical" }, "annotations": { "description": "主机: 【192.168.1xx.x:3xxx】has been down for more than 1 minute", "summary": "主机: 【192.168.1xx.xx:3xxx】 service is resolved" }, "startsAt": "2023-07-06T01:16:20.991Z", "endsAt": "2023-07-06T01:29:20.991Z", "generatorURL": "http://vm11:39090/graph?g0.expr=up%7Bjob%3D~%22node-exporter%7Cprometheus%7Cgrafana%7Calertmanager%22%7D+%3D%3D+0\u0026g0.tab=1", "fingerprint": "1becfcf3e2ebd5f7" }], "groupLabels": { "alertname": "HostDown" }, "commonLabels": { "alertname": "HostDown", "appservice": "prometheus", "city": "苏州", "instance": "192.168.1x.xx:3xx", "instancehost": "192.1x8.1x4.x", "job": "prometheus", "severity": "critical" }, "commonAnnotations": { "description": "主机: 【192.16x.xx.x1:3xxx】has been down for more than 1 minute", "summary": "主机: 【192.168.1xx.xx:39xx】 service is resolved" }, "externalURL": "http://vm11:39xxx", "version": "4", "groupKey": "{}:{alertname=\"HostDown\"}", "truncatedAlerts": 0 }
- alarmtool/timetransform.go
package alarmtool
import (
"fmt"
"time"
)
func TimeTransform(inputTime string ) string{
//inputTime := "2023-07-06T01:16:20.991Z"
// 将字符串解析为时间对象
t, err := time.Parse(time.RFC3339Nano, inputTime)
if err != nil {
fmt.Println("Failed to parse input time:", err)
return "error inputTime"
}
// 设置东八区的时区
loc, err := time.LoadLocation("Asia/Shanghai")
if err != nil {
fmt.Println("Failed to load time zone:", err)
return "转换时区error"
}
// 转换时区
localTime := t.In(loc)
// 格式化时间为指定的输出格式
outputTime := localTime.Format("2006-01-02 15:04:05")
fmt.Println(outputTime)
return outputTime
}
- alarmtool/wecome
package alarmtool
import (
"bytes"
"encoding/json"
"fmt"
"io/ioutil"
"log"
"net/http"
)
type ChatGroupJsonRes struct {
MsgType string `json:"msgtype"`
Markdown struct {
Content interface{} `json:"content"`
} `json:"markdown"`
}
type T4 struct {
Errcode int `json:"errcode"`
Errmsg string `json:"errmsg"`
}
func SendMsgChatGroupPerson(webHook string, person []string, comment string) (*T4) {
var j ChatGroupJsonRes
var perSionComment string
for i, p := range person {
fmt.Println(i)
perSionComment = perSionComment + fmt.Sprintf("<@%s>", p)
}
j.MsgType = "markdown"
j.Markdown.Content = comment + "\n" + perSionComment
data, err := json.Marshal(j)
if err != nil {
fmt.Println("err was %v", err)
}
fmt.Println(string(data))
reader := bytes.NewReader(data)
client := &http.Client{}
req, err := http.NewRequest("POST", webHook, reader)
req.Header.Add("Content-Type", "application/json")
req.Header.Add("Accept", "application/json")
resp, err := client.Do(req)
if err != nil {
log.Fatal(err)
}
defer resp.Body.Close()
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
log.Fatal(err)
}
fmt.Println("####send msg##", string(body))
var assetList T4
err = json.Unmarshal(body, &assetList)
return &assetList
}
- Dockerfile
FROM golang:1.19 as builder
ENV HOME /app
ENV CGO_ENABLED 0
ENV GOOS linux
ENV GOPROXY https://goproxy.cn,direct
ENV TIME_ZONE Asia/Shanghai
WORKDIR /app
COPY . .
RUN go mod download
RUN go build -v -a -installsuffix cgo -o demo main.go
FROM alpine:latest
RUN apk --no-cache add ca-certificates
# 设置工作目录
WORKDIR /bin/
COPY --from=builder /app/demo .
ENTRYPOINT ["/bin/demo"]
- .gitlab-ci.yml
before_script:
- echo "before script!"
variables:
PG_NAME: 'prometheus_alarm'
stages:
- build
- deploy
build_job:
stage: build
script:
- pwd
- date
- docker build -t hub.gz-yykfz.com:31443/common/prometheusalarm:${CI_COMMIT_SHORT_SHA} .
only:
- main
tags:
- vm5-docker
deploy_prometheusAlarm:
stage: deploy
script:
- lsof -i :38060 || echo "端口没有启动,或者first time deploy"
- docker rm -f prometheus-alarm || echo "docker没有,或者first time deploy"
- sleep 5
- docker run -d --name prometheus-alarm -p 38060:38060 hub.gz-yykfz.com:31443/common/prometheusalarm:${CI_COMMIT_SHORT_SHA}
- lsof -i :${PG_PORT} || (echo "######deploy error ########" && exit 1)
when: manual
only:
- main
tags:
- vm5-docker
pro_deploy_prometheusAlarm:
stage: deploy
script:
- lsof -i :38060 || echo "端口没有启动,或者first time deploy"
- docker rm -f prometheus-alarm || echo "docker没有,或者first time deploy"
- sleep 5
- docker run -d --name prometheus-alarm -p 38060:38060 hub.gz-yykfz.com:31443/common/prometheusalarm:${CI_COMMIT_SHORT_SHA}
- lsof -i :${PG_PORT} || (echo "######deploy error ########" && exit 1)
when: manual
only:
- master
tags:
- vm5-docker
test
docker build -t prometheusalarm:test1 .
docker run -d -p 38060:38060 prometheusalarm:test1
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。