头图

Prometheus

target

[root@vm11 prometheus]# cat prometheus.yml 
global:
  scrape_interval: 60s
  evaluation_interval: 65s
  scrape_timeout: 15s

# Alertmanager configuration
alerting:
  alertmanagers:
    - static_configs:
        - targets: ["localhost:3xxx"]

rule_files:
  - "rule.yml"

scrape_configs:
  - job_name: "prometheus"
    static_configs:
      - targets: ["192.168.1x4.xx:3xx"]
        labels:
          appservice: "prometheus"
          city: "苏州"
          instancehost: "192.1x.1x4.xx"


rule


[root@vm11 prometheus]# cat rule.yml 

groups:
- name: Hosts.rules
  rules:
  - alert: HostDown
    expr: up{job=~"node-exporter|prometheus|grafana|alertmanager"} == 0
    for: 0m
    labels:
      severity: critical
    annotations:
      description: "主机: 【{{ $labels.instance }}】 service is down-- Das ist ein Test."
      summary: "主机: 【{{ $labels.instance }}】 service is resolved"

  - alert: HostCpuLoadAvage
    expr: sum(node_load5) by (instance) > 10
    for: 1m
    annotations:
      title: "5分钟内CPU负载过高"
      description: "主机: 【{{ $labels.instance }}】 5五分钟内CPU负载超过10 (当前值:{{ $value }})"
    labels:
      severity: 'warning'

  - alert: HostCpuUsage
    expr: (1-((sum(increase(node_cpu_seconds_total{mode="idle"}[5m])) by (instance))/ (sum(increase(node_cpu_seconds_total[5m])) by (instance))))*100 > 80
    for: 1m
    annotations:
      title: "CPU使用率过高"
      description: "主机: 【{{ $labels.instance }}】 5五分钟内CPU使用率超过80% (当前值:{{ $value }})"
    labels:
      severity: 'warning'

  - alert: HostMemoryUsage
    expr: (1-((node_memory_Buffers_bytes + node_memory_Cached_bytes + node_memory_MemFree_bytes)/node_memory_MemTotal_bytes))*100 > 80
    for: 1m
    annotations:
      title: "主机内存使用率超过80%"
      description: "主机: 【{{ $labels.instance }}】 内存使用率超过80% (当前使用率:{{ $value }}%)"
    labels:
      severity: 'warning'

  - alert: HostIOWait
    expr: ((sum(increase(node_cpu_seconds_total{mode="iowait"}[5m])) by (instance))/(sum(increase(node_cpu_seconds_total[5m])) by (instance)))*100 > 10
    for: 1m
    annotations:
      title: "磁盘负载过高"
      description: "主机: 【{{ $labels.instance }}】 5五分钟内磁盘负载过高 (当前负载值:{{ $value }})"
    labels:
      severity: 'warning'

  - alert: HostFileSystemUsage
    expr: (1-(node_filesystem_free_bytes{fstype=~"ext4|xfs",mountpoint!~".*tmp|.*boot" }/node_filesystem_size_bytes{fstype=~"ext4|xfs",mountpoint!~".*tmp|.*boot" }))*100 > 70
    for: 1m
    annotations:
      title: "磁盘空间剩余不足"
      description: "主机: 【{{ $labels.instance }}】 {{ $labels.mountpoint }}分区使用率超过70%, 当前值使用率:{{ $value }}%"
    labels:
      severity: 'warning'

  - alert: HostSwapIsFillingUp
    expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
    for: 2m
    labels:
      severity: 'warning'
    annotations:
      title: "主机swap分区不足"
      description: "主机: 【{{ $labels.instance }}】 swap分区使用超过 (>80%), 当前值使用率: {{ $value }}%"

  - alert: HostNetworkConnection-ESTABLISHED
    expr:  sum(node_netstat_Tcp_CurrEstab) by (instance) > 1000
    for: 5m
    labels:
      severity: 'warning'
    annotations:
      title: "主机ESTABLISHED连接数过高"
      description: "主机: 【{{ $labels.instance }}】 ESTABLISHED连接数超过1000, 当前ESTABLISHED连接数: {{ $value }}"

  - alert: HostNetworkConnection-TIME_WAIT
    expr:  sum(node_sockstat_TCP_tw) by (instance) > 1000
    for: 5m
    labels:
      severity: 'warning'
    annotations:
      title: "主机TIME_WAIT连接数过高"
      description: "主机: 【{{ $labels.instance }}】 TIME_WAIT连接数超过1000, 当前TIME_WAIT连接数: {{ $value }}"

  - alert: HostUnusualNetworkThroughputIn
    expr:  sum by (instance, device) (rate(node_network_receive_bytes_total{device=~"ens.*"}[2m])) / 1024 / 1024 > 100
    for: 5m
    labels:
      severity: 'warning'
    annotations:
      title: "主机网卡入口流量过高"
      description: "主机: 【{{ $labels.instance }}】, 网卡: {{ $labels.device }} 入口流量超过 (> 100 MB/s), 当前值: {{ $value }}"

  - alert: HostUnusualNetworkThroughputOut
    expr: sum by (instance, device) (rate(node_network_transmit_bytes_total{device=~"ens.*"}[2m])) / 1024 / 1024 > 100
    for: 5m
    labels:
      severity: 'warning'
    annotations:
      title: "主机网卡出口流量过高"
      description: "主机: 【{{ $labels.instance }}】, 网卡: {{ $labels.device }} 出口流量超过 (> 100 MB/s), 当前值: {{ $value }}"

  - alert: HostUnusualDiskReadRate
    expr: sum by (instance, device) (rate(node_disk_read_bytes_total{device=~"sd.*"}[2m])) / 1024 / 1024 > 50
    for: 5m
    labels:
      severity: 'warning'
    annotations:
      title: "主机磁盘读取速率过高"
      description: "主机: 【{{ $labels.instance }}】, 磁盘: {{ $labels.device }} 读取速度超过(50 MB/s), 当前值: {{ $value }}"

  - alert: HostUnusualDiskWriteRate
    expr: sum by (instance, device) (rate(node_disk_written_bytes_total{device=~"sd.*"}[2m])) / 1024 / 1024 > 50
    for: 2m
    labels:
      severity: 'warning'
    annotations:
      title: "主机磁盘写入速率过高"
      description: "主机: 【{{ $labels.instance }}】, 磁盘: {{ $labels.device }} 写入速度超过(50 MB/s), 当前值: {{ $value }}"

  - alert: HostOutOfInodes
    expr: node_filesystem_files_free{fstype=~"ext4|xfs",mountpoint!~".*tmp|.*boot" } / node_filesystem_files{fstype=~"ext4|xfs",mountpoint!~".*tmp|.*boot" } * 100 < 10
    for: 2m
    labels:
      severity: 'warning'
    annotations:
      title: "主机分区Inode节点不足"
      description: "主机: 【{{ $labels.instance }}】 {{ $labels.mountpoint }}分区inode节点不足 (可用值小于{{ $value }}%)"

  - alert: HostUnusualDiskReadLatency
    expr: rate(node_disk_read_time_seconds_total{device=~"sd.*"}[1m]) / rate(node_disk_reads_completed_total{device=~"sd.*"}[1m]) > 0.1 and rate(node_disk_reads_completed_total{device=~"sd.*"}[1m]) > 0
    for: 2m
    labels:
      severity: 'warning'
    annotations:
      title: "主机磁盘Read延迟过高"
      description: "主机: 【{{ $labels.instance }}】, 磁盘: {{ $labels.device }} Read延迟过高 (read operations > 100ms), 当前延迟值: {{ $value }}ms"

  - alert: HostUnusualDiskWriteLatency
    expr: rate(node_disk_write_time_seconds_total{device=~"sd.*"}[1m]) / rate(node_disk_writes_completed_total{device=~"sd.*"}[1m]) > 0.1 and rate(node_disk_writes_completed_total{device=~"sd.*"}[1m]) > 0
    for: 2m
    labels:
      severity: 'warning'
    annotations:
      title: "主机磁盘Write延迟过高"
      description: "主机: 【{{ $labels.instance }}】, 磁盘: {{ $labels.device }} Write延迟过高 (write operations > 100ms), 当前延迟值: {{ $value }}ms"

alertmanager

[root@vm11 alertmanager]# cat alertmanager.yml 
route:
  group_by: ['alertname']
  group_wait: 30s
  group_interval: 5m
  repeat_interval: 3600s
  receiver: 'Warning'
  routes:
    - receiver: 'Information'
      repeat_interval: 3600s
      match:
        severity: Information

    - receiver: 'Warning'
      repeat_interval: 3600s
      match:
        severity: Warning

receivers:
  - name: 'Warning'
    webhook_configs:
      - url: 'http://192.1x.1xx.xx:3xx/alertmanager/warning'
        send_resolved: true

  - name: 'Information'
    webhook_configs:
      - url: 'http://192.168.1x.1x:38xx/alertmanager/information'
        send_resolved: true

inhibit_rules:
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'dev', 'instance']

go code

image.png

  • main.go
package main

import (
    "bytes"
    "encoding/json"
    "fmt"
    "github.com/gin-gonic/gin"
    "io"
    "io/ioutil"
    "net/http"
    "prometheus_alarm/alarmtool"
    "prometheus_alarm/prometheus"
    "strings"
)

// 关键
func JSONDecode(r io.Reader, obj interface{}) error {
    if err := json.NewDecoder(r).Decode(obj); err != nil {
        return err
    }
    return nil
}

func main() {
    gin.SetMode(gin.ReleaseMode)
    r := gin.Default()

    r.GET("/health", func(c *gin.Context) {
        //将发送的信息封装成JSON发送给浏览器
        c.JSON(http.StatusOK, gin.H{
            //这是我们定义的数据
            "status": 200,
        })
    })

    r.POST("/alertmanager/warning", func(c *gin.Context) {
        // 打印出 body
        //data, _ := ioutil.ReadAll(c.Request.Body)
        //fmt.Printf("req.body=%s\n, content-type=%v\n", data, c.ContentType())
        var reqInfo prometheus.AlertMsgRet
        data, err := c.GetRawData()
        if err != nil {
            fmt.Println(err.Error())
        }
        fmt.Println("### data:", string(data))
        // 把字节流重新放回 body 中
        c.Request.Body = ioutil.NopCloser(bytes.NewBuffer(data))
        err2 := JSONDecode(c.Request.Body, &reqInfo)
        if err2 != nil {
            fmt.Println("decode err:", err2)
        }
        //// 打印body中的参数
        //fmt.Println(reqInfo.Status)
        //fmt.Println(reqInfo.Alerts[0].Labels)
        //fmt.Println(reqInfo.Alerts[0].Labels.Severity)
        //fmt.Println(reqInfo.Alerts[0].Annotations.Description)
        userList := []string{"18xxxxxxxxx"}
        url := "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=axxxxxxxxx"
        //alarmtool.SendMsgChatGroupPerson(url, userList, "## hallo kugou")

        for _, i := range reqInfo.Alerts {
            if i.Status == "firing" {
                content := prometheus.FiringTemp
                replacements := []struct {
                    Old string
                    New string
                }{
                    {Old: "alertname17", New: i.Labels.AlertName},
                    {Old: "appservice17", New: i.Labels.AppService},
                    {Old: "startsAt17", New: alarmtool.TimeTransform(i.StartsAt)},
                    {Old: "city17", New: i.Labels.City},
                    {Old: "instance17", New: i.Labels.Instance},
                    {Old: "severity17", New: i.Labels.Severity},
                    {Old: "description17", New: i.Annotations.Description},
                }
                for _, repl := range replacements {
                    content = strings.Replace(content, repl.Old, repl.New, -1)
                }

                alarmtool.SendMsgChatGroupPerson(url, userList, content)
            } else if i.Status == "resolved" {
                content := prometheus.ResolvedTemp
                replacements := []struct {
                    Old string
                    New string
                }{
                    {Old: "alertname17", New: i.Labels.AlertName},
                    {Old: "appservice17", New: i.Labels.AppService},
                    {Old: "city17", New: i.Labels.City},
                    {Old: "startsAt17", New: alarmtool.TimeTransform(i.StartsAt)},
                    {Old: "endsAt17", New: alarmtool.TimeTransform(i.EndsAt)},
                    {Old: "instance17", New: i.Labels.Instance},
                    {Old: "severity17", New: i.Labels.Severity},
                    {Old: "summary17", New: i.Annotations.Summary},
                }
                for _, repl := range replacements {
                    content = strings.Replace(content, repl.Old, repl.New, -1)
                }

                alarmtool.SendMsgChatGroupPerson(url, userList, content)
            }
        }

        c.JSON(200, "success")
    })
    r.Run("0.0.0.0:38060") //默认在本地8080端口启动服务
}


  • prometheus/promethues.go

package prometheus


type AlertMsgRet struct {
    Receiver string `json:"receiver"`
    Status string `json:"status"`
    Alerts []struct {
        Status string `json:"status"`
        Labels struct {
            AlertName string `json:"alertname"`
            AppService string `json:"appservice"`
            City string `json:"city"`
            Instance string `json:"instance"`
            InstanceHost string `json:"instancehost"`
            Job string `json:"job"`
            Severity string `json:"severity"`
        } `json:"labels"`
        Annotations struct {
            Description string `json:"description"`
            Summary string `json:"summary"`
        } `json:"annotations"`
        StartsAt string `json:"startsAt"`
        EndsAt string `json:"endsAt"`
        GeneratorURL string `json:"generatorURL"`
        Fingerprint string `json:"fingerprint"`
    } `json:"alerts"`
    GroupLabels struct {
        AlertName string `json:"alertname"`
    } `json:"groupLabels"`
    CommonLabels struct {
        AlertName string `json:"alertname"`
        AppService string `json:"appservice"`
        City string `json:"city"`
        Instance string `json:"instance"`
        InstanceHost string `json:"instancehost"`
        Job string `json:"job"`
        Severity string `json:"severity"`
    } `json:"commonLabels"`
    CommonAnnotations struct {
        Description string `json:"description"`
        Summary string `json:"summary"`
    } `json:"commonAnnotations"`
    ExternalURL string `json:"externalURL"`
    Version string `json:"version"`
    GroupKey string `json:"groupKey"`
    TruncatedAlerts int `json:"truncatedAlerts"`
}

const(
    FiringTemp = "#### [Prometheus 告警信息] \n> <font color=\"#FF0000\">告警名称</font>:alertname17\n> <font color=\"#FF0000\">告警级别</font>:severity17 \n> <font color=\"#FF0000\">应用名称</font>:appservice17\n> <font color=\"#FF0000\">开始时间</font>:startsAt17 \n> <font color=\"#FF0000\">主机地址</font>:instance17 \n> <font color=\"#FF0000\">City</font>:city17 \n> <font color=\"#FF0000\">故障信息</font>:\n** messages: description17 ** "

    ResolvedTemp = "#### [Prometheus 恢复信息] \n> <font color=\"#00FF00\">告警名称</font>:alertname17\n> <font color=\"#00FF00\">告警级别</font>:severity17 \n> <font color=\"#00FF00\">应用名称</font>:appservice17\n> <font color=\"#00FF00\">开始时间</font>:startsAt17 \n> <font color=\"#00FF00\">恢复时间</font>:endsAt17 \n> <font color=\"#00FF00\">主机地址</font>:instance17 \n> <font color=\"#00FF00\">City</font>:city17 \n> <font color=\"#00FF00\">恢复信息</font>:\n** messages: summary17 ** "
)


  • json

    {
      "receiver": "Warning",
      "status": "resolved",
      "alerts": [{
          "status": "resolved",
          "labels": {
              "alertname": "HostDown",
              "appservice": "prometheus",
              "city": "苏州",
              "instance": "192.168.1xx.x1:3xx0",
              "instancehost": "192.1xx.1xx.xx",
              "job": "prometheus",
              "severity": "critical"
          },
          "annotations": {
              "description": "主机: 【192.168.1xx.x:3xxx】has been down for more than 1 minute",
              "summary": "主机: 【192.168.1xx.xx:3xxx】 service is resolved"
          },
          "startsAt": "2023-07-06T01:16:20.991Z",
          "endsAt": "2023-07-06T01:29:20.991Z",
          "generatorURL": "http://vm11:39090/graph?g0.expr=up%7Bjob%3D~%22node-exporter%7Cprometheus%7Cgrafana%7Calertmanager%22%7D+%3D%3D+0\u0026g0.tab=1",
          "fingerprint": "1becfcf3e2ebd5f7"
      }],
      "groupLabels": {
          "alertname": "HostDown"
      },
      "commonLabels": {
          "alertname": "HostDown",
          "appservice": "prometheus",
          "city": "苏州",
          "instance": "192.168.1x.xx:3xx",
          "instancehost": "192.1x8.1x4.x",
          "job": "prometheus",
          "severity": "critical"
      },
      "commonAnnotations": {
          "description": "主机: 【192.16x.xx.x1:3xxx】has been down for more than 1 minute",
          "summary": "主机: 【192.168.1xx.xx:39xx】 service is resolved"
      },
      "externalURL": "http://vm11:39xxx",
      "version": "4",
      "groupKey": "{}:{alertname=\"HostDown\"}",
      "truncatedAlerts": 0
    }
    
  • alarmtool/timetransform.go
package alarmtool

import (
    "fmt"
    "time"
)

func TimeTransform(inputTime string )  string{
    //inputTime := "2023-07-06T01:16:20.991Z"

    // 将字符串解析为时间对象
    t, err := time.Parse(time.RFC3339Nano, inputTime)
    if err != nil {
        fmt.Println("Failed to parse input time:", err)
        return "error inputTime"
    }
    // 设置东八区的时区
    loc, err := time.LoadLocation("Asia/Shanghai")
    if err != nil {
        fmt.Println("Failed to load time zone:", err)
        return "转换时区error"
    }
    // 转换时区
    localTime := t.In(loc)
    // 格式化时间为指定的输出格式
    outputTime := localTime.Format("2006-01-02 15:04:05")

    fmt.Println(outputTime)
    return outputTime
}
  • alarmtool/wecome

package alarmtool

import (
    "bytes"
    "encoding/json"
    "fmt"
    "io/ioutil"
    "log"
    "net/http"
)

type ChatGroupJsonRes struct {
    MsgType  string `json:"msgtype"`
    Markdown struct {
        Content interface{} `json:"content"`
    } `json:"markdown"`
}

type T4 struct {
    Errcode int    `json:"errcode"`
    Errmsg  string `json:"errmsg"`
}

func SendMsgChatGroupPerson(webHook string, person []string, comment string) (*T4) {
    var j ChatGroupJsonRes
    var perSionComment string
    for i, p := range person {
        fmt.Println(i)
        perSionComment = perSionComment + fmt.Sprintf("<@%s>", p)
    }
    j.MsgType = "markdown"
    j.Markdown.Content = comment + "\n" + perSionComment

    data, err := json.Marshal(j)
    if err != nil {
        fmt.Println("err was %v", err)
    }
    fmt.Println(string(data))
    reader := bytes.NewReader(data)
    client := &http.Client{}
    req, err := http.NewRequest("POST", webHook, reader)
    req.Header.Add("Content-Type", "application/json")
    req.Header.Add("Accept", "application/json")
    resp, err := client.Do(req)
    if err != nil {
        log.Fatal(err)
    }
    defer resp.Body.Close()
    body, err := ioutil.ReadAll(resp.Body)
    if err != nil {
        log.Fatal(err)
    }
    fmt.Println("####send msg##", string(body))
    var assetList T4
    err = json.Unmarshal(body, &assetList)
    return &assetList
}




  • Dockerfile
FROM golang:1.19 as builder

ENV HOME /app
ENV CGO_ENABLED 0
ENV GOOS linux
ENV GOPROXY https://goproxy.cn,direct
ENV TIME_ZONE Asia/Shanghai

WORKDIR /app
COPY . .
RUN go mod download

RUN go build -v -a -installsuffix cgo -o demo main.go

FROM alpine:latest

RUN apk --no-cache add ca-certificates

# 设置工作目录
WORKDIR /bin/

COPY --from=builder /app/demo .

ENTRYPOINT ["/bin/demo"]
  • .gitlab-ci.yml
before_script:
  - echo "before script!"

variables:
  PG_NAME: 'prometheus_alarm'



stages:
  - build
  - deploy


build_job:
  stage: build
  script:
    - pwd
    - date
    - docker build -t hub.gz-yykfz.com:31443/common/prometheusalarm:${CI_COMMIT_SHORT_SHA} .
  only:
    - main
  tags:
    - vm5-docker


deploy_prometheusAlarm:
  stage: deploy
  script:
    - lsof -i :38060 || echo "端口没有启动,或者first time deploy"
    - docker rm -f prometheus-alarm || echo "docker没有,或者first time deploy"
    - sleep 5
    - docker run -d --name prometheus-alarm -p 38060:38060 hub.gz-yykfz.com:31443/common/prometheusalarm:${CI_COMMIT_SHORT_SHA}
    - lsof -i :${PG_PORT} || (echo "######deploy error ########" && exit 1)
  when: manual
  only:
    - main
  tags:
    - vm5-docker



pro_deploy_prometheusAlarm:
  stage: deploy
  script:
    - lsof -i :38060 || echo "端口没有启动,或者first time deploy"
    - docker rm -f prometheus-alarm || echo "docker没有,或者first time deploy"
    - sleep 5
    - docker run -d --name prometheus-alarm -p 38060:38060 hub.gz-yykfz.com:31443/common/prometheusalarm:${CI_COMMIT_SHORT_SHA}
    - lsof -i :${PG_PORT} || (echo "######deploy error ########" && exit 1)
  when: manual
  only:
    - master
  tags:
    - vm5-docker

test

docker build -t prometheusalarm:test1 .
docker run -d -p 38060:38060 prometheusalarm:test1

image.png


锅包肉
89 声望17 粉丝

这个人很懒,没有什么说的。