前情与实验目的 

背景:服务器晚上负载经常突然暴涨,只有node服务,但是双机另一台无问题

解决方法:

通过python fastapi实现load,pm2信息,cup等接口,自定义zabbix脚本监控报警

实现 astapi实现load,pm2信息,cup等接口


#!/usr/bin/python  
# -*- coding:utf-8 -*-

import subprocess  
import urllib.request

from fastapi import FastAPI  
import platform  
import socket,requests  
from ansible2 import *  
import ansible_runner  
import os, sys, json, datetime, time  
import urllib.request  
from fastapi.responses import HTMLResponse

from fastapi import FastAPI  
from starlette.requests import Request  
from starlette.responses import Response  
from fastapi import FastAPI, Form  
from fastapi import Cookie  
from starlette.templating import Jinja2Templates  
from starlette.staticfiles import StaticFiles  
#from utils import sqlhelper  
import pymysql
app = FastAPI()
#调用ansible模块传入主机IP和命令,返回命令结果
def ansible_linux_command(hosts1,cmd1):  
    ansible3 = MyAnsiable2(inventory='/data/ansible/host/hosts', connection='smart')  
    ansible3.run(hosts=hosts1, module="shell", args=cmd1)  
    stdout_dict = json.loads(ansible3.get_result())  
    print(stdout_dict, type(stdout_dict))  
    print(stdout_dict['success'][hosts1]['stdout'])  
    source_list = stdout_dict['success'][hosts1]['stdout'].split("\n")

    return source_list[0]

def ansible_load(hosts1):  
    #调用ansible接口获取服务器lode值
    pid_listf = float(ansible_linux_command(hosts1, "uptime | awk {tprint} |tr -d ','".format(tprint="'{print $11}'")))  
    print(pid_listf,type(pid_listf))  
    #如果load负载值高于10则获取pm2的信息,如果小于10则返回load值。
    if pid_listf > 10.00:  
        stdout_list2 = {"load": ansible_linux_command(hosts1, "uptime | awk {tprint} |tr -d ','".format(tprint="'{print $11}'")),"pm2": ansible_linux_command(hosts1, 'pm2 ls|tr "\n" " "'),"cpu": ansible_linux_command(hosts1, 'ps aux|grep -v PID|sort -rn -k +3|head|tr "\n" " "')}  
    else:  
        stdout_list2 = {"load": ansible_linux_command(hosts1, "uptime | awk {tprint} |tr -d ','".format(tprint="'{print $11}'"))}  
      
    return stdout_list2

@app.get("/load/{hosts1}")  
def read_load(hosts1: str):  
    print(hosts1, '#######################hosts')  
    print(ansible_load(hosts1))  
    return ansible_load(hosts1)

if __name__ == '__main__':  
    import uvicorn

    uvicorn.run(app=app,  
                host="192.168.0.215",  
                port=9999,  
                workers=1)


ansible 模块

[root@dev-technology-215l fastapi_websocket_logs]# cat ansible2.py 
import json
import shutil
from ansible.module_utils.common.collections import ImmutableDict
from ansible.parsing.dataloader import DataLoader
from ansible.vars.manager import VariableManager
from ansible.inventory.manager import InventoryManager
from ansible.playbook.play import Play
from ansible.executor.task_queue_manager import TaskQueueManager
from ansible.plugins.callback import CallbackBase
from ansible import context
import ansible.constants as C


class ResultCallback(CallbackBase):
    """
    重写callbackBase类的部分方法
    """
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.host_ok = {}
        self.host_unreachable = {}
        self.host_failed = {}
        self.task_ok = {}
    def v2_runner_on_unreachable(self, result):
        self.host_unreachable[result._host.get_name()] = result

    def v2_runner_on_ok(self, result, **kwargs):
        self.host_ok[result._host.get_name()] = result

    def v2_runner_on_failed(self, result, **kwargs):
        self.host_failed[result._host.get_name()] = result

class MyAnsiable2():
    def __init__(self,
        connection='local',  # 连接方式 local 本地方式,smart ssh方式
        remote_user=None,    # ssh 用户
        remote_password=None,  # ssh 用户的密码,应该是一个字典, key 必须是 conn_pass
        private_key_file=None,  # 指定自定义的私钥地址
        sudo=None, sudo_user=None, ask_sudo_pass=None,
        module_path=None,    # 模块路径,可以指定一个自定义模块的路径
        become=None,         # 是否提权
        become_method=None,  # 提权方式 默认 sudo 可以是 su
        become_user=None,  # 提权后,要成为的用户,并非登录用户
        check=False, diff=False,
        listhosts=None, listtasks=None,listtags=None,
        verbosity=3,
        syntax=None,
        start_at_task=None,
        inventory=None):

        # 函数文档注释
        """
        初始化函数,定义的默认的选项值,
        在初始化的时候可以传参,以便覆盖默认选项的值
        """
        context.CLIARGS = ImmutableDict(
            connection=connection,
            remote_user=remote_user,
            private_key_file=private_key_file,
            sudo=sudo,
            sudo_user=sudo_user,
            ask_sudo_pass=ask_sudo_pass,
            module_path=module_path,
            become=become,
            become_method=become_method,
            become_user=become_user,
            verbosity=verbosity,
            listhosts=listhosts,
            listtasks=listtasks,
            listtags=listtags,
            syntax=syntax,
            start_at_task=start_at_task,
        )

        # 三元表达式,假如没有传递 inventory, 就使用 "localhost,"
        # 指定 inventory 文件
        # inventory 的值可以是一个 资产清单文件
        # 也可以是一个包含主机的元组,这个仅仅适用于测试
        #  比如 : 1.1.1.1,    # 如果只有一个 IP 最后必须有英文的逗号
        #  或者: 1.1.1.1, 2.2.2.2

        self.inventory = inventory if inventory else "localhost,"

        # 实例化数据解析器
        self.loader = DataLoader()

        # 实例化 资产配置对象
        self.inv_obj = InventoryManager(loader=self.loader, sources=self.inventory)

        # 设置密码
        self.passwords = remote_password

        # 实例化回调插件对象
        self.results_callback = ResultCallback()

        # 变量管理器
        self.variable_manager = VariableManager(self.loader, self.inv_obj)

    def run(self, hosts='localhost', gether_facts="no", module="ping", args='', task_time=0):
        """
        参数说明:
        task_time -- 执行异步任务时等待的秒数,这个需要大于 0 ,等于 0 的时候不支持异步(默认值)。这个值应该等于执行任务实际耗时时间为好
        """
        play_source =  dict(
            name = "Ad-hoc",
            hosts = hosts,
            gather_facts = gether_facts,
            tasks = [
                # 这里每个 task 就是这个列表中的一个元素,格式是嵌套的字典
                # 也可以作为参数传递过来,这里就简单化了。
               {"action":{"module": module, "args": args}, "async": task_time, "poll": 0}])

        play = Play().load(play_source, variable_manager=self.variable_manager, loader=self.loader)

        tqm = None
        try:
            tqm = TaskQueueManager(
                      inventory=self.inv_obj ,
                      variable_manager=self.variable_manager,
                      loader=self.loader,
                      passwords=self.passwords,
                      stdout_callback=self.results_callback)

            result = tqm.run(play)
        finally:
            if tqm is not None:
                tqm.cleanup()
            shutil.rmtree(C.DEFAULT_LOCAL_TMP, True)

    def playbook(self,playbooks):
        """
        Keyword arguments:
        playbooks --  需要是一个列表类型
        """
        from ansible.executor.playbook_executor import PlaybookExecutor

        playbook = PlaybookExecutor(playbooks=playbooks,
                        inventory=self.inv_obj,
                        variable_manager=self.variable_manager,
                        loader=self.loader,
                        passwords=self.passwords)

        # 使用回调函数
        playbook._tqm._stdout_callback = self.results_callback

        result = playbook.run()


    def get_result(self):
      result_raw = {'success':{},'failed':{},'unreachable':{}}

      # print(self.results_callback.host_ok)
      for host,result in self.results_callback.host_ok.items():
          result_raw['success'][host] = result._result
      for host,result in self.results_callback.host_failed.items():
          result_raw['failed'][host] = result._result
      for host,result in self.results_callback.host_unreachable.items():
          result_raw['unreachable'][host] = result._result

      # 最终打印结果,并且使用 JSON 继续格式化
      print(json.dumps(result_raw, indent=4))
      
      return json.dumps(result_raw)


测试

[root@dev-technology-215l fastapi_websocket_logs]# curl -s http://192.168.0.215:9999/load/172.16.19.43
{"load":"9.57"}

[root@dev-technology-215l fastapi_websocket_logs]# pwd
/data/shell/fastapi_websocket_logs

{"load":"2.33","pm2":" >>>> In-memory PM2 is out-of-date, do: >>>> $ pm2 update In memory PM2 version: 4.4.0 Local PM2 version: 3.5.0  ┌───────────────────────────────────┬────┬─────────┬──────┬───────┬────────┬─────────┬────────┬────────┬────────────┬──────┬──────────┐ │ App name                          │ id │ version │ mode │ pid   │ status │ restart │ uptime │ cpu    │ mem        │ user │ watching │ ├───────────────────────────────────┼────┼─────────┼──────┼───────┼────────┼─────────┼────────┼────────┼────────────┼──────┼──────────┤ │ service-sk_platform_center_sit    │ 4  │ N/A     │ fork │ 26277 │ online │ 0       │ 11s    │ 34.9%  │ 60.1 MB    │ root │ disabled │ │ service-sk_platform_develop_sit   │ 6  │ N/A     │ fork │ 26305 │ online │ 0       │ 11s    │ 30.2%  │ 39.1 MB    │ root │ disabled │ │ service-sk_platform_furniture_sit │ 7  │ N/A     │ fork │ 26317 │ online │ 0       │ 11s    │ 156.9% │ 319.6 MB   │ root │ disabled │ │ service-sk_platform_system_sit    │ 5  │ N/A     │ fork │ 26289 │ online │ 0       │ 11s    │ 34.9%  │ 57.2 MB    │ root │ disabled │ │ sk_service_calculate_center_sit   │ 2  │ N/A     │ fork │ 26249 │ online │ 0       │ 11s    │ 35.6%  │ 51.3 MB    │ root │ disabled │ │ sk_service_calculate_convert_sit  │ 8  │ N/A     │ fork │ 26323 │ online │ 0       │ 10s    │ 212.5% │ 314.7 MB   │ root │ disabled │ │ sk_service_logger_sit             │ 1  │ N/A     │ fork │ 26241 │ online │ 0       │ 11s    │ 44.5%  │ 54.5 MB    │ root │ disabled │ │ sk_service_schedule_sit           │ 3  │ N/A     │ fork │ 26265 │ online │ 0       │ 11s    │ 40.9%  │ 59.1 MB    │ root │ disabled │ │ sk_service_test_sit               │ 0  │ N/A     │ fork │ 26233 │ online │ 0       │ 11s    │ 34.9%  │ 74.6 MB    │ root │ disabled │ └───────────────────────────────────┴────┴─────────┴──────┴───────┴────────┴─────────┴────────┴────────┴────────────┴──────┴──────────┘  Use `pm2 show <id|name>` to get more details about an app ","cpu":"root     26323  128  1.7 24220264 322276 ?     Ssl  16:58   0:15 node /data/source/sk_service_calculate_convert_sit/publish/service/server.js root     26317  123  1.7 24099844 327476 ?     Ssl  16:58   0:14 node /data/source/service-sk_platform_furniture_sit/publish/service/server.js root      5222  104  0.9 12583248 168828 ?     Rl   8月26 117128:49 /usr/local/bin/node --max-old-space-size=32768 /data/source/sk_service_calculate_convert_sit/publish/service/process.js MCRCR1M3BR1BMHYBR6692IG3CGGPHYLL root      3686  104  0.5 12583172 109060 ?     Rl   8月26 117353:48 /usr/local/bin/node --max-old-space-size=32768 /data/source/sk_service_calculate_convert_sit/publish/service/process.js 5VQ19E387BT1I3K1FNLAVZXTSF36MHWF root     26233 42.0  0.4 22752072 79276 ?      Dsl  16:58   0:05 node /data/source/sk_service_test_sit/publish/service/server.js root     26277 41.5  0.3 12234288 61980 ?      Rsl  16:58   0:04 node /data/source/service-sk_platform_center_sit/publish/service/server.js root     26241 41.3  0.3 714392 56456 ?        Rsl  16:58   0:04 node /data/source/sk_service_logger_sit/publish/service/server.js root     26289 41.1  0.3 12236212 59688 ?      Dsl  16:58   0:04 node /data/source/service-sk_platform_system_sit/publish/service/server.js root     26265 40.4  0.3 12236540 60660 ?      Dsl  16:58   0:04 node /data/source/sk_service_schedule_sit/publish/service/server.js root     26249 39.9  0.3 12236896 59472 ?      Rsl  16:58   0:04 node /data/source/sk_service_calculate_center_sit/publish/service/server.js "}

zabbix自定义监控脚本

编写load_monitor.py 

[root@sit-cdpapp-162l zabbix]# cat load_monitor.py 
#!/usr/bin/python
# -*- coding:utf-8 -*-

import subprocess
import os,sys,json,datetime,time
import locale
import re
import requests


host2 = sys.argv[1]

r = requests.get('http://192.168.0.215:9999/load/{thost}'.format(thost=host2), timeout=10)


print(r.text)
修改 /etc/zabbix/zabbix_agentd.conf 文件



[root@sit-cdpapp-162l zabbix]# grep  -v  "#"  /etc/zabbix/zabbix_agentd.conf

PidFile=/var/run/zabbix/zabbix_agentd.pid

LogFile=/var/log/zabbix/zabbix_agentd.log
LogFileSize=0

Server=192.168.0.12
ServerActive=192.168.0.12
Hostname=sit-spring-app162
Timeout=10
Include=/etc/zabbix/zabbix_agentd.d/

UnsafeUserParameters=1

UserParameter=process.all[*],/etc/zabbix/processstatus.sh $1 $2
UserParameter=java_monitor[*],/etc/zabbix/java_monitor.py $1
UserParameter=cdp-java_monitor[*],/etc/zabbix/cdp-java_monitor.py $1
UserParameter=node_monitor[*],/etc/zabbix/node_monitor.py $1 $2
UserParameter=load_monitor[*],/etc/zabbix/load_monitor.py $1
UserParameter=pro_elk_port[*],/etc/zabbix/elk_socket_port.py $1 $2
UserParameter=node_monitor2[*],/etc/zabbix/node_monitor-nodomain.py $1 $2
UserParameter=nginx_check_upstream[*],/etc/zabbix/nginx_check_upstream.py $1 $2

image.png

image.png


锅包肉
89 声望17 粉丝

这个人很懒,没有什么说的。