前情与实验目的
背景:服务器晚上负载经常突然暴涨,只有node服务,但是双机另一台无问题解决方法:
通过python fastapi实现load,pm2信息,cup等接口,自定义zabbix脚本监控报警
实现 astapi实现load,pm2信息,cup等接口
#!/usr/bin/python
# -*- coding:utf-8 -*-
import subprocess
import urllib.request
from fastapi import FastAPI
import platform
import socket,requests
from ansible2 import *
import ansible_runner
import os, sys, json, datetime, time
import urllib.request
from fastapi.responses import HTMLResponse
from fastapi import FastAPI
from starlette.requests import Request
from starlette.responses import Response
from fastapi import FastAPI, Form
from fastapi import Cookie
from starlette.templating import Jinja2Templates
from starlette.staticfiles import StaticFiles
#from utils import sqlhelper
import pymysql
app = FastAPI()
#调用ansible模块传入主机IP和命令,返回命令结果
def ansible_linux_command(hosts1,cmd1):
ansible3 = MyAnsiable2(inventory='/data/ansible/host/hosts', connection='smart')
ansible3.run(hosts=hosts1, module="shell", args=cmd1)
stdout_dict = json.loads(ansible3.get_result())
print(stdout_dict, type(stdout_dict))
print(stdout_dict['success'][hosts1]['stdout'])
source_list = stdout_dict['success'][hosts1]['stdout'].split("\n")
return source_list[0]
def ansible_load(hosts1):
#调用ansible接口获取服务器lode值
pid_listf = float(ansible_linux_command(hosts1, "uptime | awk {tprint} |tr -d ','".format(tprint="'{print $11}'")))
print(pid_listf,type(pid_listf))
#如果load负载值高于10则获取pm2的信息,如果小于10则返回load值。
if pid_listf > 10.00:
stdout_list2 = {"load": ansible_linux_command(hosts1, "uptime | awk {tprint} |tr -d ','".format(tprint="'{print $11}'")),"pm2": ansible_linux_command(hosts1, 'pm2 ls|tr "\n" " "'),"cpu": ansible_linux_command(hosts1, 'ps aux|grep -v PID|sort -rn -k +3|head|tr "\n" " "')}
else:
stdout_list2 = {"load": ansible_linux_command(hosts1, "uptime | awk {tprint} |tr -d ','".format(tprint="'{print $11}'"))}
return stdout_list2
@app.get("/load/{hosts1}")
def read_load(hosts1: str):
print(hosts1, '#######################hosts')
print(ansible_load(hosts1))
return ansible_load(hosts1)
if __name__ == '__main__':
import uvicorn
uvicorn.run(app=app,
host="192.168.0.215",
port=9999,
workers=1)
ansible 模块
[root@dev-technology-215l fastapi_websocket_logs]# cat ansible2.py
import json
import shutil
from ansible.module_utils.common.collections import ImmutableDict
from ansible.parsing.dataloader import DataLoader
from ansible.vars.manager import VariableManager
from ansible.inventory.manager import InventoryManager
from ansible.playbook.play import Play
from ansible.executor.task_queue_manager import TaskQueueManager
from ansible.plugins.callback import CallbackBase
from ansible import context
import ansible.constants as C
class ResultCallback(CallbackBase):
"""
重写callbackBase类的部分方法
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.host_ok = {}
self.host_unreachable = {}
self.host_failed = {}
self.task_ok = {}
def v2_runner_on_unreachable(self, result):
self.host_unreachable[result._host.get_name()] = result
def v2_runner_on_ok(self, result, **kwargs):
self.host_ok[result._host.get_name()] = result
def v2_runner_on_failed(self, result, **kwargs):
self.host_failed[result._host.get_name()] = result
class MyAnsiable2():
def __init__(self,
connection='local', # 连接方式 local 本地方式,smart ssh方式
remote_user=None, # ssh 用户
remote_password=None, # ssh 用户的密码,应该是一个字典, key 必须是 conn_pass
private_key_file=None, # 指定自定义的私钥地址
sudo=None, sudo_user=None, ask_sudo_pass=None,
module_path=None, # 模块路径,可以指定一个自定义模块的路径
become=None, # 是否提权
become_method=None, # 提权方式 默认 sudo 可以是 su
become_user=None, # 提权后,要成为的用户,并非登录用户
check=False, diff=False,
listhosts=None, listtasks=None,listtags=None,
verbosity=3,
syntax=None,
start_at_task=None,
inventory=None):
# 函数文档注释
"""
初始化函数,定义的默认的选项值,
在初始化的时候可以传参,以便覆盖默认选项的值
"""
context.CLIARGS = ImmutableDict(
connection=connection,
remote_user=remote_user,
private_key_file=private_key_file,
sudo=sudo,
sudo_user=sudo_user,
ask_sudo_pass=ask_sudo_pass,
module_path=module_path,
become=become,
become_method=become_method,
become_user=become_user,
verbosity=verbosity,
listhosts=listhosts,
listtasks=listtasks,
listtags=listtags,
syntax=syntax,
start_at_task=start_at_task,
)
# 三元表达式,假如没有传递 inventory, 就使用 "localhost,"
# 指定 inventory 文件
# inventory 的值可以是一个 资产清单文件
# 也可以是一个包含主机的元组,这个仅仅适用于测试
# 比如 : 1.1.1.1, # 如果只有一个 IP 最后必须有英文的逗号
# 或者: 1.1.1.1, 2.2.2.2
self.inventory = inventory if inventory else "localhost,"
# 实例化数据解析器
self.loader = DataLoader()
# 实例化 资产配置对象
self.inv_obj = InventoryManager(loader=self.loader, sources=self.inventory)
# 设置密码
self.passwords = remote_password
# 实例化回调插件对象
self.results_callback = ResultCallback()
# 变量管理器
self.variable_manager = VariableManager(self.loader, self.inv_obj)
def run(self, hosts='localhost', gether_facts="no", module="ping", args='', task_time=0):
"""
参数说明:
task_time -- 执行异步任务时等待的秒数,这个需要大于 0 ,等于 0 的时候不支持异步(默认值)。这个值应该等于执行任务实际耗时时间为好
"""
play_source = dict(
name = "Ad-hoc",
hosts = hosts,
gather_facts = gether_facts,
tasks = [
# 这里每个 task 就是这个列表中的一个元素,格式是嵌套的字典
# 也可以作为参数传递过来,这里就简单化了。
{"action":{"module": module, "args": args}, "async": task_time, "poll": 0}])
play = Play().load(play_source, variable_manager=self.variable_manager, loader=self.loader)
tqm = None
try:
tqm = TaskQueueManager(
inventory=self.inv_obj ,
variable_manager=self.variable_manager,
loader=self.loader,
passwords=self.passwords,
stdout_callback=self.results_callback)
result = tqm.run(play)
finally:
if tqm is not None:
tqm.cleanup()
shutil.rmtree(C.DEFAULT_LOCAL_TMP, True)
def playbook(self,playbooks):
"""
Keyword arguments:
playbooks -- 需要是一个列表类型
"""
from ansible.executor.playbook_executor import PlaybookExecutor
playbook = PlaybookExecutor(playbooks=playbooks,
inventory=self.inv_obj,
variable_manager=self.variable_manager,
loader=self.loader,
passwords=self.passwords)
# 使用回调函数
playbook._tqm._stdout_callback = self.results_callback
result = playbook.run()
def get_result(self):
result_raw = {'success':{},'failed':{},'unreachable':{}}
# print(self.results_callback.host_ok)
for host,result in self.results_callback.host_ok.items():
result_raw['success'][host] = result._result
for host,result in self.results_callback.host_failed.items():
result_raw['failed'][host] = result._result
for host,result in self.results_callback.host_unreachable.items():
result_raw['unreachable'][host] = result._result
# 最终打印结果,并且使用 JSON 继续格式化
print(json.dumps(result_raw, indent=4))
return json.dumps(result_raw)
测试
[root@dev-technology-215l fastapi_websocket_logs]# curl -s http://192.168.0.215:9999/load/172.16.19.43
{"load":"9.57"}
[root@dev-technology-215l fastapi_websocket_logs]# pwd
/data/shell/fastapi_websocket_logs
{"load":"2.33","pm2":" >>>> In-memory PM2 is out-of-date, do: >>>> $ pm2 update In memory PM2 version: 4.4.0 Local PM2 version: 3.5.0 ┌───────────────────────────────────┬────┬─────────┬──────┬───────┬────────┬─────────┬────────┬────────┬────────────┬──────┬──────────┐ │ App name │ id │ version │ mode │ pid │ status │ restart │ uptime │ cpu │ mem │ user │ watching │ ├───────────────────────────────────┼────┼─────────┼──────┼───────┼────────┼─────────┼────────┼────────┼────────────┼──────┼──────────┤ │ service-sk_platform_center_sit │ 4 │ N/A │ fork │ 26277 │ online │ 0 │ 11s │ 34.9% │ 60.1 MB │ root │ disabled │ │ service-sk_platform_develop_sit │ 6 │ N/A │ fork │ 26305 │ online │ 0 │ 11s │ 30.2% │ 39.1 MB │ root │ disabled │ │ service-sk_platform_furniture_sit │ 7 │ N/A │ fork │ 26317 │ online │ 0 │ 11s │ 156.9% │ 319.6 MB │ root │ disabled │ │ service-sk_platform_system_sit │ 5 │ N/A │ fork │ 26289 │ online │ 0 │ 11s │ 34.9% │ 57.2 MB │ root │ disabled │ │ sk_service_calculate_center_sit │ 2 │ N/A │ fork │ 26249 │ online │ 0 │ 11s │ 35.6% │ 51.3 MB │ root │ disabled │ │ sk_service_calculate_convert_sit │ 8 │ N/A │ fork │ 26323 │ online │ 0 │ 10s │ 212.5% │ 314.7 MB │ root │ disabled │ │ sk_service_logger_sit │ 1 │ N/A │ fork │ 26241 │ online │ 0 │ 11s │ 44.5% │ 54.5 MB │ root │ disabled │ │ sk_service_schedule_sit │ 3 │ N/A │ fork │ 26265 │ online │ 0 │ 11s │ 40.9% │ 59.1 MB │ root │ disabled │ │ sk_service_test_sit │ 0 │ N/A │ fork │ 26233 │ online │ 0 │ 11s │ 34.9% │ 74.6 MB │ root │ disabled │ └───────────────────────────────────┴────┴─────────┴──────┴───────┴────────┴─────────┴────────┴────────┴────────────┴──────┴──────────┘ Use `pm2 show <id|name>` to get more details about an app ","cpu":"root 26323 128 1.7 24220264 322276 ? Ssl 16:58 0:15 node /data/source/sk_service_calculate_convert_sit/publish/service/server.js root 26317 123 1.7 24099844 327476 ? Ssl 16:58 0:14 node /data/source/service-sk_platform_furniture_sit/publish/service/server.js root 5222 104 0.9 12583248 168828 ? Rl 8月26 117128:49 /usr/local/bin/node --max-old-space-size=32768 /data/source/sk_service_calculate_convert_sit/publish/service/process.js MCRCR1M3BR1BMHYBR6692IG3CGGPHYLL root 3686 104 0.5 12583172 109060 ? Rl 8月26 117353:48 /usr/local/bin/node --max-old-space-size=32768 /data/source/sk_service_calculate_convert_sit/publish/service/process.js 5VQ19E387BT1I3K1FNLAVZXTSF36MHWF root 26233 42.0 0.4 22752072 79276 ? Dsl 16:58 0:05 node /data/source/sk_service_test_sit/publish/service/server.js root 26277 41.5 0.3 12234288 61980 ? Rsl 16:58 0:04 node /data/source/service-sk_platform_center_sit/publish/service/server.js root 26241 41.3 0.3 714392 56456 ? Rsl 16:58 0:04 node /data/source/sk_service_logger_sit/publish/service/server.js root 26289 41.1 0.3 12236212 59688 ? Dsl 16:58 0:04 node /data/source/service-sk_platform_system_sit/publish/service/server.js root 26265 40.4 0.3 12236540 60660 ? Dsl 16:58 0:04 node /data/source/sk_service_schedule_sit/publish/service/server.js root 26249 39.9 0.3 12236896 59472 ? Rsl 16:58 0:04 node /data/source/sk_service_calculate_center_sit/publish/service/server.js "}
zabbix自定义监控脚本
编写load_monitor.py
[root@sit-cdpapp-162l zabbix]# cat load_monitor.py
#!/usr/bin/python
# -*- coding:utf-8 -*-
import subprocess
import os,sys,json,datetime,time
import locale
import re
import requests
host2 = sys.argv[1]
r = requests.get('http://192.168.0.215:9999/load/{thost}'.format(thost=host2), timeout=10)
print(r.text)
修改 /etc/zabbix/zabbix_agentd.conf 文件
[root@sit-cdpapp-162l zabbix]# grep -v "#" /etc/zabbix/zabbix_agentd.conf
PidFile=/var/run/zabbix/zabbix_agentd.pid
LogFile=/var/log/zabbix/zabbix_agentd.log
LogFileSize=0
Server=192.168.0.12
ServerActive=192.168.0.12
Hostname=sit-spring-app162
Timeout=10
Include=/etc/zabbix/zabbix_agentd.d/
UnsafeUserParameters=1
UserParameter=process.all[*],/etc/zabbix/processstatus.sh $1 $2
UserParameter=java_monitor[*],/etc/zabbix/java_monitor.py $1
UserParameter=cdp-java_monitor[*],/etc/zabbix/cdp-java_monitor.py $1
UserParameter=node_monitor[*],/etc/zabbix/node_monitor.py $1 $2
UserParameter=load_monitor[*],/etc/zabbix/load_monitor.py $1
UserParameter=pro_elk_port[*],/etc/zabbix/elk_socket_port.py $1 $2
UserParameter=node_monitor2[*],/etc/zabbix/node_monitor-nodomain.py $1 $2
UserParameter=nginx_check_upstream[*],/etc/zabbix/nginx_check_upstream.py $1 $2
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。