需求
系统报ERROR错误时,能实时做到消息通知。
思路
当前项目比较小,不想过多的依赖额外的第三方组件。
项目在ERROR时,都会打印ERROR日志,所以可以在log4j接收到ERROR日志请求时,发送通知消息。
实践
Filter
是log4j2的扩展点,从图中(图片来自如何编写Log4j2脱敏插件)流程可以看到,Filter
分别可以在全局
、Logger
、Appender
三个地方做过滤。
三个地方对应的log4j.xml
配置地方如下:
<Configuration>
<MyFilter /><!-- 全局 -->
<Appenders>
<RollingFile name="ErrorRollingFile">
<Filters>
<ThresholdFilter level="ERROR" onMatch="NEUTRAL" />
<MyFilter /> <!-- Logger -->
</Filters>
</RollingFile>
</Appenders>
<Loggers>
<Root level="INFO">
<MyFilter /> <!-- Appender -->
<AppenderRef ref="ErrorRollingFile" />
</Root>
</Loggers>
</Configuration>
log4j提供了过滤器的基类AbstractFilter
:
全局过滤器
入口方法是filter(Logger logger, Level level, Marker marker, String msg, Object... params)
,msg
是填充参数之前的内容,params
是参数列表,包含Throwable
对象。Logger
和Appender
入口方式是filter(final LogEvent event)
,通过event.getMessage().getFormattedMessage()
取到填充参数之后的内容,通过event.getThrown()
获取异常对象。
代码
@Plugin(name = "ErrorNotifyFilter", category = Node.CATEGORY, elementType = Filter.ELEMENT_TYPE, printObject = true)
public class ErrorNotifyLog4j2Filter extends AbstractFilter {
// 默认打印异常堆栈长度
public static final int DEFAULT_PRINT_EXCEPTIONSTACK_LENGTH = 3;
// 默认消息最长长度
// 2020-07-09 测试企业微信允许最大长度为4000
public static final int DEFAULT_MAX_MSG_LENGTH = 4000;
private String projectName;
private List<String> rtxReceivers;
private boolean noPrintExceptionStack;
private int printExceptionStackLength;
private int maxMsgLength;
private Object lock = new Object();
private ErrorNotifyLog4j2Filter(String projectName, String rtxReceivers,
boolean noPrintExceptionStack, int printExceptionStackLength, int maxMsgLength) {
super();
this.projectName = projectName;
this.rtxReceivers = Lists.newArrayList(rtxReceivers.split(","));
this.noPrintExceptionStack = noPrintExceptionStack;
this.printExceptionStackLength = printExceptionStackLength <= 0 ? DEFAULT_PRINT_EXCEPTIONSTACK_LENGTH : printExceptionStackLength;
this.maxMsgLength = (maxMsgLength <= 0 || maxMsgLength > DEFAULT_MAX_MSG_LENGTH) ? DEFAULT_MAX_MSG_LENGTH : maxMsgLength;
}
@Override
public Result filter(LogEvent event) {
notify(event.getLevel(), event.getMessage().getFormattedMessage(), event.getThrown());
return super.filter(event);
}
@Override
public Result filter(Logger logger, Level level, Marker marker, Message msg, Throwable t) {
notify(level, msg.getFormattedMessage(), t);
return super.filter(logger, level, marker, msg, t);
}
@Override
public Result filter(Logger logger, Level level, Marker marker, Object msg, Throwable t) {
notify(level, msg == null ? "" : msg.toString(), t);
return super.filter(logger, level, marker, msg, t);
}
@Override
public Result filter(Logger logger, Level level, Marker marker, String msg, Object... params) {
notify(level, msg, getExceptionParam(params));
return super.filter(logger, level, marker, msg, params);
}
/**
* @param level
* @param msg
* @param t
* @author
* @date
*/
private void notify(Level level, String msg, Throwable t) {
try {
if (level == null || level.intLevel() != Level.ERROR.intLevel()) {
return;
}
if (StringUtils.isBlank(msg) && t == null) {
return;
}
Log4j2AsyncExecutor.executorService.submit(() -> {
try {
String notifyMsg = getNotifyMsg(msg, t); MessageUtil.postMessage(Lists.newArrayList(MessageTypeEnum.RTX),
rtxReceivers,
getNotifyTitle(),
notifyMsg);
} catch (Exception ignoreException) {
ignoreException.printStackTrace();
}
});
} catch (Throwable ignoreException) {
ignoreException.printStackTrace();
}
}
/**
* @param params
* @return java.lang.Throwable
* @author
* @date
*/
private Throwable getExceptionParam(Object... params) {
if (params == null || params.length == 0) {
return null;
}
for (Object param : params) {
if (param instanceof Throwable) {
return (Throwable) param;
}
}
return null;
}
/**
* 如果开启堆栈信息,能让告警更清晰,但同样的也就降低了性能
*
* @param msg
* @param t
* @return java.lang.String
* @author
* @date
*/
private String getNotifyMsg(String msg, Throwable t) {
String errorMsg = "信息:" + (msg == null ? "" : msg);
String exceptionMsg = "";
if (t != null) {
exceptionMsg += "\n异常:" + t.toString();
if (noPrintExceptionStack) {
return errorMsg + exceptionMsg;
}
StackTraceElement[] stackTraceElements = t.getStackTrace();
if (stackTraceElements == null || stackTraceElements.length == 0) {
return errorMsg + exceptionMsg;
}
int length = Math.min(stackTraceElements.length, printExceptionStackLength);
for (int i = 0; i < length; i++) {
exceptionMsg += "\n" + stackTraceElements[i];
}
}
String allMsg = errorMsg + exceptionMsg;
return allMsg.length() > maxMsgLength ? allMsg.substring(0, maxMsgLength) : allMsg;
}
/**
* @return java.lang.String
* @author
* @date
*/
private String getNotifyTitle() {
String actualActiveProfiles = getActiveProfiles();
StringBuilder ret = new StringBuilder("项目异常告警 ");
if (StringUtils.isNotBlank(actualActiveProfiles)) {
ret.append("【").append(actualActiveProfiles).append("】");
}
ret.append("【").append(projectName).append("】");
ret.append("【").append(IPUtils.getLocalHostIP()).append("】");
ret.append("【").append(DateUtils.formatDateTime(new Date())).append("】");
return ret.toString();
}
/**
* @return java.lang.String
* @author
* @date
*/
private String getActiveProfiles() {
String[] activeProfiles = SpringContextUtil.getApplicationContext() == null ? null : SpringContextUtil.getActiveProfile();
if (activeProfiles == null || activeProfiles.length == 0) {
return "";
}
List<String> actualActiveProfiles = Arrays.stream(activeProfiles)
// 部分项目的profile中有用到include,include中含有"-",过滤掉include的数据
.filter(str -> !str.contains("-"))
.collect(Collectors.toList());
return StringUtils.join(actualActiveProfiles, ",");
}
/**
* @return com.tscm.purchase.interfacer.log4j2.ErrorNotifyFilter
* @author
* @date 2020-05-14 18:37
*/
@PluginFactory
public static ErrorNotifyLog4j2Filter createFilter(@PluginAttribute("projectName") final String projectName, @PluginAttribute("rtxReceivers") final String rtxReceivers, @PluginAttribute("noPrintExceptionStack") final Boolean noPrintExceptionStack, @PluginAttribute("printExceptionStackLength") final Integer printExceptionStackLength, @PluginAttribute("maxMsgLength") final Integer maxMsgLength) {
return new ErrorNotifyLog4j2Filter(projectName, rtxReceivers,
noPrintExceptionStack == null ? false : noPrintExceptionStack.booleanValue(),
printExceptionStackLength == null ? DEFAULT_PRINT_EXCEPTIONSTACK_LENGTH : printExceptionStackLength,
maxMsgLength == null ? DEFAULT_MAX_MSG_LENGTH : maxMsgLength);
}
}
@Component
public class Log4j2AsyncExecutor {
// 根据自己项目的情况配置线程池信息
public static ExecutorService executorService = new ThreadPoolExecutor(1,
1,
0L,
TimeUnit.MILLISECONDS,
new LinkedBlockingQueue<>(),
new ThreadFactoryBuilder().setNameFormat("log4j2-async-executor-pool-%d").build());
/**
* @author
* @date
*/
@PreDestroy
public synchronized void shutdown() {
if (executorService != null) {
ThreadUtils.shutdown(executorService, 5, TimeUnit.SECONDS);
executorService = null;
}
}
}
这里要特别说明:为什么要有Log4j2AsyncExecutor
这个类?
因为发送告警是异步的,用了线程池,那在进程停止时,肯定要销毁线程池;而AbstractFilter
虽然实现了LifeCycle
接口,有stop
方法,但是实际上stop
方法并不会被调用到;所以这里依赖了Spring的PreDestroy
来销毁。
配置
告警肯定是要取填充参数之后的内容
,所以Filter
放在Logger
或Appender
里会更合适;但代码里同时也兼容放在全局
配置里的过滤器。
<Configuration>
......
<Appenders>
<RollingFile name="ErrorRollingFile">
<Filters>
<ThresholdFilter level="ERROR" onMatch="NEUTRAL" />
<ErrorNotifyFilter projectName="xxxx" rtxReceivers="xxxx" />
</Filters>
......
</RollingFile>
</Appenders>
<Loggers>
<Root level="INFO">
......
<AppenderRef ref="ErrorRollingFile" />
</Root>
</Loggers>
</Configuration>
优化
以上功能虽然实现了基本的日志告警,但是里有两个地方有问题:
- 虽然
filter
在异常部分都catch
了,但是通知类MessageUtil.postMessage
是平台封装好的调用企业微信的功能,里面异常地方用了log.error
。这里假设通知失败,比如服务器网络异常,导致postMessage
方法打印了ERROR
日志,再次进入告警模块,然后又继续失败,继续告警,导致无限循环。 (假如通知模块没有调用log.error
,那这里就不会出问题)。 如果系统在短时间内出现大量异常,那就会发送大量的通知。应该对通知做频次过滤,这里过滤分两种:
- N秒内只能通知M次,这个控制比较简单,这里如果前M次都是相同的异常A,M+1是异常B,会导致B异常不会被通知。
- N秒内,相同的内容只能通知M次,这个控制虽然相对比较复杂,但是可以避免上面的那个问题。
- 有同事在
for循环
里循环遍历操作业务,如果失败就打异常日志,由于每次循环的参数不一定,所以异常日志也不一样,导致在一次几万条
数据的循环里,打了几万次日志,所以要控制单位时间内告警的总次数。
根据内容做频次过滤,同时还能解决第一个问题,因为无限循环告警时,每次告警的内容是一样的,所以第二次告警是就会被过阻断调,频次过滤。
这里频次过滤没必要做的多复杂,N秒内一次即可,这时候可以用基于时间过期的缓存来快速实现。
代码
/**
* @author
* @data 2020-05-14 18:36
* @description
*/
@Plugin(name = "ErrorNotifyFilter", category = Node.CATEGORY, elementType = Filter.ELEMENT_TYPE, printObject = true)
public class ErrorNotifyLog4j2Filter extends AbstractFilter {
// 默认打印异常堆栈长度
public static final int DEFAULT_PRINT_EXCEPTIONSTACK_LENGTH = 10;
// 默认消息最长长度
// 2020-07-09 测试企业微信允许最大长度为4000
public static final int DEFAULT_MAX_MSG_LENGTH = 4000;
private String projectName;
private List<String> rtxReceivers;
private boolean noPrintExceptionStack;
private int printExceptionStackLength;
private int maxMsgLength;
private List<Limiter> limiters;
private ErrorNotifyLog4j2Filter(String projectName, String rtxReceivers, int contentLimitSecond, int cacheMaxSize,
boolean noPrintExceptionStack, int printExceptionStackLength, int maxMsgLength, int timesLimit) {
super();
this.projectName = projectName;
this.rtxReceivers = Lists.newArrayList(rtxReceivers.split(","));
this.noPrintExceptionStack = noPrintExceptionStack;
this.printExceptionStackLength =
printExceptionStackLength <= 0 ? DEFAULT_PRINT_EXCEPTIONSTACK_LENGTH : printExceptionStackLength;
this.maxMsgLength =
(maxMsgLength <= 0 || maxMsgLength > DEFAULT_MAX_MSG_LENGTH) ? DEFAULT_MAX_MSG_LENGTH : maxMsgLength;
limiters = Lists.newArrayList();
limiters.add(new ContentLimiter(contentLimitSecond, cacheMaxSize));
limiters.add(new TimesLimiter(timesLimit));
}
@Override
public Result filter(LogEvent event) {
notify(event.getLevel(), event.getMessage().getFormattedMessage(), event.getThrown());
return super.filter(event);
}
@Override
public Result filter(Logger logger, Level level, Marker marker, Message msg, Throwable t) {
notify(level, msg.getFormattedMessage(), t);
return super.filter(logger, level, marker, msg, t);
}
@Override
public Result filter(Logger logger, Level level, Marker marker, Object msg, Throwable t) {
notify(level, msg == null ? "" : msg.toString(), t);
return super.filter(logger, level, marker, msg, t);
}
@Override
public Result filter(Logger logger, Level level, Marker marker, String msg, Object... params) {
notify(level, msg, getExceptionParam(params));
return super.filter(logger, level, marker, msg, params);
}
/**
* @param level
* @param msg
* @param t
* @author
* @date 2020-05-14 18:44
*/
private void notify(Level level, String msg, Throwable t) {
try {
if (level == null || level.intLevel() != Level.ERROR.intLevel()) {
return;
}
if (StringUtils.isBlank(msg) && t == null) {
return;
}
Log4j2AsyncExecutor.executorService.submit(() -> {
try {
String notifyMsg = getNotifyMsg(msg, t);
if (isLimited(notifyMsg)) {
// 以下两种场景会被限制
// 1. MessageUtil.postMessag方法有问题(比如网络出问题导致调用企业微信失败),里面调用了log.error,会导致无限信号。
// 2. 系统在短时间之内大量异常。
// 打印到console。
System.out.println("notifyMsg[" + notifyMsg + "] is limited");
return;
}
MessageUtil.postMessage(Lists.newArrayList(MessageTypeEnum.RTX),
rtxReceivers,
getNotifyTitle(),
notifyMsg);
} catch (Exception ignoreException) {
ignoreException.printStackTrace();
}
});
} catch (Throwable ignoreException) {
ignoreException.printStackTrace();
}
}
/**
* @param params
* @return java.lang.Throwable
* @author
* @date 2020-05-14 19:25
*/
private Throwable getExceptionParam(Object... params) {
if (params == null || params.length == 0) {
return null;
}
for (Object param : params) {
if (param instanceof Throwable) {
return (Throwable) param;
}
}
return null;
}
/**
* 如果开启堆栈信息,能让告警更清晰,但同样的也就降低了性能
*
* @param msg
* @param t
* @return java.lang.String
* @author
* @date 2020-05-25 12:53
*/
private String getNotifyMsg(String msg, Throwable t) {
String errorMsg = "信息:" + (msg == null ? "" : msg);
String exceptionMsg = "";
if (t != null) {
exceptionMsg += "\n异常:" + t.toString();
if (noPrintExceptionStack) {
return errorMsg + exceptionMsg;
}
StackTraceElement[] stackTraceElements = t.getStackTrace();
if (stackTraceElements == null || stackTraceElements.length == 0) {
return errorMsg + exceptionMsg;
}
int length = Math.min(stackTraceElements.length, printExceptionStackLength);
for (int i = 0; i < length; i++) {
exceptionMsg += "\n" + stackTraceElements[i];
}
}
String allMsg = errorMsg + exceptionMsg;
return allMsg.length() > maxMsgLength ? allMsg.substring(0, maxMsgLength) : allMsg;
}
/**
* @return java.lang.String
* @author
* @date 2020-06-28 10:51
*/
private String getNotifyTitle() {
String actualActiveProfiles = getActiveProfiles();
StringBuilder ret = new StringBuilder("项目异常告警 ");
if (StringUtils.isNotBlank(actualActiveProfiles)) {
ret.append("【").append(actualActiveProfiles).append("】");
}
ret.append("【").append(projectName).append("】");
ret.append("【").append(IPUtils.getLocalHostIP()).append("】");
ret.append("【").append(DateUtils.formatDateTime(new Date())).append("】");
return ret.toString();
}
/**
* 是否被限制
*
* @param notifyMsg
* @return boolean
* @author
* @date 2020-06-02 15:26
*/
private boolean isLimited(String notifyMsg) throws Exception {
for (Limiter limiter : limiters) {
if (limiter.isLimited(notifyMsg)) {
return true;
}
}
return false;
}
/**
* @return java.lang.String
* @author
* @date 2020-05-25 12:57
*/
private String getActiveProfiles() {
String[] activeProfiles =
SpringContextUtil.getApplicationContext() == null ? null : SpringContextUtil.getActiveProfile();
if (activeProfiles == null || activeProfiles.length == 0) {
return "";
}
List<String> actualActiveProfiles = Arrays.stream(activeProfiles)
// 部分项目的profile中有用到include,include中含有"-",过滤掉include的数据
.filter(str -> !str.contains("-"))
.collect(Collectors.toList());
return StringUtils.join(actualActiveProfiles, ",");
}
/**
* @param projectName 项目名,必填
* @param rtxReceivers 告警人,多个之间以","分隔,必填
* @param contentLimitSecond 如果瞬间有大量"相同内容"的异常,在limitSecond秒内只会告警一次
* @param cacheMaxSize 配合limitSecond,缓存当前已告警的数量,指定最多存储多少条,防止无限制存储数据导致内存溢出
* @param noPrintExceptionStack 异常信息内是否不打印堆栈信息
* @param printExceptionStackLength 如果打印堆栈信息,打印多少行,取Math.min(printExceptionStackLength, 堆栈长度)
* @param maxMsgLength 消息内容长度限制,默认4000,当前测试"企业微信接口"超过4000就发送失败
* @param timesLimit 限定时间内(目前暂时是自然小时时间,后续优化为可以自定义滑动窗口时间段)允许告警次数
* @return com.xxx.tscm.common.log4j2.ErrorNotifyLog4j2Filter
* @author
* @date 2020-05-14 18:37
*/
@PluginFactory
public static ErrorNotifyLog4j2Filter createFilter(@PluginAttribute("projectName") final String projectName,
@PluginAttribute("rtxReceivers") final String rtxReceivers,
@PluginAttribute("contentLimitSecond") final Integer contentLimitSecond,
@PluginAttribute("cacheMaxSize") final Integer cacheMaxSize,
@PluginAttribute("noPrintExceptionStack") final Boolean noPrintExceptionStack,
@PluginAttribute("printExceptionStackLength") final Integer printExceptionStackLength,
@PluginAttribute("maxMsgLength") final Integer maxMsgLength,
@PluginAttribute("timesLimit") final Integer timesLimit) {
return new ErrorNotifyLog4j2Filter(projectName, rtxReceivers,
contentLimitSecond == null || contentLimitSecond <= 0 ? ContentLimiter.DEFAULT_CONTENT_LIMIT_SECOND
: contentLimitSecond,
cacheMaxSize == null || cacheMaxSize <= 0 ? ContentLimiter.DEFAULT_CACHE_MAX_SIZE : cacheMaxSize,
noPrintExceptionStack == null ? false : noPrintExceptionStack.booleanValue(),
printExceptionStackLength == null ? DEFAULT_PRINT_EXCEPTIONSTACK_LENGTH : printExceptionStackLength,
maxMsgLength == null ? DEFAULT_MAX_MSG_LENGTH : maxMsgLength,
timesLimit <= 0 ? TimesLimiter.DEFAULT_TIMES_LIMIT : timesLimit);
}
}
/**
* @author
* @date 2020-09-24 11:48
* @description
*/
public interface Limiter {
/**
* @param content
* @return boolean
* @author
* @date 2020-09-24 11:49
*/
boolean isLimited(String content) throws Exception;
}
/**
* 按内容频次过滤
*
* @author
* @data 2020-06-02 10:51
* @description
*/
public class ContentLimiter extends AbstractLocalCache<String, String> implements Limiter {
public static final String DEFAILT_VALUE = "DEFAILT_VALUE";
public static final String NULL_VALUE = "NULL_VALUE";
public static final int DEFAULT_CONTENT_LIMIT_SECOND = 60;
public static final int DEFAULT_CACHE_MAX_SIZE = 1000;
private Object lock = new Object();
public ContentLimiter(long expireAfterWriteDuration, int maximumSize) {
super(CacheBuilder.newBuilder().maximumSize(maximumSize)
.refreshAfterWrite(expireAfterWriteDuration, TimeUnit.SECONDS));
}
@Override
protected String loadData(String key) {
return NULL_VALUE;
}
/**
* 是否被限制
*
* @param notifyMsg
* @return boolean
* @author
* @date 2020-06-02 15:26
*/
@Override
public boolean isLimited(String notifyMsg) throws Exception {
// 根据内容限制
String encodeMsg = DigestUtils.md5Hex(notifyMsg);
String value = getCache(encodeMsg);
if (value == ContentLimiter.DEFAILT_VALUE) {
return true;
}
// 只有新的内容,才会进入锁
// 如果真的短时间出现大量告警,基本上都是相同内容(也可能是相同类别但参数不一样导致内容不一样)的告警
// 因为用了线程池,线程数并不多(目前是单线程)
// 基于以上原因,锁对性能的影响基本可以忽略不计
synchronized (lock) {
value = getCache(encodeMsg);
if (value == ContentLimiter.DEFAILT_VALUE) {
return true;
}
put(encodeMsg, ContentLimiter.DEFAILT_VALUE);
return false;
}
}
}
/**
* 按内容频次过滤
*
* @author
* @data 2020-06-02 10:51
* @description
*/
public class TimesLimiter implements Limiter {
public static final int DEFAULT_TIMES_LIMIT = 100;
private Object lock = new Object();
private int timesLimit;
private String date;
private AtomicInteger counter;
public TimesLimiter(int timesLimit) {
this.timesLimit = timesLimit;
counter = new AtomicInteger();
}
@Override
public boolean isLimited(String content) throws Exception {
String now = DateUtils.format(new Date(), "yyyyMMddHH");
if (!now.equals(date)) {
synchronized (lock) {
if (now.equals(date)) {
return counter.incrementAndGet() > timesLimit;
}
date = now;
counter.set(1);
return false;
}
} else {
return counter.incrementAndGet() > timesLimit;
}
}
}
/**
* @author
* @data 2020-05-14 18:36
* @description
*/
@Component
public class Log4j2AsyncExecutor {
public static final int EXECUTOR_QUEUE_SIZE = 10000;
public static ExecutorService executorService = new ThreadPoolExecutor(1,
1,
0L,
TimeUnit.MILLISECONDS,
new LinkedBlockingQueue<>(EXECUTOR_QUEUE_SIZE),
new ThreadFactoryBuilder().setNameFormat("log4j2-async-executor-pool-%d").build());
/**
* @author
* @date 2020-05-20 15:12
*/
@PreDestroy
public synchronized void shutdown() {
if (executorService != null) {
ThreadUtils.shutdown(executorService, 5, TimeUnit.SECONDS);
executorService = null;
}
}
}
避坑
NEUTRAL
和ACCEPT
及NEUTRAL
的区别是:
- onMatch="ACCEPT" 表示匹配该级别及以上,且不走后续的Filter。
- onMatch="DENY" 表示不匹配该级别及以上,且不走后续的Filter。
- onMatch="NEUTRAL" 表示该级别及以上的,由下一个filter处理,如果当前是最后一个,则表示匹配该级别及以上。
- onMismatch="ACCEPT" 表示匹配该级别以下,且不走后续的Filter。
- onMismatch="NEUTRAL" 表示该级别及以下的,由下一个filter处理,如果当前是最后一个,则不匹配该级别以下的。
- onMismatch="DENY" 表示不匹配该级别以下的,且不走后续的Filter。
onMatch
默认值为NEUTRAL
,onMismatch
默认是DENY
。
备注
本篇文章的做法,是告警跟项目在同一进程内,这样有不少缺点,比如:
- 如果告警功能有问题,比如造成内存泄漏,可能会影响到项目正常运行。
- 升级不方便:功能改进/BUG修复时,业务方需要升级的jar包,这样会给业务方造成困扰;而且无法统一各个业务方的版本,后续版本升级时,还需要考虑各个版本的兼容情况。
综上,本篇文章的做法,比较适合小系统。
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。