环境
java环境
hadoop环境
zookeeper和kafka都要安装完成
组件选择
source
flume1.7加入的taildir类型的source,可以随时监控文件变化、支持断点续传。是主流的source类型。
channel
kafka channel,能将source传入的数据,直接存储到kafka。既保存在磁盘中,提高了可靠性。数据层传输的效率又高。是主流的channel类型
sink
根据数据保存的位置选择对应sink,我这里使用的是hdfs sink
架构图
步骤
/usr/local/apache-flume-1.10.0-bin/conf/flume-env.sh.template
文件去掉template后缀- 配置java home
export JAVA_HOME=/usr/local/jdk-11.0.15
- 编写你需要的拦截器(普通的java项目即可)
- 配置文件(下面贴出来)
- 启动命令
nohup ./flume-ng agent --conf-file ../conf/file-flume-kafka.conf --name a1 -Dflume.root.logger=INFO,LOGFILE >/usr/local/apache-flume-1.10.0-bin/log1.txt 2>&1 &
配置文件
file-flum-kafka.conf(producer flume配置)
#定义组件
a1.sources=r1
a1.channels=c1
#配置source(tail dir source)
a1.sources.r1.type=TAILDIR
a1.sources.r1.filegroups=f1
a1.sources.r1.filegroups.f1=/usr/local/app/log/print-log-info.log
a1.sources.r1.positionFile=/usr/local/apache-flume-1.10.0-bin/taildir_position.json
#配置拦截器(etl数据清洗 判断json是否完整)
a1.sources.r1.interceptors=i1
a1.sources.r1.interceptors.i1.type=com.gms.ETLInterceptor$Builder
#配置channel
a1.channels.c1.type=org.apache.flume.channel.kafka.KafkaChannel
a1.channels.c1.kafka.bootstrap.servers=hadoop-4:9092,hadoop-5:9092,hadoop-6:9092
a1.channels.c1.kafka.topic=topic_log
#flume发送的消息分为两部分flume添加的header和真实的数据body.这里配置去掉头,只传输body即可
a1.channels.c1.parseAsFlumeEvent=false
#配置 sink(不配)
#拼接组件
a1.sources.r1.channels=c1
kafka-flume-hdfs.conf(consumer flume配置)
#定义组件
a1.channels=c1
a1.sinks=k1
#配置source
#配置channel
a1.channels.c1.type=org.apache.flume.channel.kafka.KafkaChannel
a1.channels.c1.kafka.bootstrap.servers=hadoop-4:9092,hadoop-5:9092
a1.channels.c1.kafka.topic=topic_log
a1.channels.c1.kafka.consumer.group.id=flume-consumer
#flume发送的消息分为两部分flume添加的header和真实的数据body.这里配置去掉头,只传输body即可
a1.channels.c1.parseAsFlumeEvent=false
#配置 sink(hdsf sink)
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path = /origin_data/gms/log/topic_log/%Y-%m-%d
a1.sinks.k1.hdfs.useLocalTimeStamp = true
a1.sinks.k1.hdfs.filePrefix = log-
a1.sinks.k1.hdfs.round = false
#控制生成的小文件
a1.sinks.k1.hdfs.rollInterval = 10
a1.sinks.k1.hdfs.rollSize = 134217728
a1.sinks.k1.hdfs.rollCount = 0
## 控制输出文件是原生文件。
a1.sinks.k1.hdfs.fileType = CompressedStream
a1.sinks.k1.hdfs.codeC = lzop
#拼接组件
a1.sources.r1.channels=c1
a1.sinks.k1.channel=c1
拦截器
新建普通的java maven工程
添加依赖
<dependency>
<groupId>org.apache.flume</groupId>
<artifactId>flume-ng-core</artifactId>
<version>1.10.0</version>
<scope>provided</scope>
</dependency>
校验log格式拦截器示例
package com.gms;
import cn.hutool.json.JSONUtil;
import com.google.gson.Gson;
import com.google.gson.JsonObject;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.interceptor.Interceptor;
import java.nio.charset.Charset;
import java.util.Iterator;
import java.util.List;
public class ETLInterceptor implements Interceptor {
@Override
public void initialize() {
}
@Override
public Event intercept(Event event) {
byte[] body = event.getBody();
String log = new String(body, Charset.forName("UTF-8"));
//校验是否是json格式
if (JSONUtil.isTypeJSON(log)) {
return event;
}
return null;
}
@Override
public List<Event> intercept(List<Event> events) {
Iterator<Event> iterator = events.iterator();
while (iterator.hasNext()) {
Event next = iterator.next();
if (intercept(next)==null) {
iterator.remove();
}
}
return events;
}
@Override
public void close() {
}
public static class Builder implements Interceptor.Builder {
@Override
public Interceptor build() {
return new ETLInterceptor();
}
@Override
public void configure(Context context) {
}
}
}
解决零点漂移的timestamp interceptor
package com.gms;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.gson.JsonObject;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.interceptor.Interceptor;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.Map;
public class TimestampInterceptor implements Interceptor {
@Override
public void initialize() {
}
public static Gson gson = new GsonBuilder().create();
@Override
public Event intercept(Event event) {
//将body里的日志生成时间,填充到header的timestamp
//1.获取header
Map<String, String> headers = event.getHeaders();
//2.获取body里的ts
byte[] body = event.getBody();
String s = new String(body, StandardCharsets.UTF_8);
JsonObject jsonObject = gson.fromJson(s, JsonObject.class);
String ts = jsonObject.get("ts").getAsString();
//3.将ts赋值给timestamp
headers.put("timestamp",ts);
return event;
}
@Override
public List<Event> intercept(List<Event> events) {
for (Event event : events) {
intercept(event);
}
return events;
}
@Override
public void close() {
}
public static class Builder implements Interceptor.Builder{
@Override
public Interceptor build() {
return new TimestampInterceptor();
}
@Override
public void configure(Context context) {
}
}
}
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。