环境

java环境
hadoop环境
zookeeper和kafka都要安装完成

组件选择

source

flume1.7加入的taildir类型的source,可以随时监控文件变化、支持断点续传。是主流的source类型。

channel

kafka channel,能将source传入的数据,直接存储到kafka。既保存在磁盘中,提高了可靠性。数据层传输的效率又高。是主流的channel类型

sink

根据数据保存的位置选择对应sink,我这里使用的是hdfs sink

架构图

步骤

  • /usr/local/apache-flume-1.10.0-bin/conf/flume-env.sh.template文件去掉template后缀
  • 配置java home export JAVA_HOME=/usr/local/jdk-11.0.15
  • 编写你需要的拦截器(普通的java项目即可)
  • 配置文件(下面贴出来)
  • 启动命令nohup ./flume-ng agent --conf-file ../conf/file-flume-kafka.conf --name a1 -Dflume.root.logger=INFO,LOGFILE >/usr/local/apache-flume-1.10.0-bin/log1.txt 2>&1 &

配置文件

file-flum-kafka.conf(producer flume配置)

#定义组件
a1.sources=r1
a1.channels=c1

#配置source(tail dir source)
a1.sources.r1.type=TAILDIR
a1.sources.r1.filegroups=f1
a1.sources.r1.filegroups.f1=/usr/local/app/log/print-log-info.log
a1.sources.r1.positionFile=/usr/local/apache-flume-1.10.0-bin/taildir_position.json
#配置拦截器(etl数据清洗 判断json是否完整)
a1.sources.r1.interceptors=i1
a1.sources.r1.interceptors.i1.type=com.gms.ETLInterceptor$Builder

#配置channel
a1.channels.c1.type=org.apache.flume.channel.kafka.KafkaChannel
a1.channels.c1.kafka.bootstrap.servers=hadoop-4:9092,hadoop-5:9092,hadoop-6:9092
a1.channels.c1.kafka.topic=topic_log
#flume发送的消息分为两部分flume添加的header和真实的数据body.这里配置去掉头,只传输body即可
a1.channels.c1.parseAsFlumeEvent=false

#配置 sink(不配)

#拼接组件
a1.sources.r1.channels=c1

kafka-flume-hdfs.conf(consumer flume配置)

#定义组件
a1.channels=c1
a1.sinks=k1

#配置source

#配置channel
a1.channels.c1.type=org.apache.flume.channel.kafka.KafkaChannel
a1.channels.c1.kafka.bootstrap.servers=hadoop-4:9092,hadoop-5:9092
a1.channels.c1.kafka.topic=topic_log
a1.channels.c1.kafka.consumer.group.id=flume-consumer
#flume发送的消息分为两部分flume添加的header和真实的数据body.这里配置去掉头,只传输body即可
a1.channels.c1.parseAsFlumeEvent=false

#配置 sink(hdsf sink)
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path = /origin_data/gms/log/topic_log/%Y-%m-%d
a1.sinks.k1.hdfs.useLocalTimeStamp = true
a1.sinks.k1.hdfs.filePrefix = log-
a1.sinks.k1.hdfs.round = false

#控制生成的小文件
a1.sinks.k1.hdfs.rollInterval = 10
a1.sinks.k1.hdfs.rollSize = 134217728
a1.sinks.k1.hdfs.rollCount = 0

## 控制输出文件是原生文件。
a1.sinks.k1.hdfs.fileType = CompressedStream
a1.sinks.k1.hdfs.codeC = lzop

#拼接组件
a1.sources.r1.channels=c1
a1.sinks.k1.channel=c1

拦截器

新建普通的java maven工程
添加依赖

<dependency>
    <groupId>org.apache.flume</groupId>
    <artifactId>flume-ng-core</artifactId>
    <version>1.10.0</version>
    <scope>provided</scope>
</dependency>

校验log格式拦截器示例

package com.gms;

import cn.hutool.json.JSONUtil;
import com.google.gson.Gson;
import com.google.gson.JsonObject;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.interceptor.Interceptor;

import java.nio.charset.Charset;
import java.util.Iterator;
import java.util.List;

public class ETLInterceptor implements Interceptor {
    @Override
    public void initialize() {

    }

    @Override
    public Event intercept(Event event) {
        byte[] body = event.getBody();
        String log = new String(body, Charset.forName("UTF-8"));
        //校验是否是json格式
        if (JSONUtil.isTypeJSON(log)) {
            return event;
        }
        return null;
    }

    @Override
    public List<Event> intercept(List<Event> events) {
        Iterator<Event> iterator = events.iterator();
        while (iterator.hasNext()) {
            Event next = iterator.next();
            if (intercept(next)==null) {
                iterator.remove();
            }
        }
        return events;
    }

    @Override
    public void close() {

    }

    public static class Builder implements Interceptor.Builder {

        @Override
        public Interceptor build() {
            return new ETLInterceptor();
        }

        @Override
        public void configure(Context context) {

        }
    }
}

解决零点漂移的timestamp interceptor

package com.gms;

import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.gson.JsonObject;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.interceptor.Interceptor;

import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.Map;

public class TimestampInterceptor implements Interceptor {
    @Override
    public void initialize() {

    }

    public static Gson gson = new GsonBuilder().create();

    @Override
    public Event intercept(Event event) {
        //将body里的日志生成时间,填充到header的timestamp
        //1.获取header
        Map<String, String> headers = event.getHeaders();
        //2.获取body里的ts
        byte[] body = event.getBody();
        String s = new String(body, StandardCharsets.UTF_8);

        JsonObject jsonObject = gson.fromJson(s, JsonObject.class);
        String ts = jsonObject.get("ts").getAsString();
        //3.将ts赋值给timestamp
        headers.put("timestamp",ts);
        return event;
    }

    @Override
    public List<Event> intercept(List<Event> events) {
        for (Event event : events) {
            intercept(event);
        }
        return events;
    }

    @Override
    public void close() {

    }

    public static class Builder implements Interceptor.Builder{

        @Override
        public Interceptor build() {
            return new TimestampInterceptor();
        }

        @Override
        public void configure(Context context) {

        }
    }

}

馒头
8 声望0 粉丝

java工程师


引用和评论

0 条评论