3

需求

此次 job 任务的目的是从 kafka 指定 topic 读取消息,并写入到 hbase 中;

消息体包含 project(消息所属项目)、table(要写入的 hbase 表名)和 datajson 字符串)。

执行思路:

  1. 使用流式引擎,配置 kafka source
  2. 过滤不符合格式要求的数据
  3. 调整触发机制:指定时间窗口,同时若在窗口内达成指定次数,也会触发
  4. 接入 hbase sink 批量写数据

环境准备

version: '3'
services:
  mysql:
    image: "docker.io/mysql:5.7"
    environment:
      MYSQL_ROOT_PASSWORD: "123456"
    ports:
      - "3306:3306"
  zookeeper:
    image: harbor.oneitfarm.com/cidata/zookeeper:3.4.14
    environment:
      ZOO_MY_ID: 1
      ZOO_SERVERS: server.1=0.0.0.0:2888:3888
      ENABLE_SASL: "true"
      SUPER_PASSWORD: admin
      USER_KAFKA_PASSWORD: 123456
    ports:
      - "2182:2181"
  kafka_broker:
    image: "harbor.oneitfarm.com/cidata/kafka:2.4.0"
    environment:
      KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
      ZOOKEEPER_SASL_ENABLE: "true"
      ZOOKEEPER_SASL_USER_KAFKA_PASSWORD: 123456
      KAFKA_SASL_ENABLE: "true"
      KAFKA_ADMIN_PASSWORD: 123456
      KAFKA_BROKER_ID: 1
      KAFKA_HEAP_OPTS: "-Xmx512M -Xms256M"
      KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: INSIDE:SASL_PLAINTEXT,OUTSIDE:SASL_PLAINTEXT
      KAFKA_ADVERTISED_LISTENERS: INSIDE://:9094,OUTSIDE://${HOST_IP}:19092
      KAFKA_LISTENERS: INSIDE://:9094,OUTSIDE://:9092
      KAFKA_INTER_BROKER_LISTENER_NAME: INSIDE
    ports:
      - "19092:9092"

同目录下要有 .env 文件

# 修改成主机IP
HOST_IP=192.168.50.187

代码

pom.xml

    <properties>
        <mainClass>xxx.flinkjob.kafka.Application</mainClass>
        <flink-version>1.10.0</flink-version>
        <hbase-version>1.3.1</hbase-version>
        <maven.compiler.source>1.8</maven.compiler.source>
        <maven.compiler.target>1.8</maven.compiler.target>
        <maven.compiler.compilerVersion>1.8</maven.compiler.compilerVersion>
    </properties>

    <dependencies>
        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-api</artifactId>
            <version>1.7.25</version>
        </dependency>
        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-log4j12</artifactId>
            <version>1.7.25</version>
        </dependency>

        <dependency>
            <groupId>org.apache.kafka</groupId>
            <artifactId>kafka-clients</artifactId>
            <version>2.3.1</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-java</artifactId>
            <version>${flink-version}</version>
<!--            <scope>provided</scope>-->
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-java_2.11</artifactId>
            <version>${flink-version}</version>
<!--            <scope>provided</scope>-->
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-jdbc_2.11</artifactId>
            <version>1.7.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-connector-kafka_2.11</artifactId>
            <version>${flink-version}</version>
        </dependency>
        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>1.2.62</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-client</artifactId>
            <version>${hbase-version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-common</artifactId>
            <version>${hbase-version}</version>
        </dependency>
        <dependency>
            <groupId>commons-cli</groupId>
            <artifactId>commons-cli</artifactId>
            <version>1.4</version>
        </dependency>
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>6.0.6</version>
        </dependency>
    </dependencies>

    <build>
        <plugins>
            <plugin>
                <artifactId>maven-assembly-plugin</artifactId>
                <configuration>
                    <archive>
                        <manifest>
                            <mainClass>${mainClass}</mainClass>
                        </manifest>
                    </archive>
                    <descriptorRefs>
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                </configuration>

                <executions>
                    <execution>
                        <id>make-assembly</id>
                        <phase>package</phase>
                        <goals>
                            <goal>assembly</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>

model

package xxx.flinkjob.kafka.model;

public class HttpDataModel {
    private String project;
    private String table;
    private String data;

    public HttpDataModel() {}

    public HttpDataModel(String project, String table, String data) {
        this.project = project;
        this.table   = table;
        this.data    = data;
    }

    public String getProject() {
        return project;
    }

    public String getTable() {
        return table;
    }

    public String getData() {
        return data;
    }

    public String getFullTable() {
        return project + ":" + table;
    }

    public void setProject(String project) {
        this.project = project;
    }

    public void setTable(String table) {
        this.table = table;
    }

    public void setData(String data) {
        this.data = data;
    }

    @Override
    public String toString() {
        return "HttpDataModel{" +
                "project='" + project + '\'' +
                ", table='" + table + '\'' +
                ", data='" + data + '\'' +
                '}';
    }
}

sink

package xxx.flinkjob.kafka.sink;

import com.alibaba.fastjson.JSONObject;
import xxx.flinkjob.kafka.model.HttpDataModel;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;
import org.apache.hadoop.hbase.*;
import org.apache.hadoop.hbase.client.Admin;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.log4j.Logger;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class HbaseSink extends RichSinkFunction<List<HttpDataModel>> implements Serializable {
    private Logger log;

    private String hbase_zookeeper_host;
    private String hbase_zookeeper_port;

    private Connection connection;
    private Admin admin;

    public HbaseSink(String hbase_zookeeper_host, String hbase_zookeeper_port) {
        this.hbase_zookeeper_host = hbase_zookeeper_host;
        this.hbase_zookeeper_port = hbase_zookeeper_port;
    }

    @Override
    public void open(Configuration parameters) throws Exception {
        super.open(parameters);
        log = Logger.getLogger(HbaseSink.class);

        org.apache.hadoop.conf.Configuration configuration = HBaseConfiguration.create();
        configuration.set("hbase.zookeeper.property.clientPort", hbase_zookeeper_port);
        configuration.set("hbase.zookeeper.quorum", hbase_zookeeper_host);

        connection = ConnectionFactory.createConnection(configuration);
        admin = connection.getAdmin();
    }

    public void invoke(List<HttpDataModel> datas, Context context) throws Exception {
        // 按 project:table 归纳
        Map<String, List<HttpDataModel>> map = new HashMap<String, List<HttpDataModel>>();
        for (HttpDataModel data : datas) {
            if (! map.containsKey(data.getFullTable())) {
                map.put(data.getFullTable(), new ArrayList<HttpDataModel>());
            }
            map.get(data.getFullTable()).add(data);
        }
        // 遍历 map
        for(Map.Entry<String, List<HttpDataModel>> entry : map.entrySet()){
            // 如果 表不存在,即创建
            createTable(entry.getKey());
            // 写数据
            List<Put> list = new ArrayList<Put>();
            for (HttpDataModel item : entry.getValue()) {
                Put put = new Put(Bytes.toBytes(String.valueOf(System.currentTimeMillis())));

                JSONObject object = JSONObject.parseObject(item.getData());
                for (String key: object.keySet()) {
                    put.addColumn("data".getBytes(), key.getBytes(), object.getString(key).getBytes());
                }
                list.add(put);
            }
            connection.getTable(TableName.valueOf(entry.getKey())).put(list);
        }
    }

    @Override
    public void close() throws Exception {
        super.close();
    }

    /**
     * 创建 hbase 表
     */
    private void createTable(String tableName) throws Exception {
        createNamespace(tableName.split(":")[0]);
        TableName table = TableName.valueOf(tableName);
        if (! admin.tableExists(table)) {
            HTableDescriptor hTableDescriptor = new HTableDescriptor(table);
            // 固定只有 data 列簇
            hTableDescriptor.addFamily(new HColumnDescriptor("data"));
            admin.createTable(hTableDescriptor);
        }
    }

    /**
     * 创建命名空间
     */
    private void createNamespace(String namespace) throws Exception {
        try {
            admin.getNamespaceDescriptor(namespace);
        } catch (NamespaceNotFoundException e) {
            admin.createNamespace(NamespaceDescriptor.create(namespace).build());
        }
    }
}

trigger

package xxx.flinkjob.kafka.trigger;

import org.apache.flink.streaming.api.windowing.triggers.Trigger;
import org.apache.flink.streaming.api.windowing.triggers.TriggerResult;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;

public class CountTrigger<T> extends Trigger<T, TimeWindow> {

    // 当前的计数标志
    private static int flag = 0;

    // 最大数量
    public static int threshold = 0;

    public CountTrigger(Integer threshold) {
        this.threshold = threshold;
    }

    /**
     * 添加到窗口的每个元素都会调此方法
     */
    public TriggerResult onElement(Object element, long timestamp, TimeWindow window, TriggerContext ctx) throws Exception {
        ctx.registerEventTimeTimer(window.maxTimestamp());

        flag++;

        if(flag >= threshold){
            flag = 0;
            ctx.deleteProcessingTimeTimer(window.maxTimestamp());
            return TriggerResult.FIRE_AND_PURGE;
        }

        return TriggerResult.CONTINUE;
    }

    /**
     * 当注册的处理时间计时器触发时,将调用此方法
     */
    public TriggerResult onProcessingTime(long time, TimeWindow window, TriggerContext ctx) throws Exception {
        if(flag > 0){
            // System.out.println("到达窗口时间执行触发:" + flag);
            flag = 0;
            return TriggerResult.FIRE_AND_PURGE;
        }
        return TriggerResult.CONTINUE;
    }

    /**
     * 当注册的事件时间计时器触发时,将调用此方法
     */
    public TriggerResult onEventTime(long time, TimeWindow window, TriggerContext ctx) throws Exception {
        if (time >= window.maxTimestamp() && flag > 0) {
            // System.out.println("到达时间窗口且有数据,触发操作!");
            flag = 0;
            return TriggerResult.FIRE_AND_PURGE;
        } else if (time >= window.maxTimestamp() && flag == 0) {
            // 清除窗口但不触发
            return TriggerResult.PURGE;
        }
        return TriggerResult.CONTINUE;
    }

    /**
     * 执行任何需要清除的相应窗口
     */
    public void clear(TimeWindow window, TriggerContext ctx) throws Exception {
        ctx.deleteProcessingTimeTimer(window.maxTimestamp());
        ctx.deleteEventTimeTimer(window.maxTimestamp());
    }
}

入口执行文件

package xxx.flinkjob.kafka;

import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import com.ci123.data.flinkjob.kafka.model.HttpDataModel;
import com.ci123.data.flinkjob.kafka.sink.HbaseSink;
import com.ci123.data.flinkjob.kafka.trigger.CountTrigger;
import org.apache.commons.cli.*;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.windowing.AllWindowFunction;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.Window;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
import org.apache.flink.util.Collector;

import java.util.ArrayList;
import java.util.List;
import java.util.Properties;

public class Application {

    @SuppressWarnings(value={"unchecked"})
    public static void main(String[] args) throws Exception {
        // kafka 需要的参数
        String brokers = "127.0.0.1:9092";
        String username = "admin";
        String password = "123456";
        String topic = "test";
        // hbase 需要的参数
        String hbase_zookeeper_host = "hbase";
        String hbase_zookeeper_port = "2181";

        // 接收命令行参数,覆盖默认值
        Options options = new Options();
        options.addOption("kafka_brokers", true, "kafka cluster hosts, such 127.0.0.1:9092");
        options.addOption("kafka_username", true, "kafka cluster username, default: admin");
        options.addOption("kafka_user_password", true, "kafka cluster user password, default: 123456");
        options.addOption("kafka_topic", true, "kafka cluster topic, default: test");

        options.addOption("hbase_zookeeper_host", true, "hbase zookeeper host, default: hbase");
        options.addOption("hbase_zookeeper_port", true, "hbase zookeeper port, default: 2181");

        CommandLineParser parser = new DefaultParser();
        CommandLine line = parser.parse( options, args );

        if ( line.hasOption( "kafka_brokers" ) ) {
            brokers = line.getOptionValue("kafka_brokers");
        } else {
            HelpFormatter formatter = new HelpFormatter();
            formatter.printHelp( "flink write hbase job", options );
            System.exit(1);
        }

        if ( line.hasOption( "kafka_username" ) ) {
            username = line.getOptionValue("kafka_username");
        }
        if ( line.hasOption( "kafka_user_password" ) ) {
            password = line.getOptionValue("kafka_user_password");
        }
        if ( line.hasOption( "kafka_topic" ) ) {
            topic = line.getOptionValue("kafka_topic");
        }
        if ( line.hasOption( "hbase_zookeeper_host" ) ) {
            hbase_zookeeper_host = line.getOptionValue("hbase_zookeeper_host");
        }
        if ( line.hasOption( "hbase_zookeeper_port" ) ) {
            hbase_zookeeper_port = line.getOptionValue("hbase_zookeeper_port");
        }

        // 执行任务
        doExcute(brokers, username, password, topic, hbase_zookeeper_host, hbase_zookeeper_port);
    }

    /**
     * 具体任务执行
     */
    public static void doExcute(String kafka_brokers, String kafka_username, String kafka_password,
                                String topic, String hbase_zookeeper_host, String hbase_zookeeper_port) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        // 设置 kafka source
        env.enableCheckpointing(5000 * 100000);

        Properties props = getKafkaProperties(kafka_username, kafka_password);
        props.setProperty("bootstrap.servers", kafka_brokers);
        DataStream<String> stream = env.addSource(new FlinkKafkaConsumer(topic, new SimpleStringSchema(), props));

        // 过滤不标准格式的数据,并格式化
        DataStream<HttpDataModel> formated_stream = stream.filter(s -> {
            JSONObject obj = JSONObject.parseObject(s);
            return obj.containsKey("project") && obj.containsKey("table") && obj.containsKey("data");
        }).map(s -> { return JSON.parseObject(s, HttpDataModel.class); });

        // 在 10 秒的时间窗口内,每 100 条触发输出到 hbase
        DataStream<List<HttpDataModel>> batch_stream = formated_stream
                .timeWindowAll(Time.seconds(10))
                .trigger(new CountTrigger(100))
                .apply(new AllWindowFunction<HttpDataModel, List<HttpDataModel>, Window>() {
                    public void apply(Window window, Iterable<HttpDataModel> values, Collector<List<HttpDataModel>> out) throws Exception {
                        List<HttpDataModel> lists = new ArrayList<HttpDataModel>();
                        for (HttpDataModel value : values) {
                            lists.add(value);
                        }
                        out.collect(lists);
                    }
                });

        batch_stream.addSink(new HbaseSink(hbase_zookeeper_host, hbase_zookeeper_port));

        // 控制台输出
        //batch_stream.print();

        env.execute("integration-http");
    }

    /**
     * 获取 kafka 的默认配置
     */
    public static Properties getKafkaProperties(String username, String password) {
        Properties props = new Properties();
        props.setProperty("bootstrap.servers", "127.0.0.1:9092");
        props.setProperty("group.id", "dataworks-integration");
        props.put("enable.auto.commit", "true");
        props.put("auto.offset.reset", "earliest");
        props.put("auto.commit.interval.ms", "1000");
        props.put("session.timeout.ms", "30000");
        props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
        props.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");

        String jaasTemplate = "org.apache.kafka.common.security.scram.ScramLoginModule required username=\"%s\" password=\"%s\";";
        String jaasCfg = String.format(jaasTemplate, username, password);

        props.put("security.protocol", "SASL_PLAINTEXT");
        props.put("sasl.mechanism", "SCRAM-SHA-256");
        props.put("sasl.jaas.config", jaasCfg);
        return props;
    }
}

GreenLightt
1.6k 声望194 粉丝