Flume HDFS Sink 防止产生小文件

添加配置

a1.sinks.k1.hdfs.round = false
a1.sinks.k1.hdfs.roundValue = 30
a1.sinks.k1.hdfs.roundUnit = second
a1.sinks.k1.hdfs.rollInterval = 30
a1.sinks.k1.hdfs.rollSize = 0
a1.sinks.k1.hdfs.rollCount = 0

参数 说明
round 表示是否对时间戳四舍五入(true/flase)
roundUnit 时间单位(second,minute,hour)
roundValue 时间戳四舍五入的倍数,要小于当前时间
rollInterval 每隔N个时间单位截断一个文件。设置为0表示不会因为时间间隔截断文件(整数N)
rollSize 文件字节数超过N截断一个文件。设置为0就不因为文件大小截断文件(字节数N)
rollCount 每N个event截断一个文件。设置为0就不因为event数量截断文件

Hive 查询分析

ext_startup_logs表结构:

createdatms             bigint                  
appid                   string                  
tenantid                string              
deviceid                string                  
appversion              string                  
appchannel              string                  
appplatform             string                  
ostype                  string                  
devicestyle             string                  
country                 string                  
province                string                  
ipaddress               string                  
network                 string                  
carrier                 string                  
brand                   string                  
screensize              string                  
ym                      string         
day                     string         
hm                      string
  • 查询某个app的用户:
    hive> select count(distinct(deviceid)) from ext_startup_logs where appid='sdk34734'
  • 查询某天的新增用户
    该需求需要定位时间,通过UDF实现

DateUtil.java: 获取某天的0点0分

package com.applogs.udf;

import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.hive.ql.udf.UDFType;

import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;


@Description(name = "udf_getEndDay",
        value = "getEndTimeInDay",
        extended = "udf() ; udf('2020/03/27 02:03:04') ; udf('2020-03-27 02-03-04','yyyy-MM-dd HH-mm-ss')")
@UDFType(deterministic = true, stateful = false)
public class DayEndUDF extends UDF {

    public long evaluate(){
        return evaluate(new Date());
    }

    // 返回某天的当天开始时间
    public long evaluate(Date date){

        // date那天0时
        Date startDate = DateUtil.getZeroDate(date);

        // 通过calendar获取第二天0时
        Calendar c = Calendar.getInstance();
        c.setTime(startDate);
        // 后推一天即为第二天0时
        c.add(Calendar.DAY_OF_MONTH, 1);
        return c.getTimeInMillis();
    }

    public long evaluate(String dateStr) throws ParseException {
        return evaluate(dateStr, "yyyy/MM/dd HH:mm:ss");
    }

    public long evaluate(String dateStr, String pattern){
        try {
            SimpleDateFormat sdf = new SimpleDateFormat(pattern);
            Date date = sdf.parse(dateStr);

            return evaluate(date);

        } catch (ParseException e) {
            e.printStackTrace();
        }
        return 0;
    }
}

DayStartUDF.java: 获得某天0时的UDF

package com.applogs.udf;

import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.hive.ql.udf.UDFType;

import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;

@Description(name = "udf_getStartay",
        value = "getStartInDay",
        extended = "udf() ; udf('2020/03/27 02:03:04') ; udf('2020-03-27 02-03-04','yyyy-MM-dd HH-mm-ss')")
@UDFType(deterministic = true, stateful = false)

public class DayStartUDF extends UDF {
    // 返回当前时刻的当天开始时间
    public long evaluate(){
        return evaluate(new Date());
    }
    // 返回某天的当天开始时间
    public long evaluate(Date date){
        return DateUtil.getZeroDate(date).getTime();
    }

    public long evaluate(String dateStr) throws ParseException {
        return evaluate(dateStr, "yyyy/MM/dd HH:mm:ss");
    }

    public long evaluate(String dateStr, String pattern){
        try {

            SimpleDateFormat sdf = new SimpleDateFormat(pattern);
            Date date = sdf.parse(dateStr);
            Date zeroDate = DateUtil.getZeroDate(date);
            return zeroDate.getTime();
        } catch (ParseException e) {
            e.printStackTrace();
        }
        return 0;
    }

}

DayEndUDF.java: 获得某天结束时间的UDF,即获取第二天的0时

package com.applogs.udf;

import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.hive.ql.udf.UDFType;

import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;


@Description(name = "udf_getEndDay",
        value = "getEndTimeInDay",
        extended = "udf() ; udf('2020/03/27 02:03:04') ; udf('2020-03-27 02-03-04','yyyy-MM-dd HH-mm-ss')")
@UDFType(deterministic = true, stateful = false)
public class DayEndUDF extends UDF {

    public long evaluate(){
        return evaluate(new Date());
    }

    // 返回某天的当天开始时间
    public long evaluate(Date date){

        // date那天0时
        Date startDate = DateUtil.getZeroDate(date);

        // 通过calendar获取第二天0时
        Calendar c = Calendar.getInstance();
        c.setTime(startDate);
        // 后推一天即为第二天0时
        c.add(Calendar.DAY_OF_MONTH, 1);
        return c.getTimeInMillis();
    }

    public long evaluate(String dateStr) throws ParseException {
        return evaluate(dateStr, "yyyy/MM/dd HH:mm:ss");
    }

    public long evaluate(String dateStr, String pattern){
        try {
            SimpleDateFormat sdf = new SimpleDateFormat(pattern);
            Date date = sdf.parse(dateStr);

            return evaluate(date);

        } catch (ParseException e) {
            e.printStackTrace();
        }
        return 0;
    }
}
在Hive中注册UDF

1.将UDF的jar包导出并放至hive/lib下
2.启动hive,临时导入jar包

$hive> add jar app-logs-hive-1.0-SNAPSHOT.jar ;

3.注册临时函数

$hive>create temporary function getstartday AS 'com.applogs.udf.DayStartUDF';
$hive>create temporary function getendday AS 'com.applogs.udf.DayEndUDF';


猛男落泪为offer
22 声望5 粉丝

半路出家大数据