1

1、UDF的实现方法

Hive 提供了两个实现 UDF 的方式:

1.1、继承UDF类

  • 优点

    • 实现简单
    • 支持Hive的基本类型、数组和Map
    • 支持函数重载
  • 缺点

    • 逻辑较为简单,只适合用于实现简单的函数
    • 这种方式编码少,代码逻辑清晰,可以快速实现简单的UDF

1.2、继承GenericUDF类

  • 优点

    • 支持任意长度、任意类型的参数
    • 可以根据参数个数和类型实现不同的逻辑
    • 可以实现初始化和关闭资源的逻辑(initialize、close)
  • 缺点

    • 实现比继承UDF要复杂一些
    • 与继承 UDF 相比,GenericUDF 更加灵活,可以实现更为复杂的函数

2、继承UDF类实现方式

pom.xml

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <groupId>com.journey.udf</groupId>
  <artifactId>hive-udf</artifactId>
  <packaging>jar</packaging>
  <version>1.0-SNAPSHOT</version>
  <name>hive-udf</name>
  <url>http://maven.apache.org</url>
  <properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <maven.compiler.encoding>UTF-8</maven.compiler.encoding>
    <java.version>1.8</java.version>
    <maven.compiler.source>1.8</maven.compiler.source>
    <maven.compiler.target>1.8</maven.compiler.target>
  </properties>

  <dependencies>
    <dependency>
      <groupId>org.apache.hive</groupId>
      <artifactId>hive-exec</artifactId>
      <version>3.1.0</version>
      <scope>provided</scope>
    </dependency>
    <dependency>
      <groupId>com.alibaba</groupId>
      <artifactId>fastjson</artifactId>
      <version>1.2.69</version>
    </dependency>
  </dependencies>
  <build>
    <plugins>
      <!-- assembly打包插件 -->
      <plugin>
        <groupId>org.apache.maven.plugins</groupId>
        <artifactId>maven-assembly-plugin</artifactId>
        <version>3.0.0</version>
        <executions>
          <execution>
            <id>make-assembly</id>
            <phase>package</phase>
            <goals>
              <goal>single</goal>
            </goals>
          </execution>
        </executions>
        <configuration>
          <archive>
            <manifest>
            </manifest>
          </archive>
          <descriptorRefs>
            <descriptorRef>jar-with-dependencies</descriptorRef>
          </descriptorRefs>
        </configuration>
      </plugin>
    </plugins>
  </build>
</project>
package com.journey.udf;

import org.apache.hadoop.hive.ql.exec.UDF;

/**
 * UDF已经废弃,建议用GenericUDF
 */
public class MyUDF extends UDF {

    public String evaluate(String value) {
        return "journey_" + value;
    }
}

3、继承GenericUDF类实现方式

package com.journey.udf;

import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
import org.apache.hadoop.io.Text;

@Description(name = "my_hash", value = "returns hash of the given value")
public class MyHashUDF extends GenericUDF {

    private static final String MD2_HASH_TYPE = "md2";
    private static final String MD5_HASH_TYPE = "md5";

    public static final String UDF_NAME = "my_hash";

    private PrimitiveObjectInspector primitiveObjectInspector;

    @Override
    public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
        PrimitiveObjectInspector primitiveObjectInspector = ((PrimitiveObjectInspector) arguments[1]);

        this.primitiveObjectInspector = ((PrimitiveObjectInspector) arguments[1]);
        // 简单来说就是标识evaluate的类型,使用ObjectInspector来进行封装,这里其实就是输入类型是什么返回类型就是什么
        // 也可以通过 PrimitiveObjectInspectorFactory.xxx,来指定类型的返回
        PrimitiveObjectInspector columnType = ((PrimitiveObjectInspector) arguments[0]);
        ObjectInspector ret = PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(columnType.getPrimitiveCategory());
        return ret;
    }

    @Override
    public Object evaluate(DeferredObject[] arguments) throws HiveException {
        Text result = new Text();
        String propertiesJsonString = (String) primitiveObjectInspector.getPrimitiveJavaObject(arguments[1].get());
        JSONObject propertiesJson = JSON.parseObject(propertiesJsonString);
        String hashType = propertiesJson.getString("hashType");

        DeferredObject col = arguments[0];
        StringObjectInspector stringColumn = (StringObjectInspector) primitiveObjectInspector;
        String colVal = stringColumn.getPrimitiveJavaObject(col.get());

        switch (hashType) {
            case MD2_HASH_TYPE:
                result.set(DigestUtils.md2Hex(colVal));
                break;
            case MD5_HASH_TYPE:
                result.set(DigestUtils.md5Hex(colVal));
                break;
            default:
                throw new UnsupportedOperationException("hash type must be one of [" + MD2_HASH_TYPE + ",}" + MD5_HASH_TYPE + "]");
        }
        return result;
    }

    @Override
    public String getDisplayString(String[] children) {
        return getStandardDisplayString(UDF_NAME, children);
    }
}

4、三种加载方式

4.1、SPI机制直接加载为系统函数

image.png

Hive端代码实现

org.apache.hadoop.hive.ql.exec#FunctionRegistry

private static final Registry system = new Registry(true);

  static {
    system.registerGenericUDF("concat", GenericUDFConcat.class);
    system.registerUDF("substr", UDFSubstr.class, false);
    system.registerUDF("substring", UDFSubstr.class, false);
    .....
    Map<String, Class<? extends GenericUDF>> edapUDFMap = loadEdapUDF();
        for (Map.Entry<String, Class<? extends GenericUDF>> entry : edapUDFMap.entrySet()) {
            system.registerGenericUDF(entry.getKey(), entry.getValue());
        }
    }
 /**
   * spi机制动态加载自定义udf函数
   *
   * @return
   */
  private static Map<String, Class<? extends GenericUDF>> loadEdapUDF() {
      Map<String, Class<? extends GenericUDF>> classMap = new HashMap<>();

      ServiceLoader<GenericUDF> loadedUDF = ServiceLoader.load(GenericUDF.class);
      Iterator<GenericUDF> udfIterator = loadedUDF.iterator();

      while (udfIterator.hasNext()) {
          Class<? extends GenericUDF> clazz = udfIterator.next().getClass();
          Field udfNameField = null;

          // UDF_NAME 是静态方法
          try {
              udfNameField = clazz.getDeclaredField("UDF_NAME");
          } catch (NoSuchFieldException e) {
              LOG.warn("Class" + clazz.getName() + " doesn't UDF_NAME filed.");
              continue;
          }

          udfNameField.setAccessible(true);

          try {
              classMap.put(String.valueOf(udfNameField.get(null)), clazz);
          } catch (IllegalAccessException e) {
              LOG.warn("illegal access " + clazz.getName() + " UDF_NAME field value.");
          }
      }
      return classMap;
  }

自定义函数

比如说上面 MyHashUDF,可以将该类在resources下进行如下配置 :
image.png

4.2、创建临时函数

// 将jar放入到classpath中
add jar /opt/journey/hive/auxlib/hive-udf-1.0-SNAPSHOT-jar-with-dependencies.jar;

// 创建临时函数
create temporary function my_udf AS 'com.journey.udf.MyUDF';

// 使用函数
select my_udf('zhagnsan');
返回值 : journey_zhagnsan

// 删除临时函数
drop temporary function my_udf;

注意 : 临时函数是不能指定db的,所以在访问的时候也不需要db,直接在任何库下都可以访问

4.3、创建永久函数

// 将jar放入到hdfs上
hdfs dfs -put /opt/journey/hive/auxlib/hive-udf-1.0-SNAPSHOT-jar-with-dependencies.jar /journey

// 创建永久函数
create function db1.my_udf as 'com.journey.udf.MyUDF' using jar 'hdfs://master-52600d0:8020/journey/hive-udf-1.0-SNAPSHOT-jar-with-dependencies.jar';

// 使用函数
select my_hash("zhagnsan","{'hashType':'md2'}");
select my_hash("zhagnsan","{'hashType':'md5'}");

// 删除永久函数
drop function db1.my_udf;

select db1.my_udf('zhangsan');

注意 : 创建函数的时候,可以不指定db,其实就是默认的default下的db的函数

如感兴趣,点赞加关注,谢谢!!!


journey
32 声望22 粉丝

下一篇 »
Hive事务

引用和评论

0 条评论