RDD转换DataFrame之Reflection方法

第一种方式是使用反射的方式,用反射去推倒出来RDD里面的schema。这个方式简单,但是不建议使用,因为在工作当中,使用这种方式是有限制的。

对于以前的版本来说,case class最多支持22个大数据培训字段如果超过了22个字段,我们就必须要自己开发一个类,实现product接口才行。因此这种方式虽然简单,但是不通用;因为生产中的字段是非常非常多的,是不可能只有20来个字段的。

//Java
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;

import javax.jnlp.PersistenceService;
import javax.xml.crypto.Data;

public class rddtoDFreflectionJava {

public static void main(String[] args) {
    SparkSession spark = SparkSession
            .builder()
            .appName("program")
            .master("local").config("spark.sql.warehouse.dir", "file:/Users/zhangjingyu/Desktop/Spark架构/spark-warehouse")
            .getOrCreate();
    String Path = "file:/Users/zhangjingyu/Desktop/spark-2.4.0/examples/src/main/resources/people.txt";


    JavaRDD<PersonJava> personRDD = Spark.read().textFile(Path).javaRDD().map(line -> {
        String name = line.split(",")[0];
        Long age = Long.valueOf(line.split(",")[1].trim());
        PersonJava person = new PersonJava();
        person.setName(name);
        person.setAge(age);
        return person;
    });

/**

  • JavaRDD<PersonJava> personRdd = Spark.read().textFile(Path).javaRDD().map(new Function<String, PersonJava>() {
  • @Override
  • public PersonJava call(String line) throws Exception {
  • String name = line.split(",")[0];
  • Long age = Long.valueOf(line.split(",")[1].trim());
  • PersonJava person = new PersonJava();
  • person.setName(name);
  • person.setAge(age);
  • return person;
  • }
  • });
    */

        Dataset<Row> personDF = Spark.createDataFrame(personRDD,PersonJava.class);
        personDF.createOrReplaceTempView("test");
        Dataset<Row> ResultDF = Spark.sql("select * from test a where a.age < 30");
        ResultDF.show();
    
    JavaRDD<PersonJava> ResultRDD = ResultDF.javaRDD().map(line -> {
        PersonJava person = new PersonJava();
        person.setName(line.getAs("name"));
        person.setAge(line.getAs("age"));
        return person;
    });


    for (PersonJava personJava : ResultRDD.collect()) {
        System.out.println(personJava.getName()+":"+personJava.getAge());
    }

/**

  • JavaRDD<PersonJava> resultRdd = ResultDF.javaRDD().map(new Function<Row, PersonJava>() {
  • @Override
  • public PersonJava call(Row row) throws Exception {
  • PersonJava person = new PersonJava();
  • String name = row.getAs("name");
  • long age = row.getAs("age");
  • person.setName(name);
  • person.setAge(age);
  • return person;
  • }
  • });
  • resultRdd.foreach(new VoidFunction<PersonJava>() {
  • @Override
  • public void call(PersonJava personJava) throws Exception {
  • System.out.println(personJava.getName()+":"+personJava.getAge());
  • }
  • });
    */

    }

}

//Scala
object rddtoDFreflectionScala {
case class Person(name : String , age : Long)

def main(args: Array[String]): Unit = {

val spark = CommSparkSessionScala.getSparkSession()
val path = "file:/Users/zhangjingyu/Desktop/spark-2.4.0/examples/src/main/resources/people.txt"
import spark.implicits._;
val personDF = spark.sparkContext.textFile(path).map(row => row.split(",")).map(line => {
  Person(line(0),line(1).trim.toLong)
}).toDF
personDF.createOrReplaceTempView("test")
val resultDF = spark.sql("select * from test a where a.age > 20")
val resultrdd = resultDF.rdd.map(x =>{
  val name = x.getAs[String]("name")
  val age = x.getAs[Long]("age")
  Person(name,age)
})


for (elem <- resultrdd.collect()) {
  System.out.println(elem.name+" : "+ elem.age)
}

}
}

RDD转换DataFrame之Programm方式

创建一个DataFrame,使用编程的方式,这个方式用的非常多。通过编程方式指定schema ,对于第一种方式的schema其实定义在了case class里面了。

//Java
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import scala.Tuple2;

import java.util.ArrayList;
import java.util.List;

public class rddtoDFprogrammJava {

public static void main(String[] args) {


    SparkSession spark = SparkSession
            .builder()
            .appName("program")
            .master("local").config("spark.sql.warehouse.dir", "file:/Users/zhangjingyu/Desktop/Spark架构/spark-warehouse")
            .getOrCreate();
    String Path = "file:/Users/zhangjingyu/Desktop/spark-2.4.0/examples/src/main/resources/people.txt";


    //创建列属性
    List<StructField> fields = new ArrayList<>();
    StructField structField_name = DataTypes.createStructField("name", DataTypes.StringType, true);
    StructField structField_age = DataTypes.createStructField("age", DataTypes.LongType, true);
    fields.add(structField_name);
    fields.add(structField_age);
    StructType scheme = DataTypes.createStructType(fields);


    JavaRDD PersonRdd = spark.read().textFile(Path).javaRDD().map(x -> {
        String[] lines = x.split(",");
        return RowFactory.create(lines[0], Long.valueOf(lines[1].trim()));
    });


    Dataset<Row> PersonDF = spark.createDataFrame(PersonRdd, scheme);
    PersonDF.createOrReplaceTempView("program");
    Dataset<Row> ResultDF = spark.sql("select * from program ");
    ResultDF.show();


    for (Row row : ResultDF.javaRDD().collect()) {
        System.out.println(row);
    }
}

}

//Scala

import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}

object rddtoDFprogrammScala {
def main(args: Array[String]): Unit = {

val spark = CommSparkSessionScala.getSparkSession()
val path = "file:/Users/zhangjingyu/Desktop/spark-2.4.0/examples/src/main/resources/people.txt"
val scheme = StructType(Array(
  StructField("name",StringType,true),
  StructField("age",LongType,true)
))
val rdd = spark.sparkContext.textFile(path).map(line => line.split(",")).map(x => {
  Row(x(0),x(1).trim.toLong)
})
val PersonDF = spark.createDataFrame(rdd,scheme)
PersonDF.createOrReplaceTempView("person")
val resultDF = spark.sql("select * from person a where a.age < 30")
for (elem <- resultDF.collect()) {
  System.out.println(elem.get(0)+":"+elem.get(1))
}

}
}

原创作者:张景宇


IT小尚
24 声望8 粉丝

尚硅谷IT培训学习爱好分享者