1

运行环境

JDK    1.8.0 
Hadoop 2.6.0
Scala  2.11.8  
Spark  2.1.2

RDD, 不用 lambda,reduceByKey

  • import
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.SparkSession;
import scala.Tuple2;
import java.util.Arrays;
import java.util.Iterator;
  • main
public class WordCount {
    public static void main(String[] args) {
        // 输入文件
        String wordFile = "/user/walker/input/wordcount/idea.txt";    
        SparkSession spark = SparkSession.builder().appName("wordcount")
                .config("spark.hadoop.mapreduce.output.fileoutputformat.compress", false)
                .getOrCreate();
        JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());
        JavaRDD<String> hdfstext = jsc.textFile(wordFile);
        // 切分(转化操作)
        JavaRDD<String> words = hdfstext.flatMap(new FlatMapFunction<String, String>() {
            public Iterator<String> call(String x) {
                return Arrays.asList(x.split(" ")).iterator();
            }
        });
        // 单次计 1(转化操作)        
        JavaPairRDD<String, Integer> pairs = words.mapToPair(    
                                        new PairFunction<String, String, Integer>() {
            public Tuple2<String, Integer> call(String word) {
                return new Tuple2<>(word, 1);
            }
        }); 
        // 累加 1(转化操作) 
        JavaPairRDD<String, Integer> wordCounts = pairs.reduceByKey(    
                                        new Function2<Integer, Integer, Integer>() {
            public Integer call(Integer v1, Integer v2) {
                return v1 + v2;
            }
        }).repartition(1);     
        // 输出目录   
        String outDir = "/user/walker/output/wordcount";    
        wordCounts.saveAsTextFile(outDir);
        jsc.close();
    }
}

RDD + reduceByKey

  • import
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import scala.Tuple2;
import java.util.Arrays;
  • main
public class WordCount2 {
    public static void main(String[] args) {
        // 输入文件
        String wordFile = "/user/walker/input/wordcount/idea.txt";
        SparkSession spark = SparkSession.builder().appName("wordcount")
                .config("spark.hadoop.mapreduce.output.fileoutputformat.compress", false)
                .getOrCreate();
        JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());
        JavaRDD<String> hdfstext = jsc.textFile(wordFile);
        // 切分(转化操作)
        JavaRDD<String> words = hdfstext.flatMap(line -> Arrays.asList(line.split(" ")).iterator());
        // 单次计 1(转化操作)
        JavaPairRDD<String, Integer> pairs = words.mapToPair(word -> new Tuple2<>(word, 1));
        // 累加 1(转化操作)
        JavaPairRDD<String, Integer> wordCounts = pairs.reduceByKey((v1, v2) -> v1 + v2)
                                                        .repartition(1);
        // 输出目录
        String outDir = "/user/walker/output/wordcount2";
        wordCounts.saveAsTextFile(outDir);
        jsc.close();
    }
}

RDD + countByValue

  • import
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import scala.Tuple2;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
  • main
public class WordCount3 {
    public static void main(String[] args) {
        // 输入文件
        String wordFile = "/user/walker/input/wordcount/idea.txt";
        SparkSession spark = SparkSession.builder().appName("wordcount")
                .config("spark.hadoop.mapreduce.output.fileoutputformat.compress", false)
                .getOrCreate();
        JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());
        JavaRDD<String> hdfstext = jsc.textFile(wordFile);
        // 切分(转化操作)
        JavaRDD<String> words = hdfstext.flatMap(line -> Arrays.asList(line.split(" ")).iterator());
        // 计数(行动操作)
        Map<String, Long> wordCounts =  words.countByValue(); 
        // 将 Map 转为 RDD
        List<String> lst = new LinkedList<>();
        wordCounts.forEach((k, v) -> lst.add(k + "\t" + v));
        JavaRDD<String> result = jsc.parallelize(lst).repartition(1);
        // 保存结果到 HDFS
        String outDir = "/user/walker/output/wordcount3";    //输出目录
        result.saveAsTextFile(outDir);
        jsc.close();
    }
}

DataFrame/DataSet + SQL

  • import
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import java.util.Arrays;
  • main
public class WordCount4 {
    public static void main(String[] args) {
        // 输入文件
        String wordFile = "/user/qhy/input/wordcount/idea.txt";
        SparkSession spark = SparkSession.builder().appName("wordcount")
                .config("spark.hadoop.mapreduce.output.fileoutputformat.compress", false)
                .getOrCreate();
        Dataset<Row> dfRow = spark.read().text(wordFile);
        // row.toString() 生成的字符串会被 [] 包围
        Dataset<String> words = dfRow.flatMap((FlatMapFunction<Row, String>) row ->
                    Arrays.asList(row.getString(0).split("\\s+")).iterator(), Encoders.STRING());
        Dataset<Row> dfWord = words.toDF("word");
        dfWord.createOrReplaceTempView("words");
        Dataset<Row> dfCnt = dfWord.sqlContext().sql("SELECT word, count(*) FROM words GROUP BY word");
        // 保存结果到 HDFS
        String outDir = "/user/qhy/output/wordcount4";
        dfCnt.repartition(1).write().csv(outDir);
        spark.stop();
    }
}

DataSet/DataFrame + groupBy

  • import
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import java.util.Arrays;
  • main
public class WordCount5 {
    public static void main(String[] args) {
        // 输入文件
        String wordFile = "/user/qhy/input/wordcount/idea.txt";
        SparkSession spark = SparkSession.builder().appName("wordcount")
                .config("spark.hadoop.mapreduce.output.fileoutputformat.compress", false)
                .getOrCreate();
        Dataset<String> ds = spark.read().textFile(wordFile);
        Dataset<String> words = ds.flatMap((FlatMapFunction<String, String>) line ->
                               Arrays.asList(line.split("\\s+")).iterator(), Encoders.STRING());
        // words.printSchema();   // value: string (nullable = true)
        Dataset<Row> df = words.groupBy("value").count();
        // 保存结果到 HDFS
        String outDir = "/user/qhy/output/wordcount5";
        df.repartition(1).write().csv(outDir);
        spark.stop();
    }
}

执行流程图

spark wordcount

相关阅读

本文出自 walker snapshot

qbit
268 声望279 粉丝