运行环境
JDK 1.8.0
Hadoop 2.6.0
Scala 2.11.8
Spark 2.1.2
RDD, 不用 lambda,reduceByKey
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.SparkSession;
import scala.Tuple2;
import java.util.Arrays;
import java.util.Iterator;
public class WordCount {
public static void main(String[] args) {
// 输入文件
String wordFile = "/user/walker/input/wordcount/idea.txt";
SparkSession spark = SparkSession.builder().appName("wordcount")
.config("spark.hadoop.mapreduce.output.fileoutputformat.compress", false)
.getOrCreate();
JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());
JavaRDD<String> hdfstext = jsc.textFile(wordFile);
// 切分(转化操作)
JavaRDD<String> words = hdfstext.flatMap(new FlatMapFunction<String, String>() {
public Iterator<String> call(String x) {
return Arrays.asList(x.split(" ")).iterator();
}
});
// 单次计 1(转化操作)
JavaPairRDD<String, Integer> pairs = words.mapToPair(
new PairFunction<String, String, Integer>() {
public Tuple2<String, Integer> call(String word) {
return new Tuple2<>(word, 1);
}
});
// 累加 1(转化操作)
JavaPairRDD<String, Integer> wordCounts = pairs.reduceByKey(
new Function2<Integer, Integer, Integer>() {
public Integer call(Integer v1, Integer v2) {
return v1 + v2;
}
}).repartition(1);
// 输出目录
String outDir = "/user/walker/output/wordcount";
wordCounts.saveAsTextFile(outDir);
jsc.close();
}
}
RDD + reduceByKey
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import scala.Tuple2;
import java.util.Arrays;
public class WordCount2 {
public static void main(String[] args) {
// 输入文件
String wordFile = "/user/walker/input/wordcount/idea.txt";
SparkSession spark = SparkSession.builder().appName("wordcount")
.config("spark.hadoop.mapreduce.output.fileoutputformat.compress", false)
.getOrCreate();
JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());
JavaRDD<String> hdfstext = jsc.textFile(wordFile);
// 切分(转化操作)
JavaRDD<String> words = hdfstext.flatMap(line -> Arrays.asList(line.split(" ")).iterator());
// 单次计 1(转化操作)
JavaPairRDD<String, Integer> pairs = words.mapToPair(word -> new Tuple2<>(word, 1));
// 累加 1(转化操作)
JavaPairRDD<String, Integer> wordCounts = pairs.reduceByKey((v1, v2) -> v1 + v2)
.repartition(1);
// 输出目录
String outDir = "/user/walker/output/wordcount2";
wordCounts.saveAsTextFile(outDir);
jsc.close();
}
}
RDD + countByValue
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import scala.Tuple2;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
public class WordCount3 {
public static void main(String[] args) {
// 输入文件
String wordFile = "/user/walker/input/wordcount/idea.txt";
SparkSession spark = SparkSession.builder().appName("wordcount")
.config("spark.hadoop.mapreduce.output.fileoutputformat.compress", false)
.getOrCreate();
JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());
JavaRDD<String> hdfstext = jsc.textFile(wordFile);
// 切分(转化操作)
JavaRDD<String> words = hdfstext.flatMap(line -> Arrays.asList(line.split(" ")).iterator());
// 计数(行动操作)
Map<String, Long> wordCounts = words.countByValue();
// 将 Map 转为 RDD
List<String> lst = new LinkedList<>();
wordCounts.forEach((k, v) -> lst.add(k + "\t" + v));
JavaRDD<String> result = jsc.parallelize(lst).repartition(1);
// 保存结果到 HDFS
String outDir = "/user/walker/output/wordcount3"; //输出目录
result.saveAsTextFile(outDir);
jsc.close();
}
}
DataFrame/DataSet + SQL
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import java.util.Arrays;
public class WordCount4 {
public static void main(String[] args) {
// 输入文件
String wordFile = "/user/qhy/input/wordcount/idea.txt";
SparkSession spark = SparkSession.builder().appName("wordcount")
.config("spark.hadoop.mapreduce.output.fileoutputformat.compress", false)
.getOrCreate();
Dataset<Row> dfRow = spark.read().text(wordFile);
// row.toString() 生成的字符串会被 [] 包围
Dataset<String> words = dfRow.flatMap((FlatMapFunction<Row, String>) row ->
Arrays.asList(row.getString(0).split("\\s+")).iterator(), Encoders.STRING());
Dataset<Row> dfWord = words.toDF("word");
dfWord.createOrReplaceTempView("words");
Dataset<Row> dfCnt = dfWord.sqlContext().sql("SELECT word, count(*) FROM words GROUP BY word");
// 保存结果到 HDFS
String outDir = "/user/qhy/output/wordcount4";
dfCnt.repartition(1).write().csv(outDir);
spark.stop();
}
}
DataSet/DataFrame + groupBy
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import java.util.Arrays;
public class WordCount5 {
public static void main(String[] args) {
// 输入文件
String wordFile = "/user/qhy/input/wordcount/idea.txt";
SparkSession spark = SparkSession.builder().appName("wordcount")
.config("spark.hadoop.mapreduce.output.fileoutputformat.compress", false)
.getOrCreate();
Dataset<String> ds = spark.read().textFile(wordFile);
Dataset<String> words = ds.flatMap((FlatMapFunction<String, String>) line ->
Arrays.asList(line.split("\\s+")).iterator(), Encoders.STRING());
// words.printSchema(); // value: string (nullable = true)
Dataset<Row> df = words.groupBy("value").count();
// 保存结果到 HDFS
String outDir = "/user/qhy/output/wordcount5";
df.repartition(1).write().csv(outDir);
spark.stop();
}
}
执行流程图
相关阅读
本文出自 walker snapshot
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。