1、Tranform(转换算子)

map

package com.journey.core.rdd.transform;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;

import java.util.ArrayList;
import java.util.List;

/**
 * 将处理的数据逐条进行映射转换,这里的转换可以是类型的转换,也可以是指的转换
 */
public class MapRDD {

    public static void main(String[] args) {
        SparkConf conf = new SparkConf()
                .setAppName("MapRDD")
                .setMaster("local[*]");

        JavaSparkContext sc = new JavaSparkContext(conf);

        List<Integer> nums = new ArrayList<>();
        nums.add(1);
        nums.add(2);
        nums.add(3);
        nums.add(4);

        JavaRDD<Integer> numsRDD = sc.parallelize(nums);

        JavaRDD<Integer> mapRDD = numsRDD.map(new Function<Integer, Integer>() {
            @Override
            public Integer call(Integer value) throws Exception {
                return value * 2;
            }
        });

        mapRDD.collect().forEach(System.out::println);


        JavaRDD<String> fileRDD = sc.textFile("datas/apache.log");

        JavaRDD<String> urlRDD = fileRDD.map(new Function<String, String>() {
            @Override
            public String call(String line) throws Exception {
                return line.split(" ")[6];
            }
        });

        urlRDD.collect().forEach(System.out::println);


        sc.stop();
    }
}

mapPartitions

package com.journey.core.rdd.transform;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

/**
 * 将处理的数据以分区为单位发送给计算节点进行处理,这里的处理是指可以进行任意的处理,哪怕是过滤数据
 *
 * map和mapPartitions的区别?
 * 数据处理角度
 * Map算子是分区内一个数据一个数据的执行,类似于串行操作。而mapPartitions算子是以分区为单位进行批处理操作
 *
 * 功能的角度
 * Map算子主要目的将数据源中的数据进行转换和改变。但是不会减少或增多数据。MapPartitions算子需要传递一个迭代器,返回一个迭代器,没有要求的元素的个数
 * 保持不变,所以可以增加或减少数据
 *
 * 性能角度
 * Map算子因为类似于串行操作,所以性能比较低,而mapPartitions算子类似于批处理,所以性能较高。但是mapPartitions算子会长时间占用内存,那么这样会导致
 * 内存可能不够用,出现内存溢出的错误。所以在内存有限的情况下,不推荐使用。使用map操作
 */
public class MapPartitionsRDD {

    public static void main(String[] args) {
        SparkConf conf = new SparkConf()
                .setAppName("MapPartitionsRDD")
                .setMaster("local[*]");

        JavaSparkContext sc = new JavaSparkContext(conf);

        List<Integer> nums = new ArrayList<>();
        nums.add(1);
        nums.add(2);
        nums.add(3);
        nums.add(4);

        JavaRDD<Integer> numsRDD = sc.parallelize(nums, 2);

        JavaRDD<Integer> mapPartitionsRDD = numsRDD.mapPartitions(new FlatMapFunction<Iterator<Integer>, Integer>() {
            @Override
            public Iterator<Integer> call(Iterator<Integer> iterator) throws Exception {
                // 注意,这里只会打印两遍,为什么呢?是因为有两个分区,每个分区处理一次
                System.out.println("xxxxxxxxxxx");
                List<Integer> result = new ArrayList<>();
                while (iterator.hasNext()) {
                    Integer num = iterator.next();
                    result.add(num * 2);
                }
                return result.iterator();
            }
        });

        mapPartitionsRDD.collect().forEach(System.out::println);

        // 计算每个分区的最大值
        JavaRDD<Integer> maxPartitionValueRDD = mapPartitionsRDD.mapPartitions(new FlatMapFunction<Iterator<Integer>, Integer>() {
            @Override
            public Iterator<Integer> call(Iterator<Integer> iterator) throws Exception {

                List<Integer> result = new ArrayList<>();
                Integer maxValue = Integer.MIN_VALUE;
                while (iterator.hasNext()) {
                    Integer value = iterator.next();
                    if (value > maxValue) {
                        maxValue = value;
                    }
                }
                result.add(maxValue);
                return result.iterator();
            }
        });

        maxPartitionValueRDD.collect().forEach(System.out::println);


        sc.stop();
    }
}

mapPartitionsWithIndex

package com.journey.core.rdd.transform;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;

/**
 * 将处理的数据以分区为单位发送到计算节点进行处理,这里处理的是指可以进行任意的处理,哪怕是过滤数据,在处理时同时可以获取当前分区的索引
 */
public class MapPartitionsWithIndexRDD {

    public static void main(String[] args) {
        SparkConf conf = new SparkConf()
                .setAppName("MapPartitionsWithIndexRDD")
                .setMaster("local[*]");

        JavaSparkContext sc = new JavaSparkContext(conf);

        List<Integer> nums = new ArrayList<>();
        nums.add(1);
        nums.add(2);
        nums.add(3);
        nums.add(4);

        JavaRDD<Integer> numsRDD = sc.parallelize(nums, 2);

        Function2 mpIndexFunction = new Function2<Integer, Iterator<Integer>, Iterator<Integer>>(){
            @Override
            public Iterator<Integer> call(Integer index, Iterator<Integer> iterator) throws Exception {
                if(index == 0){
                    return iterator;
                }
                // 返回一个空的迭代器
                return Collections.emptyIterator();
            }
        };

        // mapPartitionsWithIndex 的时候需要注意,preservesPartitioning是否保留 partitioner
        // 函数外部声明
        JavaRDD mpRDD = numsRDD.mapPartitionsWithIndex(mpIndexFunction, true);

        mpRDD.collect().forEach(System.out::println);

        sc.stop();
    }
}

flatMap

package com.journey.core.rdd.transform;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;

/**
 * 将处理的数据进行扁平化后再进行映射处理,所以算子也称之为扁平映射,说白了其实就是可以一对多的输出
 */
public class FlatMapRDD {

    public static void main(String[] args) {
        SparkConf conf = new SparkConf()
                .setAppName("FlatMapRDD")
                .setMaster("local[*]");

        JavaSparkContext sc = new JavaSparkContext(conf);

        JavaRDD<String> fileRDD = sc.textFile("datas/wc");

        JavaRDD<String> wordRDD = fileRDD.flatMap(new FlatMapFunction<String, String>() {
            @Override
            public Iterator<String> call(String line) throws Exception {
                return Arrays.stream(line.split(" ")).iterator();
            }
        });

        wordRDD.collect().forEach(System.out::println);

        List<ArrayList<Integer>> nums = new ArrayList<>();

        ArrayList<Integer> nums1 = new ArrayList<>();
        nums1.add(1);
        nums1.add(2);
        nums.add(nums1);

        ArrayList<Integer> nums2 = new ArrayList<>();
        nums2.add(3);
        nums2.add(4);
        nums.add(nums2);

        JavaRDD<ArrayList<Integer>> numsRDD = sc.parallelize(nums);

        JavaRDD<Integer> numsFlatMapRDD = numsRDD.flatMap(new FlatMapFunction<ArrayList<Integer>, Integer>() {
            @Override
            public Iterator<Integer> call(ArrayList<Integer> integers) throws Exception {
                return integers.iterator();
            }
        });

        numsFlatMapRDD.collect().forEach(System.out::println);


        sc.stop();
    }
}

mapValues

package com.journey.core.rdd.transform;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import scala.Tuple2;

import java.util.ArrayList;
import java.util.List;

/**
 * 只对value进行操作
 */
public class MapValuesRDD {

    public static void main(String[] args) {
        SparkConf conf = new SparkConf()
                .setAppName("MapValuesRDD")
                .setMaster("local[*]");

        JavaSparkContext sc = new JavaSparkContext(conf);


        List<Tuple2<String, Integer>> userInfos = new ArrayList<>();
        userInfos.add(Tuple2.apply("Alice", 300));
        userInfos.add(Tuple2.apply("zhangsan", 200));
        userInfos.add(Tuple2.apply("lisi", 309));
        userInfos.add(Tuple2.apply("wagnwu", 201));
        userInfos.add(Tuple2.apply("mayun", 234));
        userInfos.add(Tuple2.apply("haha", 223));

        JavaPairRDD<String, Integer> userInfosRDD = sc.parallelizePairs(userInfos, 2);

        // 都涨薪100
        JavaPairRDD<String, Integer>  userInfosSalaryAdd100 = userInfosRDD.mapValues(new Function<Integer, Integer>() {
            @Override
            public Integer call(Integer v1) throws Exception {
                return v1 + 100;
            }
        });

        userInfosSalaryAdd100.collect().forEach(System.out::println);

        sc.stop();
    }
}

glom

package com.journey.core.rdd.transform;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

/**
 * 将同一个分区的数据直接转换为相同类型的内存数组进行处理,分区不变
 */
public class GlomRDD {

    public static void main(String[] args) {
        SparkConf conf = new SparkConf()
                .setAppName("GlomRDD")
                .setMaster("local[*]");

        JavaSparkContext sc = new JavaSparkContext(conf);

        List<Integer> nums = new ArrayList<>();
        nums.add(1);
        nums.add(2);
        nums.add(3);
        nums.add(4);

        JavaRDD<Integer> numsRDD = sc.parallelize(nums, 2);

        JavaRDD<List<Integer>> glomRDD = numsRDD.glom();
        JavaRDD<Integer> mapRDD = glomRDD.map(new Function<List<Integer>, Integer>() {
            @Override
            public Integer call(List<Integer> nums) throws Exception {
                return Collections.max(nums);
            }
        });

        List<Integer> resultList = mapRDD.collect();
        Integer result = resultList.stream().reduce(Integer::sum).orElse(0);
        System.out.println(result);

        sc.stop();
    }
}

groupBy

package com.journey.core.rdd.transform;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;

import java.util.ArrayList;
import java.util.Collection;
import java.util.List;

/**
 * reduceByKey和groupByKey的区别?
 * 从shuffle角度 : reduceByKey和groupByKey都存在shuffle操作,但是reduceByKey可以在shuffle前对分区内相同的key进行预聚合(combine)功能,
 * 这样会减少落盘的数据量,而groupByKey只是进行分组,不存在数据量减少的问题,reduceByKey性能比较高
 *
 * 从功能角度: reduceByKey其实包含分区和聚合的功能。GroupByKey只能分组,不能聚合,所以分组聚合场景下,推荐使用reduceByKey,如果仅仅是分组而
 * 不需要聚合。那么还是只能使用reduceByKey
 */
public class GroupByKeyRDD {

    public static void main(String[] args) {
        SparkConf conf = new SparkConf()
                .setAppName("GroupByKeyRDD")
                .setMaster("local[*]");

        JavaSparkContext sc = new JavaSparkContext(conf);

        List<String> words = new ArrayList<>();
        words.add("Hello");
        words.add("Spark");
        words.add("Spark");
        words.add("World");

        JavaRDD<String> wordsRDD = sc.parallelize(words);
        JavaPairRDD<String, Integer> wordToPairRDD = wordsRDD.mapToPair(new PairFunction<String, String, Integer>() {
            @Override
            public Tuple2<String, Integer> call(String word) throws Exception {
                return Tuple2.apply(word, 1);
            }
        });

        JavaPairRDD<String, Iterable<Integer>> wordGroupByRDD = wordToPairRDD.groupByKey();

        JavaPairRDD<String, Integer> wordCountRDD = wordGroupByRDD.mapValues(new Function<Iterable<Integer>, Integer>() {
            @Override
            public Integer call(Iterable<Integer> iterable) throws Exception {
                return ((Collection<?>) iterable).size();
            }
        });

        wordCountRDD.collect().forEach(System.out::println);


        sc.stop();
    }
}

filter

package com.journey.core.rdd.transform;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;

import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.List;

/**
 * 将数据根据指定的规则进行筛选过滤,符合规则的数据保留,不符合规则的数据丢弃。当数据进行筛选过滤过,分区不变,但是分区内的数据可能不均衡
 * 生成环境下,可能会出现数据倾斜,所以一般filter之后可以repartition
 */
public class FilterRDD {

    public static void main(String[] args) {
        SparkConf conf = new SparkConf()
                .setAppName("FilterRDD")
                .setMaster("local[*]");

        JavaSparkContext sc = new JavaSparkContext(conf);


        JavaRDD<String> logFileRDD = sc.textFile("datas/apache.log");

        JavaRDD<String> filterRDD = logFileRDD.filter(new Function<String, Boolean>() {
            @Override
            public Boolean call(String value) throws Exception {
                return value.contains("7/05/2015");
            }
        });

        JavaRDD<String> mapRDD = filterRDD.map(new Function<String, String>() {
            @Override
            public String call(String value) throws Exception {
                String[] fields = value.split(" ");
                return fields[6];
            }
        });

        mapRDD.collect().forEach(System.out::println);

        sc.stop();
    }
}

sample

package com.journey.core.rdd.transform;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;

import java.util.ArrayList;
import java.util.List;

/**
 * 其实主要查看一下数据的分布
 */
public class SampleRDD {

    public static void main(String[] args) {
        SparkConf conf = new SparkConf()
                .setAppName("SampleRDD")
                .setMaster("local[*]");

        JavaSparkContext sc = new JavaSparkContext(conf);

        List<Integer> nums = new ArrayList<>();
        nums.add(1);
        nums.add(2);
        nums.add(3);
        nums.add(4);

        JavaRDD<Integer> numsRDD = sc.parallelize(nums);
        /**
         * 第一个参数 : 抽取的数据是否放回,false : 不放回,true : 放回
         * 第二个参数 : 抽取的几率,范围在[0,1]之间,抽取出现的概率,大于1,重复几率
         * 第三个参数 : 随机种子
         */
        JavaRDD<Integer> sampleRDD1 = numsRDD.sample(false, 0.5);
        JavaRDD<Integer> sampleRDD2 = numsRDD.sample(true, 3);
        sampleRDD1.collect().forEach(System.out::println);
        System.out.println("**************************");
        sampleRDD2.collect().forEach(System.out::println);

        sc.stop();
    }
}

distinct

package com.journey.core.rdd.transform;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;

import java.util.ArrayList;
import java.util.List;

/**
 * 将数据集中重复的数据去重
 */
public class DistinctRDD {

    public static void main(String[] args) {
        SparkConf conf = new SparkConf()
                .setAppName("DistinctRDD")
                .setMaster("local[*]");

        JavaSparkContext sc = new JavaSparkContext(conf);

        List<Integer> nums = new ArrayList<>();
        nums.add(1);
        nums.add(1);
        nums.add(2);
        nums.add(3);
        nums.add(3);
        nums.add(1);

        JavaRDD<Integer> numsRDD = sc.parallelize(nums, 2);

        JavaRDD<Integer> distinctRDD = numsRDD.distinct(2);
        distinctRDD.collect().forEach(System.out::println);

        sc.stop();
    }
}

coalesce

package com.journey.core.rdd.transform;

import com.clearspring.analytics.util.Lists;
import org.apache.commons.collections.IteratorUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

/**
 * 根据数据量缩减分区,用于大数据集过滤后,提高小数据集的执行效率
 * 当Spark程序中,存在过多的小任务的时候,可以通过coalesce方法,缩减合并分区,减少分区的个数,减少任务调度成本
 */
public class CoalesceRDD {

    public static void main(String[] args) {
        SparkConf conf = new SparkConf()
                .setAppName("CoalesceRDD")
                .setMaster("local[*]");

        JavaSparkContext sc = new JavaSparkContext(conf);

        List<Integer> nums = new ArrayList<>();
        nums.add(1);
        nums.add(2);
        nums.add(3);
        nums.add(4);
        nums.add(5);
        nums.add(6);

        JavaRDD<Integer> numsRDD = sc.parallelize(nums, 6);

        /**
         * coalesce其实需要注意一点,就是默认shuffle为false,也就是在缩减分区的时候,是进行分区的合并的
         * coalesce 在不shuffle的情况下,不能增加分区
         */
        JavaRDD<Integer> coalesceRDD = numsRDD.coalesce(2);

        coalesceRDD.saveAsTextFile("datas/output");
        sc.stop();
    }
}

repartition

package com.journey.core.rdd.transform;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;

import java.util.ArrayList;
import java.util.List;

/**
 * 该操作内部其实执行的是coalesce操作,参数shuffle的默认值为true。无论是将分区数多的RDD转换为分区少的RDD,还是将分区少的RDD
 * 转换为分区多的RDD,repartition都可以完成,因为无论如何都会经过shuffle过程
 */
public class RepartitionRDD {

    public static void main(String[] args) {
        SparkConf conf = new SparkConf()
                .setAppName("RepartitionRDD")
                .setMaster("local[*]");

        JavaSparkContext sc = new JavaSparkContext(conf);

        List<Integer> nums = new ArrayList<>();
        nums.add(1);
        nums.add(2);
        nums.add(3);
        nums.add(4);
        nums.add(5);
        nums.add(6);

        JavaRDD<Integer> numsRDD = sc.parallelize(nums, 6);

        JavaRDD<Integer> coalesceRDD = numsRDD.repartition(10);

        coalesceRDD.saveAsTextFile("datas/output");
        sc.stop();
    }
}

intersection & union & subtract & zip

package com.journey.core.rdd.transform;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;

import java.util.ArrayList;
import java.util.List;

/**
 * 该操作用于排序数据。在排序之前,可以将数据通过f函数进行处理,之后按照f函数处理的结果进行排序,默认是升序排序。排序后新产生的RDD的分区数
 * 与原RDD分区数一直。中间存在shuffle的过程
 */
public class IntersectionRDD {

    public static void main(String[] args) {
        SparkConf conf = new SparkConf()
                .setAppName("SortByRDD")
                .setMaster("local[*]");

        JavaSparkContext sc = new JavaSparkContext(conf);

        List<Integer> nums1 = new ArrayList<>();
        nums1.add(1);
        nums1.add(2);
        nums1.add(3);
        nums1.add(4);

        List<Integer> nums2 = new ArrayList<>();
        nums2.add(3);
        nums2.add(4);
        nums2.add(5);
        nums2.add(6);

        List<String> nums3 = new ArrayList<>();
        nums3.add("3");


        JavaRDD<Integer> nums1RDD = sc.parallelize(nums1,1);
        JavaRDD<Integer> nums2RDD = sc.parallelize(nums2,1);

        // 必须相同类型
        JavaRDD<Integer> intersectionRDD = nums1RDD.intersection(nums2RDD);
        JavaRDD<Integer> unionRDD = nums1RDD.union(nums2RDD);
        // 必须相同类型
        JavaRDD<Integer> subtractRDD = nums1RDD.subtract(nums2RDD);
        // 必须相同类型,相同分区个数
        JavaPairRDD<Integer, Integer> zipRDD = nums1RDD.zip(nums2RDD);

        intersectionRDD.collect().forEach(System.out::println);
        System.out.println("******************************");
        unionRDD.collect().forEach(System.out::println);
        System.out.println("******************************");
        subtractRDD.collect().forEach(System.out::println);
        System.out.println("******************************");
        zipRDD.collect().forEach(System.out::println);

        sc.stop();
    }
}

partitionBy

package com.journey.core.rdd.transform;

import org.apache.spark.Partitioner;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import scala.Tuple2;

import java.util.ArrayList;
import java.util.List;

/**
 * 将数据按照指定Partitioner重新进行分区。Spark默认的分区器是HashPartitioner
 */
public class PartitionerByRDD {

    public static void main(String[] args) {
        SparkConf conf = new SparkConf()
                .setAppName("PartitionerByRDD")
                .setMaster("local[*]");

        JavaSparkContext sc = new JavaSparkContext(conf);

        List<Tuple2<String, String>> infos = new ArrayList<>();
        infos.add(Tuple2.apply("1305261989234", "zhangsan"));
        infos.add(Tuple2.apply("1505261989234", "lisi"));
        infos.add(Tuple2.apply("1305261982343", "wagnwu"));
        infos.add(Tuple2.apply("1505261382343", "zhaoliu"));

        // 将130开头的放入一个分区,将150开头放入一个分区中
        // TODO 注意,如果是pairs,需要调用的是parallelizePairs
        JavaPairRDD<String, String> infosRDD = sc.parallelizePairs(infos, 2);

        JavaPairRDD<String, String> partitionByRDD = infosRDD.partitionBy(new Partitioner() {
            @Override
            public int numPartitions() {
                return 2;
            }

            @Override
            public int getPartition(Object key) {
                String item = key.toString();
                if (item.startsWith("130")) {
                    return 0;
                } else if (item.startsWith("150")) {
                    return 1;
                }
                return 0;
            }
        });

        partitionByRDD.collect().forEach(System.out::println);


        sc.stop();
    }
}

reduceByKey

package com.journey.core.rdd.transform;

import org.apache.spark.Partitioner;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;

import java.util.ArrayList;
import java.util.List;

/**
 * 可以将相同的key对应的value进行聚合
 */
public class ReduceByKeyRDD {

    public static void main(String[] args) {
        SparkConf conf = new SparkConf()
                .setAppName("ReduceByKeyRDD")
                .setMaster("local[*]");

        JavaSparkContext sc = new JavaSparkContext(conf);

        List<String> words = new ArrayList<>();
        words.add("Hello");
        words.add("Spark");
        words.add("Spark");
        words.add("World");

        JavaRDD<String> wordsRDD = sc.parallelize(words);
        JavaPairRDD<String, Integer> wordToPairRDD = wordsRDD.mapToPair(new PairFunction<String, String, Integer>() {
            @Override
            public Tuple2<String, Integer> call(String word) throws Exception {
                return Tuple2.apply(word, 1);
            }
        });

        JavaPairRDD<String, Integer> wordCountRDD = wordToPairRDD.reduceByKey(new Function2<Integer, Integer, Integer>() {
            @Override
            public Integer call(Integer v1, Integer v2) throws Exception {
                return v1 + v2;
            }
        });

        wordCountRDD.collect().forEach(System.out::println);

        sc.stop();
    }
}

groupByKey

package com.journey.core.rdd.transform;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;

import java.util.ArrayList;
import java.util.Collection;
import java.util.List;

/**
 * reduceByKey和groupByKey的区别?
 * 从shuffle角度 : reduceByKey和groupByKey都存在shuffle操作,但是reduceByKey可以在shuffle前对分区内相同的key进行预聚合(combine)功能,
 * 这样会减少落盘的数据量,而groupByKey只是进行分组,不存在数据量减少的问题,reduceByKey性能比较高
 *
 * 从功能角度: reduceByKey其实包含分区和聚合的功能。GroupByKey只能分组,不能聚合,所以分组聚合场景下,推荐使用reduceByKey,如果仅仅是分组而
 * 不需要聚合。那么还是只能使用reduceByKey
 */
public class GroupByKeyRDD {

    public static void main(String[] args) {
        SparkConf conf = new SparkConf()
                .setAppName("GroupByKeyRDD")
                .setMaster("local[*]");

        JavaSparkContext sc = new JavaSparkContext(conf);

        List<String> words = new ArrayList<>();
        words.add("Hello");
        words.add("Spark");
        words.add("Spark");
        words.add("World");

        JavaRDD<String> wordsRDD = sc.parallelize(words);
        JavaPairRDD<String, Integer> wordToPairRDD = wordsRDD.mapToPair(new PairFunction<String, String, Integer>() {
            @Override
            public Tuple2<String, Integer> call(String word) throws Exception {
                return Tuple2.apply(word, 1);
            }
        });

        JavaPairRDD<String, Iterable<Integer>> wordGroupByRDD = wordToPairRDD.groupByKey();

        JavaPairRDD<String, Integer> wordCountRDD = wordGroupByRDD.mapValues(new Function<Iterable<Integer>, Integer>() {
            @Override
            public Integer call(Iterable<Integer> iterable) throws Exception {
                return ((Collection<?>) iterable).size();
            }
        });

        wordCountRDD.collect().forEach(System.out::println);


        sc.stop();
    }
}

aggregateByKey

package com.journey.core.rdd.transform;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;

import java.util.ArrayList;
import java.util.Collection;
import java.util.List;

/**
 * 第一个参数表示初始值
 * 第二个参数分区内的计算规则
 * 第三个参数分区间的计算规则
 */
public class AggregateByKeyRDD {

    public static void main(String[] args) {
        SparkConf conf = new SparkConf()
                .setAppName("AggregateByKeyRDD")
                .setMaster("local[*]");

        JavaSparkContext sc = new JavaSparkContext(conf);

        List<Tuple2<String, Integer>> words = new ArrayList<>();
        words.add(Tuple2.apply("Hello", 3));
        words.add(Tuple2.apply("Spark", 2));
        words.add(Tuple2.apply("Hello", 10));
        words.add(Tuple2.apply("Spark", 17));

        JavaPairRDD<String, Integer> wordsRDD = sc.parallelizePairs(words, 2);



        // aggregateByKey 的初始值只会参与分区内的计算
        JavaPairRDD<String, Integer> aggregateByKeyRDD = wordsRDD.aggregateByKey(10,
                new Function2<Integer, Integer, Integer>() {
            @Override
            public Integer call(Integer v1, Integer v2) throws Exception {
                return v1 + v2;
            }
        }, new Function2<Integer, Integer, Integer>() {
            @Override
            public Integer call(Integer v1, Integer v2) throws Exception {
                return v1 + v2;
            }
        });

        aggregateByKeyRDD.collect().forEach(System.out::println);

        // aggregateByKey 的初始值只会参与分区内的计算
        JavaPairRDD<String, Integer> aggregateByKeyRDD2 = wordsRDD.aggregateByKey(10,
                new Function2<Integer, Integer, Integer>() {
                    @Override
                    public Integer call(Integer v1, Integer v2) throws Exception {
                        // 分区内计算最大值
                        return Math.max(v1, v2);
                    }
                }, new Function2<Integer, Integer, Integer>() {
                    @Override
                    public Integer call(Integer v1, Integer v2) throws Exception {
                        return v1 + v2;
                    }
                });

        aggregateByKeyRDD2.collect().forEach(System.out::println);

        sc.stop();
    }
}

foldByKey

package com.journey.core.rdd.transform;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import scala.Tuple2;

import java.util.ArrayList;
import java.util.List;

/**
 * 第一个参数表示初始值
 * 第二个参数表示分区内和分区间的计算规则,相同
 */
public class FoldByKeyRDD {

    public static void main(String[] args) {
        SparkConf conf = new SparkConf()
                .setAppName("FoldByKeyRDD")
                .setMaster("local[*]");

        JavaSparkContext sc = new JavaSparkContext(conf);

        List<Tuple2<String, Integer>> words = new ArrayList<>();
        words.add(Tuple2.apply("Hello", 3));
        words.add(Tuple2.apply("Spark", 2));
        words.add(Tuple2.apply("Hello", 10));
        words.add(Tuple2.apply("Spark", 17));

        JavaPairRDD<String, Integer> wordsRDD = sc.parallelizePairs(words, 2);

        JavaPairRDD<String, Integer> foldByKeyRDD = wordsRDD.foldByKey(10,
                new Function2<Integer, Integer, Integer>() {
            @Override
            public Integer call(Integer v1, Integer v2) throws Exception {
                return v1 + v2;
            }
        });

        foldByKeyRDD.collect().forEach(System.out::println);


        sc.stop();
    }
}

combineByKey

package com.journey.core.rdd.transform;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import scala.Tuple2;

import java.util.ArrayList;
import java.util.List;

/**
 * 求平均数
 * 第一个参数只做数据的转换
 * 第二个参数分区内的计算
 * 第三个参数分区间的计算
 *
 * reduceByKey : 相同key的第一个数据不进程任何计算,分区内和分区间计算规则相同
 * foldByKey : 相同key的第一个数据和初始值进行分区内计算,分区内和分区间计算规则相同
 * aggregateByKey : 相同key的第一个数据和初始值进行分区内计算,分区内和分区间计算规则可以不相同
 * combineByKey : 当计算时,发现数据结构不满足时,可以让第一个数据转换结构。分区内和分区间计算规则可以不相同
 */
public class CombineByKeyRDD {

    public static void main(String[] args) {
        SparkConf conf = new SparkConf()
                .setAppName("CombineByKeyRDD")
                .setMaster("local[*]");

        JavaSparkContext sc = new JavaSparkContext(conf);

        List<Tuple2<String, Integer>> words = new ArrayList<>();
        words.add(Tuple2.apply("Hello", 3));
        words.add(Tuple2.apply("Spark", 2));
        words.add(Tuple2.apply("Hello", 3));
        words.add(Tuple2.apply("Spark", 2));
        words.add(Tuple2.apply("Spark", 2));
        words.add(Tuple2.apply("Spark", 2));

        JavaPairRDD<String, Integer> wordsRDD = sc.parallelizePairs(words, 2);

        JavaPairRDD<String, Tuple2<Integer, Integer>> combineByKeyRDD = wordsRDD.combineByKey(new Function<Integer, Tuple2<Integer, Integer>>() {
            @Override
            public Tuple2<Integer, Integer> call(Integer v1) throws Exception {
                return Tuple2.apply(v1, 1);
            }
        }, new Function2<Tuple2<Integer, Integer>, Integer, Tuple2<Integer, Integer>>() {
            @Override
            public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> v1, Integer v2) throws Exception {
                return Tuple2.apply(v1._1 + v2, v1._2 + 1);
            }
        }, new Function2<Tuple2<Integer, Integer>, Tuple2<Integer, Integer>, Tuple2<Integer, Integer>>() {
            @Override
            public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> v1, Tuple2<Integer, Integer> v2) throws Exception {
                return Tuple2.apply(v1._1 + v2._1, v1._2 + v2._2);
            }
        });

        combineByKeyRDD.collect().forEach(t -> {
            String key = t._1;
            Tuple2<Integer, Integer> tuple = t._2;
            System.out.println(key + ":" + tuple._1 / tuple._2);
        });

        JavaPairRDD<String, Integer> wordCountRDD = wordsRDD.combineByKey(new Function<Integer, Integer>() {
            @Override
            public Integer call(Integer v1) throws Exception {
                return v1;
            }
        }, new Function2<Integer, Integer, Integer>() {
            @Override
            public Integer call(Integer v1, Integer v2) throws Exception {
                return v1 + v2;
            }
        }, new Function2<Integer, Integer, Integer>() {
            @Override
            public Integer call(Integer v1, Integer v2) throws Exception {
                return v1 + v2;
            }
        });

        wordCountRDD.collect().forEach(System.out::println);

        sc.stop();
    }
}

sortByKey

package com.journey.core.rdd.transform;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import scala.Tuple2;

import java.util.ArrayList;
import java.util.List;

/**
 *  对key进行排序
 */
public class SortByKeyRDD {

    public static void main(String[] args) {
        SparkConf conf = new SparkConf()
                .setAppName("CombineByKeyRDD")
                .setMaster("local[*]");

        JavaSparkContext sc = new JavaSparkContext(conf);

        List<Tuple2<String, Integer>> words = new ArrayList<>();
        words.add(Tuple2.apply("Alice", 3));
        words.add(Tuple2.apply("zhangsan", 2));
        words.add(Tuple2.apply("lisi", 3));
        words.add(Tuple2.apply("wagnwu", 2));
        words.add(Tuple2.apply("mayun", 2));
        words.add(Tuple2.apply("haha", 2));

        JavaPairRDD<String, Integer> wordsRDD = sc.parallelizePairs(words, 2);

        // 默认是升序,可以指定降序排序,也可以指定自定义排序规则
        JavaPairRDD<String, Integer> sortWordsRDD = wordsRDD.sortByKey(true);

        sortWordsRDD.collect().forEach(System.out::println);

        sc.stop();
    }
}

join & leftOuterJoin

package com.journey.core.rdd.transform;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.Optional;
import scala.Tuple2;

import java.util.ArrayList;
import java.util.List;

/**
 * 在类型为(K,V)和(K,W)的RDD上调用,返回一个相同key对应的所有元素连接在一起的(K,(V,W))的RDD
 */
public class JoinRDD {

    public static void main(String[] args) {
        SparkConf conf = new SparkConf()
                .setAppName("JoinRDD")
                .setMaster("local[*]");

        JavaSparkContext sc = new JavaSparkContext(conf);

        List<Tuple2<Integer, String>> userInfos = new ArrayList<>();
        userInfos.add(Tuple2.apply(1, "zhagnsan"));
        userInfos.add(Tuple2.apply(2, "lisi"));
        userInfos.add(Tuple2.apply(3, "lisi"));


        List<Tuple2<Integer, String>> orders = new ArrayList<>();
        orders.add(Tuple2.apply(1, "iphone pad"));
        orders.add(Tuple2.apply(1, "mac pad"));
        orders.add(Tuple2.apply(2, "java book"));

        JavaPairRDD<Integer, String> userInfosRDD = sc.parallelizePairs(userInfos, 2);
        JavaPairRDD<Integer, String> ordersRDD = sc.parallelizePairs(orders, 2);

        JavaPairRDD<Integer, Tuple2<String, String>> joinRDD = userInfosRDD.join(ordersRDD);

        joinRDD.collect().forEach(System.out::println);

        // 左连接,就是左边都显示,右边没有为empty
        JavaPairRDD<Integer, Tuple2<String, Optional<String>>> leftOuterJoinRDD = userInfosRDD.leftOuterJoin(ordersRDD);
        leftOuterJoinRDD.collect().forEach(System.out::println);


        sc.stop();
    }
}

cogroup

package com.journey.core.rdd.transform;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.Optional;
import scala.Tuple2;

import java.util.ArrayList;
import java.util.List;

/**
 * 相同的key会聚合在一起,value是一个集合
 */
public class CogroupRDD {

    public static void main(String[] args) {
        SparkConf conf = new SparkConf()
                .setAppName("CogroupRDD")
                .setMaster("local[*]");

        JavaSparkContext sc = new JavaSparkContext(conf);

        List<Tuple2<Integer, String>> userInfos = new ArrayList<>();
        userInfos.add(Tuple2.apply(1, "zhagnsan"));
        userInfos.add(Tuple2.apply(2, "lisi"));
        userInfos.add(Tuple2.apply(3, "lisi"));


        List<Tuple2<Integer, String>> orders = new ArrayList<>();
        orders.add(Tuple2.apply(1, "iphone pad"));
        orders.add(Tuple2.apply(1, "mac pad"));
        orders.add(Tuple2.apply(2, "java book"));

        JavaPairRDD<Integer, String> userInfosRDD = sc.parallelizePairs(userInfos, 2);
        JavaPairRDD<Integer, String> ordersRDD = sc.parallelizePairs(orders, 2);

        JavaPairRDD<Integer, Tuple2<Iterable<String>, Iterable<String>>> cogroupRDD = userInfosRDD.cogroup(ordersRDD);

        cogroupRDD.collect().forEach(System.out::println);

        sc.stop();
    }
}

Top N 案例

package com.journey.core.rdd.transform;

import org.apache.commons.collections.IteratorUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.Optional;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
import scala.Tuple3;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;


/**
 * Serialization stack:
 *     - object not serializable (class: java.util.ArrayList$SubList, value: [(16,26), (26,25), (1,23)])
 *     - field (class: scala.Tuple2, name: _2, type: class java.lang.Object)
 *     - object (class scala.Tuple2, (7,[(16,26), (26,25), (1,23)]))
 *     - element of array (index: 0)
 *     - array (class [Lscala.Tuple2;, size 5)
 *     at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:41)
 *     at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:47)
 *     at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:101)
 *     at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:489)
 *     at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
 *     at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
 *     at java.base/java.lang.Thread.run(Thread.java:835)
 * 23/05/09 20:29:01 ERROR Executor: Exception in task 0.0 in stage 2.0 (TID 4)
 * java.io.NotSerializableException: java.util.ArrayList$SubList
 * Serialization stack:
 *
 * 解决之法 :
 * It's because, List returned by subList() method is an instance of 'RandomAccessSubList' which is not serializable.
 * Therefore you need to create a new ArrayList object from the list returned by the subList().
 */
public class Demo {

    public static void main(String[] args) {
        SparkConf conf = new SparkConf()
                .setAppName("Demo")
                .setMaster("local[*]");

        JavaSparkContext sc = new JavaSparkContext(conf);

        JavaRDD<String> logRDD = sc.textFile("datas/agent.log");


        JavaPairRDD<Tuple2<String, String>, Integer> proviceAdRDD = logRDD.mapToPair(new PairFunction<String, Tuple2<String, String>, Integer>() {
            @Override
            public Tuple2<Tuple2<String, String>, Integer> call(String line) throws Exception {
                String[] fields = line.split(" ");
                String provice = fields[1];
                String ad = fields[4];
                return Tuple2.apply(Tuple2.apply(provice, ad), 1);
            }
        });

        JavaPairRDD<Tuple2<String, String>, Integer> proviceAdToCountRDD = proviceAdRDD.reduceByKey(new Function2<Integer, Integer, Integer>() {
            @Override
            public Integer call(Integer v1, Integer v2) throws Exception {
                return v1 + v2;
            }
        });


        JavaPairRDD<String, Tuple2<String, Integer>> proviceToAdCountRDD = proviceAdToCountRDD.mapToPair(new PairFunction<Tuple2<Tuple2<String, String>, Integer>, String, Tuple2<String, Integer>>() {
            @Override
            public Tuple2<String, Tuple2<String, Integer>> call(Tuple2<Tuple2<String, String>, Integer> value) throws Exception {
                return Tuple2.apply(value._1._1, Tuple2.apply(value._1._2, value._2));
            }
        });

        JavaPairRDD<String, Iterable<Tuple2<String, Integer>>> proviceToAdGroupRDD = proviceToAdCountRDD.groupByKey();

        // 在分组内进行排序,取分组内的 top N
        JavaPairRDD<String , Iterable<Tuple2<String , Integer>>> proviceToAdTop3RDD = proviceToAdGroupRDD.mapToPair(new PairFunction<Tuple2<String, Iterable<Tuple2<String, Integer>>>, String, Iterable<Tuple2<String, Integer>>>() {
            @Override
            public Tuple2<String, Iterable<Tuple2<String, Integer>>> call(Tuple2<String, Iterable<Tuple2<String, Integer>>> iterable) throws Exception {
                List<Tuple2<String, Integer>> result = IteratorUtils.toList(iterable._2.iterator());
                Collections.sort(result, new Comparator<Tuple2<String, Integer>>() {
                    @Override
                    public int compare(Tuple2<String, Integer> o1, Tuple2<String, Integer> o2) {
                        return o2._2 - o1._2;
                    }
                });
                // 一定要主要,这里需要的是new ArrayList<>(result.subList(0, 3)),封装一下
                return Tuple2.apply(iterable._1, new ArrayList<>(result.subList(0, 3)));
            }
        });

//        proviceToAdTop3RDD.foreach(new VoidFunction<Tuple2<String, Iterable<Tuple2<String, Integer>>>>() {
//            @Override
//            public void call(Tuple2<String, Iterable<Tuple2<String, Integer>>> stringIterableTuple2) throws Exception {
//                System.out.println(stringIterableTuple2);
//            }
//        });

        proviceToAdTop3RDD.collect().forEach(System.out::println);


        sc.stop();
    }
}

2、Action(行动算子)

reduce

package com.journey.core.rdd.action;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;

import java.util.ArrayList;
import java.util.List;

/**
 * 聚合RDD中的所有元素,先聚合分区内数据,再聚合分区间数据
 */
public class ReduceRDD {

    public static void main(String[] args) {
        SparkConf sparkConf = new SparkConf()
                .setAppName("ReduceRDD")
                .setMaster("local[*]");
        JavaSparkContext sc = new JavaSparkContext(sparkConf);

        List<Integer> nums = new ArrayList<>();
        nums.add(1);
        nums.add(2);
        nums.add(3);
        nums.add(4);

        JavaRDD<Integer> numsRDD = sc.parallelize(nums, 2);

        Integer result = numsRDD.reduce(new Function2<Integer, Integer, Integer>() {
            @Override
            public Integer call(Integer v1, Integer v2) throws Exception {
                return v1 + v2;
            }
        });
        System.out.println(result);


        sc.stop();
    }
}

collect

package com.journey.core.rdd.action;

import org.apache.spark.SparkConf;
import org.apache.spark.SparkJobInfo;
import org.apache.spark.SparkStageInfo;
import org.apache.spark.api.java.JavaFutureAction;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;

import java.util.ArrayList;
import java.util.List;

/**
 * collect会将数据拉取到Driver端进行聚合,注意 : 如果数据量比较大,可能会让Driver内存溢出
 */
public class CollectRDD {

    public static void main(String[] args) throws Exception {
        SparkConf sparkConf = new SparkConf()
                .setAppName("CollectRDD")
                .setMaster("local[*]");
        JavaSparkContext sc = new JavaSparkContext(sparkConf);

        List<Integer> nums = new ArrayList<>();
        nums.add(1);
        nums.add(2);
        nums.add(3);
        nums.add(4);

        JavaRDD<Integer> numsRDD = sc.parallelize(nums, 2);

        // 同步获取
//        numsRDD.collect().forEach(System.out::println);

        // 异步获取
        JavaFutureAction<List<Integer>> jobFuture = numsRDD.collectAsync();
        while (!jobFuture.isDone()) {
            Thread.sleep(1000);  // 1 second
            List<Integer> jobIds = jobFuture.jobIds();
            if (jobIds.isEmpty()) {
                continue;
            }
            int currentJobId = jobIds.get(jobIds.size() - 1);
            SparkJobInfo jobInfo = sc.statusTracker().getJobInfo(currentJobId);
            SparkStageInfo stageInfo = sc.statusTracker().getStageInfo(jobInfo.stageIds()[0]);
            System.out.println(stageInfo.numTasks() + " tasks total: " + stageInfo.numActiveTasks() +
                    " active, " + stageInfo.numCompletedTasks() + " complete");
        }

        if (jobFuture.isDone()) {
            List<Integer> result = jobFuture.get();
            System.out.println(result);
        }

        sc.stop();
    }
}

count

package com.journey.core.rdd.action;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;

import java.util.ArrayList;
import java.util.List;

/**
 * 统计元素的个数
 */
public class CountRDD {

    public static void main(String[] args) {
        SparkConf sparkConf = new SparkConf()
                .setAppName("CountRDD")
                .setMaster("local[*]");
        JavaSparkContext sc = new JavaSparkContext(sparkConf);

        List<Integer> nums = new ArrayList<>();
        nums.add(1);
        nums.add(2);
        nums.add(3);
        nums.add(4);

        JavaRDD<Integer> numsRDD = sc.parallelize(nums, 2);

        long count = numsRDD.count();
        System.out.println(count);

        sc.stop();
    }
}

first

package com.journey.core.rdd.action;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;

import java.util.ArrayList;
import java.util.List;

/**
 * 返回RDD中的第一个元素
 */
public class FirstRDD {

    public static void main(String[] args) {
        SparkConf sparkConf = new SparkConf()
                .setAppName("FirstRDD")
                .setMaster("local[*]");
        JavaSparkContext sc = new JavaSparkContext(sparkConf);

        List<Integer> nums = new ArrayList<>();
        nums.add(1);
        nums.add(2);
        nums.add(3);
        nums.add(4);

        JavaRDD<Integer> numsRDD = sc.parallelize(nums, 2);

        long firstItem = numsRDD.first();
        System.out.println(firstItem);

        sc.stop();
    }
}

take

package com.journey.core.rdd.action;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;

import java.util.ArrayList;
import java.util.List;

/**
 * 返回RDD的前个元素
 */
public class TakeRDD {

    public static void main(String[] args) {
        SparkConf sparkConf = new SparkConf()
                .setAppName("TakeRDD")
                .setMaster("local[*]");
        JavaSparkContext sc = new JavaSparkContext(sparkConf);

        List<Integer> nums = new ArrayList<>();
        nums.add(1);
        nums.add(2);
        nums.add(3);
        nums.add(4);

        JavaRDD<Integer> numsRDD = sc.parallelize(nums, 2);

        List<Integer> items = numsRDD.take(3);
        System.out.println(items);

        sc.stop();
    }
}

takeOrdered

package com.journey.core.rdd.action;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;

import java.util.ArrayList;
import java.util.List;

/**
 * 返回RDD排序后的前n个元素数组
 */
public class TakeOrderedRDD {

    public static void main(String[] args) {
        SparkConf sparkConf = new SparkConf()
                .setAppName("TakeOrderedRDD")
                .setMaster("local[*]");
        JavaSparkContext sc = new JavaSparkContext(sparkConf);

        List<Integer> nums = new ArrayList<>();
        nums.add(10);
        nums.add(22);
        nums.add(3);
        nums.add(40);

        JavaRDD<Integer> numsRDD = sc.parallelize(nums, 2);

        // 默认升序,可以传入比较器
        List<Integer> items = numsRDD.takeOrdered(2);
        System.out.println(items);

        sc.stop();
    }
}

aggregate

package com.journey.core.rdd.action;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;

import java.util.ArrayList;
import java.util.List;

/**
 * 分区内通过初始值计算进行聚合,然后再用初始值进行分区间数据聚合,和aggregateByKey不同,aggregateByKey只会参与分区内计算
 */
public class AggregateRDD {

    public static void main(String[] args) {
        SparkConf sparkConf = new SparkConf()
                .setAppName("AggregateRDD")
                .setMaster("local[*]");
        JavaSparkContext sc = new JavaSparkContext(sparkConf);

        List<Integer> nums = new ArrayList<>();
        nums.add(10);
        nums.add(10);

        JavaRDD<Integer> numsRDD = sc.parallelize(nums, 2);

        /**
         * 分区1(分区内) : 初始值(10) + 10
         * 分区2(分区内) : 初始值(10) + 10
         *
         * 分区间 : 初始值(10) + 20 + 20
         *
         * 所以注意 : 不管是aggregateByKey还是aggregate都是和分区有关的,分区个数不同,初始值的计算也会不同
         */
        Integer sum = numsRDD.aggregate(10, new Function2<Integer, Integer, Integer>() {
            @Override
            public Integer call(Integer v1, Integer v2) throws Exception {
                return v1 + v2;
            }
        }, new Function2<Integer, Integer, Integer>() {
            @Override
            public Integer call(Integer v1, Integer v2) throws Exception {
                return v1 + v2;
            }
        });
        System.out.println(sum);

        sc.stop();
    }
}

fold

ckage com.journey.core.rdd.action;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;

import java.util.ArrayList;
import java.util.List;

/**
 * 同aggregate类似,只是分区内和分区间逻辑需要一样
 */
public class FoldRDD {

    public static void main(String[] args) {
        SparkConf sparkConf = new SparkConf()
                .setAppName("FoldRDD")
                .setMaster("local[*]");
        JavaSparkContext sc = new JavaSparkContext(sparkConf);

        List<Integer> nums = new ArrayList<>();
        nums.add(10);
        nums.add(10);

        JavaRDD<Integer> numsRDD = sc.parallelize(nums, 2);

        Integer sum = numsRDD.fold(10, new Function2<Integer, Integer, Integer>() {
            @Override
            public Integer call(Integer v1, Integer v2) throws Exception {
                return v1 + v2;
            }
        });

        System.out.println(sum);

        sc.stop();
    }
}

countByKey

package com.journey.core.rdd.action;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaFutureAction;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import scala.Tuple2;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;

/**
 * 统计key出现的次数
 */
public class CountByKeyRDD {

    public static void main(String[] args) {
        SparkConf sparkConf = new SparkConf()
                .setAppName("CountByKeyRDD")
                .setMaster("local[*]");
        JavaSparkContext sc = new JavaSparkContext(sparkConf);

        List<Tuple2<String, Integer>> userInfos = new ArrayList<>();
        userInfos.add(Tuple2.apply("zhangsan", 23));
        userInfos.add(Tuple2.apply("lisi", 30));

        JavaPairRDD<String, Integer> userInfosRDD = sc.parallelizePairs(userInfos, 2);

        Map<String, Long> countByKey = userInfosRDD.countByKey();
        System.out.println(countByKey);

        sc.stop();
    }
}

save

package com.journey.core.rdd.action;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import scala.Tuple2;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;

/**
 * 保存相关算子
 */
public class SaveRDD {

    public static void main(String[] args) {
        SparkConf sparkConf = new SparkConf()
                .setAppName("SaveRDD")
                .setMaster("local[*]");
        JavaSparkContext sc = new JavaSparkContext(sparkConf);

        List<Tuple2<String, Integer>> userInfos = new ArrayList<>();
        userInfos.add(Tuple2.apply("zhangsan", 23));
        userInfos.add(Tuple2.apply("lisi", 30));

        JavaPairRDD<String, Integer> userInfosRDD = sc.parallelizePairs(userInfos, 2);

        // 保存成text文件
        userInfosRDD.saveAsTextFile("datas/output1");
        // 序列化成对象保存到文件
        userInfosRDD.saveAsObjectFile("datas/output2");

        sc.stop();
    }
}

foreach

package com.journey.core.rdd.action;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.VoidFunction;

import java.util.ArrayList;
import java.util.List;

/**
 * foreach和collect相比不一样,collect是将数据拉取到Driver端,foreache直接在Executor进行比如输出
 */
public class ForeachRDD {

    public static void main(String[] args) {
        SparkConf sparkConf = new SparkConf()
                .setAppName("ForeachRDD")
                .setMaster("local[*]");
        JavaSparkContext sc = new JavaSparkContext(sparkConf);

        List<Integer> nums = new ArrayList<>();
        nums.add(1);
        nums.add(2);
        nums.add(3);
        nums.add(4);

        JavaRDD<Integer> numsRDD = sc.parallelize(nums, 2);

        numsRDD.foreach(new VoidFunction<Integer>() {
            @Override
            public void call(Integer value) throws Exception {
                System.out.println(value);
            }
        });

        sc.stop();
    }
}

如感兴趣,点赞加关注,非常感谢!!!


journey
32 声望22 粉丝

« 上一篇
Flink核心组件
下一篇 »
Spark原理