package com.jason.example import org.apache.spark.rdd.RDD class RddTest extends SparkInstance { val sc = spark.sparkContext val rdd = sc.parallelize(1 to 10) val rdd2 = sc.parallelize(9 to 15) val pairRdd = rdd2.map(x => (x, x * 2)) def trans(): Unit = { printRdd(rdd.filter(x => x % 2 == 0)) //2,4,6,8,10,12,14,16,18,20 printRdd(rdd.map(x => 1 to x)) //Range(1),Range(1, 2),Range(1, 2, 3),Range(1, 2, 3, 4),Range(1, 2, 3, 4, 5),Range(1, 2, 3, 4, 5, 6),Range(1, 2, 3, 4, 5, 6, 7),Range(1, 2, 3, 4, 5, 6, 7, 8),Range(1, 2, 3, 4, 5, 6, 7, 8, 9),Range(1, 2, 3, 4, 5, 6, 7, 8, 9, 10) printRdd(rdd.flatMap(x => 1 to x)) //1,1,2,1,2,3,1,2,3,4 printRdd(rdd.mapPartitions { it => it.map(_ + 0.5) }) //1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5 printRdd(rdd.mapPartitionsWithIndex((x, i) => i.map(_ + 0.5))) //1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5 printRdd(rdd.sample(true, 0.2)) //7 printRdd(rdd2.union(rdd)) //9,10,11,12,13,14,15,1,2,3 printRdd(rdd.intersection(rdd2)) //9,10 printRdd(rdd.distinct()) //4,8,1,9,5,6,10,2,3,7 rdd.coalesce(1) rdd.repartition(2) rdd.groupBy(x => x) //pairRdd printRdd(pairRdd.groupBy(x => x._2)) //(28,CompactBuffer((14,28))),(24,CompactBuffer((12,24))),(20,CompactBuffer((10,20))),(22,CompactBuffer((11,22))),(30,CompactBuffer((15,30))),(18,CompactBuffer((9,18))),(26,CompactBuffer((13,26))) printRdd(pairRdd.groupByKey()) //(12,CompactBuffer(24)),(13,CompactBuffer(26)),(9,CompactBuffer(18)),(14,CompactBuffer(28)),(10,CompactBuffer(20)),(15,CompactBuffer(30)),(11,CompactBuffer(22)) printRdd(pairRdd.reduceByKey(_ + _)) //(12,24),(13,26),(9,18),(14,28),(10,20),(15,30),(11,22) printRdd(pairRdd.aggregateByKey(0)((u, x) => u + x, (u1, u2) => u1 + u2)) //(12,24),(13,26),(9,18),(14,28),(10,20),(15,30),(11,22) printRdd(pairRdd.sortByKey(false)) //(15,30),(14,28),(13,26),(12,24),(11,22),(10,20),(9,18) printRdd(pairRdd.join(pairRdd)) //(12,(24,24)),(13,(26,26)),(9,(18,18)),(14,(28,28)),(10,(20,20)),(15,(30,30)),(11,(22,22)) pairRdd.leftOuterJoin(pairRdd) pairRdd.rightOuterJoin(pairRdd) pairRdd.fullOuterJoin(pairRdd) printRdd(pairRdd.cogroup(pairRdd)) //(12,(CompactBuffer(24),CompactBuffer(24))),(13,(CompactBuffer(26),CompactBuffer(26))),(9,(CompactBuffer(18),CompactBuffer(18))),(14,(CompactBuffer(28),CompactBuffer(28))),(10,(CompactBuffer(20),CompactBuffer(20))),(15,(CompactBuffer(30),CompactBuffer(30))),(11,(CompactBuffer(22),CompactBuffer(22))) pairRdd.groupWith(pairRdd) printRdd(rdd.cartesian(rdd2)) //笛卡尔积 (1,9),(2,9),(1,10),(1,11),(2,10),(2,11),(1,12),(1,13),(2,12),(2,13) rdd.setName("haha") stop() } def actionTest(): Unit = { rdd.aggregate(0)((u, x) => x + u, (u1, u2) => u1 + u2) rdd.reduce(_ + _) rdd.count() rdd.first() rdd.take(10) rdd.takeOrdered(10) rdd.takeSample(true, 10) println(pairRdd.countByKey()) //Map(10 -> 1, 14 -> 1, 9 -> 1, 13 -> 1, 12 -> 1, 11 -> 1, 15 -> 1) println(pairRdd.countByValue()) //Map((10,20) -> 1, (9,18) -> 1, (11,22) -> 1, (14,28) -> 1, (13,26) -> 1, (12,24) -> 1, (15,30) -> 1) rdd.countByValue() println(rdd.countApprox(90)) stop() } def printRdd[U](rdd: RDD[U]): Unit = { println(rdd.take(10).mkString(",")) } } object RddTest { def main(args: Array[String]): Unit = { val rt = new RddTest rt.trans() rt.actionTest() //Runtime.getRuntime.exec(s"""C:\notos\code\jason-ml\jason""") } }
原文:https://www.cnblogs.com/jason-dong/p/10446848.html