spark wordcount
maven
<dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-core_2.11</artifactId> <version>2.3.0</version> </dependency>
package cn.easyinfo.spark; import java.util.Arrays; import java.util.Iterator; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.Function2; import org.apache.spark.api.java.function.PairFunction; import scala.Tuple2; /** * @author * @date 2020年4月6日 下午6:26:51 * */ public class MyJavaWordCount { public static void main(String[] args) { // TODO Auto-generated method stub //参数检查 if(args.length<2){ System.err.println("Usage:MyJavaWordCount input output"); System.exit(1); } //输入路径 String inputPath = args[0]; //输出路径 String outputPath = args[1]; //创建SparkContext SparkConf conf = new SparkConf().setAppName("MyJavaWordCount"); JavaSparkContext jsc = new JavaSparkContext(conf); //读取数据 JavaRDD<String> inputRDD = jsc.textFile(inputPath); //flatmap扁平化操作 JavaRDD<String> words = inputRDD.flatMap(new FlatMapFunction<String, String>() { /** * */ private static final long serialVersionUID = 1L; @Override public Iterator<String> call(String line) throws Exception { // TODO Auto-generated method stub return Arrays.asList(line.split("\\s+")).iterator(); } }); //map 操作 JavaPairRDD<String, Integer> pairRDD = words.mapToPair(new PairFunction<String, String, Integer>() { /** * */ private static final long serialVersionUID = 1L; @Override public Tuple2<String, Integer> call(String word) throws Exception { // TODO Auto-generated method stub return new Tuple2<String, Integer>(word,1); } }); //reduce操作 JavaPairRDD<String, Integer> result = pairRDD.reduceByKey(new Function2<Integer, Integer, Integer>() { /** * */ private static final long serialVersionUID = 1L; @Override public Integer call(Integer x, Integer y) throws Exception { // TODO Auto-generated method stub return x+y; } }); //保存结果 result.saveAsTextFile(outputPath); //关闭jsc jsc.close(); } }
打包
运行结果:
原文:https://www.cnblogs.com/ptbx1t/p/12636545.html