一:MapReduce处理数据去重
Map的key具有数据去重的功能
/* * 去除数据中相同数据 * 数据去重问题 * 以整个数据作为key发送出去, value为null */ public class DelsameMap extends Mapper<LongWritable, Text, Text, Text> { @Override protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context) throws IOException, InterruptedException { String line = value.toString(); if (line.length() > 0) { context.write(new Text(line.trim()), new Text("")); } } }
public class DelsameRedu extends Reducer<Text, Text, Text, NullWritable> { @Override protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException { context.write(key, NullWritable.get()); } }
public class DelsameMain { public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = new Job(conf); job.setJarByClass(DelsameMain.class); job.setMapperClass(DelsameMap.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setReducerClass(DelsameRedu.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.waitForCompletion(true); } }
二:MapReduce处理数据排序
将原始数据作为map输出的key设置为int类型。map会自动的根据key进行排序
/* * mapreduce处理数据排序 *将原始数据作为map输出的key设置为int类型。map会自动的根据key进行排序 */ public class SortMap extends Mapper<LongWritable, Text, IntWritable, Text> { @Override protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, IntWritable, Text>.Context context) throws IOException, InterruptedException { String line = value.toString(); if (line.length() > 0) { context.write(new IntWritable(Integer.parseInt(line.trim())), new Text("")); } } }
/* * 将values作为次序key。将map排序好的key作为value输出 */ public class SortRedu extends Reducer<IntWritable, Text, IntWritable, IntWritable> { private IntWritable num = new IntWritable(1); @Override protected void reduce(IntWritable key, Iterable<Text> values, Reducer<IntWritable, Text, IntWritable, IntWritable>.Context context) throws IOException, InterruptedException { // 将values作为排序的次序。将map拍好序的key作为reduce的value输出 for (Text val : values) { context.write(num, key); num = new IntWritable(num.get() + 1); } } }
public class SortMain { public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = new Job(conf); job.setJarByClass(SortMain.class); job.setMapperClass(SortMap.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(Text.class); job.setReducerClass(SortRedu.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.waitForCompletion(true); } }
原文:http://blog.csdn.net/u010366796/article/details/44589239