mapreduce求前k个最大值(topk 问题)

时间：2015-05-05 19:00:51 阅读：1012 评论：0 收藏：0 [点我收藏+]

需要先统计词频，再进行排序

----------词频统计---------

package TopK;
import java.io.IOException;
import java.util.StringTokenizer;
 
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

 /**
  * 统计词频
  * @author mlj
  */
 public class WordCount {
    
     /**
     * 读取单词
      */
    public static class Map extends Mapper<Object,Text,Text,IntWritable>{

       IntWritable count = new IntWritable(1);
       
       @Override
       protected void map(Object key, Text value, Context context)
               throws IOException, InterruptedException {
            StringTokenizer st = new StringTokenizer(value.toString());
             while(st.hasMoreTokens()){    
               String word = st.nextToken().replaceAll("\"", "").replace("‘", "").replace(".", "");
                context.write(new Text(word), count);
            }
       }
        
     }
    
    /**
     * 统计词频
     */
    public static class Reduce extends Reducer<Text,IntWritable,Text,IntWritable>{

       @SuppressWarnings("unused")
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values,Context context)
                throws IOException, InterruptedException {
           int count = 0;
            for (IntWritable intWritable : values) {
                count ++;
           }
             context.write(key,new IntWritable(count));
        }
        
     }
    
    @SuppressWarnings("deprecation")
    public static boolean run(String in,String out) throws IOException, ClassNotFoundException, InterruptedException{
         
         Configuration conf = new Configuration();
        
       Job job = new Job(conf,"WordCount");
        job.setJarByClass(WordCount.class);
         job.setMapperClass(Map.class);
       job.setReducerClass(Reduce.class);
       
        // 设置Map输出类型
        job.setMapOutputKeyClass(Text.class);
         job.setMapOutputValueClass(IntWritable.class);

        // 设置Reduce输出类型
       job.setOutputKeyClass(Text.class);
         job.setOutputValueClass(IntWritable.class);

      // 设置输入和输出目录
        FileInputFormat.addInputPath(job, new Path(in));
        FileOutputFormat.setOutputPath(job, new Path(out));
        
         return job.waitForCompletion(true);
   }
     
 }

　　----------排序---------

package TopK;
  import java.io.IOException;
  import java.util.Comparator;
  import java.util.Map.Entry;
  import java.util.Set;
  import java.util.StringTokenizer;
  import java.util.TreeMap;
  import java.util.regex.Pattern;
 
  import org.apache.hadoop.conf.Configuration;
  import org.apache.hadoop.fs.Path;
  import org.apache.hadoop.io.IntWritable;
  import org.apache.hadoop.io.Text;
  import org.apache.hadoop.mapreduce.Job;
  import org.apache.hadoop.mapreduce.Mapper;
  import org.apache.hadoop.mapreduce.Reducer;
  import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
  import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
  import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
  import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

 /**
 * 以单词出现的频率排序
*/
  public class Sort {
 
     /**
      * 读取单词（词频 word）
      * 
     * @author mlj
     * 
       */
     public static class Map extends Mapper<Object, Text, IntWritable, Text> {
 
        // 输出key 词频
         IntWritable outKey = new IntWritable();
         Text outValue = new Text();

       @Override
       protected void map(Object key, Text value, Context context)
               throws IOException, InterruptedException {

            StringTokenizer st = new StringTokenizer(value.toString());
            while (st.hasMoreTokens()) {
                String element = st.nextToken();
               if (Pattern.matches("\\d+", element)) {
                   outKey.set(Integer.parseInt(element));
               } else {
                   outValue.set(element);
                }
             }
 
           context.write(outKey, outValue);
          }

    }

    /**
     * 根据词频排序
    */
    public static class Reduce extends
             Reducer<IntWritable, Text, Text, IntWritable> {
         
        private static MultipleOutputs<Text, IntWritable> mos = null;
        
        //要获得前K个频率最高的词
        private static final int k = 10;
         
      //用TreeMap存储可以利用它的排序功能
        //这里用 MyInt 因为TreeMap是对key排序，且不能唯一，而词频可能相同，要以词频为Key就必需对它封装
          private static TreeMap<MyInt, String> tm = new TreeMap<MyInt, String>(new Comparator<MyInt>(){
           /**
            * 默认是从小到大的顺序排的，现在修改为从大到小
            * @param o1
           * @param o2
             * @return
        */
             @Override
          public int compare(MyInt o1, MyInt o2) {
                return o2.compareTo(o1);
           }
           
       }) ;
        
      /*
         * 以词频为Key是要用到reduce的排序功能
         */
         @Override
         protected void reduce(IntWritable key, Iterable<Text> values,
                 Context context) throws IOException, InterruptedException {
            for (Text text : values) {
                context.write(text, key);
               tm.put(new MyInt(key.get()),text.toString());
               
                 //TreeMap以对内部数据进行了排序，最后一个必定是最小的
               if(tm.size() > k){
                   tm.remove(tm.lastKey());
                }
               
            }
         }

        @Override
         protected void cleanup(Context context)
                throws IOException, InterruptedException {
             String path = context.getConfiguration().get("topKout");
            mos = new MultipleOutputs<Text, IntWritable>(context);
            Set<Entry<MyInt, String>> set = tm.entrySet();
           for (Entry<MyInt, String> entry : set) {
                 mos.write("topKMOS", new Text(entry.getValue()), new IntWritable(entry.getKey().getValue()), path);
            }
             mos.close();
        }

         
         
     }

     @SuppressWarnings("deprecation")
     public static void run(String in, String out,String topKout) throws IOException,
             ClassNotFoundException, InterruptedException {
 
         Path outPath = new Path(out);

        Configuration conf = new Configuration();
        
         //前K个词要输出到哪个目录
        conf.set("topKout",topKout);
         
        Job job = new Job(conf, "Sort");
        job.setJarByClass(Sort.class);
        job.setMapperClass(Map.class);
         job.setReducerClass(Reduce.class);

        // 设置Map输出类型
         job.setMapOutputKeyClass(IntWritable.class);
         job.setMapOutputValueClass(Text.class);
 
       // 设置Reduce输出类型
         job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        //设置MultipleOutputs的输出格式
        //这里利用MultipleOutputs进行对文件输出
         MultipleOutputs.addNamedOutput(job,"topKMOS",TextOutputFormat.class,Text.class,Text.class);
        
       // 设置输入和输出目录
        FileInputFormat.addInputPath(job, new Path(in));
        FileOutputFormat.setOutputPath(job, outPath);
        job.waitForCompletion(true);
 
    }

}

　　---------自定义int---------

package TopK;

public class MyInt implements Comparable<MyInt>{
    private Integer value;

     public MyInt(Integer value){
        this.value = value;
    }
    
    public int getValue() {
       return value;
    }
 
     public void setValue(int value) {
       this.value = value;
   }
 
   @Override
    public int compareTo(MyInt o) {
     return value.compareTo(o.getValue());
   }
    
    
}

　　------------------驱动--------------

package TopK;
  import java.io.IOException;
 
/**
 * 
  * @author mlj
  */
 public class TopK {
    public static void main(String args[]) throws ClassNotFoundException, IOException, InterruptedException{
        
       //要统计字数，排序的文字
       String in = "hdfs://localhost:9000/input/1.text";
        
       //统计字数后的结果
        String wordCout = "hdfs://mlj:9000/out/wordCout";
       
       //对统计完后的结果再排序后的内容
        String sort = "hdfs://mlj:9000/out/sort";
        
         //前K条
        String topK = "hdfs://mlj:9000/out/topK";
      
       //如果统计字数的job完成后就开始排序
        if(WordCount.run(in, wordCout)){
          Sort.run(wordCout, sort,topK);
      }
         
  }
 }

mapreduce求前k个最大值(topk 问题)

原文：http://www.cnblogs.com/mlj5288/p/4479852.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年09月23日 (328)
2021年09月24日 (313)
2021年09月17日 (191)
2021年09月15日 (369)
2021年09月16日 (411)
2021年09月13日 (439)
2021年09月11日 (398)
2021年09月12日 (393)
2021年09月10日 (160)
2021年09月08日 (222)