映射减少以产生n个最常见的单词

时间:2018-04-14 17:15:52

标签: java mapreduce

我有MapReduce代码,用于打印文档中单词的显示次数。我想更改此代码以生成N个最常用的单词。 N是输入参数。这是代码:

    import java.io.IOException;        
    import org.apache.hadoop.conf.Configuration;       
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IntWritable;        
    import org.apache.hadoop.io.LongWritable;        
    import org.apache.hadoop.io.Text;        
    import org.apache.hadoop.mapreduce.Job;        
    import org.apache.hadoop.mapreduce.Mapper;        
    import org.apache.hadoop.mapreduce.Reducer;        
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;        
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;        
    import org.apache.hadoop.util.GenericOptionsParser;

    public class WordCount {

    public static void main(String [] args) throws Exception

    {

    Configuration c=new Configuration();        
    String[] files=new GenericOptionsParser(c,args).getRemainingArgs();        
    Path input=new Path(files[0]);        
    Path output=new Path(files[1]);        
    Job j=new Job(c,"wordcount");        
    j.setJarByClass(WordCount.class);        
    j.setMapperClass(MapForWordCount.class);        
    j.setReducerClass(ReduceForWordCount.class);        
    j.setOutputKeyClass(Text.class);        
    j.setOutputValueClass(IntWritable.class);        
    FileInputFormat.addInputPath(j, input);        
    FileOutputFormat.setOutputPath(j, output);        
    System.exit(j.waitForCompletion(true)?0:1);

    }

    public static class MapForWordCount extends Mapper<LongWritable, Text, Text, IntWritable>{

    public void map(LongWritable key, Text value, Context con) throws IOException, InterruptedException

    {

    String line = value.toString();

    String[] words=line.split("[^\\p{L}0-9]+");

    for(String word: words )

    {
         if(word.length()<=3){
             continue;
         }else{
             Text outputKey = new Text(word.toUpperCase().trim());

              IntWritable outputValue = new IntWritable(1);

              con.write(outputKey, outputValue); 
         }
          } } }

    public static class ReduceForWordCount extends Reducer<Text, IntWritable, Text, IntWritable>
    {

 public void reduce(Text word, Iterable<IntWritable> values, Context con) throws IOException, InterruptedException

    {

    int sum = 0;

       for(IntWritable value : values)

       {

       sum += value.get();

       }

       con.write(word, new IntWritable(sum));

    }

    }

    }

我在hadoop文件系统中使用一个文件来计算单词计数,现在我只得到一个单词列表,其中包含它们在文本中出现的次数。

0 个答案:

没有答案