如何在Hadoop中获得前5个常用词

时间:2018-03-19 01:51:41

标签: java hadoop mapreduce

我是hadoop世界的新手,并尝试做一个简单的任务。

如何在一份工作中获得hadoop中排名前5位的单词?

该程序的输出如下:

good 10
haha 15
hello 5
morning 12
nice 13
ok 16
what 7 
you 20

但我的目标是获得前5个常用字词,例如

you 20
ok 16
haha 15
nice 13
morning 12

以下是功能的代码,包括mapper,reducer和 工作控制。

Mapper(删除标点符号并将文本更改为小写)

import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class WordCountMapper extends Mapper<Object, Text, Text, IntWritable> {
    private final static IntWritable one = new IntWritable(1);
    private Text word = new Text();

    public void map(Object key,Text value, Context context) throws IOException, InterruptedException {
        String remove_pinct = value.toString.replaceAll("[\\pP+~$`^=|<>~`$^+=|<>¥×]", " ");
        StringTokenizer itr = new StringTokenizer(value.toString().toLowerCase());

        while(itr.hasMoreTokens()) {
            word.set(itr.nextToken());
            context.write (word,one);
        }
    }
}

减速

import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
    private IntWritable result = new IntWritable();

    public void reduce (Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        int sum = 0;

        for (IntWritable val : values) {
            sum += val.get();
        }

        result.set(sum);
        context.write(key, result);
    }
}

工作控制

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class WordCountJobControl {
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf, " word count ");
        job.setJarByClass(WordCountJobControl.class);
        job.setMapperClass(WordCountMapper.class);
        job.setCombinerClass(WordCountReducer.class);
        job.setReducerClass(WordCountReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        FileInputFormat.addInputPath(job, new Path(args [0]));
        FileOutputFormat.setOutputPath(job, new Path(args [1]));

        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

0 个答案:

没有答案