我正在尝试计算单词频率并使用顺序反转设计模式。
这是我的Java代码:
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.*;
import org.apache.hadoop.mapreduce.lib.output.*;
import org.apache.hadoop.util.*;
import java.io.*;
import java.util.*;
public class WordFreq2 {
static enum StatusCounters {MAP_COUNTER, REDUCE_COUNTER, TOTAL_WORDS}
static enum MyExceptions {IO_EXCEPTION, INTERRUPTED_EXCEPTION, NULL_POINTER_EXCEPTION}
public static class MyComparator extends WritableComparator {
public int compare(WritableComparable a, WritableComparable b)
{
if (a.toString().equals("special_key0") && b.toString().equals("special_key1") )
return 0;
else
if ( a.toString().equals("special_key0") || a.toString().equals("special_key1") )
return -1;
else
if ( b.toString().equals("special_key0") || a.toString().equals("special_key1") )
return 1;
else
return a.toString().compareTo(b.toString());
}
}
public static class MyPartitioner extends Partitioner<Text,IntWritable>
{
public int getPartition(Text key, IntWritable value, int num)
{
if ( key.toString().equals("special_key0") )
return 0;
else
if ( key.toString().equals("special_key1") )
return 1;
else
return key.hashCode() % num;
}
}
public static class MyMap extends Mapper<LongWritable, Text, Text, IntWritable> {
private Text word = new Text();
private final int MEMORYHASHSIZE = 7;
private final HashMap<String,Integer> memoryHash = new HashMap<String,Integer>(MEMORYHASHSIZE);
private int special_key_count = 0;
protected void setup(Context context) throws IOException, InterruptedException {
}
protected void cleanup(Context context) throws IOException, InterruptedException {
flushMap(context);
for ( int c = 0; c < context.getNumReduceTasks(); c++)
{
word.set("special_key"+c);
context.write(word,new IntWritable(special_key_count));
}
}
private void flushMap(Context context) throws IOException, InterruptedException
{
Iterator<Map.Entry<String, Integer>> entries = memoryHash.entrySet().iterator();
while (entries.hasNext()) {
Map.Entry<String, Integer> entry = entries.next();
word.set(entry.getKey());
context.write(word,new IntWritable(entry.getValue()));
entries.remove();
}
}
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
context.progress(); //in case of long running code, report that something is happening
while (tokenizer.hasMoreTokens())
{
String current_token = tokenizer.nextToken();
// Key Present in our In-Memory Hash Tbale
if ( memoryHash.containsKey(current_token) )
{
// Increase the corresponding counter
Integer val = memoryHash.get(current_token);
memoryHash.put(current_token,++val);
}
else
{
// Flush the HashTable if size limit reached
if ( memoryHash.size() == MEMORYHASHSIZE)
flushMap(context);
memoryHash.put(current_token,1); // Make a new key with corresponding count 1
}
special_key_count++;
context.getCounter(StatusCounters.MAP_COUNTER).increment(1);
}
}
}
public static class Reduce extends Reducer<Text, IntWritable, Text, FloatWritable>
{
int total_words;
protected void setup(Context context) throws IOException, InterruptedException {
total_words=0;
}
protected void cleanup(Context context) throws IOException, InterruptedException {
context.getCounter(StatusCounters.TOTAL_WORDS).increment(total_words);
}
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
float frequency;
for (IntWritable val : values)
{
if(key.toString().equals("special0") || key.toString().equals("special1"))
{
total_words = total_words + val.get();
}
else
{
frequency = val.get() / total_words;
context.write(key, new FloatWritable(frequency));
}
}
context.progress(); //in case of long running code, report that something is happening
context.getCounter(StatusCounters.REDUCE_COUNTER).increment(1);
}
}
private static boolean deleteOutputDir(Job job, Path p) throws IOException {
boolean retvalue = false;
Configuration conf = job.getConfiguration();
FileSystem myfs = p.getFileSystem(conf);
if(myfs.exists(p) && myfs.isDirectory(p)) {
retvalue = myfs.delete(p,true);
}
return retvalue;
}
public static void main(String[] args) throws Exception {
Job job = Job.getInstance();
job.setJarByClass(WordFreq2.class);
job.setJobName("wordfreq");
/* type of map output */
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
/* type of reduce output */
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FloatWritable.class);
/* specify input/output directories */
FileInputFormat.setInputPaths(job, new Path(args[0]));
deleteOutputDir(job,new Path(args[1]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
/* How to read and write inputs/outputs */
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
/* specify program components */
job.setMapperClass(MyMap.class);
job.setReducerClass(Reduce.class);
job.setNumReduceTasks(2); // Set the number of reducer to two
job.setSortComparatorClass(MyComparator.class);
job.setPartitionerClass(MyPartitioner.class);
boolean result = job.waitForCompletion(true);
Counters counters = job.getCounters();
Counter acounter = counters.findCounter(MyExceptions.IO_EXCEPTION);
long iocount = acounter.getValue();
System.exit(result?0:1);
}
}
但是,我经常遇到这个错误:
Error: java.lang.NullPointerException
at org.apache.hadoop.io.WritableComparator.compare(WritableComparator.java:128)
at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.compare(MapTask.java:1245)
at org.apache.hadoop.util.QuickSort.sortInternal(QuickSort.java:74)
at org.apache.hadoop.util.QuickSort.sort(QuickSort.java:63)
at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.sortAndSpill(MapTask.java:1575)
at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.flush(MapTask.java:1462)
at org.apache.hadoop.mapred.MapTask$NewOutputCollector.close(MapTask.java:700)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:770)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:340)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:168)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1548)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:163)
我无法弄清楚这个问题。有人能指出我正确的方向吗?