我正在运行下面的Map reduce代码,以计算每个英语字母开头的单词的总和和平均长度。
For example : If the doc only contains the word 'and' 5 times
letter | total words | average length
a 5 3
mapreduce程序如下:
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class LetterWiseAvgLengthV1
{
public static class TokenizerMapper
extends Mapper<LongWritable, Text, Text, Text>
{
public void map(LongWritable key, Text value, Context context
) throws IOException, InterruptedException
{
String st [] = value.toString().split("\\s+");
for(String word : st) {
String wordnew=word.replaceAll("[^a-zA-Z]","");
String firstLetter = wordnew.substring(0, 1);
if(!wordnew.isEmpty()){
// write ('a',3) if the word is and
context.write(new Text(firstLetter), new Text(String.valueOf(wordnew.length())));
}
else continue;
}
}
}
public static class IntSumReducer
extends Reducer<Text,Text,Text,Text>
{
public void reduce(Text key, Iterable<Text> values,
Context context
) throws IOException, InterruptedException
{
int sum=0,count=0;
for (Text val : values)
{
sum += Integer.parseInt(val.toString());
count+= 1;
}
float avg=(sum/(float)count);
String op="Average length of " + count + " words = " + avg;
context.write(new Text(key), new Text(op));
}
}
public static void main(String[] args) throws Exception
{
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "wordLenAvgCombiner");
job.setJarByClass(LetterWiseAvgLengthV1.class);
job.setMapperClass(TokenizerMapper.class);
job.setReducerClass(IntSumReducer.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
当我在文本文档上执行以下程序时,它将在HDFS中创建一个空的输出目录。执行期间没有失败,但输出文件夹始终为空