我在hadoop中有以下代码,当它运行时,它会生成mapper的输出作为reducer的输出。减速机基本上什么都不做。 2个输入文件的格式为:
文件A: Jan-1#starwars,17115 (每一行都是这样的。)VALUE是数字17115。
文件B:#starwars,2017 / 1/1 5696 (每一行都是这样的。)VALUE是数字5696。
Mapper 类处理这些文件和输出(仅粗体字母):
JAN #STARWARS 17115 / A 其中KEY:JAN #STARWARS
JAN #STARWARS 5696 / B 其中KEY:JAN #STARWARS
reducer应该执行以下操作:
所有相同的键都转到一个减速器,纠正我如果我错了我是hadoop的新手,每个减速器将值分成2个部分:键和值
KEY:A,VALUE 17115
KEY:B,VALUE 5696
目前它应该只是添加所有值而不关心它是否来自A或B并写入(仅粗体):
JAN #STARWARS 22.811 (22.811 = 17115 + 5696)
那么为什么它在没有reducer执行它应该做的事情的情况下编写映射器输出? 我不会将减速器的数量设置为零。
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Partitioner;
public class WordCount {
public static class TokenizerMapper
extends Mapper<Object, Text, Text, Text>{
//private final static IntWritable result = new IntWritable();
private Text word = new Text();
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString(),"\n");
while (itr.hasMoreTokens()) {
String nextWord = itr.nextToken().toUpperCase();
//System.out.println("'"+nextWord+"'");
if(isFromPlatformB(nextWord)){
//Procedure for words of Platform B.
String[] split1 = nextWord.split("(,)|(/)|(\\s)");
String seriesTitle = split1[0];
String numOfMonth = split1[2];
String numOfDay = split1[3];
String number = split1[4];//VALUE
int monthInt = Integer.parseInt(numOfMonth);
String monthString;
switch (monthInt) {
case 1: monthString = "JAN";
break;
case 2: monthString = "FEB";
break;
case 3: monthString = "MAR";
break;
case 4: monthString = "APR";
break;
case 5: monthString = "MAY";
break;
case 6: monthString = "JUN";
break;
case 7: monthString = "JUL";
break;
case 8: monthString = "AUG";
break;
case 9: monthString = "SEP";
break;
case 10: monthString = "OCT";
break;
case 11: monthString = "NOV";
break;
case 12: monthString = "DEC";
break;
default: monthString = "ERROR";
break;
}
//result.set(numberInt);
word.set(monthString + " " + seriesTitle);
System.out.println("key: "+monthString + " " + seriesTitle + ", value: "+number+"/B");
context.write(word, new Text(number + "/B"));
//FORMAT : <KEY,VALUE/B>
}
else{
//Procedure for words of Platform A.
String[] split5 = nextWord.split("(-)|( )|(,)");
String month = split5[0];
String seriesTitle = split5[2];
String value2 = split5[3];//OUTVALUE
String finalWord = month + " " + seriesTitle;//OUTKEY KEY: <APR #WESTWORLD>
word.set(finalWord);
//result.set(valueInt);
System.out.println("key: "+finalWord + ", value: "+value2+"/A");
context.write(word, new Text(value2 + "/A"));
//FORMAT : <KEY,VALUE/A>
}
}
}
/*
*This method takes the next token and returns true if the token is taken from platform B file,
*Or it returns false if the token comes from platform A file.
*
*/
public boolean isFromPlatformB(String nextToken){
// B platform has the form of : "#WestWorld ,2017/1/2){
if(nextToken.charAt(0) == '#'){
return true;
}
return false;
}
}
public static class IntSumReducer
extends Reducer<Text,IntWritable,Text,Text> {
//private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<Text> values,
Context context
) throws IOException, InterruptedException {
int sum = 0;
for (Text val : values) {
String valToString = val.toString();
String[] split = valToString.split("/");
//String keyOfValue;
String valueOfValue;
int intValueOfValue = 0;
// FORMAT : <KEY,VALUE/platform> [<KEY,VALUE>,VALUE = <key,value>]
// [0] [1]
if(split.length>1){
//keyOfValue = split[1];
valueOfValue = split[0];
//System.out.println(key);
//System.out.println(valueOfValue);
//System.out.println(keyOfValue);
intValueOfValue = Integer.parseInt(valueOfValue);
/*if(keyOfValue.equals("A")){//If value is from platform A
counterForPlatformA += intValueOfValue;
System.out.println("KEY = 'A' " + "VALUE :" +intValueOfValue);
System.out.println("counter A: "+ counterForPlatformA +"|| counter B: "+ counterForPlatformB + "||----||");
}
else if(keyOfValue.equals("B")){//If value is from platform B
counterForPlatformB += intValueOfValue;
System.out.println("KEY = 'B' " + "VALUE :" +intValueOfValue);
System.out.println("counter A: "+ counterForPlatformA +"|| counter B: "+ counterForPlatformB + "||----||");
}
else{
//ERROR
System.out.println("Not equal to A or B");
}*/
}
sum += intValueOfValue;
}
context.write(key, new Text(sum));
}
}
public static void main(String[] args) throws Exception{
if (args.length != 3 ){
System.err.println ("Usage :<inputlocation1> <inputlocation2> <outputlocation> >");
System.exit(0);
}
Configuration conf = new Configuration();
String[] files=new GenericOptionsParser(conf,args).getRemainingArgs();
Path input1=new Path(files[0]);
Path input2=new Path(files[1]);
Path output=new Path(files[2]);
//If OUTPUT already exists -> Delete it
FileSystem fs = FileSystem.get(conf);
if(fs.exists(output)){
fs.delete(output, true);
}
Job job = Job.getInstance(conf, "word count");
job.setJarByClass(WordCount.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
MultipleInputs.addInputPath(job, input1, TextInputFormat.class);
MultipleInputs.addInputPath(job, input2, TextInputFormat.class);
FileOutputFormat.setOutputPath(job, output);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
答案 0 :(得分:0)
看起来你的reducer接受一对Text对象并输出Text。如果是这种情况,看起来你有一些问题:
在你的main
中:
job.setOutputValueClass(IntWritable.class)
应该是job.setOutputValueClass(Text.class)
您还将减速器定义为:
public static class IntSumReducer extends Reducer<Text,IntWritable,Text,Text>
它应该是public static class IntSumReducer extends Reducer<Text,Text,Text,Text>
reducer正在接收Text值,而不是IntWritables。
答案 1 :(得分:0)
最后是合并器。如果您将减速器设置为合成器,那么您的映射器和减速器之间就不能有不同的类型。