我有一个简单的hadoop应用程序,它获取一个CSV文件,然后用“,”分割条目,然后计算第一个项目。
以下是我的代码。
package com.bluedolphin; import java.io.IOException; import java.util.Iterator; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; public class MyJob extends Configured implements Tool { private final static LongWritable one = new LongWritable(1); public static class MapClass extends Mapper<Object, Text, Text, LongWritable> { private Text word = new Text(); public void map(Object key, Text value, OutputCollector<Text, LongWritable> output, Reporter reporter) throws IOException, InterruptedException { String[] citation = value.toString().split(","); word.set(citation[0]); output.collect(word, one); } } public static class Reduce extends Reducer<Text, LongWritable, Text, LongWritable> { public void reduce( Text key, Iterator<LongWritable> values, OutputCollector<Text, LongWritable> output, Reporter reporter) throws IOException, InterruptedException { int sum = 0; while (values.hasNext()) { sum += values.next().get(); } output.collect(key, new LongWritable(sum)); } } public static class Combiner extends Reducer<Text, IntWritable, Text, LongWritable> { public void reduce( Text key, Iterator<LongWritable> values, OutputCollector<Text, LongWritable> output, Reporter reporter) throws IOException, InterruptedException { int sum = 0; while (values.hasNext()) { sum += values.next().get(); } output.collect(key, new LongWritable(sum)); } } public int run(String[] args) throws Exception { Configuration conf = getConf(); Job job = new Job(conf, "MyJob"); job.setJarByClass(MyJob.class); Path in = new Path(args[0]); Path out = new Path(args[1]); FileInputFormat.setInputPaths(job, in); FileOutputFormat.setOutputPath(job, out); job.setMapperClass(MapClass.class); // job.setCombinerClass(Combiner.class); job.setReducerClass(Reduce.class); // job.setInputFormatClass(KeyValueInputFormat.class); job.setInputFormatClass(TextInputFormat.class); // job.setOutputFormatClass(KeyValueOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); System.exit(job.waitForCompletion(true) ? 0 : 1); return 0; } public static void main(String args[]) throws Exception { int res = ToolRunner.run(new Configuration(), new MyJob(), args); System.exit(res); } }
这是错误:
11/12/16 22:16:58 INFO mapred.JobClient: Task Id : attempt_201112161948_0005_m_000000_0, Status : FAILED java.io.IOException: Type mismatch in key from map: expected org.apache.hadoop.io.Text, recieved org.apache.hadoop.io.LongWritable at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.collect(MapTask.java:1013) at org.apache.hadoop.mapred.MapTask$NewOutputCollector.write(MapTask.java:690) at org.apache.hadoop.mapreduce.TaskInputOutputContext.write(TaskInputOutputContext.java:80) at org.apache.hadoop.mapreduce.Mapper.map(Mapper.java:124) at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:144) at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:763) at org.apache.hadoop.mapred.MapTask.run(MapTask.java:369) at org.apache.hadoop.mapred.Child$4.run(Child.java:259) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:416) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1059) at org.apache.hadoop.mapred.Child.main(Child.java:253)
答案 0 :(得分:8)
要在代码中修复一些事情
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
确保映射器/缩减器的输入/输出的类型为o.a.h.io.Writable。 Mapper的输入键是Object,使其成为LongWritable。
看起来Combiner和Reducer的功能是一样的,所以你不需要重复它。
job.setCombinerClass(Reducer.class);
此外,您可以使用WordCount示例,您的要求与WordCount示例之间没有太大区别。
答案 1 :(得分:5)
一般说明,如果我们有Mapper<K1,V1, K2,V2>
和Reducer<K2,V2, K3,V3>
,最好在(在工作中)声明以下内容
JobConf conf = new JobConf(MyJob.class);
...
conf.setMapOutputKeyClass(K2.class);
conf.setMapOutputValueClass(V2.class);
您可以看到另一个示例here。
答案 2 :(得分:0)
旧API(o.a.h.mapred)和新API(o.a.h.mapreduce)不兼容,因此不应混用。
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
您应该尝试使用Context替换Map和Reduce函数签名中的OutputCollector和Reporter。 map(K1 key,V1 val,Context context)和output.collect(k,v)with context.write(k,v)
供参考使用此链接以及有关迁移到新API的更多详细信息 http://www.slideshare.net/sh1mmer/upgrading-to-the-new-map-reduce-api#