我有一个非常紧凑的程序来链接grep作业和sort作业,如下所示:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.map.InverseMapper;
import org.apache.hadoop.mapreduce.lib.map.RegexMapper;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.mapreduce.lib.reduce.LongSumReducer;
import java.util.Random;
public class grep {
public static void main(String[] args) throws Exception {
if (args.length < 3 || args.length > 4) {
System.out.println("Grep <in dir><out dir><regex>[<group>]");
return;
}
Configuration conf = new Configuration();
conf.set(RegexMapper.PATTERN, args[2]);
if (args.length == 4) {
conf.set(RegexMapper.GROUP, args[3]);
}
Job grepJob = new Job(conf);
grepJob.setJobName("grep");
Job sortJob = new Job(conf);
sortJob.setJobName("sort");
Path tmp = new Path("grep-temp" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
try {
FileInputFormat.setInputPaths(grepJob, args[0]);
grepJob.setMapOutputKeyClass(RegexMapper.class);
grepJob.setCombinerClass(LongSumReducer.class);
grepJob.setReducerClass(LongSumReducer.class);
FileOutputFormat.setOutputPath(grepJob, tmp);
grepJob.setOutputFormatClass(SequenceFileOutputFormat.class);
grepJob.setOutputKeyClass(Text.class);
grepJob.setOutputValueClass(LongWritable.class);
grepJob.waitForCompletion(true);
FileInputFormat.setInputPaths(sortJob, tmp);
sortJob.setInputFormatClass(SequenceFileInputFormat.class);
sortJob.setMapperClass(InverseMapper.class);
sortJob.setNumReduceTasks(1);
FileOutputFormat.setOutputPath(sortJob, new Path(args[1]));
sortJob.setSortComparatorClass(LongWritable.DecreasingComparator.class);
sortJob.waitForCompletion(true);
} finally {
FileSystem.get(conf).delete(tmp, true);
}
}
}
编译并打包,运行,然后出现此错误:
18/09/01 01:58:50 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032
18/09/01 01:58:50 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
18/09/01 01:58:50 WARN mapreduce.JobResourceUploader: No job jar file set. User classes may not be found. See Job or Job#setJar(String).
18/09/01 01:58:50 INFO input.FileInputFormat: Total input files to process : 1
18/09/01 01:58:50 INFO mapreduce.JobSubmitter: number of splits:1
18/09/01 01:58:51 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1535418462954_0005
18/09/01 01:58:51 INFO mapred.YARNRunner: Job jar is not present. Not adding any jar to the list of resources.
18/09/01 01:58:51 INFO impl.YarnClientImpl: Submitted application application_1535418462954_0005
18/09/01 01:58:51 INFO mapreduce.Job: The url to track the job: http://ubuntu:8088/proxy/application_1535418462954_0005/
18/09/01 01:58:51 INFO mapreduce.Job: Running job: job_1535418462954_0005
18/09/01 01:58:57 INFO mapreduce.Job: Job job_1535418462954_0005 running in uber mode : false
18/09/01 01:58:57 INFO mapreduce.Job: map 0% reduce 0%
18/09/01 01:59:00 INFO mapreduce.Job: Task Id : attempt_1535418462954_0005_m_000000_0, Status : FAILED
Error: java.io.IOException: Initialization of all the collectors failed. Error in last collector was :class org.apache.hadoop.mapreduce.lib.map.RegexMapper
at org.apache.hadoop.mapred.MapTask.createSortingCollector(MapTask.java:414)
at org.apache.hadoop.mapred.MapTask.access$100(MapTask.java:81)
at org.apache.hadoop.mapred.MapTask$NewOutputCollector.<init>(MapTask.java:698)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:770)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:341)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:175)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1840)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:169)
Caused by: java.lang.ClassCastException: class org.apache.hadoop.mapreduce.lib.map.RegexMapper
at java.lang.Class.asSubclass(Class.java:3218)
at org.apache.hadoop.mapred.JobConf.getOutputKeyComparator(JobConf.java:887)
at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.init(MapTask.java:1004)
at org.apache.hadoop.mapred.MapTask.createSortingCollector(MapTask.java:402)
... 9 more
那么我的程序哪里出错了?如何解决?