使用hadoop执行jar文件

时间:2013-12-30 13:16:32

标签: hadoop mapreduce

我想执行一个jar文件,该文件在从命令行执行时工作正常:

java -Xmx3g -jar jarname.jar -T class_name_in_jar -R filename1 -I filename2 -known filename3 -o filename4

上面的命令通过输入filename1,filename2和filename3来执行* class_name_in_jar *。它将在filename4中生成输出。

这是我的地图缩减程序:

import java.io.IOException;
    import java.util.*;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.conf.*;
    import org.apache.hadoop.io.*;
    import org.apache.hadoop.mapred.*;
    import org.apache.hadoop.util.*;

    public class GatkWordCount {

       public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> {
         public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
           String find_targets_cmd = "java -Xmx3g -jar <jarname>.jar -T <class name in jar> -R <filename1> -I <filename2> -known <filename3> -o <filename4>";

           exceptionOnError(execAndReconnect(find_targets_cmd));
         }
       }

    public static int execAndReconnect(String cmd) throws IOException  {
       Process p = Runtime.getRuntime().exec(cmd);
        p.waitFor();
       return p.exitValue();
    }

    public static void exceptionOnError(int errorCode) throws IOException{
        if(0 != errorCode)
            throw new IOException(String.valueOf(errorCode));
    }

       public static void main(String[] args) throws Exception {
         JobConf conf = new JobConf(GatkWordCount.class);
         conf.setJobName("GatkWordCount");

         conf.setOutputKeyClass(Text.class);
         conf.setOutputValueClass(IntWritable.class);

         conf.setReducerClass(Reduce.class);

         conf.setInputFormat(TextInputFormat.class);
         conf.setOutputFormat(TextOutputFormat.class);

         FileInputFormat.setInputPaths(conf, new Path(args[0]));
         FileOutputFormat.setOutputPath(conf, new Path(args[1]));

         JobClient.runJob(conf);
       }
    }

HDFS中,我已经放置了所有必需的输入文件。 我执行了以下命令:

  enter code herehadoop/bin/hadoop jar gatkword.jar GatkWordCount /user/hduser/gatkinput/gatkinput/group.bam /user/hduser/gatkword2

以下是执行上述命令后出现的错误消息:

13/12/29 17:58:59 WARN mapred.JobClient: Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same.
13/12/29 17:58:59 INFO util.NativeCodeLoader: Loaded the native-hadoop library
13/12/29 17:58:59 WARN snappy.LoadSnappy: Snappy native library not loaded
13/12/29 17:58:59 INFO mapred.FileInputFormat: Total input paths to process : 1
13/12/29 17:58:59 INFO mapred.JobClient: Running job: job_201312261425_0013
13/12/29 17:59:00 INFO mapred.JobClient:  map 0% reduce 0%
13/12/29 17:59:06 INFO mapred.JobClient: Task Id : attempt_201312261425_0013_m_000000_0, Status : FAILED
java.io.IOException: Type mismatch in key from map: expected org.apache.hadoop.io.Text, recieved org.apache.hadoop.io.LongWritable
    at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.collect(MapTask.java:1014)
    at org.apache.hadoop.mapred.MapTask$OldOutputCollector.collect(MapTask.java:592)
    at org.apache.hadoop.mapred.lib.IdentityMapper.map(IdentityMapper.java:38)
    at org.apache.hadoop.mapred.MapRunner.run(MapRunner.java:50)
    at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:436)
    at org.apache.hadoop.mapred.MapTask.run(MapTask.java:372)
    at org.apache.hadoop.mapred.Child$4.run(Child.java:255)
    at java.security.AccessController.doPrivileged(Native Method)
    at javax.security.auth.Subject.doAs(Subject.java:415)
    at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1136)
    at org.apache.hadoop.mapred.Child.main(Child.java:249)

13/12/29 17:59:06 INFO mapred.JobClient: Task Id : attempt_201312261425_0013_m_000001_0, Status : FAILED
java.io.IOException: Type mismatch in key from map: expected org.apache.hadoop.io.Text, recieved org.apache.hadoop.io.LongWritable
    at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.collect(MapTask.java:1014)
    at org.apache.hadoop.mapred.MapTask$OldOutputCollector.collect(MapTask.java:592)
    at org.apache.hadoop.mapred.lib.IdentityMapper.map(IdentityMapper.java:38)
    at org.apache.hadoop.mapred.MapRunner.run(MapRunner.java:50)
    at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:436)
    at org.apache.hadoop.mapred.MapTask.run(MapTask.java:372)
    at org.apache.hadoop.mapred.Child$4.run(Child.java:255)
    at java.security.AccessController.doPrivileged(Native Method)
    at javax.security.auth.Subject.doAs(Subject.java:415)
    at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1136)
    at org.apache.hadoop.mapred.Child.main(Child.java:249)

13/12/29 17:59:11 INFO mapred.JobClient: Task Id : attempt_201312261425_0013_m_000000_1, Status : FAILED
java.io.IOException: Type mismatch in key from map: expected org.apache.hadoop.io.Text, recieved org.apache.hadoop.io.LongWritable
    at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.collect(MapTask.java:1014)
    at org.apache.hadoop.mapred.MapTask$OldOutputCollector.collect(MapTask.java:592)
    at org.apache.hadoop.mapred.lib.IdentityMapper.map(IdentityMapper.java:38)
    at org.apache.hadoop.mapred.MapRunner.run(MapRunner.java:50)
    at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:436)
    at org.apache.hadoop.mapred.MapTask.run(MapTask.java:372)
    at org.apache.hadoop.mapred.Child$4.run(Child.java:255)
    at java.security.AccessController.doPrivileged(Native Method)
    at javax.security.auth.Subject.doAs(Subject.java:415)
    at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1136)
    at org.apache.hadoop.mapred.Child.main(Child.java:249)

13/12/29 17:59:11 INFO mapred.JobClient: Task Id : attempt_201312261425_0013_m_000001_1, Status : FAILED
java.io.IOException: Type mismatch in key from map: expected org.apache.hadoop.io.Text, recieved org.apache.hadoop.io.LongWritable
    at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.collect(MapTask.java:1014)
    at org.apache.hadoop.mapred.MapTask$OldOutputCollector.collect(MapTask.java:592)
    at org.apache.hadoop.mapred.lib.IdentityMapper.map(IdentityMapper.java:38)
    at org.apache.hadoop.mapred.MapRunner.run(MapRunner.java:50)
    at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:436)
    at org.apache.hadoop.mapred.MapTask.run(MapTask.java:372)
    at org.apache.hadoop.mapred.Child$4.run(Child.java:255)
    at java.security.AccessController.doPrivileged(Native Method)
    at javax.security.auth.Subject.doAs(Subject.java:415)
    at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1136)
    at org.apache.hadoop.mapred.Child.main(Child.java:249)

13/12/29 17:59:17 INFO mapred.JobClient: Task Id : attempt_201312261425_0013_m_000000_2, Status : FAILED
java.io.IOException: Type mismatch in key from map: expected org.apache.hadoop.io.Text, recieved org.apache.hadoop.io.LongWritable
    at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.collect(MapTask.java:1014)
    at org.apache.hadoop.mapred.MapTask$OldOutputCollector.collect(MapTask.java:592)
    at org.apache.hadoop.mapred.lib.IdentityMapper.map(IdentityMapper.java:38)
    at org.apache.hadoop.mapred.MapRunner.run(MapRunner.java:50)
    at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:436)
    at org.apache.hadoop.mapred.MapTask.run(MapTask.java:372)
    at org.apache.hadoop.mapred.Child$4.run(Child.java:255)
    at java.security.AccessController.doPrivileged(Native Method)
    at javax.security.auth.Subject.doAs(Subject.java:415)
    at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1136)
    at org.apache.hadoop.mapred.Child.main(Child.java:249)

13/12/29 17:59:17 INFO mapred.JobClient: Task Id : attempt_201312261425_0013_m_000001_2, Status : FAILED
java.io.IOException: Type mismatch in key from map: expected org.apache.hadoop.io.Text, recieved org.apache.hadoop.io.LongWritable
    at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.collect(MapTask.java:1014)
    at org.apache.hadoop.mapred.MapTask$OldOutputCollector.collect(MapTask.java:592)
    at org.apache.hadoop.mapred.lib.IdentityMapper.map(IdentityMapper.java:38)
    at org.apache.hadoop.mapred.MapRunner.run(MapRunner.java:50)
    at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:436)
    at org.apache.hadoop.mapred.MapTask.run(MapTask.java:372)
    at org.apache.hadoop.mapred.Child$4.run(Child.java:255)
    at java.security.AccessController.doPrivileged(Native Method)
    at javax.security.auth.Subject.doAs(Subject.java:415)
    at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1136)
    at org.apache.hadoop.mapred.Child.main(Child.java:249)

13/12/29 17:59:22 INFO mapred.JobClient: Job complete: job_201312261425_0013
13/12/29 17:59:22 INFO mapred.JobClient: Counters: 7
13/12/29 17:59:22 INFO mapred.JobClient:   Job Counters 
13/12/29 17:59:22 INFO mapred.JobClient:     SLOTS_MILLIS_MAPS=42572
13/12/29 17:59:22 INFO mapred.JobClient:     Total time spent by all reduces waiting after reserving slots (ms)=0
13/12/29 17:59:22 INFO mapred.JobClient:     Total time spent by all maps waiting after reserving slots (ms)=0
13/12/29 17:59:22 INFO mapred.JobClient:     Launched map tasks=8
13/12/29 17:59:22 INFO mapred.JobClient:     Data-local map tasks=8
13/12/29 17:59:22 INFO mapred.JobClient:     SLOTS_MILLIS_REDUCES=0
13/12/29 17:59:22 INFO mapred.JobClient:     Failed map tasks=1
13/12/29 17:59:22 INFO mapred.JobClient: Job Failed: # of failed Map Tasks exceeded allowed limit. FailedCount: 1. LastFailedTask: task_201312261425_0013_m_000000
Exception in thread "main" java.io.IOException: Job failed!
    at org.apache.hadoop.mapred.JobClient.runJob(JobClient.java:1327)
    at GatkWordCount.main(GatkWordCount.java:51)
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:601)
    at org.apache.hadoop.util.RunJar.main(RunJar.java:156)

请在我的代码中建议需要更改的内容才能正确执行。谢谢你的帮助。

2 个答案:

答案 0 :(得分:3)

在此示例中,您尚未指定mapper类,并且无法使用默认的标识映射器。

这是因为您指定的TextInputFormat生成LongWritable(行号)作为键,Text(行内容)作为值。因此默认的身份映射器将忠实地发出Longwritable和Text不变。由于您已将outputKeyClass指定为Text,因此映射器的键输出(LongWritable)与映射器输出归类系统(Text)所期望的键类型之间存在类不匹配。您在值字段上也会出现类似的不匹配,但系统首先在关键字段上失败。

要解决这个问题,你必须编写自己的mapper类,它采用LongWritable,Text并输出Text,Intwritable。

编辑:我只是仔细看看你的代码。你只是使用mapreduce框架在reducer中执行一个java jar,这似乎很明显违反了Hadoop的精神(使用MapReduce计算HDFS数据)。我实际上会重新检查你正在尝试用这个应用程序做什么,而不是花更多的时间来使它在mapreduce中工作。

答案 1 :(得分:0)

我修改了代码,如下所示:

    import java.io.*;
    import java.util.*;

    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.conf.*;
    import org.apache.hadoop.io.*;
    import org.apache.hadoop.mapred.*;
    import org.apache.hadoop.util.*;

    public class GatkWordCount {
       public static class Reduce extends MapReduceBase implements Reducer<LongWritable, Text, LongWritable, Text> {
         public void reduce(LongWritable key, Iterator<Text> values, OutputCollector<LongWritable, Text> output, Reporter reporter) throws IOException {
           String cmd = "java -jar /home/hduser/apps/hadoop/GenomeAnalysisTK.jar -T RealignerTargetCreator -R /user/hduser/gatkinput/gatkinput/ucsc.hg19.fasta -I /user/hduser/gatkinput/gatkinput/groupbam -known /user/hduser/gatkinput/gatkinput/Mills_and_1000G_gold_standard.indels.hg19.vcf -o /user/hduser/gatkinput/gatkinput/target_intervals.list";

    try{    
        System.out.println("GATK cmd===>"+cmd);
        Process p = Runtime.getRuntime().exec(cmd);
    p.waitFor();
    System.out.println("p.exitValue-->"+p.exitValue());
         } catch (Exception e){
        System.out.println("Exception-->"+e.getMessage());
    }
}}
      public static void main(String[] args) throws Exception {
         JobConf conf = new JobConf(GatkWordCount.class);
         conf.setJobName("GatkWordCount");

         conf.setReducerClass(Reduce.class);
         //conf.setMapperClass(Map.class);

         conf.setOutputKeyClass(LongWritable.class);
         conf.setOutputValueClass(Text.class);


         conf.setInputFormat(TextInputFormat.class);
         conf.setOutputFormat(TextOutputFormat.class);

         FileInputFormat.setInputPaths(conf, new Path(args[0]));
         FileOutputFormat.setOutputPath(conf, new Path(args[1]));

         JobClient.runJob(conf);
       }
    }
  

[1]现在,执行中没有问题,但是将工作减少到66%   [2] userlogs /../ stderr文件中没有错误。甚至system.out.println   在stdout文件中打印,但system.out.println是连续的   在stdout文件中打印? [3] p.exitValue - &gt;在代码中打印1表示作业异常终止但日志中没有问题。   你能否告诉我这段代码的问题或建议一种执行jar文件的替代方法?   PS:我不能对答案投票,因为没有15个声望。   感谢。