这个Java for HDInsight Hadoop有什么问题?

时间:2013-07-30 18:33:12

标签: java hadoop hdinsight

我试图找出当我尝试在hadoop上运行时,下面的Java无效的原因。

import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;

public class PageStat implements Tool {
private Configuration conf;

@Override
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "Page visit statistics MR";
    job.setJobName(jobName);

    job.setJarByClass(PageStat.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(PageStat.PageStatMapper.class);
    job.setReducerClass(PageStat.PageStatReducer.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setNumReduceTasks(job.getConfiguration().getInt("num.reducer", 1));

    int status =  job.waitForCompletion(true) ? 0 : 1;
    return status;
}

public static void main(String[] args) throws Exception {
    int exitCode = ToolRunner.run(new PageStat(), args);
    System.exit(exitCode);
}

public void setConf(Configuration conf) {
   this.conf = conf;
}

public Configuration getConf() {
    return conf;
}

public static class PageStatMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
    private Text keyHolder = new Text();
    private IntWritable valueHolder = new IntWritable();

    @Override
    protected void map(LongWritable key, Text value, Context context)
        throws IOException, InterruptedException {
        String[] items  =  value.toString().split(",");
        if (items.length == 3) {
            String url = items[1];
            keyHolder.set(url);
            Integer duration = Integer.parseInt(items[2]);
            valueHolder.set(duration);
            context.write(keyHolder, valueHolder);
        } else {
            context.getCounter("Error", "invalidData").increment(1);
        }
    }        
}   

public static class PageStatReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
    private Text keyHolder = new Text();
    private IntWritable valueHolder = new IntWritable();
    private String statType;
    private int count;
    private int totalTime;
    private int avTime;

    protected void setup(Context context) throws IOException, InterruptedException {
        Configuration conf = context.getConfiguration();
        statType = conf.get("page.stat");
    }

    protected void reduce(Text key, Iterable<IntWritable> values, Context context)
    throws IOException, InterruptedException {
        count = 0;
        totalTime = 0;
        for (IntWritable value : values){
            ++count;
            totalTime += value.get();
        } 
        avTime = totalTime / count;

        keyHolder.set(key);
        if (statType.equals("average")){
            valueHolder.set(avTime);
        } else {
            valueHolder.set(totalTime);
        }
        context.write(keyHolder, valueHolder);
    }
}    

}

错误是:

  

c:\ hadoop-training \ tutorial02-jobtracker&gt; hadoop jar PageStat.jar   PageStat jobtra cker / input / visit_5000000.txt jobtracker / output   13/07/29 11:24:50 INFO input.FileInputFormat:到的总输入路径   进程:1 log4j:ERROR无法重命名   [c:\ Hadoop \ hadoop-1.1.0-SNAPSHOT \ logs / hadoop.log] t o   [C:\ Hadoop的\ Hadoop的1.1.0-SNAPSHOT \日志/ hadoop.log.2013-07-26]。 13/07/29   11:24:51 INFO util.NativeCodeLoader:加载native-hadoop库   13/07/29 11:24:51 WARN snappy.LoadSnappy:Snappy原生图书馆没有   13/07/29 11:24:54 INFO mapred.JobClient:正在运行的工作:   job_201307261340_0001 13/07/29 11:24:55 INFO mapred.JobClient:map 0%   减少0%13/07/29 11:25:24 INFO mapred.JobClient:地图1%减少0%   13/07/29 11:25:27 INFO mapred.JobClient:地图6%减少0%13/07/29   11:25:30 INFO mapred.JobClient:地图14%减少0%13/07/29 11:25:35   INFO mapred.JobClient:map 22%reduce 0%13/07/29 11:25:38 INFO   mapred.JobClient:地图31%减少0%13/07/29 11:25:41信息   mapred.JobClient:地图35%减少0%13/07/29 11:25:44信息   mapred.JobClient:map 44%reduce 0%13/07/29 11:25:47 INFO   mapred.JobClient:map 50%reduce 0%13/07/29 11:26:03 INFO   mapred.JobClient:地图60%减少0%13/07/29 11:26:06信息   mapred.JobClient:map 64%reduce 0%13/07/29 11:26:09 INFO   mapred.JobClient:地图69%减少0%13/07/29 11:26:12信息   mapred.JobClient:地图76%减少0%13/07/29 11:26:15信息   mapred.JobClient:地图81%减少0%13/07/29 11:26:18信息   mapred.JobClient:地图85%减少0%13/07/29 11:26:21信息   mapred.JobClient:地图87%减少0%13/07/29 11:26:24信息   mapred.JobClient:地图92%减少0%13/07/29 11:26:27信息   mapred.JobClient:地图94%减少0%13/07/29 11:26:30信息   mapred.JobClient:地图96%减少0%13/07/29 11:26:33信息   mapred.JobClient:map 97%reduce 0%13/07/29 11:26:37 INFO   mapred.JobClient:地图99%减少8%13/07/29 11:26:40信息   mapred.JobClient:地图100%减少8%13/07/29 11:26:46信息   mapred.JobClient:地图100%减少25%13/07/29 11:26:54信息   mapred.JobClient:任务ID:attempt_201307261340_0001_r_0 00000_0,   状态:FAILED java.lang.NullPointerException           在PageStat $ PageStatReducer.reduce(PageStat.java:120)            在PageStat $ PageStatReducer.reduce(PageStat.java:96)           在org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:177)           在org.apache.hadoop.mapred.ReduceTask.runNewReducer(ReduceTask.java:651)   )            在org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:418)           在org.apache.hadoop.mapred.Child $ 4.run(Child.java:271)           at java.security.AccessController.doPrivileged(Native Method)           在javax.security.auth.Subject.doAs(Subject.java:396)            在org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInforma   tion.java:1135)           在org.apache.hadoop.mapred.Child.main(Child.java:265)

     

13/07/29 11:26:56 INFO mapred.JobClient:地图100%减少0%13/07/29   11:27:05 INFO mapred.JobClient:地图100%减少8%13/07/29 11:27:08   INFO mapred.JobClient:地图100%减少33%13/07/29 11:27:10信息   mapred.JobClient:任务ID:attempt_201307261340_0001_r_0 00000_1,   状态:FAILED java.lang.NullPointerException           在PageStat $ PageStatReducer.reduce(PageStat.java:120)           在PageStat $ PageStatReducer.reduce(PageStat.java:96)           在org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:177)            在org.apache.hadoop.mapred.ReduceTask.runNewReducer(ReduceTask.java:651)   )           在org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:418)           在org.apache.hadoop.mapred.Child $ 4.run(Child.java:271)            at java.security.AccessController.doPrivileged(Native Method)           在javax.security.auth.Subject.doAs(Subject.java:396)           在org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInforma   tion.java:1135)           在org.apache.hadoop.mapred.Child.main(Child.java:265)

     

13/07/29 11:27:11 INFO mapred.JobClient:地图100%减少0%13/07/29   11:27:20 INFO mapred.JobClient:地图100%减少8%13/07/29 11:27:23   INFO mapred.JobClient:地图100%减少25%13/07/29 11:27:25信息   mapred.JobClient:任务ID:attempt_201307261340_0001_r_0 00000_2,   状态:FAILED java.lang.NullPointerException           在PageStat $ PageStatReducer.reduce(PageStat.java:120)           在PageStat $ PageStatReducer.reduce(PageStat.java:96)            在org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:177)           在org.apache.hadoop.mapred.ReduceTask.runNewReducer(ReduceTask.java:651)   )           在org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:418)            在org.apache.hadoop.mapred.Child $ 4.run(Child.java:271)           at java.security.AccessController.doPrivileged(Native Method)           在javax.security.auth.Subject.doAs(Subject.java:396)           在org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInforma   tion.java:1135)           在org.apache.hadoop.mapred.Child.main(Child.java:265)

     

13/07/29 11:27:26 INFO mapred.JobClient:地图100%减少0%13/07/29   11:27:38 INFO mapred.JobClient:地图100%减少25%13/07/29 11:27:41   INFO mapred.JobClient:map 100%reduce 0%13/07/29 11:27:43 INFO   mapred.JobClient:工作完成:job_201307261340_0001 13/07/29   11:27:43 INFO mapred.JobClient:Counters:24 13/07/29 11:27:43 INFO   mapred.JobClient:Job Counters 13/07/29 11:27:43 INFO   mapred.JobClient:推出减少任务= 4 13/07/29 11:27:43 INFO   mapred.JobClient:SLOTS_MILLIS_MAPS = 179086 13/07/29 11:27:43 INFO   mapred.JobClient:所有人花费的总时间减少了wai ting之后   保留插槽(ms)= 0 13/07/29 11:27:43 INFO mapred.JobClient:
  在保留插槽(ms)= 0后,所有映射等待的总时间   13/07/29 11:27:43 INFO mapred.JobClient:推出地图任务= 4   13/07/29 11:27:43 INFO mapred.JobClient:数据本地地图任务= 4   13/07/29 11:27:43 INFO mapred.JobClient:失败减少任务= 1   13/07/29 11:27:43 INFO mapred.JobClient:
  SLOTS_MILLIS_REDUCES = 106513 13/07/29 11:27:43 INFO mapred.JobClient:
  FileSystemCounters 13/07/29 11:27:43 INFO mapred.JobClient:
  FILE_BYTES_READ = 179504086 13/07/29 11:27:43 INFO mapred.JobClient:
  HDFS_BYTES_READ = 254931072 13/07/29 11:27:43 INFO mapred.JobClient:
  FILE_BYTES_WRITTEN = 359099432 13/07/29 11:27:43 INFO mapred.JobClient:   文件输入格式计数器13/07/29 11:27:43 INFO mapred.JobClient:
  字节读取= 254930544 13/07/29 11:27:43 INFO mapred.JobClient:
  Map-Reduce Framework 13/07/29 11:27:43 INFO mapred.JobClient:Map   输出物化字节= 17949 9502 13/07/29 11:27:43信息   mapred.JobClient:合并输出记录= 0 13/07/29 11:27:43 INFO   mapred.JobClient:地图输入记录= 5000000 13/07/29 11:27:43   INFO mapred.JobClient:物理内存(字节)snapshot = 85 1607552   13/07/29 11:27:43 INFO mapred.JobClient:溢出记录= 10000000   13/07/29 11:27:43 INFO mapred.JobClient:地图输出   bytes = 169499478 13/07/29 11:27:43 INFO mapred.JobClient:CPU时间   花(ms)= 81308 13/07/29 11:27:43 INFO mapred.JobClient:总计   已提交的堆使用量(字节)= 746323968 13/07/29 11:27:43信息   mapred.JobClient:虚拟内存(字节)snapshot = 988 401664   13/07/29 11:27:43 INFO mapred.JobClient:合并输入记录= 0   13/07/29 11:27:43 INFO mapred.JobClient:地图输出   记录= 5000000 13/07/29 11:27:43 INFO mapred.JobClient:
  SPLIT_RAW_BYTES = 528

感谢!!!

2 个答案:

答案 0 :(得分:1)

我有类似的问题,你需要使用-D标志来执行:

-Dpage.stat=total

您可能会看到错误:

log4j:WARN No appenders could be found for logger (org.apache.hadoop.hdfs.DFSClient).
log4j:WARN Please initialize the log4j system properly.

这不是完整的答案,我自己仍然深究其中。

答案 1 :(得分:0)

堆栈跟踪中的行号似乎与发布的源代码对齐。自此次运行后代码是否已更改?

可能在if(statType ...)行上发生NullPointerException。我没有在配置中看到任何设置“page.stat”的内容,要么在run方法中进行硬编码,要么在作业提交中作为参数传递。这将导致statType成员初始化为null。