按值Hadoop排序

时间:2014-12-22 02:52:23

标签: hadoop

我正试图按价值排序并尝试运行两个 mappers和redurs,但是当第二个工作开始时,它失败并说:

  

14/12/21 18:43:35 ERROR security.UserGroupInformation:   PriviledgedActionException as:cloudera(auth:SIMPLE)   原因:org.apache.hadoop.mapred.FileAlreadyExistsException:输出   目录   HDFS://localhost.localdomain:8020 /用户/ Cloudera的/单词计数/输出   已存在线程“main”中的异常   org.apache.hadoop.mapred.FileAlreadyExistsException:输出目录   HDFS://localhost.localdomain:8020 /用户/ Cloudera的/单词计数/输出   已存在

这是我的代码:

package org.myorg;

import java.io.IOException;
import java.util.*;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.mapreduce.Job;

public class WordCount {

public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> {

    private final static IntWritable one = new IntWritable(1);
    private Text word = new Text();

    public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
        String line = value.toString();
        StringTokenizer tokenizer = new StringTokenizer(line);
        while (tokenizer.hasMoreTokens()) {
            word.set(tokenizer.nextToken());
            output.collect(word, one);
        }
    }
}

public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> {

    public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
        int sum = 0;
        while (values.hasNext()) {
            sum += values.next().get();
        }
        output.collect(key, new IntWritable(sum));
    }
}

class Map1 extends MapReduceBase implements Mapper<Object, Text, IntWritable, Text> {

    public void map(Object key, Text value, OutputCollector<IntWritable, Text> collector, Reporter arg3) throws IOException {
        String line = value.toString();
        StringTokenizer stringTokenizer = new StringTokenizer(line);
        {
            int number = 999;
            String word = "empty";

            if (stringTokenizer.hasMoreTokens()) {
                String str0 = stringTokenizer.nextToken();
                word = str0.trim();
            }

            if (stringTokenizer.hasMoreElements()) {
                String str1 = stringTokenizer.nextToken();
                number = Integer.parseInt(str1.trim());
            }
            collector.collect(new IntWritable(number), new Text(word));
        }

    }

}

class Reduce1 extends MapReduceBase implements Reducer<IntWritable, Text, IntWritable, Text> {

    public void reduce(IntWritable key, Iterator<Text> values, OutputCollector<IntWritable, Text> arg2, Reporter arg3) throws IOException {
        while ((values.hasNext())) {
            arg2.collect(key, values.next());
        }
    }

}

public static void main(String[] args) throws Exception {
    JobConf conf = new JobConf(WordCount.class);
    conf.setJobName("wordCount");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(Map.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path("wordcount/output"));

//JobClient.runJob(conf);
    //------------------------------------------------------------------
    JobConf conf2 = new JobConf(WordCount.class);
    conf2.setJobName("WordCount1");

    conf2.setOutputKeyClass(Text.class);
    conf2.setOutputValueClass(IntWritable.class);

    conf2.setMapperClass(Map1.class);
    conf2.setCombinerClass(Reduce1.class);
    conf2.setReducerClass(Reduce1.class);

    conf2.setInputFormat(TextInputFormat.class);
    conf2.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf2, new Path("wordcount/output/part-00000"));
    FileOutputFormat.setOutputPath(conf2, new Path(args[1]));

    Job job1 = new Job(conf);
    Job job2 = new Job(conf2);

    job1.submit();
if (job1.waitForCompletion(true)) {
    job2.submit();
    job2.waitForCompletion(true);
    }

   }
 }

我已经尝试过几次更改路径,甚至创建一个名为tmp的新直接但没有运气。

当前错误消息:

    14/12/21 19:58:12 INFO mapred.JobClient: Running job: job_201412211623_0042
    14/12/21 19:58:13 INFO mapred.JobClient:  map 0% reduce 0%
    14/12/21 19:58:35 INFO mapred.JobClient: Task Id :      attempt_201412211623_0042_m_000001_0, Status : FAILED
    java.lang.RuntimeException: Error in configuring object
    at org.apache.hadoop.util.ReflectionUtils.setJobConf(ReflectionUtils.java:109)
    at org.apache.hadoop.util.ReflectionUtils.setConf(ReflectionUtils.java:75)
    at org.apache.hadoop.util.ReflectionUtils.newInstance(ReflectionUtils.java:133)
    at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:413)
    at org.apache.hadoop.mapred.MapTask.run(MapTask.java:332)
    at org.apache.hadoop.mapred.Child$4.run(Child.java:268)
    at java.security.AccessController.doPrivileged(Native Method)
    at javax.security.auth.Subject.doAs(Subject.java:396)
    at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1438)
    at org.apache.hadoop.mapred.Child.main(Child.java:262)
Caused by: java.lang.reflect.InvocationTargetException
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:39)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.ja
14/12/21 19:58:35 INFO mapred.JobClient: Task Id : attempt_201412211623_0042_m_000000_0, Status : FAILED
java.lang.RuntimeException: Error in configuring object
    at org.apache.hadoop.util.ReflectionUtils.setJobConf(ReflectionUtils.java:109)
    at org.apache.hadoop.util.ReflectionUtils.setConf(ReflectionUtils.java:75)
    at org.apache.hadoop.util.ReflectionUtils.newInstance(ReflectionUtils.java:133)
    at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:413)
    at org.apache.hadoop.mapred.MapTask.run(MapTask.java:332)
    at org.apache.hadoop.mapred.Child$4.run(Child.java:268)
    at java.security.AccessController.doPrivileged(Native Method)
    at javax.security.auth.Subject.doAs(Subject.java:396)
    at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1438)
    at org.apache.hadoop.mapred.Child.main(Child.java:262)
Caused by: java.lang.reflect.InvocationTargetException
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:39)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.ja
14/12/21 19:58:54 INFO mapred.JobClient: Task Id : attempt_201412211623_0042_m_000001_1, Status : FAILED
java.lang.RuntimeException: Error in configuring object

3 个答案:

答案 0 :(得分:0)

所有错误都表明wordcount/output目录已经存在。我看到你硬编码了第一个MR作业(FileOutputFormat.setOutputPath(conf, new Path("wordcount/output"));)的输出目录的值。

如果您已经存在此目录(output),则该作业将失败,因为它会阻止您覆盖内容。尝试删除该目录并使用新目录运行作业。

答案 1 :(得分:0)

您的2个不同的减速器作业可能正在尝试在同一位置写入。在hdfs中,我们无法更新或覆盖。如果要再次在同一位置写入,则需要删除现有文件,目录位置。

以下是一些有用的参考资料

chaining-multiple-mapreduce-jobs-in-hadoop

job chaining

答案 2 :(得分:0)

我建议你使用新的API

此示例基于新API

public class ChainJobs extends Configured implements Tool {

 private static final String OUTPUT_PATH = "intermediate_output";

 @Override
 public int run(String[] args) throws Exception {
  /*
   * Job 1
   */
  Configuration conf = getConf();
  FileSystem fs = FileSystem.get(conf);
  Job job = new Job(conf, "Job1");
  job.setJarByClass(ChainJobs.class);

  job.setMapperClass(MyMapper1.class);
  job.setReducerClass(MyReducer1.class);

  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(IntWritable.class);

  job.setInputFormatClass(TextInputFormat.class);
  job.setOutputFormatClass(TextOutputFormat.class);

  TextInputFormat.addInputPath(job, new Path(args[0]));
  TextOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH));

  job.waitForCompletion(true);

  /*
   * Job 2
   */
  Configuration conf2 = getConf();
  Job job2 = new Job(conf2, "Job 2");
  job2.setJarByClass(ChainJobs.class);

  job2.setMapperClass(MyMapper2.class);
  job2.setReducerClass(MyReducer2.class);

  job2.setOutputKeyClass(Text.class);
  job2.setOutputValueClass(Text.class);

  job2.setInputFormatClass(TextInputFormat.class);
  job2.setOutputFormatClass(TextOutputFormat.class);

  TextInputFormat.addInputPath(job2, new Path(OUTPUT_PATH));
  TextOutputFormat.setOutputPath(job2, new Path(args[1]));

  return job2.waitForCompletion(true) ? 0 : 1;
 }

private static final String OUTPUT_PATH = "intermediate_output";是为第一个作业输出定义的,它将是第二个作业的输入。

参考this

希望这有帮助。