Question

目的：

我正在尝试使用Map Reduce合并数据。我在同一个文件夹中有多组数据。

方式：

因此，我在程序/流程中按顺序多次运行Map Reduce合并作业。

问题：

我面临的问题不是失败的工作，而是没有输出的成功工作。第一个（有时是两个）迭代总是有输出（part-r-00000）但不是以下。我正在使用尺寸和体积非常小的样本数据集（1~2 kb，大约5个文件）

我尝试了什么：

让线程在每次运行后休眠5秒但无效。我试图在较长时间后使用webhdfs检查，仍然没有这样的文件。

你能告诉我这件事吗？提前谢谢。

图片：

Problem

代码：

/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package mergedata;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

/**
 *
 * @author abcdefg
 */
public class MergeData extends Configured implements Tool{

/**
 * @param args the command line arguments
 */
public static class ReadMapper
extends Mapper<Object, Text, Text, IntWritable>{
    @Override
    public void map(Object key, Text value, Mapper.Context context
    ) throws IOException, InterruptedException {

        context.write(new Text(value.toString()),  new IntWritable(1));
    }
}

public static class MergeReducer
extends Reducer<Text,IntWritable,Text,IntWritable> {
    private IntWritable result = new IntWritable();

    public void reduce(Text key, Iterable<IntWritable> values,
            Reducer.Context context
    ) throws IOException, InterruptedException {
        int sum = 0;
        for (IntWritable val : values) {
            sum += val.get();
        }
        result.set(sum);
        context.write(key, result);
    }
}

@Override
public int run(String[] args) throws Exception {

    Configuration conf = getConf();

    FileSystem hdfs = FileSystem.get(conf);

    args = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (args.length != 3) {
        System.err.println(args.length);
        System.err.println("Usage: mergedata <input folder> <temporary folder> <output folder>");
        System.exit(1);
    }
//        FileSystem fs = FileSystem.get(conf);
//        ContentSummary cs = fs.getContentSummary(new Path(args[0]));
//        long fileCount = cs.getFileCount();

    Job job = Job.getInstance(conf);

    job.setJarByClass(MergeData.class);
    job.setMapperClass(ReadMapper.class);
    job.setReducerClass(MergeReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
//        String files = ().replaceAll(",", "," + args[0] + "/");
//        FileInputFormat.addInputPaths(job, files);

    int jobComplete = 1;
    FileStatus[] fileStatus = hdfs.listStatus(new Path(args[0]));

    HashMap<String,Pair<String,Long>> map = new HashMap<String,Pair<String,Long>>();

    String tempName;
    String tempKey;
    Path tempPath;
    for (FileStatus fileStatu : fileStatus) {

        tempPath = fileStatu.getPath();
        tempName = tempPath.getName();
        tempKey = tempName.substring(0,12);
        if (map.containsKey(tempKey)) {
            map.put(tempKey,new Pair(map.get(tempKey).getLeft() + "," +
                    tempPath.toString(),
                    map.get(tempKey).getRight() + fileStatu.getLen()));
        } else {
            map.put(tempKey, new Pair(tempPath.toString(),fileStatu.getLen()));
        }
    }

    String[] files = map.keySet().toArray(new String[map.keySet().size()]);
    String[] inputFiles;
//        String[] files = args[1].split(",");
    for (String file : files)
    {
        System.out.println("file = " + file);
//            FileInputFormat.addInputPath(job, new Path(args[0] + "/" + file + "*"));
        System.out.println(args[2] + "/" + file);
        if (hdfs.exists(new Path(args[2] + "/" + file))) {
            System.out.println(file + " exists in " + args[2]);
            map.put(file,new Pair(
                    map.get(file).getLeft() + "," + args[2] + "/" + file,
                    map.get(file).getRight() + hdfs.getFileStatus(new Path(args[2] + "/" + file)).getLen()
            ));
        }
        System.out.println("MR job input files : " + map.get(file).getLeft());
        FileInputFormat.setInputPaths(job, map.get(file).getLeft());

        System.out.println("MR job output dir : " + args[1] + "/" + file);
        FileOutputFormat.setOutputPath(job ,new Path(args[1] + "/" + file));
        if (hdfs.exists(new Path(args[1] + "/" + file))) {
            hdfs.delete(new Path(args[1] + "/" + file), true); // Shouldn't occur
        }
        jobComplete = Math.max(jobComplete, (job.waitForCompletion(true))? 0 : 1);
            // hdfs.getFileStatus(tempFile)
        if (job.isSuccessful()) {
                // Following sequence includes size check before deleting files

            FileStatus[] filesStatuz = hdfs.listStatus(new Path(args[1] + "/" + file + "/part-r-00000"));

            System.out.println("filesStatuz[0].getLen() = " + filesStatuz[0].getLen());
            System.out.println("totalLen = " + map.get(file).getRight());
            if (filesStatuz[0].getLen() >= map.get(file).getRight()) {

                if (hdfs.exists(new Path(args[2] + "/" + file))) {
                    System.out.println("Found the main file of " + file);
                    hdfs.rename(new Path(args[2] + "/" + file), new Path(args[2] + "/" + file + "_tmp"));
                }
                hdfs.rename(new Path(args[1] + "/" + file + "/part-r-00000"), new Path(args[2] + "/" + file));
                hdfs.delete(new Path(args[1] + "/" + file), true);
                System.out.println("Done safe replacement");

//                    hdfs.delete(new Path(args[0] + "/" + file + "*"), false);
                inputFiles = map.get(file).getLeft().split(",");
                for (String inputFile : inputFiles) {
                    if (!inputFile.equals(args[2] + "/" + file)) {
                        hdfs.delete(new Path(inputFile), false);
                        System.out.println(inputFile + " has been deleted");
                    }
                }
                if (hdfs.exists(new Path(args[2] + "/" + file + "_tmp"))) {
                    hdfs.delete(new Path(args[2] + "/" + file + "_tmp"), false);
                    System.out.println("Deleted previous main file of " + file);
                }
            }
            else {
                System.out.println("Merging of " + file +"might have failed. Input and output size doesn't tally");
            }
        }         
    }
    return(jobComplete);
}

public static void main(String[] args) throws Exception {
    // TODO code application logic here
    int exitCode = ToolRunner.run(new MergeData(), args);
    System.exit(exitCode);
}

public class Pair<L,R> {

    private final L left;
    private final R right;

    public Pair(L left, R right) {
        this.left = left;
        this.right = right;
    }
    public L getLeft() { return left; }
    public R getRight() { return right; }

    @Override
    public int hashCode() { return left.hashCode() ^ right.hashCode(); }

    @Override
    public boolean equals(Object o) {
        if (!(o instanceof Pair)) return false;
        Pair pairo = (Pair) o;
        return this.left.equals(pairo.getLeft()) &&
                this.right.equals(pairo.getRight());
    }

}
}

流量：

本质是它将类似日期的文件组合在一起，例如：输入文件夹（args [0]）中的cdr_20150701_0，cdr_20150701_1到主文件，例如：cdr_20150701，并放在合并文件夹（args [3]）中。但是如果在合并之前存在这样的主文件，则所有文件例如：cdr_20150701_0，cdr_20150701_1和cdr_20150701将合并为新的cdr_20150701。零件0-00000将存储在临时文件夹（args [1]）中。成功转移后，临时文件夹和部件将被删除。

Answer 1

您是否尝试使用命令getmerge，也许它可以在您的情况下有用。如果您只对数据进行合并，则可能不需要为合并进行地图缩减作业。

hadoop fs -getmerge [addnl]

将源目录和目标文件作为输入，并将src中的文件连接到目标本地文件。可选择将addnl设置为允许在每个文件的末尾添加换行符。

http://hadoop.apache.org/docs/r2.7.0/hadoop-project-dist/hadoop-common/FileSystemShell.html

Hadoop，成功映射减少作业但没有输出

1 个答案: