目的:
我正在尝试使用Map Reduce合并数据。我在同一个文件夹中有多组数据。
方式:
因此,我在程序/流程中按顺序多次运行Map Reduce合并作业。
问题:
我面临的问题不是失败的工作,而是没有输出的成功工作。第一个(有时是两个)迭代总是有输出(part-r-00000)但不是以下。我正在使用尺寸和体积非常小的样本数据集(1~2 kb,大约5个文件)
我尝试了什么:
让线程在每次运行后休眠5秒但无效。我试图在较长时间后使用webhdfs检查,仍然没有这样的文件。
你能告诉我这件事吗?提前谢谢。图片:
代码:
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package mergedata;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
*
* @author abcdefg
*/
public class MergeData extends Configured implements Tool{
/**
* @param args the command line arguments
*/
public static class ReadMapper
extends Mapper<Object, Text, Text, IntWritable>{
@Override
public void map(Object key, Text value, Mapper.Context context
) throws IOException, InterruptedException {
context.write(new Text(value.toString()), new IntWritable(1));
}
}
public static class MergeReducer
extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values,
Reducer.Context context
) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
@Override
public int run(String[] args) throws Exception {
Configuration conf = getConf();
FileSystem hdfs = FileSystem.get(conf);
args = new GenericOptionsParser(conf, args).getRemainingArgs();
if (args.length != 3) {
System.err.println(args.length);
System.err.println("Usage: mergedata <input folder> <temporary folder> <output folder>");
System.exit(1);
}
// FileSystem fs = FileSystem.get(conf);
// ContentSummary cs = fs.getContentSummary(new Path(args[0]));
// long fileCount = cs.getFileCount();
Job job = Job.getInstance(conf);
job.setJarByClass(MergeData.class);
job.setMapperClass(ReadMapper.class);
job.setReducerClass(MergeReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// String files = ().replaceAll(",", "," + args[0] + "/");
// FileInputFormat.addInputPaths(job, files);
int jobComplete = 1;
FileStatus[] fileStatus = hdfs.listStatus(new Path(args[0]));
HashMap<String,Pair<String,Long>> map = new HashMap<String,Pair<String,Long>>();
String tempName;
String tempKey;
Path tempPath;
for (FileStatus fileStatu : fileStatus) {
tempPath = fileStatu.getPath();
tempName = tempPath.getName();
tempKey = tempName.substring(0,12);
if (map.containsKey(tempKey)) {
map.put(tempKey,new Pair(map.get(tempKey).getLeft() + "," +
tempPath.toString(),
map.get(tempKey).getRight() + fileStatu.getLen()));
} else {
map.put(tempKey, new Pair(tempPath.toString(),fileStatu.getLen()));
}
}
String[] files = map.keySet().toArray(new String[map.keySet().size()]);
String[] inputFiles;
// String[] files = args[1].split(",");
for (String file : files)
{
System.out.println("file = " + file);
// FileInputFormat.addInputPath(job, new Path(args[0] + "/" + file + "*"));
System.out.println(args[2] + "/" + file);
if (hdfs.exists(new Path(args[2] + "/" + file))) {
System.out.println(file + " exists in " + args[2]);
map.put(file,new Pair(
map.get(file).getLeft() + "," + args[2] + "/" + file,
map.get(file).getRight() + hdfs.getFileStatus(new Path(args[2] + "/" + file)).getLen()
));
}
System.out.println("MR job input files : " + map.get(file).getLeft());
FileInputFormat.setInputPaths(job, map.get(file).getLeft());
System.out.println("MR job output dir : " + args[1] + "/" + file);
FileOutputFormat.setOutputPath(job ,new Path(args[1] + "/" + file));
if (hdfs.exists(new Path(args[1] + "/" + file))) {
hdfs.delete(new Path(args[1] + "/" + file), true); // Shouldn't occur
}
jobComplete = Math.max(jobComplete, (job.waitForCompletion(true))? 0 : 1);
// hdfs.getFileStatus(tempFile)
if (job.isSuccessful()) {
// Following sequence includes size check before deleting files
FileStatus[] filesStatuz = hdfs.listStatus(new Path(args[1] + "/" + file + "/part-r-00000"));
System.out.println("filesStatuz[0].getLen() = " + filesStatuz[0].getLen());
System.out.println("totalLen = " + map.get(file).getRight());
if (filesStatuz[0].getLen() >= map.get(file).getRight()) {
if (hdfs.exists(new Path(args[2] + "/" + file))) {
System.out.println("Found the main file of " + file);
hdfs.rename(new Path(args[2] + "/" + file), new Path(args[2] + "/" + file + "_tmp"));
}
hdfs.rename(new Path(args[1] + "/" + file + "/part-r-00000"), new Path(args[2] + "/" + file));
hdfs.delete(new Path(args[1] + "/" + file), true);
System.out.println("Done safe replacement");
// hdfs.delete(new Path(args[0] + "/" + file + "*"), false);
inputFiles = map.get(file).getLeft().split(",");
for (String inputFile : inputFiles) {
if (!inputFile.equals(args[2] + "/" + file)) {
hdfs.delete(new Path(inputFile), false);
System.out.println(inputFile + " has been deleted");
}
}
if (hdfs.exists(new Path(args[2] + "/" + file + "_tmp"))) {
hdfs.delete(new Path(args[2] + "/" + file + "_tmp"), false);
System.out.println("Deleted previous main file of " + file);
}
}
else {
System.out.println("Merging of " + file +"might have failed. Input and output size doesn't tally");
}
}
}
return(jobComplete);
}
public static void main(String[] args) throws Exception {
// TODO code application logic here
int exitCode = ToolRunner.run(new MergeData(), args);
System.exit(exitCode);
}
public class Pair<L,R> {
private final L left;
private final R right;
public Pair(L left, R right) {
this.left = left;
this.right = right;
}
public L getLeft() { return left; }
public R getRight() { return right; }
@Override
public int hashCode() { return left.hashCode() ^ right.hashCode(); }
@Override
public boolean equals(Object o) {
if (!(o instanceof Pair)) return false;
Pair pairo = (Pair) o;
return this.left.equals(pairo.getLeft()) &&
this.right.equals(pairo.getRight());
}
}
}
流量:
本质是它将类似日期的文件组合在一起,例如:输入文件夹(args [0])中的cdr_20150701_0,cdr_20150701_1到主文件,例如:cdr_20150701,并放在合并文件夹(args [3])中。但是如果在合并之前存在这样的主文件,则所有文件例如:cdr_20150701_0,cdr_20150701_1和cdr_20150701将合并为新的cdr_20150701。零件0-00000将存储在临时文件夹(args [1])中。成功转移后,临时文件夹和部件将被删除。
答案 0 :(得分:1)
您是否尝试使用命令getmerge,也许它可以在您的情况下有用。如果您只对数据进行合并,则可能不需要为合并进行地图缩减作业。
hadoop fs -getmerge [addnl]
将源目录和目标文件作为输入,并将src中的文件连接到目标本地文件。可选择将addnl设置为允许在每个文件的末尾添加换行符。
http://hadoop.apache.org/docs/r2.7.0/hadoop-project-dist/hadoop-common/FileSystemShell.html