col1 col2 col3 col4 col5
A 120 140 160 180
B 200 220 240 260
D 400 420 440 460
col1 col2 col3 col4 col5
A 110 140 160 180
B 200 220 240 260
C 600 620 640 660
A 120 140 160 180
A 110 140 160 180
B 200 220 240 260
D 400 420 440 460
C 600 620 640 660
1)col1和col2是这个中的主键,任何键都被改变,然后我们显示两个记录,如
in A.txt contain 1st Records:- A 120 140 160 180
in B.txt contain 1st Records:- A 110 140 160 180
此col2中的已更改,因此我必须显示两条记录
2)如果两个文件的记录没有变化(我的意思相同)我们只需要显示一条记录
3)在两个文件中显示所有其他记录
最终输出应该如下所示
A 120 140 160 180
A 110 140 160 180
B 200 220 240 260
D 400 420 440 460
C 600 620 640 660
答案 0 :(得分:3)
使用PIG.Load这两个文件,合并记录然后将它们分开。
A = LOAD 'A.txt' USING PigStorage('\t');
B = LOAD 'B.txt' USING PigStorage('\t');
C = UNION A,B;
D = DISTINCT C;
DUMP D;
答案 1 :(得分:1)
这是mapreduce
解决方案:
将2个或更多文件放在一个目录中(输入 - arg1
),它会将所有文件合并到一个满足您所有要求的文件。它还匹配col3到一个键的非macthing行的结束(col1 + col2)请参阅注释以获取更多信息......
public class FileCompare extends Configured implements Tool{
public static class FileComapreMapper extends Mapper<Object, Text, Text, Text> {
int lineno=0;
public void map(Object key, Text value, Context context) throws IOException, InterruptedException{
try{
lineno++;
System.out.println(lineno + " -> " + value);
//skip header - uncomment this line to include header in output
if(lineno == 1) return;
String[] fields = value.toString().split("\\s+");//assuming input recs are whitespace seperated
String col1_col2 = fields[0] + "," + fields[1]; //key
String col3tolast="";
for(int i=2; i < fields.length;i++)
col3tolast+=fields[i] + ","; //values
col3tolast=col3tolast.substring(0, col3tolast.length()-1); //remove last char(',')
context.write(new Text(col1_col2), new Text(col3tolast)); //send key, value pairs to reducer
}catch(Exception e){
System.err.println("Invaid Data at line: " + lineno + " Error: " + e.getMessage());
}
}
}
public static class FileComapreReducer extends Reducer<Text, Text, Text, Text> {
@Override
public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
//Get unique col3 to last value
Set<Text> uniqueCol3tolast = new HashSet<Text>();
for(Text record : values)
uniqueCol3tolast.add(record);
//write key + value
for(Text col3tolast:uniqueCol3tolast) //outputing tab delimited recs
context.write(new Text(key.toString().replaceAll(",", "\t")),
new Text(col3tolast.toString().replaceAll(",", "\t")));
}
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new FileCompare(), args);
System.exit(res);
}
public int run(String[] args) throws Exception {
if (args.length != 2) {
System.err.println("Usage: <in> <out>");
System.exit(2);
}
Configuration conf = this.getConf();
Job job = Job.getInstance(conf, "merge-two-files");
job.setJarByClass(FileCompare.class);
job.setMapperClass(FileComapreMapper.class);
job.setReducerClass(FileComapreReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
FileSystem fs = null;
Path dstFilePath = new Path(args[1]);
try {
fs = dstFilePath.getFileSystem(conf);
if (fs.exists(dstFilePath))
fs.delete(dstFilePath, true);
} catch (IOException e1) {
e1.printStackTrace();
}
return job.waitForCompletion(true) ? 0 : 1;
}
}