有两个数据集A和B(具有单列 - ID)
Cat A
1
2
3
4
5
6
7
cat B
4
5
2
8
18
19
2197
Cat A-B
1
3
6
7
此减法分两步完成 步骤1:加入BY ID,左加入B按ID) 这将给出一个包含2列的数据集,其中第1列将包含数据集A的所有条目,第2列将只包含来自B的匹配条目
1
2 2
3
4 4
5 5
6
7
步骤2:通过第二个字段为空的记录来过滤步骤1中的数据集 因此,我们使用LEFT JOIN实现了A-B。
我能够执行第1步,但我无法执行第2步。 以下是第1步的源代码
public class AMinusB {
public static FlowDef createWorkflowLeftJoin(Tap aTap, Tap bTap,
Tap outputTap) {
Pipe bpipe = new Pipe("b_pipe");
Pipe apipe = new Pipe("a_pipe");
Fields b_user_id = new Fields("B_id");
Fields a_user_id = new Fields("A_id");
Pipe joinPipe = new HashJoin(apipe, a_user_id, bpipe, b_user_id,
new LeftJoin());
Pipe retainPipe = new Pipe("retain", joinPipe);
retainPipe = new Retain(retainPipe, new Fields("A_id", "B_id"));
Pipe cdistPipe = new Pipe("UniquePipe", retainPipe);
Fields selector = new Fields("A_id", "B_id");
cdistPipe = new Unique(cdistPipe, selector);
FlowDef flowDef = FlowDef.flowDef().addSource(apipe, aTap)
.addSource(bpipe, bTap).addTailSink(cdistPipe, outputTap)
.setName("A-B using left outer join");
return flowDef;
}
public static void main(String[] args) {
String Apath = "path to data set A";
String Bpath = "path to data set B";
String outputPath = "path to output";
Properties properties = new Properties();
AppProps.setApplicationJarClass(properties,
LocationsNumForAProduct.class);
FlowConnector flowConnector = new Hadoop2MR1FlowConnector(properties);
Fields A = new Fields("A_id");
Tap ATap = new Hfs(new TextDelimited(A, false, "\t"), Apath);
Fields B = new Fields("B_id");
Tap BTap = new Hfs(new TextDelimited(B, false, "\t"), Bpath);
Tap outputTap = new Hfs(new TextDelimited(false, "\t"), outputPath);
FlowDef flowDefLeftJoin = createWorkflowLeftJoin(ATap, BTap, outputTap);
flowConnector.connect(flowDefLeftJoin).complete();
}
}