我必须在reduce side join算法中使用bloom filter来过滤我的一个输入,但是我对函数readFields
有一个问题,它将分布式缓存的输入流反序列化(bloom filter)进入布隆过滤器。
public class BloomJoin {
//function map : input transaction.txt
public static class TransactionJoin extends
Mapper<LongWritable, Text, Text, Text> {
private Text CID=new Text();
private Text outValue=new Text();
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
String record[] = line.split(",", -1);
CID.set(record[1]);
outValue.set("A"+value);
context.write(CID, outValue);
}
}
//function map : input customer.txt
public static class CustomerJoinMapper extends
Mapper<LongWritable, Text, Text, Text> {
private Text outkey=new Text();
private Text outvalue = new Text();
private BloomFilter bfilter = new BloomFilter();
public void setup(Context context) throws IOException {
URI[] files = DistributedCache
.getCacheFiles(context.getConfiguration());
// if the files in the distributed cache are set
if (files != null) {
System.out.println("Reading Bloom filter from: "
+ files[0].getPath());
// Open local file for read.
DataInputStream strm = new DataInputStream(new FileInputStream(
files[0].toString()));
bfilter.readFields(strm);
strm.close();
// Read into our Bloom filter.
} else {
throw new IOException(
"Bloom filter file not set in the DistributedCache.");
}
};
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
String record[] = line.split(",", -1);
outkey.set(record[0]);
if (bfilter.membershipTest(new Key(outkey.getBytes()))) {
outvalue.set("B"+value);
context.write(outkey, outvalue);
}
}
}
//function reducer: join customer with transaction
public static class JoinReducer extends
Reducer<Text, Text, Text, Text> {
private ArrayList<Text> listA = new ArrayList<Text>();
private ArrayList<Text> listB = new ArrayList<Text>();
@Override
public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
listA.clear();
listB.clear();
for (Text t : values) {
if (t.charAt(0) == 'A') {
listA.add(new Text(t.toString().substring(1)));
System.out.println("liste A: "+listA);
} else /* if (t.charAt('0') == 'B') */{
listB.add(new Text(t.toString().substring(1)));
System.out.println("listeB :"+listB);
}
}
executeJoinLogic(context);
}
private void executeJoinLogic(Context context) throws IOException,
InterruptedException {
if (!listA.isEmpty() && !listB.isEmpty()) {
for (Text A : listB) {
for (Text B : listA) {
context.write(A, B);
System.out.println("A="+A+",B="+B);
}
}
}
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Path bloompath=new Path("/user/biadmin/ezzaki/bloomfilter/output/part-00000");
DistributedCache.addCacheFile(bloompath.toUri(),conf);
Job job = new Job(conf, "Bloom Join");
job.setJarByClass(BloomJoin.class);
String[] otherArgs = new GenericOptionsParser(conf, args)
.getRemainingArgs();
if (otherArgs.length != 3) {
System.err
.println("ReduceSideJoin <Transaction data> <Customer data> <out> ");
System.exit(1);
}
MultipleInputs.addInputPath(job, new Path(otherArgs[0]),
TextInputFormat.class,TransactionJoin.class);
MultipleInputs.addInputPath(job, new Path(otherArgs[1]),
TextInputFormat.class, CustomerJoinMapper.class);
job.setReducerClass(JoinReducer.class);
FileOutputFormat.setOutputPath(job, new Path(otherArgs[2]));
//job.setMapOutputKeyClass(Text.class);
//job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
System.exit(job.waitForCompletion(true) ? 0 : 3);
}
}
我该如何解决这个问题?
答案 0 :(得分:1)
您可以尝试更改
吗?URI[] files = DistributedCache.getCacheFiles(context.getConfiguration());
到
Path[] cacheFilePaths = DistributedCache.getLocalCacheFiles(conf);
for (Path cacheFilePath : cacheFilePaths) {
DataInputStream fileInputStream = fs.open(cacheFilePath);
}
bloomFilter.readFields(fileInputStream);
fileInputStream.close();
另外,我认为你使用的是Map side join而不是Reduce,因为你在Mapper中使用了分布式缓存。
答案 1 :(得分:1)
您可以从此处使用Bloom过滤器: https://github.com/odnoklassniki/apache-cassandra/blob/master/src/java/org/apache/cassandra/utils/BloomFilter.java
你可以像这样序列化:
Path file = new Path(bloomFilterPath);
FileSystem hdfs = file.getFileSystem(context.getConfiguration());
OutputStream os = hdfs.create(file);
BloomFilterSerializer serializer = new BloomFilterSerializer();
serializer.serialize(bloomFilter, new DataOutputStream(os));
反序列化:
InputStream is = getInputStreamFromHdfs(context, bloomFilterPath);
Path path = new Path(bloomFilterPath);
InputStream is = path.getFileSystem(context.getConfiguration()).open(path);
BloomFilterSerializer serializer = new BloomFilterSerializer();
BloomFilter bloomFilter = serializer.deserialize(
new DataInputStream(new BufferedInputStream(is)));