Question

我必须在reduce side join算法中使用bloom filter来过滤我的一个输入，但是我对函数readFields有一个问题，它将分布式缓存的输入流反序列化（bloom filter）进入布隆过滤器。

public class BloomJoin {

    //function map : input transaction.txt
    public static class TransactionJoin extends
            Mapper<LongWritable, Text, Text, Text> {

         private Text CID=new Text();
         private Text outValue=new Text();

         public void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {

             String line = value.toString();
              String record[] = line.split(",", -1);
              CID.set(record[1]);

              outValue.set("A"+value);
              context.write(CID, outValue);
              }
        }
    //function map : input customer.txt
            public static class CustomerJoinMapper extends
                    Mapper<LongWritable, Text, Text, Text> {

                private Text outkey=new Text();
                private Text outvalue = new Text();
                private BloomFilter bfilter = new BloomFilter();
                public void setup(Context context) throws IOException {

                    URI[] files = DistributedCache
                            .getCacheFiles(context.getConfiguration());

                    // if the files in the distributed cache are set
                    if (files != null) {
                    System.out.println("Reading Bloom filter from: "
                    + files[0].getPath());
                    // Open local file for read.

                    DataInputStream strm = new DataInputStream(new FileInputStream(
                    files[0].toString()));
                    bfilter.readFields(strm);
                    strm.close();

                    // Read into our Bloom filter.

                    } else {
                    throw new IOException(
                    "Bloom filter file not set in the DistributedCache.");
                    }
                 };

                public void map(LongWritable key, Text value, Context context)
                        throws IOException, InterruptedException {
                    String line = value.toString();  
                    String record[] = line.split(",", -1);

                         outkey.set(record[0]);
                         if (bfilter.membershipTest(new Key(outkey.getBytes()))) {
                         outvalue.set("B"+value);
                         context.write(outkey, outvalue);
                         }
            }
            }

    //function reducer: join customer with transaction
    public static class JoinReducer extends
            Reducer<Text, Text, Text, Text> {

        private ArrayList<Text> listA = new ArrayList<Text>();
        private ArrayList<Text> listB = new ArrayList<Text>();


        @Override
        public void reduce(Text key, Iterable<Text> values, Context context)
                throws IOException, InterruptedException {

            listA.clear();
            listB.clear();

                    for (Text t : values) {
                if (t.charAt(0) == 'A') {
                    listA.add(new Text(t.toString().substring(1)));
                    System.out.println("liste A: "+listA);
                } else /* if (t.charAt('0') == 'B') */{
                    listB.add(new Text(t.toString().substring(1)));
                    System.out.println("listeB :"+listB);
                }
            }

            executeJoinLogic(context);
        }

        private void executeJoinLogic(Context context) throws IOException,
                InterruptedException {
                if (!listA.isEmpty() && !listB.isEmpty()) {
                    for (Text A : listB) {
                        for (Text B : listA) {
                            context.write(A, B);
                            System.out.println("A="+A+",B="+B);
                        }
                    }
                }

        }
    }

    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();
        Path bloompath=new Path("/user/biadmin/ezzaki/bloomfilter/output/part-00000");
        DistributedCache.addCacheFile(bloompath.toUri(),conf);
        Job job = new Job(conf, "Bloom Join");
        job.setJarByClass(BloomJoin.class);
        String[] otherArgs = new GenericOptionsParser(conf, args)
        .getRemainingArgs();
       if (otherArgs.length != 3) {
    System.err
            .println("ReduceSideJoin <Transaction data> <Customer data> <out> ");
    System.exit(1);
                                  }
        MultipleInputs.addInputPath(job, new Path(otherArgs[0]),
                TextInputFormat.class,TransactionJoin.class);
        MultipleInputs.addInputPath(job, new Path(otherArgs[1]),
                TextInputFormat.class, CustomerJoinMapper.class);

        job.setReducerClass(JoinReducer.class);

        FileOutputFormat.setOutputPath(job, new Path(otherArgs[2]));
        //job.setMapOutputKeyClass(Text.class);
        //job.setMapOutputValueClass(Text.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        System.exit(job.waitForCompletion(true) ? 0 : 3);
    }
}

我该如何解决这个问题？

Answer 1

您可以尝试更改

吗？

URI[] files = DistributedCache.getCacheFiles(context.getConfiguration());

到

Path[] cacheFilePaths = DistributedCache.getLocalCacheFiles(conf);
for (Path cacheFilePath : cacheFilePaths) {     
    DataInputStream fileInputStream = fs.open(cacheFilePath);
}
bloomFilter.readFields(fileInputStream);
fileInputStream.close();

另外，我认为你使用的是Map side join而不是Reduce，因为你在Mapper中使用了分布式缓存。

Answer 2

您可以从此处使用Bloom过滤器： https://github.com/odnoklassniki/apache-cassandra/blob/master/src/java/org/apache/cassandra/utils/BloomFilter.java

它配有专用的序列化器： https://github.com/odnoklassniki/apache-cassandra/blob/master/src/java/org/apache/cassandra/utils/BloomFilterSerializer.java

你可以像这样序列化：

Path file = new Path(bloomFilterPath);
FileSystem hdfs = file.getFileSystem(context.getConfiguration());
OutputStream os = hdfs.create(file);
BloomFilterSerializer serializer = new BloomFilterSerializer();
serializer.serialize(bloomFilter, new DataOutputStream(os));

反序列化：

InputStream is = getInputStreamFromHdfs(context, bloomFilterPath);
Path path = new Path(bloomFilterPath);   
InputStream is = path.getFileSystem(context.getConfiguration()).open(path);
BloomFilterSerializer serializer = new BloomFilterSerializer();
BloomFilter bloomFilter = serializer.deserialize(
                              new DataInputStream(new BufferedInputStream(is)));

MapReduce中的Bloom Filter

2 个答案: