Reducer不能按键分组用于不同的映射器

时间:2016-09-22 18:12:53

标签: hadoop mapreduce

用例:

  1. 文件1包含包含trackerId +其他字段的展示数据
  2. 文件2包含点击详情,其中包含trackerId +点击
  3. 我使用不同的映射器用于上面的两个和一个reducer,但似乎reducer无法组合这两个文件数据。

    package com.hadoop.intellipaat;
    
    import java.io.IOException;
    import java.util.List;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.Reducer;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
    import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
    
    import com.google.common.collect.Lists;
    
    /**
     * This job will combine click and impression on TrackerId
     * 
     * @author raghunandangupta
     *
     */
    
    public class JoinClickImpressionDetailJob {
    
        public static final String IMPRESSION_PREFIX = "IMPRESSION_PREFIX";
        public static final String CLICK_PREFIX = "CLICK_PREFIX";
        public static final String SEPERATOR = "~";
    
        private static class ImpressionMapper extends Mapper<LongWritable, Text, Text, Text> {
    
            @Override
            protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
                    throws IOException, InterruptedException {
                /**
                 * Excluding header
                 */
                if (!(value.toString().indexOf("accountId") != -1)) {
                    String words[] = value.toString().split(",");
                    if (words.length > 18) {
                        context.write(new Text(words[18].trim()), new Text(IMPRESSION_PREFIX + SEPERATOR + value.toString()));
                    }
                } else {
                    context.write(new Text(""), value);
                }
            }
        }
    
        private static class ClickMapper extends Mapper<LongWritable, Text, Text, Text> {
    
            @Override
            protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
                    throws IOException, InterruptedException {
                String words[] = value.toString().split(",");
                if (words.length > 18) {
                    context.write(new Text(words[18].trim()), new Text(CLICK_PREFIX + SEPERATOR + value.toString()));
                } else {
                    context.write(new Text(""), new Text("1"));
                }
            }
        }
    
        private static class ImpressionClickReducer extends Reducer<Text, Text, Text, Text> {
            @Override
            protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context) {
                try {
                    System.out.println("=========="+key.toString());
                    if (key.toString().length() != 0) {
                        List<Text> myList = Lists.newArrayList(values);
    
                        for(Text t : myList){
                            System.out.println("#######"+t.toString());
                        }
                        System.out.println("@@@@@@@@@@@@@@@@@@@@@@@@@");
                        if (myList.size() == 2) {
                            if (myList.get(0).toString().indexOf(IMPRESSION_PREFIX) != -1 && myList.get(1).toString().indexOf(CLICK_PREFIX) != -1) {
                                String line = myList.get(0).toString().split(SEPERATOR)[1] + ",1";
                                context.write(key, new Text(line));
                            } else if (myList.get(1).toString().indexOf(IMPRESSION_PREFIX) != -1
                                    && myList.get(0).toString().indexOf(CLICK_PREFIX) != -1) {
                                String line = myList.get(1).toString().split(SEPERATOR)[1] + ",1";
                                context.write(key, new Text(line));
                            }
                        }
                    }
                } catch (Exception exception) {
                    exception.printStackTrace();
                }
            }
        }
    
        public static void main(String[] args) {
            try {
                Configuration conf = new Configuration();
                // conf.set("mapreduce.output.fileoutputformat.compress", "true");
                // conf.set("mapreduce.output.fileoutputformat.compress.codec",
                // "org.apache.hadoop.io.compress.GzipCodec");
                // conf.set("mapreduce.map.output.compress.codec",
                // "org.apache.hadoop.io.compress.SnappyCodec");
                // conf.set("mapreduce.output.fileoutputformat.compress.type",
                // "BLOCK");
                Job job = Job.getInstance(conf, "IMPRESSION_CLICK_COMBINE_JOB");
    
                job.setMapOutputKeyClass(Text.class);
                job.setMapOutputValueClass(Text.class);
    
                job.setInputFormatClass(TextInputFormat.class);
                job.setOutputFormatClass(TextOutputFormat.class);
    
                job.setReducerClass(ImpressionClickReducer.class);
    
                FileInputFormat.setInputDirRecursive(job, true);
    
                // FileInputFormat.addInputPath(job, new Path(args[0]));
                // job.setMapperClass(ImpressionMapper.class);
    
                /**
                 * Here directory of impressions will be present
                 */
                MultipleInputs.addInputPath(job, new Path(args[0]), TextInputFormat.class, ImpressionMapper.class);
                /**
                 * Here directory of clicks will be present
                 */
                MultipleInputs.addInputPath(job, new Path(args[1]), TextInputFormat.class, ClickMapper.class);
    
                FileOutputFormat.setOutputPath(job, new Path(args[2]));
    
                job.waitForCompletion(true);
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    
    }
    

    任何线索都将受到赞赏。

    EG。 File 1 [trackerId1,record1]     File2 [treackerId1, Clicked]

    在减速机中我得到了:

    trackerId,[record1,record1]理想情况下应为trackerId ,[record1,clicked]

1 个答案:

答案 0 :(得分:1)

你的问题最有可能是减速器中的这一行:

List<Text> myList = Lists.newArrayList(values);

要记住的主要事情是Iterable<Text> values重复使用它在迭代时为您提供的Text对象。所以你可能会在数组中添加两个Text个对象,但是它们指向同一个对象。

如果你看看Lists.newArrayList()是如何工作的,它只是将对象添加到数组中,而不是创建一个新对象。

因此,如果您要使用Text个对象,则每次向数组添加值时都需要创建一个新对象。这通常是人们在这种情况下使用字符串的原因。快速检查是否存在问题的方法是将此代码更改为:

List<Text> myList = new ArrayList<Text>();
for (Text v : values) {
    myList.add(new Text(v));
}

因此,您每次都会创建一个新的Text