MapReduce从文本文件中读取另外两列

时间:2015-08-27 21:27:59

标签: java hadoop mapreduce

我是MapReduce的新手,非常感谢您的反馈。我有一个文本文件,其中包含foll中的数据。格式 -

State1  County1  Students#(Integer) Teacher#(Integer) Classrooms#(Integer)
State1  County2  Students#(Integer) Teacher#(Integer) Classrooms#(Integer)
State1  County3  Students#(Integer) Teacher#(Integer) Classrooms#(Integer)
State2  County1  Students#(Integer) Teacher#(Integer) Classrooms#(Integer)
State2  County2  Students#(Integer) Teacher#(Integer) Classrooms#(Integer)
State2  County3  Students#(Integer) Teacher#(Integer) Classrooms#(Integer)

我写了mapreducer,给了我一个foll。输出 -

State1 Total<Students#>
State2 Total<Students#>

我需要同样适用于教师和课堂小组 - 需要最终输出:

State1 Total<Students#> Total<Teacher#>  Total<Classrooms#>
State2 Total<Students#> Total<Teacher#>  Total<Classrooms#>

3 个答案:

答案 0 :(得分:1)

你也可以在下面试试

public class MultiColSumDemo extends Configured implements Tool {

public static void main(String[] args) throws Exception {

    ToolRunner.run(new Configuration(), new MultiColSumDemo(), args);
}

@Override
public int run(String[] arg0) throws Exception {

    getConf().set("mapreduce.input.keyvaluelinerecordreader.key.value.separator", "  ");

    Job job = Job.getInstance(getConf());

    job.setJobName("MultiColSumDemo");

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(MultiColMapper.class);

    job.setReducerClass(MultiColReduce.class);

    job.setInputFormatClass(KeyValueTextInputFormat.class);

    FileInputFormat.setInputPaths(job, new Path("input/sum_multi_col"));
    FileOutputFormat.setOutputPath(job, new Path("sum_multi_col_otput" + System.currentTimeMillis()));

    job.setJarByClass(MultiColSumDemo.class);
    job.submit();

    return 1;
}

class MultiColMapper extends Mapper<Text, Text, Text, Text> {

    @Override
    protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
        System.out.println(key + " " + value);
        context.write(key, value);
    }

}

class MultiColReduce extends Reducer<Text, Text, Text, Text> {

    @Override
    protected void reduce(Text key, Iterable<Text> values, Context context)
            throws IOException, InterruptedException {

        Iterator<Text> it = values.iterator();
        StringTokenizer st = null;
        HashMap<String, Integer> sumCol = new HashMap<>();
        String[] name = null;

        System.out.println(key + " " + values);

        while (it.hasNext()) {
            st = new StringTokenizer(it.next().toString(), "  ");
            st.nextToken();
            while (st.hasMoreTokens()) {
                name = st.nextToken().split("#");
                if (sumCol.get(name[0]) == null)
                    sumCol.put(name[0], Integer.parseInt(name[1]));
                else
                    sumCol.put(name[0], sumCol.get(name[0]) + Integer.parseInt(name[1]));
            }
        }

        StringBuilder sb = new StringBuilder();

        for (Entry<String, Integer> val : sumCol.entrySet())
            sb.append(val.getKey() + "#" + val.getValue());
        System.out.println(key + " " + sb);
        context.write(key, new Text(sb.toString()));

    }
}

}

答案 1 :(得分:0)

您没有提供您尝试过的代码,因此我假设您按州进行映射,然后在reducer中汇总Student。

总结教师和教室的逻辑是完全一样的。 而不是生成一个(键,值)对,其中IntWritable中的值,您可以将值设置为文本,您将在其中附加所有总和。否则,您可以定义自己的Writable类,该类将包含三个整数(学生,教师,教室)。

同样适用于地图阶段的值;而不是IntWritable,提供一个Text(您感兴趣的字段的连接以及您将在reduce阶段中解析的字段),或者自定义Writable类。

我认为你已经有了聚合(求和)计数的逻辑,因为它与学生完全一样。

答案 2 :(得分:0)

你可以试试这个。我能够得到输出。

映射器代码

import java.io.IOException;


import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;


public class Exercisemapper extends Mapper<LongWritable,Text,Text,Text>
{
    public void map(LongWritable key,Text value,Context context) throws IOException,InterruptedException
    {
        String orig_val=value.toString();
        String[] orig_val1=orig_val.split(",");
        String state_val=orig_val1[0];
        String other_counts=orig_val1[2]+","+orig_val1[3]+","+orig_val1[4];
        context.write(new Text(state_val),new Text(other_counts));
    }


}

减速机代码:

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class ExerciseReducer extends Reducer<Text,Text,Text,Text> 
{
    public void reduce(Text key,Iterable<Text> value,Context context) throws IOException,InterruptedException
    {
        Map<Integer,Integer> mymap=new HashMap<Integer,Integer>();
        StringBuilder sb=new StringBuilder();
        int myval=0;
        for(Text s:value)
        {
            String comma_values=s.toString();
            String[] comma_values_arr=comma_values.split(",");
            for(int i=0;i<comma_values_arr.length;i++)
            {
                if(mymap.get(i)==null)
                mymap.put(i,Integer.parseInt(comma_values_arr[i]));
                else
                {
                 myval=mymap.get(i)+Integer.parseInt(comma_values_arr[i]);
                 mymap.put(i,myval);
                }
            }
        }
        for(Integer finalval:mymap.values())
        {

            sb.append(finalval.toString());
            sb.append("\t");
        }
        context.write(key,new Text(sb.toString().replaceAll("\t$","")));        
    }
}

驱动程序代码

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


public class ExerciseDriver {
    public static void main(String args[]) throws Exception
    {
        if(args.length!=2)
            {
            System.err.println("Usage: Worddrivernewapi <input path> <output path>");
            System.exit(-1);
            }
        Job job=new Job();

        job.setJarByClass(ExerciseDriver.class);
        job.setJobName("ExerciseDriver");

        FileInputFormat.addInputPath(job,new Path(args[0]));
        FileOutputFormat.setOutputPath(job,new Path(args[1]));

        job.setMapperClass(Exercisemapper.class);

        job.setReducerClass(ExerciseReducer.class);


        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        job.setNumReduceTasks(1);
        System.exit(job.waitForCompletion(true) ? 0 : 1);

    }

}

并且在驱动程序代码中将reducer的数量设置为1。