我编写了一个MapReduce程序来解析CSV中的值。
数据集如下 -
PRAVEEN,40020,婴儿,026A2,12 / 04/2015
PRAVEEN,40020,玩具,0383,1 / 04/2014
PRAVEEN,2727272,BOOK,03383,03 /二千零十三分之十四
PRAVEEN,22636,BIKE,7373737,12 /二千〇一十二分之二十四
我的地图功能正在从CSV读取第一个值(即UserName)作为KEY,最后一个值即Date为VALUE
My Reduce功能也非常简单,我必须从特定KEY的VALUES列表中选择最新日期作为VALUE,即UserName
代码如下 -
package com.test.mapreduce;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.KeyValueTextInputFormat;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class RetailCustomerAnalysis_2 extends Configured implements Tool {
public static class MapClass extends MapReduceBase
implements Mapper<LongWritable, Text, Text, Text> {
private Text key1 = new Text();
private Text value1 = new Text();
private int noofFields = 5;
public void map(LongWritable key, Text value,
OutputCollector<Text, Text> output,
Reporter reporter) throws IOException {
String line = value.toString().replaceAll("\\s+","");
String[] split = line.split(",");
if(split.length!=noofFields){
return;
}
else {
key1.set(split[0].toString().trim());
value1.set(split[4].toString().trim());
System.out.println(split[4].toString().trim());
output.collect(key1, value1);
}
}
}
public static class Reduce extends MapReduceBase
implements Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Iterator<Text> values,
OutputCollector<Text, Text> output,
Reporter reporter) throws IOException {
SimpleDateFormat formatter = new SimpleDateFormat("MM/dd/yyyy");
Date date = new Date();
List<Text> dateList = new ArrayList<Text>();
for(Iterator<Text> it = values; it.hasNext();) {
// add the values in the arrayList
dateList.add((Text) it.next());
}
if(dateList.size()==1){ //If the mapper output has only one date , then select that date
// as the VALUE
try {
date = formatter.parse(dateList.get(0).toString());
} catch (ParseException e) {
e.printStackTrace();
}
} //If part ends
else {
try {
date = formatter.parse(dateList.get(0).toString());
//select the first date from list
} catch (ParseException e1) {
e1.printStackTrace();
}
for(int i=0 ; i <dateList.size();++i){
try {
//compare the selected date with the rest of the dates in the list.
if((formatter.parse(dateList.get(i).toString())).compareTo(date)>0){
date=formatter.parse(dateList.get(i).toString());
// getting the max date from the list
}
}
catch (ParseException e) {
e.printStackTrace();
}
} //for loops ends
} // else part ends
Text value = new Text(date.toString());
output.collect(key, value);
}
}
public int run(String[] args) throws Exception {
Configuration conf = getConf();
JobConf job = new JobConf(conf, RetailCustomerAnalysis_2.class);
Path in = new Path(args[0]);
Path out = new Path(args[1]);
FileInputFormat.setInputPaths(job, in);
FileOutputFormat.setOutputPath(job, out);
job.setJobName("RetailCustomerAnalysis_2");
job.setMapperClass(MapClass.class);
job.setReducerClass(Reduce.class);
job.setInputFormat(TextInputFormat.class);
job.setOutputFormat(TextOutputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.set("key.value.separator.in.input.line", ",");
JobClient.runJob(job);
return 0;
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new RetailCustomerAnalysis_2(), args);
System.exit(res);
}
}
但我从列表中得到随机日期作为结果。任何人都可以提供帮助。
答案 0 :(得分:0)
代码大多是正确的。必须稍微修改reducer实现。下面的代码片段创建了问题
for(Iterator<Text> it = values; it.hasNext();) {
// add the values in the arrayList
dateList.add((Text) it.next());
}
在上面的代码片段中,每个itreation使用相同的值对象,只更改其内容。
例如,假设Mapreduce运行以下输入
PRAVEEN,4002013454,婴儿,026A12,12 / 04/2015
PRAVEEN,4002013454,玩具,020383,1 / 04/2014
PRAVEEN,2727272727272,BOOK,03383,03 /二千零十三分之十四
PRAVEEN,2263637373,BIKE,7373737,12 /二千〇一十二分之二十四
在reduce方法&#39; dateList&#39;对象元素在值的for循环完成后具有值(12/24 / 2012,12 / 24 / 2012,12 / 24 / 2012,12 / 24/2012)。这导致剩余代码的执行不正确,最终输出错误。
而是将代码更改为
public static class Reduce extends MapReduceBase implements Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
SimpleDateFormat formatter = new SimpleDateFormat("MM/dd/yyyy");
Date date = new Date();
//-----Modified section START-----------
List<String> dateList = new ArrayList<String>();
for(Iterator<Text> it = values; it.hasNext();) {
// add the values in the arrayList
dateList.add(((Text)it.next()).toString());
}
//----Modified section END--------------
if(dateList.size()==1){ //If the mapper output has only one date , then select that date
// as the VALUE
try {
date = formatter.parse(dateList.get(0).toString());
} catch (ParseException e) {
e.printStackTrace();
}
} //If part ends
else {
String str = dateList.get(0).toString();
try {
date = formatter.parse(dateList.get(0).toString());
//select the first date from list
} catch (ParseException e1) {
e1.printStackTrace();
}
for(int i=0 ; i <dateList.size();++i){
try {
//compare the selected date with the rest of the dates in the list.
if((formatter.parse(dateList.get(i).toString())).compareTo(date)>0){
date=formatter.parse(dateList.get(i).toString());
// getting the max date from the list
}
}
catch (ParseException e) {
e.printStackTrace();
}
} //for loops ends
} // else part ends
Text value = new Text(date.toString());
output.collect(key, value);
}
}
请参阅Hadoop Reducer Values in Memory?以获取有关地图中对象引用的更多详细信息,减少方法。