我正在为类做一个项目,我必须编写一个map reducer来读取和输出数据源中的数据。我的数据来源是伊利诺伊州最受欢迎的男婴名字。我试图输出我的源中列出的25年中每一年的前5名。我正在使用CSV阅读器代码,我基于目前为止所做的。这是我到目前为止所拥有的。如果有人能指出我正确的方向,我将非常感激。
package nameReader;
import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
public class nameJob {
public static void main(String[] args) throws IOException {
JobConf conf = new JobConf(nameJob.class);
conf.setJobName("Name Reader");
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(Text.class);
conf.setMapOutputKeyClass(Text.class);
conf.setMapOutputValueClass(Text.class);
conf.setMapperClass(boyNameMapper.class);
conf.setReducerClass(boyNameReducer.class);
FileInputFormat.addInputPath(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
JobClient.runJob(conf);
}
}
Mapper
package nameReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.StringTokenizer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import com.opencsv.CSVReader;
import java.util.regex.*;
public class boyNameMapper extends MapReduceBase
implements Mapper<LongWritable, Text, Text, Text> {
private Text Rank = new Text();
private Text Year = new Text();
private Text Name = new Text();
private Text Frequency = new Text();
public void map(LongWritable key, Text value,
OutputCollector<Text, Text> output,
Reporter reporter) throws IOException {
String line = value.toString();
CSVReader R = new CSVReader(new StringReader(line));
String[] ParsedLine = R.readNext();
R.close();
/* todo
* Check for null and number of columns
*/
Rank.set(ParsedLine[0]);
Year.set(ParsedLine[1]);
Name.set(ParsedLine[2]);
Frequency.set(ParsedLine[3]);
output.collect(Rank, Year, Name, Frequency);
}
}
减速
package nameReader;
import java.io.IOException;
import java.util.Iterator;
import java.util.HashMap;
import java.util.Map;
import java.util.TreeMap;
import mrtools.CountMap;
import mrtools.NBest;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
public class boyNameReducer extends MapReduceBase
implements Reducer<Text, Text, Text, Text> {
NBest<Text> Best = new NBest<Text>(3);
CountMap<Text> Counts= new CountMap<Text>();
public void reduce(Text key, Iterator<Text> values,
OutputCollector<Text, Text> output,
Reporter reporter) throws IOException {
Counts.clear();
Counts.countInto(values);
Best.clear();
Best.putMap(Counts);
output.collect(key,new Text(Best.bestEntryCountTSV(",")));
}
}