我最近开始学习hadoop / mapreduce / java。我正在进行以下任务。
我们有2个输入文件。
File1有2个字段。航空公司和航线数量。(航空公司是独一无二的) File2有4个字段。航空公司,航空公司名称,国家。(航空公司是独一无二的)
我需要找到最活跃的航空公司。 即 - 对于每个国家,我需要找到航线数量最多的航空公司。
让我用一个例子解释一下。让我们说
File1具有以下内容。
airlineid1 10
airlineid2 20
airlineid3 30
File2具有以下内容
airlineid1 airlinename1 country1
airlineid2 airlinename2 country1
airlineid5 airlinename5 country1
输出应为
country1 airlineid2 airlinename2 20
我为此写了2个mapreduce代码。
我在运行程序2时遇到 arrayindexoutofbound:4 错误。我认为这是因为在程序1中创建的输出文件没有做equijoin。所以它有一些记录没有count field。(airlineid仅出现在文件2中)。当我尝试为这些记录分配计数时,它给出了arrayindexoutofbound选项。
正如我之前提到的,我是Java新手,我不知道如何处理这个问题。 有人可以帮忙!
以下是我的2个程序
public class JoinReduce {
public static class RoutesMapper
extends Mapper<LongWritable, Text, Text, Text>{
public void map(LongWritable key, Text value, Context context
) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
String airlineid = itr.nextToken();
String count = itr.nextToken();
context.write(new Text(airlineid), new Text("routes" + "\t" + count));
}
}
public static class AirlinesMapper
extends Mapper<LongWritable, Text, Text, Text>{
public void map(LongWritable key, Text value, Context context
) throws IOException, InterruptedException {
String record = value.toString();
String[] parts = record.split(",");
String airlineid = parts[0];
String airlinename = parts[1];
String country = parts[6];
if(airlinename.length()==0){
airlinename="NA";
}
if(country.length()==0){
country="NA";
}
context.write(new Text(airlineid),new Text("airlines"+ "\t" + airlineid + "\t" + airlinename + "\t" + country));
}
}
public static class JoinReducer
extends Reducer<Text,Text,Text,Text> {
public void reduce(Text key, Iterable<Text> values,
Context context
) throws IOException, InterruptedException {
String airlineid ="";
String count = "";
String name = "";
String country = "";
for (Text val : values) {
String parts[] = val.toString().split("\t");
if (parts[0].equals("routes")){
count = parts[1];
}else if (parts[0].equals("airlines")){
airlineid = parts[1];
name = parts[2] ;
country = parts[3];
}
}
context.write(new Text("Joinedfile"), new Text(airlineid + "\t" + name + "\t" + country + "\t" + count));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf, "Join Reduce");
job.setJarByClass(JoinReduce.class);
job.setReducerClass(JoinReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
MultipleInputs.addInputPath(job, new Path(args[0]),
TextInputFormat.class,RoutesMapper.class);
MultipleInputs.addInputPath(job, new Path(args[1]),
TextInputFormat.class,AirlinesMapper.class);
Path outputPath = new Path(args[2]);
FileOutputFormat.setOutputPath(job, outputPath);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
public class Activeairline {
public static class ActiveMapper
extends Mapper<LongWritable, Text, Text, Text>{
public void map(LongWritable key, Text value, Context context
) throws IOException, InterruptedException {
String record = value.toString();
String parts[] = record.split("\t");
String airlineid = parts[1];
String name = parts[2];
String country = parts[3];
if(!(parts[4].length()==0)){
String count = parts[4];
context.write(new Text(country),new Text(airlineid + "\t" + name + "\t" + count));
}
}
}
public static class ActiveReducer
extends Reducer<Text,Text,Text,Text> {
public void reduce(Text key, Iterable<Text> values,
Context context
) throws IOException, InterruptedException {
int max = 0;
String maxairlineid = "";
String maxname = "";
for (Text val : values) {
String parts[] = val.toString().split("\t");
int count = Integer.parseInt(parts[2]);
String airlineid = parts[0];
String name = parts[1];
if (count > max){
max = count;
maxairlineid = airlineid;
maxname = name;
}
}
context.write(key, new Text("Most Active airline " + "\t" + maxname + "\t" + maxairlineid));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf, "Active airline");
job.setJarByClass(Activeairline.class);
job.setMapperClass(ActiveMapper.class);
job.setReducerClass(ActiveReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}