我有一个输入文本文件,如下所示(部分):
{"author":"Martti Paturi","book":"Aiotko oppikouluun"}
{"author":"International Meeting of Neurobiologists Amsterdam 1959.","book":"Structure and function of the cerebral cortex"}
{"author":"Paraná (Brazil : State). Comissão de Desenvolvimento Municipal.","book":"Plano diretor de desenvolvimento de Maringá"}
我需要在这个文件上执行MapReduce,以获取一个JSON对象作为输出,该对象包含来自JSON数组中同一作者的所有书籍,格式如下:
{"author": "Ian Fleming", "books": [{"book": "Goldfinger"},{"book": "Moonraker"}]}
我的代码如下:
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.json.*;
public class CombineBooks {
//TODO define variables and implement necessary components
/*public static class MyTuple implements Writable{
private String author;
private String book;
public void readFields(DataInput in){
JSONObject obj = new JSONObject(in.readLine());
author = obj.getString("author");
book = obj.getString("book");
}
public void write(DataOutput out){
out.writeBytes(author);
out.writeBytes(book);
}
public static MyTuple read(DataInput in){
MyTuple tup = new MyTuple();
tup.readFields(in);
return tup;
}
}*/
public static class Map extends Mapper<LongWritable, Text, Text, Text>{
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException{
String author;
String book;
String line = value.toString();
String[] tuple = line.split("\\n");
try{
for(int i=0;i<tuple.length; i++){
JSONObject obj = new JSONObject(tuple[i]);
author = obj.getString("author");
book = obj.getString("book");
context.write(new Text(author), new Text(book));
}
}catch(JSONException e){
e.printStackTrace();
}
}
}
public static class Combine extends Reducer<Text, Text, Text, Text>{
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException{
String booklist = null;
int i = 0;
for(Text val : values){
if(booklist.equals(null)){
booklist = booklist + val.toString();
}
else{
booklist = booklist + "," + val.toString();
}
i++;
}
context.write(key, new Text(booklist));
}
}
public static class Reduce extends Reducer<Text,Text,JSONObject,NullWritable>{
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException{
try{
JSONArray ja = new JSONArray();
String[] book = null;
for(Text val : values){
book = val.toString().split(",");
}
for(int i=0; i<book.length; i++){
JSONObject jo = new JSONObject().put("book", book[i]);
ja.put(jo);
}
JSONObject obj = new JSONObject();
obj.put("author", key.toString());
obj.put("books", ja);
context.write(obj, NullWritable.get());
}catch(JSONException e){
e.printStackTrace();
}
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args)
.getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: CombineBooks <in> <out>");
System.exit(2);
}
//TODO implement CombineBooks
Job job = new Job(conf, "CombineBooks");
job.setJarByClass(CombineBooks.class);
job.setMapperClass(Map.class);
job.setCombinerClass(Combine.class);
job.setReducerClass(Reduce.class);
job.setOutputKeyClass(JSONObject.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
//TODO implement CombineBooks
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
当我尝试运行它时,我遇到了以下错误:
java.lang.ClassCastException: class org.json.JSONObject
at java.lang.Class.asSubclass(Class.java:3165)
at org.apache.hadoop.mapred.JobConf.getOutputKeyComparator(JobConf.java:795)
at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.<init>(MapTask.java:964)
at org.apache.hadoop.mapred.MapTask$NewOutputCollector.<init>(MapTask.java:673)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:756)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:364)
at org.apache.hadoop.mapred.Child$4.run(Child.java:255)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1190)
at org.apache.hadoop.mapred.Child.main(Child.java:249)
我使用java-json.jar作为外部依赖项。我不确定这里的错误是什么。任何沮丧都表示赞赏!
答案 0 :(得分:1)
json jar文件必须保存在hadoop lib文件夹中,然后尝试执行该程序。
答案 1 :(得分:0)
看看:Hadoop Writable。虽然您确实告诉Hadoop设置输出键的值,但JSONObject并没有实现Writable接口。
为什么你不输出text
?
context.write(new Text(jo.toString()), NullWritable.get());