MapReduce:将基于行的数据转换为分层表单

时间:2015-09-14 09:00:41

标签: xml hadoop design-patterns mapreduce hierarchical-data

Follwing问题和代码来自MapReduce设计模式。

问题:给定一个帖子和评论列表,创建一个结构化的XML层次结构,以便将评论与相关帖子嵌套。

抛出NullPointer异常。我无法理解我的错误。我真的很感激我能得到的任何帮助。

这是我的类和nestElement方法:

import java.io.IOException;
import java.io.StringReader;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

//import javax.swing.text.html.parser.Element;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
//import org.apache.hadoop.mapreduce.*;
//import org.apache.hadoop.mapred.TextInputFormat;
//import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
//import org.apache.hadoop.mapreduce.lib.reduce.WrappedReducer.Context;
import org.w3c.dom.Attr;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
//import org.w3c.dom.UserDataHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
public class PostCommentHierarchy {
        public static class PostMapper extends Mapper<Object,Text,Text,Text>{
                private Text outkey = new Text();
                private Text outvalue = new Text();

                public void map(Object key,Text value,Context context)throws IOException ,InterruptedException{
                        Map<String,String> parsed = MRDPUtils.transformXmlToMap(value.toString());
                        String line = value.toString();
                        if(!(line.length()<=0)){
                        outkey.set(parsed.get("Id"));
                        outvalue.set("P"+value.toString());
                        context.write(outkey, outvalue);
                        }
                }
        }

        public static class CommentMapper extends Mapper<Object,Text,Text,Text>{
                private Text outkey = new Text();
                private Text outvalue = new Text();

                public void map(Object key, Text value,Context context)throws IOException,InterruptedException{
                        Map<String,String> parsed = MRDPUtils.transformXmlToMap(value.toString());
                        String line = value.toString();
                        if(!(line.length()<=0)){
                        outkey.set(parsed.get("PostId"));

                        outvalue.set("C"+value.toString());
                        context.write(outkey, outvalue);
                        }
                }
        }
        public static class PostCommentHierarchyReducer extends Reducer <Text,Text,Text,NullWritable>{
                private ArrayList<String> comments = new ArrayList<String>();
                private DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
                private String post = null;

                public void reduce(Text key,Iterable<Text>values,Context context)throws IOException,InterruptedException{
                        post =null;
                        comments.clear();

                        for(Text t:values){
                            if(t.charAt(0)== 0){
                                return;
                            }else if(t.charAt(0)=='P'){
                                        post = t.toString().substring(1,t.toString().length()).trim();
                                }else{
                                        comments.add(t.toString().substring(1,t.toString().length()).trim());
                                }
                        }

                        if(post != null){
                                String postWithCommentChildren = null;
                                try {
                                        try {
                                                postWithCommentChildren = nestElements(post,comments);
                                        } catch (TransformerException e) {
                                                // TODO Auto-generated catch block
                                                e.printStackTrace();
                                        }
                                } catch (ParserConfigurationException e) {
                                        // TODO Auto-generated catch block
                                        e.printStackTrace();
                                } catch (SAXException e) {
                                        // TODO Auto-generated catch block
                                        e.printStackTrace();
                                }
                                context.write(new Text(postWithCommentChildren), NullWritable.get());
                        }
                }

                private String nestElements(String post,List<String> comments) throws ParserConfigurationException, SAXException, IOException, TransformerException{
                        DocumentBuilder bldr = dbf.newDocumentBuilder();
                        Document doc = bldr.newDocument();

                        org.w3c.dom.Element postEl = getXmlElementFromString(post);
                        org.w3c.dom.Element toAddPostEl = doc.createElement("post");
//                      Element toAddPostEl1 = doc.createElement("post");

                        copyAttributesToElement(postEl.getAttributes(),toAddPostEl);

                        for(String commentXml:comments){
                                org.w3c.dom.Element commentEl = getXmlElementFromString(commentXml);
                                org.w3c.dom.Element toAddCommentEl = doc.createElement("comments");

                                copyAttributesToElement(commentEl.getAttributes(),toAddCommentEl);
                                toAddPostEl.appendChild(toAddPostEl);
                        }
                        doc.appendChild(toAddPostEl);
                    // Transform the document into a String of XML and return
                        return transformDocumentToString(doc);
                }
                private org.w3c.dom.Element getXmlElementFromString(String xml) throws ParserConfigurationException, SAXException, IOException{
                        DocumentBuilder bldr = dbf.newDocumentBuilder();
                        return bldr.parse(new InputSource(new StringReader(xml)))
                                        .getDocumentElement();

                }

                private void copyAttributesToElement(NamedNodeMap attributes,org.w3c.dom.Element toAddPostEl){
                        for(int i = 0; i<attributes.getLength();++i){
                                Attr toCopy = (Attr) attributes.item(i);
                                toAddPostEl.setAttribute(toCopy.getName(),toCopy.getValue());

                        }
                }

                private String transformDocumentToString(Document doc) throws TransformerException{

                        TransformerFactory tf = TransformerFactory.newInstance();
                        Transformer transformer = tf.newTransformer();
                        transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");

                        StringWriter writer = new StringWriter();
                        //Acts as a holder for a transformation Source tree in the form of a Document Object Model (DOM) tree.
                        transformer.transform(new DOMSource(doc), new StreamResult(writer));
                        return writer.getBuffer().toString().replaceAll("\n|\r", "");
                }

        }
        public static void main(String[] args)throws IOException, ClassNotFoundException, InterruptedException{

                //check how multiple inputs job works?

                Configuration conf = new Configuration();
                @SuppressWarnings("deprecation")
                Job job = new Job(conf,"PostCommentHierarchy");
                job.setJarByClass(PostCommentHierarchy.class);
                job.setOutputKeyClass(Text.class);
                job.setOutputValueClass(Text.class);

                MultipleInputs.addInputPath(job, new Path(args[0]),TextInputFormat.class,PostMapper.class);
                MultipleInputs.addInputPath(job, new Path(args[1]),TextInputFormat.class ,CommentMapper.class);

                job.setReducerClass(PostCommentHierarchyReducer.class);
                job.setOutputFormatClass(TextOutputFormat.class);
                TextOutputFormat.setOutputPath(job, new Path(args[2]));


                System.exit(job.waitForCompletion(true)? 0 :1);
        }

}

0 个答案:

没有答案