Spark master不会调用Custom InputFormat

时间:2014-12-01 10:39:49

标签: xml hadoop apache-spark

我试图探索Apache Spark,作为其中的一部分,我想自定义InputFormat。在我的情况下,我想阅读xml文件,并将<text>的每次出现转换为新记录。

我写了自定义的TextInputFormat (XMLRecordInputFormat.java),它返回自定义的**XMLRecordReader extends org.apache.hadoop.mapreduce.RecordReader**

但我不明白为什么Spark master不会调用自定义输入格式(XMLRecordInputFormat.class)?由于某种原因,它继续表现得像普通的分线器。

以下是代码:

import java.util.Iterator;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;


import scala.Tuple2;

public class CustomizedXMLReader{

    public static void main(String[] args) {
        SparkConf sparkConf = new SparkConf()
        .setMaster("local")
        .setAppName("CustomizedXMLReader")
        .set("spark.executor.memory", "512m").set("record.delimiter.regex", "</bermudaview>");

        JobConf jobConf = new JobConf(new Configuration(), CustomizedXMLReader.class);
        jobConf.setInputFormat(XMLRecordInputFormat.class);
        FileInputFormat.setInputPaths(jobConf, new Path(args[0]));


        JavaSparkContext ctx = new JavaSparkContext(sparkConf);
        JavaPairRDD<LongWritable, Text> lines = ctx.hadoopRDD(jobConf,XMLRecordInputFormat.class, LongWritable.class, Text.class);



        Function<Tuple2<LongWritable, Text>, XMLRecord> keyData =
              new Function<Tuple2<LongWritable, Text>, XMLRecord>() {
                     @Override
                    public XMLRecord call(Tuple2<LongWritable, Text> arg0)
                            throws Exception {
                        // TODO Auto-generated method stub
                         System.out.println(arg0.toString());
                         XMLRecord record = new XMLRecord();
                         record.setPos(Long.getLong(arg0._1.toString()));
                         record.setXml(arg0._2.toString());                      
                         return record;                         
                    }
                };

        JavaRDD<XMLRecord> words = lines.map(keyData);      

        List<XMLRecord>  tupleList = words.collect();    

        Iterator<XMLRecord> itr = tupleList.iterator();

        while(itr.hasNext()){
            XMLRecord t = itr.next();
            System.out.println(t.getXml());
            System.out.println(t.getPos());
        }
    }
}

//following custom InputFormat implementation

public class XMLRecordInputFormat extends TextInputFormat{

    public RecordReader<LongWritable, Text> createRecordReader(
            InputSplit arg0, JobConf arg1, Reporter arg2) throws IOException {
        // TODO Auto-generated method stub
        XMLRecordReader r = new XMLRecordReader();

        return r;
    }


}

1 个答案:

答案 0 :(得分:1)

我想我想办法做到了。

我对API和org.apache.hadoop.mapred.RecordReader(Interface)和org.apache.hadoop.mapreduce.RecordReader(class)之间感到困惑。还有要使用的InputFormat。

看起来像FileInputFormat和org.apache.hadoop.mapred.RecordReader确实齐头并进。请找到将XML解析为JavaRDD的完整代码。

在这个例子中,我正在寻找解析和提取XML标签....

主要课程

import java.io.Serializable;
import java.util.Iterator;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;


import scala.Tuple2;

public class CustomizedXMLReader implements Serializable{

    private static final long serialVersionUID = 1L;


    public static void main(String[] args) {        
        CustomizedXMLReader reader = new CustomizedXMLReader();     
        reader.readUsingFileInputFormat(args);
    }



    /**
     * Doing all reading using org.apache.hadoop.mapred.RecordReader interface. This is doing good.
     * @param args
     */
        public void readUsingFileInputFormat(String[] args){
            SparkConf sparkConf = new SparkConf()
            .setMaster("local")
            .setAppName("CustomizedXMLReader")
            .set("spark.executor.memory", "512m").set("record.delimiter.regex", "</name>");

            JobConf jobConf = new JobConf(new Configuration(), CustomizedXMLReader.class);
            jobConf.setInputFormat(XMLRecordFileInputFormat.class);
            FileInputFormat.setInputPaths(jobConf, new Path(args[0]));


            JavaSparkContext ctx = new JavaSparkContext(sparkConf);
            JavaPairRDD<Text, Text> lines = ctx.hadoopRDD(jobConf,XMLRecordFileInputFormat.class, Text.class, Text.class);



            Function<Tuple2<Text, Text>, XMLRecord> keyData =
                  new Function<Tuple2<Text, Text>, XMLRecord>() {
                        private static final long serialVersionUID = 1L;

                        @Override
                        public XMLRecord call(Tuple2<Text, Text> arg0)
                                throws Exception {
                             System.out.println(arg0.toString());
                             XMLRecord record = new XMLRecord();
                             record.setPos(arg0._1.toString());
                             record.setXml(arg0._2.toString());                      
                             return record;                         
                        }
                    };

            JavaRDD<XMLRecord> words = lines.map(keyData);      

            List<XMLRecord>  tupleList = words.collect();    

            Iterator<XMLRecord> itr = tupleList.iterator();

            while(itr.hasNext()){
                XMLRecord t = itr.next();
                System.out.println(t.getXml());
                System.out.println(t.getPos());


            }
        }
        }

RecordReader

import java.io.IOException;

import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.streaming.StreamXmlRecordReader;

public class XMLInterfaceRecordReader implements org.apache.hadoop.mapred.RecordReader<Text,Text>{

     private StreamXmlRecordReader in;
     private String delimiterRegex;
      private long start;
      private long pos;
      private long end;
      private static Long keyInt = 0L;


      public XMLInterfaceRecordReader(InputSplit split, JobConf arg1, Reporter rep) throws IOException {
          super();
          FileSplit fSplit = (FileSplit) split;

          this.delimiterRegex = "</name>";

            start = fSplit.getStart();
            end = start + fSplit.getLength();
            arg1.set("stream.recordreader.begin", "<name>");
            arg1.set("stream.recordreader.end", delimiterRegex);


            final Path file = fSplit.getPath();
            FileSystem fs = file.getFileSystem(arg1);
            FSDataInputStream fileIn = fs.open(fSplit.getPath());

            boolean skipFirstLine = false;
            if (start != 0) {
                skipFirstLine = true;           
                --start;
                fileIn.seek(start);
            }

            in = new StreamXmlRecordReader(fileIn, fSplit,rep, arg1,fs);

            this.pos = start;
    }


    @Override
    public void close() throws IOException {
        if (in != null) {
            in.close();
        }       
    }

    @Override
    public Text createKey() {
        return new Text();
    }

    @Override
    public Text createValue() {
        return new Text();
    }

    @Override
    public long getPos() throws IOException {
        return pos;
    }

    @Override
    public float getProgress() throws IOException {
        if (start == end) {
            return  (long) 0.0f;
        }
        else {
            return  (long) Math.min(1.0f, (pos - start) / (float)(end - start));
        }
    }

    @Override
    public boolean next(Text Key, Text Value) throws IOException {
        in.seekNextRecordBoundary();
        Text key = new Text();
        Text val = new Text();
        in.next(key, val);

        if(key.toString() != null && key.toString().length() > 0){              
                System.out.println(key.toString());
                System.out.println(val.toString());
                start += in.getPos();
                Key.set(new LongWritable(++keyInt).toString());
                Value.set(key.toString());
                return true;
        }else
                return false;
    }

}

文件输入格式

import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;

public class XMLRecordFileInputFormat extends FileInputFormat<Text, Text>{

    XMLInterfaceRecordReader reader = null;

    public XMLRecordFileInputFormat(){

    }

    @Override
    public RecordReader<Text, Text> getRecordReader(InputSplit arg0,
            JobConf arg1, Reporter arg2) throws IOException {
        if(reader != null)
                return reader;
        else
              return new XMLInterfaceRecordReader(arg0,arg1,arg2);
    }

}