我必须处理250个XML文件,每个文件的大小为25 MB。为了处理XML文件,我使用Apache Mahout的XMLInputFormat并生成一个序列文件。关键是文件名,值是序列文件中的整个文件内容。但是这种方法的问题是启动了250个Mapper,这使得MapReduce作业变慢。
我遇到了CombineFileInputFormat(在浏览Tom White书时),使用250个Mapper将无法启动250个文件。但是CombineFileInputFormat是一个抽象类,我面临着为XML文件实现它的困难,因为我不熟悉Java和Hadoop。
那么,有人可以请我为XML文件提供CombineFileInputFormat的实现。
驱动程序代码:
package com.ericsson.sequencefile;
//A MapReduce program for packaging a collection of small files as a single SequenceFile.
//hadoop jar sequencefiles.jar com.ericsson.sequencefile.SmallFilesToSequenceFileConverter -D xmlinput.start="<XMLstart>" -D xmlinput.end="</XMLstart>" /IRIS_NG/pfinder2/ccn/archive /IRIS_NG/pfinder2/output
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class SmallFilesToSequenceFileConverter extends Configured implements Tool {
public static class SequenceFileMapper extends Mapper<LongWritable, Text, Text, Text> {
private Text filenameKey;
@Override
public void setup(Context context) throws IOException, InterruptedException {
InputSplit split = context.getInputSplit();
Path path = ((FileSplit) split).getPath();
filenameKey = new Text(path.toString() + "\n");
}
@Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String document = value.toString();
context.write(filenameKey, new Text(document));
}
}
@Override
public int run(String[] args) throws Exception {
if (args.length != 2) {
System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getSimpleName());
ToolRunner.printGenericCommandUsage(System.err);
return -1;
}
Configuration conf = getConf();
Job job = Job.getInstance(conf,"SmallFilesToSequenceFile");
job.setJarByClass(getClass());
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setOutputFormatClass(SequenceFileOutputFormat.class);
job.setInputFormatClass(XmlInputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(SequenceFileMapper.class);
return job.waitForCompletion(true) ? 0 : 1;
}
public static void main(String[] args) throws Exception {
int exitCode = ToolRunner.run(new SmallFilesToSequenceFileConverter(), args);
System.exit(exitCode);
}
}
XMLInputFormt.java
package com.ericsson.sequencefile;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.*;
import org.slf4j.*;
import java.io.IOException;
/**
* Reads records that are delimited by a specific begin/end tag.
*/
public class XmlInputFormat extends TextInputFormat {
private static final Logger log =
LoggerFactory.getLogger(XmlInputFormat.class);
public static final String START_TAG_KEY = "xmlinput.start";
public static final String END_TAG_KEY = "xmlinput.end";
@Override
public RecordReader<LongWritable, Text> createRecordReader(
InputSplit split, TaskAttemptContext context) {
try {
return new XmlRecordReader((FileSplit) split,
context.getConfiguration());
} catch (IOException ioe) {
log.warn("Error while creating XmlRecordReader", ioe);
return null;
}
}
/**
* XMLRecordReader class to read through a given xml document to
* output xml blocks as records as specified
* by the start tag and end tag
*/
public static class XmlRecordReader
extends RecordReader<LongWritable, Text> {
private final byte[] startTag;
private final byte[] endTag;
private final long start;
private final long end;
private final FSDataInputStream fsin;
private final DataOutputBuffer buffer = new DataOutputBuffer();
private LongWritable currentKey;
private Text currentValue;
public XmlRecordReader(FileSplit split, Configuration conf)
throws IOException {
startTag = conf.get(START_TAG_KEY).getBytes("UTF-8");
endTag = conf.get(END_TAG_KEY).getBytes("UTF-8");
// open the file and seek to the start of the split
start = split.getStart();
end = start + split.getLength();
Path file = split.getPath();
FileSystem fs = file.getFileSystem(conf);
fsin = fs.open(split.getPath());
fsin.seek(start);
}
private boolean next(LongWritable key, Text value)
throws IOException {
if (fsin.getPos() < end && readUntilMatch(startTag, false)) {
try {
buffer.write(startTag);
if (readUntilMatch(endTag, true)) {
key.set(fsin.getPos());
value.set(buffer.getData(), 0, buffer.getLength());
return true;
}
} finally {
buffer.reset();
}
}
return false;
}
@Override
public void close() throws IOException {
fsin.close();
}
@Override
public float getProgress() throws IOException {
return (fsin.getPos() - start) / (float) (end - start);
}
private boolean readUntilMatch(byte[] match, boolean withinBlock)
throws IOException {
int i = 0;
while (true) {
int b = fsin.read();
// end of file:
if (b == -1) {
return false;
}
// save to buffer:
if (withinBlock) {
buffer.write(b);
}
// check if we're matching:
if (b == match[i]) {
i++;
if (i >= match.length) {
return true;
}
} else {
i = 0;
}
// see if we've passed the stop point:
if (!withinBlock && i == 0 && fsin.getPos() >= end) {
return false;
}
}
}
@Override
public LongWritable getCurrentKey()
throws IOException, InterruptedException {
return currentKey;
}
@Override
public Text getCurrentValue()
throws IOException, InterruptedException {
return currentValue;
}
@Override
public void initialize(InputSplit split,
TaskAttemptContext context)
throws IOException, InterruptedException {
}
@Override
public boolean nextKeyValue()
throws IOException, InterruptedException {
currentKey = new LongWritable();
currentValue = new Text();
return next(currentKey, currentValue);
}
}
}