我正在尝试使用自定义输入格式类解析Mapreduce中的PDF文件,如下所示:
import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
public class PdfFileInputFormat extends FileInputFormat<LongWritable, Text> {
@Override
public RecordReader<LongWritable, Text> createRecordReader(InputSplit split,
TaskAttemptContext context) throws IOException, InterruptedException {
System.out.println("Entered PdfFileInputFormat class");
return new PdfRecordReader();
}
@Override
protected boolean isSplitable(JobContext context, Path file) {
return false;
}
}
和我的pdf阅读器类是:
package com.pdf.prac;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.PdfReaderContentParser;
import com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy;
import com.itextpdf.text.pdf.parser.TextExtractionStrategy;
import com.itextpdf.text.pdf.parser.RenderListener;
public class PdfRecordReader extends RecordReader<LongWritable, Text> {
private int flag = 0;
private LongWritable key = null;
private Text value = null;
private PdfReader reader;
private PdfReaderContentParser parser;
private TextExtractionStrategy strategy;
private FSDataInputStream fileIn;
private List<String> records = new ArrayList<String>();
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
throws IOException {
System.out.println("Executing initialize........");
FileSplit split = (FileSplit) genericSplit;
Configuration conf = context.getConfiguration();
final Path file = split.getPath();
FileSystem fs = file.getFileSystem(conf);
this.fileIn = fs.open(split.getPath());
this.reader = new PdfReader(fileIn);
this.parser = new PdfReaderContentParser(reader);
readRecords();
}
public synchronized boolean nextKeyValue() throws IOException {
System.out.println("Executing nextKey........Total Records : " + records.size() + "; Flag : " + (flag++));
int index = 0;
if (key == null) {
key = new LongWritable(index);
} else {
index = (int) key.get();
key.set(++index);
}
if (value == null) {
value = new Text(records.get(index));
} else {
value.set(records.get(index));
}
if (flag == records.size()) {
return false;
} else {
return true;
}
}
@Override
public LongWritable getCurrentKey() {
return key;
}
@Override
public Text getCurrentValue() {
return value;
}
/**
* Get the progress within the split
*/
public float getProgress() {
return 0;
}
public synchronized void close() throws IOException {
if (fileIn != null) {
fileIn.close();
}
}
private void readRecords() throws IOException {
if (reader != null) {
for (int i = 1; i <= reader.getNumberOfPages(); i++) {
strategy = parser.processContent(i, new SimpleTextExtractionStrategy());
if (strategy != null) {
/* String[] content = (strategy.getResultantText()).split("\n");
for (String str : content) {
records.add(str);
}*/
StringTokenizer tokens = new StringTokenizer(strategy.getResultantText(), "\n");
while (tokens.hasMoreTokens()) {
records.add(tokens.nextToken());
}
}
}
reader.close();
}
return;
}
}
但是这会在hadoop环境的运行时在class not found exception
上给出com.itextpdf.text.pdf.parser.RenderListener
。感谢任何帮助。所有jar文件都在构建路径上正确添加,并且没有多个jar。