Question

我正在尝试使用自定义输入格式类解析Mapreduce中的PDF文件，如下所示：

import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;    
public class PdfFileInputFormat extends FileInputFormat<LongWritable, Text> {

@Override
public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, 
                    TaskAttemptContext context) throws IOException, InterruptedException {

    System.out.println("Entered PdfFileInputFormat class");

    return new PdfRecordReader();
}

@Override
protected boolean isSplitable(JobContext context, Path file) {
    return false;
}   

}

和我的pdf阅读器类是：

package com.pdf.prac;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;    
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.PdfReaderContentParser;
import com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy;
import com.itextpdf.text.pdf.parser.TextExtractionStrategy;
import com.itextpdf.text.pdf.parser.RenderListener;

public class PdfRecordReader extends RecordReader<LongWritable, Text> {
private int flag = 0;
private LongWritable key = null;
private Text value = null;

private PdfReader reader;
private PdfReaderContentParser parser;
private TextExtractionStrategy strategy;
private FSDataInputStream fileIn;

private List<String> records = new ArrayList<String>();

public void initialize(InputSplit genericSplit, TaskAttemptContext context)
        throws IOException {
    System.out.println("Executing initialize........");

    FileSplit split = (FileSplit) genericSplit;
    Configuration conf = context.getConfiguration();
    final Path file = split.getPath();

    FileSystem fs = file.getFileSystem(conf);
    this.fileIn = fs.open(split.getPath());

    this.reader = new PdfReader(fileIn);
    this.parser = new PdfReaderContentParser(reader);

    readRecords();
}

public synchronized boolean nextKeyValue() throws IOException {
    System.out.println("Executing nextKey........Total Records : " + records.size() + "; Flag : " + (flag++));
    int index = 0;

    if (key == null) {
        key = new LongWritable(index);
    } else {
        index = (int) key.get();
        key.set(++index);
    }

    if (value == null) {
        value = new Text(records.get(index));
    } else {
        value.set(records.get(index));
    }

    if (flag == records.size()) {
        return false;
    } else {
        return true;
    }
}

@Override
public LongWritable getCurrentKey() {
    return key;
}

@Override
public Text getCurrentValue() {
    return value;
}

/**
 * Get the progress within the split
 */
public float getProgress() {
    return 0;
}

public synchronized void close() throws IOException {
    if (fileIn != null) {
        fileIn.close();
    }
}

private void readRecords() throws IOException { 
    if (reader != null) {
        for (int i = 1; i <= reader.getNumberOfPages(); i++) {
            strategy = parser.processContent(i, new SimpleTextExtractionStrategy());
                if (strategy != null) {
/*                   String[] content = (strategy.getResultantText()).split("\n");
                 for (String str : content) {
                     records.add(str);
                 }*/
                StringTokenizer tokens = new StringTokenizer(strategy.getResultantText(), "\n");
                while (tokens.hasMoreTokens()) {
                    records.add(tokens.nextToken());
                }
            }
        }
        reader.close();     
    }

    return;
}

}

但是这会在hadoop环境的运行时在class not found exception上给出com.itextpdf.text.pdf.parser.RenderListener。感谢任何帮助。所有jar文件都在构建路径上正确添加，并且没有多个jar。

com.itextpdf.text.pdf.parser.RenderListener上的类未找到例外

0 个答案: