我需要从pdf文件中提取文字(逐字逐句)。
import java.io.*;
import com.itextpdf.text.*;
import com.itextpdf.text.pdf.*;
import com.itextpdf.text.pdf.parser.*;
public class pdf {
private static String INPUTFILE = "http://ontology.buffalo.edu/ontology%28PIC%29.pdf" ;
private static String OUTPUTFILE = "c:/new3.pdf";
public static void main(String[] args) throws DocumentException,
IOException {
Document document = new Document();
PdfWriter writer = PdfWriter.getInstance(document,
new FileOutputStream(OUTPUTFILE));
document.open();
PdfReader reader = new PdfReader(INPUTFILE);
int n = reader.getNumberOfPages();
PdfImportedPage page;
// Go through all pages
for (int i = 1; i <= n; i++) {
page = writer.getImportedPage(reader, i);
System.out.println(i);
Image instance = Image.getInstance(page);
document.add(instance);
}
document.close();
PdfReader readerN = new PdfReader(OUTPUTFILE);
PdfTextExtractor parse = new PdfTextExtractor();
for (int i = 1; i <= n; i++)
System.out.println(parser.getTextFromPage(reader,i));
}
编译代码时,出现此错误:
构造函数PdfTextExtractor未定义
我该如何解决这个问题?
答案 0 :(得分:8)
PDFTextExtractor只包含静态方法,构造函数是私有的。 itext
你可以这样称呼它:
String myLine = PDFTextExtractor.getTextFromPage(reader, pageNumber)
答案 1 :(得分:0)
如果您想从PDF文件中获取所有文本并将其保存到文本文件,您可以使用以下代码。
使用 pdfutil.jar 库。
import java.io.IOException;
import java.io.PrintWriter;
import com.testautomationguru.utility.PDFUtil;
public class PDFToText{
public static void main(String[] args) {
try {
String pdfFilePath = "C:\\abc.pdf";
PDFUtil pdfUtil = new PDFUtil();
String content = pdfUtil.getText(pdfFilePath);
PrintWriter out = new PrintWriter("C:\\abc.txt");
out.println(content);
out.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
答案 2 :(得分:-1)
// Try Apache PDF Box
import java.io.FilterInputStream;
import java.io.InputStream;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
// Your PDF file
String filePath = "";
InputStream inputStream = null;
try
{
inputStream = new FileInputStream(filePath);
PDFParser parser = new PDFParser(inputStream);
// This will parse the stream and populate the COSDocument object.
parser.parse();
// Get the document that was parsed.
COSDocument cosDoc = parser.getDocument();
// This class will take a pdf document and strip out all of the text and
// ignore the formatting and such.
PDFTextStripper pdfStripper = new PDFTextStripper();
// This is the in-memory representation of the PDF document
PDDocument pdDoc = new PDDocument(cosDoc);
pdfStripper.setStartPage(1);
pdfStripper.setEndPage(pdDoc.getNumberOfPages());
// This will return the text of a document.
def statementPDF = pdfStripper.getText(pdDoc);
}
catch(Exception e)
{
String errorMessage += "\nUnexpected Exception: " + e.getClass() + "\n" + e.getMessage();
for (trace in e.getStackTrace())
{
errorMessage += "\n\t" + trace;
}
}
finally
{
if (inputStream != null)
{
inputStream.close();
}
}