我想从pdf中提取文本,但问题是阿拉伯语文本我只是得到一些代码。
它适用于法语文本。
这是我的源代码:
package com.example;
import java.io.*;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.apache.pdfbox.pdmodel.*;
import org.apache.pdfbox.text.PDFTextStripper;
import org.jsoup.Jsoup;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
public class convert2 {
static void WriteFromPDFToXML(String XMLFile, String content) {
try {
BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
DocumentBuilder db = dbf.newDocumentBuilder();
Document document = db.parse(XMLFile);
//root
System.out.println(document.getDocumentElement().getNodeName());
//no of elemnts
NodeList nodeList = document.getElementsByTagName("IndexValue");
System.out.println(nodeList.getLength());
for (int i = 0; i < nodeList.getLength(); i++) {
//insert an extra node for the second person
if (i == 14) {
Node node = nodeList.item(i);
Element publisherElm = document.createElement("Value");
// System.out.println("Enter publisher value :");
//String publisher = br.readLine();
publisherElm.appendChild(document.createTextNode(content));
node.appendChild(publisherElm);
}
}
TransformerFactory tff = TransformerFactory.newInstance();
Transformer transformer = tff.newTransformer();
transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
transformer.setOutputProperty(OutputKeys.INDENT, "yes");
DOMSource xmlSource = new DOMSource(document);
StreamResult outputTarget = new StreamResult(XMLFile);
transformer.transform(xmlSource, outputTarget);
} catch (Exception e) {
e.printStackTrace();
}
}
static void DeleteFile(File output) {
try {
File file = output;
if (file.delete()) {
System.out.println(file.getName() + " is deleted!");
} else {
System.out.println("Delete operation is failed.");
}
} catch (Exception e) {
e.printStackTrace();
}
}
public static void main(String[] args) {
PDDocument pd;
BufferedWriter wr;
File output = null;
String content;
try {
File input =
new File(
"C:/Users/HP/Desktop/EXPORT1/00000002.pdf"); // The PDF file from where you would like to extract
output =
new File(
"C:/Users/HP/Desktop/EXPORT1/verif.txt"); // The text file where you are going to store the extracted data
pd = PDDocument.load(input);
System.out.println(pd.getNumberOfPages());
System.out.println(pd.isEncrypted());
pd.save("CopyOfInvoice.pdf"); // Creates a copy called "CopyOfInvoice.pdf"
PDFTextStripper stripper = new PDFTextStripper();
wr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output)));
stripper.writeText(pd, wr);
System.out.println(stripper.getText(pd));
content = stripper.getText(pd);
if (pd != null) {
pd.close();
}
// I use close() to flush the stream.
wr.close();
DeleteFile(output);
String test = Jsoup.parse(content).text();
WriteFromPDFToXML("C:/Users/HP/Desktop/EXPORT1/result1.xml", test);
} catch (Exception e) {
e.printStackTrace();
}
}
}