使用JAVA从PDF中提取阿拉伯语文本

时间:2017-02-06 09:44:58

标签: java eclipse pdf ocr

我想从pdf中提取文本,但问题是阿拉伯语文本我只是得到一些代码。

它适用于法语文本。

这是我的源代码:

package com.example;

import java.io.*;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.apache.pdfbox.pdmodel.*;
import org.apache.pdfbox.text.PDFTextStripper;
import org.jsoup.Jsoup;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

public class convert2 {

  static void WriteFromPDFToXML(String XMLFile, String content) {
    try {
      BufferedReader br = new BufferedReader(new InputStreamReader(System.in));

      DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
      DocumentBuilder db = dbf.newDocumentBuilder();
      Document document = db.parse(XMLFile);

      //root
      System.out.println(document.getDocumentElement().getNodeName());

      //no of elemnts
      NodeList nodeList = document.getElementsByTagName("IndexValue");
      System.out.println(nodeList.getLength());

      for (int i = 0; i < nodeList.getLength(); i++) {
        //insert an extra node for the second person
        if (i == 14) {
          Node node = nodeList.item(i);

          Element publisherElm = document.createElement("Value");

          // System.out.println("Enter publisher value :");
          //String publisher = br.readLine();

          publisherElm.appendChild(document.createTextNode(content));

          node.appendChild(publisherElm);
        }
      }

      TransformerFactory tff = TransformerFactory.newInstance();
      Transformer transformer = tff.newTransformer();

      transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
      transformer.setOutputProperty(OutputKeys.INDENT, "yes");

      DOMSource xmlSource = new DOMSource(document);
      StreamResult outputTarget = new StreamResult(XMLFile);
      transformer.transform(xmlSource, outputTarget);
    } catch (Exception e) {
      e.printStackTrace();
    }
  }

  static void DeleteFile(File output) {
    try {
      File file = output;
      if (file.delete()) {
        System.out.println(file.getName() + " is deleted!");
      } else {
        System.out.println("Delete operation is failed.");
      }
    } catch (Exception e) {
      e.printStackTrace();
    }
  }

  public static void main(String[] args) {
    PDDocument pd;
    BufferedWriter wr;
    File output = null;
    String content;
    try {
      File input =
          new File(
              "C:/Users/HP/Desktop/EXPORT1/00000002.pdf"); // The PDF file from where you would like to extract
      output =
          new File(
              "C:/Users/HP/Desktop/EXPORT1/verif.txt"); // The text file where you are going to store the extracted data
      pd = PDDocument.load(input);
      System.out.println(pd.getNumberOfPages());
      System.out.println(pd.isEncrypted());
      pd.save("CopyOfInvoice.pdf"); // Creates a copy called "CopyOfInvoice.pdf"
      PDFTextStripper stripper = new PDFTextStripper();

      wr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output)));
      stripper.writeText(pd, wr);
      System.out.println(stripper.getText(pd));
      content = stripper.getText(pd);

      if (pd != null) {
        pd.close();
      }

      // I use close() to flush the stream.
      wr.close();
      DeleteFile(output);
      String test = Jsoup.parse(content).text();
      WriteFromPDFToXML("C:/Users/HP/Desktop/EXPORT1/result1.xml", test);

    } catch (Exception e) {
      e.printStackTrace();
    }
  }
}

0 个答案:

没有答案