我有一个包含自定义字体的阿拉伯语PDF文件,因此当我尝试读取该文件时,遇到了一些无法理解的单词和字符,并用另一个字符或符号替换。
这里是link to the PDF file I'm working on。
public class TikaAnalysis {
public static String extractContentUsingFacade(InputStream stream) throws IOException, TikaException {
Tika tika = new Tika();
String content = tika.parseToString(stream);
try {
WriteOnWordDoc(str);
} catch (Exception e) {
e.printStackTrace();
}
return content;
}
public static void WriteOnWordDoc(String fileContent) throws Exception {
XWPFDocument document = new XWPFDocument();
XWPFParagraph tmpParagraph = document.createParagraph();
XWPFRun tmpRun = tmpParagraph.createRun();
tmpRun.setText(fileContent);
tmpRun.setFontSize(10);
FileOutputStream fos = new FileOutputStream(new File("extractedContent.docx"));
document.write(fos);
fos.close();
}
public static void main(String[] args) {
FileInputStream inputStream = null;
String path ="File.pdf";
try {
File file=new File(path);
inputStream = new FileInputStream(file);
InputStream input = new BufferedInputStream(inputStream);
TikaAnalysis.extractContentUsingFacade(inputStream);
inputStream.close();
} catch (Exception e) {
e.printStackTrace();
} finally {
if (inputStream != null) {
try {
System.out.println("close the file ");
inputStream.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
}
}