我正在尝试从.docx文件中提取阿拉伯语段落。我打算用英文打开它并用Apache-OPI读取它,但是当文件有任何阿拉伯字符时,它会将其显示为问号。
以下是我正在使用的代码:
import java.io.*;
import java.util.List;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.ss.usermodel.FontCharset;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
public class main_class {
public static void ReadDocXFile(List < XWPFParagraph > extractor, String filename) {
for (XWPFParagraph paragraph: extractor) {
System.out.println("Text: " + paragraph.getParagraphText());
}
}
public static void main(String[] args) throws Exception {
FileInputStream fis;
File file = new File("C:\\filename.docx");
fis = new FileInputStream(file.getAbsolutePath());
String filename = file.getName();
XWPFDocument documentX = new XWPFDocument(fis);
List < XWPFParagraph > pera = documentX.getParagraphs();
ReadDocXFile(pera, filename);
}
}