Question

如何使用c＃7.1版读取用阿拉伯语编写的PDF文件？

我尝试过这个功能：

private string GetTextFromPDF(String path)
    {
        StringBuilder text = new StringBuilder();
        using (PdfReader reader = new PdfReader(path))
        {
            for (int i = 1; i <= reader.NumberOfPages; i++)
            {
                text.Append(PdfTextExtractor.GetTextFromPage(reader, i));
            }
        }
        return text.ToString();

}

但它只适用于英语charahcters，有什么帮助吗？

Answer 1

要阅读pdf，请尝试使用：

private static string ReadPdfFile(string fileName)
{
    StringBuilder text = new StringBuilder();


    if (File.Exists(fileName))
    {
        PdfReader pdfReader = new PdfReader(fileName);
        ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
        string currentText = PdfTextExtractor.GetTextFromPage(pdfReader, 1, strategy);
        //currentText = Encoding.UTF8.GetString(UTF8Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));
        text.Append(currentText);
    }

    return text.ToString();
}

然后在提取的文本上：

public static string Arabic1256ToUtf8(string data)
{
    var latin = Encoding.GetEncoding("ISO-8859-1");
    var bytes = latin.GetBytes(data); // get the bytes for your ANSI string

    var arabic = Encoding.GetEncoding("Windows-1256"); // decode it using the correct encoding
    return arabic.GetString(bytes);
}

阅读阿拉伯语pdf文件C＃

1 个答案: