仅针对某些pdf从PDF中提取文本的问题C#

时间:2018-02-15 08:24:51

标签: c# pdf

我需要从PDF文件中提取一些数据。 我使用 iTextSharp 来做到这一点。

我使用的是我在网上创建的代码:

using System;
using System.IO;
using iTextSharp.text.pdf;

namespace PdfToText
{
/// <summary>
/// Parses a PDF file and extracts the text from it.
/// </summary>
public class PDFParser
{
    /// BT = Beginning of a text object operator 
    /// ET = End of a text object operator
    /// Td move to the start of next line
    ///  5 Ts = superscript
    /// -5 Ts = subscript

    #region Fields

    #region _numberOfCharsToKeep
    /// <summary>
    /// The number of characters to keep, when extracting text.
    /// </summary>
    private static int _numberOfCharsToKeep = 15;
    #endregion

    #endregion

    #region ExtractText
    /// <summary>
    /// Extracts a text from a PDF file.
    /// </summary>
    /// <param name="inFileName">the full path to the pdf file.</param>
    /// <param name="outFileName">the output file name.</param>
    /// <returns>the extracted text</returns>
    public bool ExtractText(string inFileName, string outFileName)
    {
        StreamWriter outFile = null;
        try
        {
            outFileName = String.Empty;

            outFileName = Path.GetDirectoryName(System.AppDomain.CurrentDomain.BaseDirectory);
            //string currentDirectory = Directory.GetCurrentDirectory();
            //string filePath = System.IO.Path.Combine(currentDirectory, "Data", "myfile.txt");
            // extract the text
            //string test = "";
            outFileName += @"\test.txt";
            // Create a reader for the given PDF file
            PdfReader reader = new PdfReader(inFileName);
            //outFile = File.CreateText(outFileName);
            outFile = new StreamWriter(outFileName, true, System.Text.Encoding.UTF8);

            Console.Write("Processing: ");

            int totalLen = 68;
            float charUnit = ((float)totalLen) / (float)reader.NumberOfPages;
            int totalWritten = 0;
            float curUnit = 0;

            for (int page = 1; page <= reader.NumberOfPages; page++)
            {
                outFile.Write(ExtractTextFromPDFBytes(reader.GetPageContent(page)) + " ");

                // Write the progress.
                if (charUnit >= 1.0f)
                {
                    for (int i = 0; i < (int)charUnit; i++)
                    {
                        Console.Write("#");
                        totalWritten++;
                    }
                }
                else
                {
                    curUnit += charUnit;
                    if (curUnit >= 1.0f)
                    {
                        for (int i = 0; i < (int)curUnit; i++)
                        {
                            Console.Write("#");
                            totalWritten++;
                        }
                        curUnit = 0;
                    }

                }
            }

            if (totalWritten < totalLen)
            {
                for (int i = 0; i < (totalLen - totalWritten); i++)
                {
                    Console.Write("#");
                }
            }
            return true;
        }
        catch(Exception ex)
        {
            return false;
        }
        finally
        {
            if (outFile != null) outFile.Close();
        }
    }
    #endregion

    #region ExtractTextFromPDFBytes
    /// <summary>
    /// This method processes an uncompressed Adobe (text) object 
    /// and extracts text.
    /// </summary>
    /// <param name="input">uncompressed</param>
    /// <returns></returns>
    private string ExtractTextFromPDFBytes(byte[] input)
    {
        if (input == null || input.Length == 0) return "";

        try
        {
            string resultString = "";

            // Flag showing if we are we currently inside a text object
            bool inTextObject = false;

            // Flag showing if the next character is literal 
            // e.g. '\\' to get a '\' character or '\(' to get '('
            bool nextLiteral = false;

            // () Bracket nesting level. Text appears inside ()
            int bracketDepth = 0;

            // Keep previous chars to get extract numbers etc.:
            char[] previousCharacters = new char[_numberOfCharsToKeep];
            for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' ';


            for (int i = 0; i < input.Length; i++)
            {
                char c = (char)input[i];

                if (inTextObject)
                {
                    // Position the text
                    if (bracketDepth == 0)
                    {
                        if (CheckToken(new string[] { "TD", "Td" }, previousCharacters))
                        {
                            resultString += "\n\r";
                        }
                        else
                        {
                            if (CheckToken(new string[] { "'", "T*", "\"" }, previousCharacters))
                            {
                                resultString += "\n";
                            }
                            else
                            {
                                if (CheckToken(new string[] { "Tj" }, previousCharacters))
                                {
                                    resultString += " ";
                                }
                            }
                        }
                    }

                    // End of a text object, also go to a new line.
                    if (bracketDepth == 0 &&
                        CheckToken(new string[] { "ET" }, previousCharacters))
                    {

                        inTextObject = false;
                        resultString += " ";
                    }
                    else
                    {
                        // Start outputting text
                        if ((c == '(') && (bracketDepth == 0) && (!nextLiteral))
                        {
                            bracketDepth = 1;
                        }
                        else
                        {
                            // Stop outputting text
                            if ((c == ')') && (bracketDepth == 1) && (!nextLiteral))
                            {
                                bracketDepth = 0;
                            }
                            else
                            {
                                // Just a normal text character:
                                if (bracketDepth == 1)
                                {
                                    // Only print out next character no matter what. 
                                    // Do not interpret.
                                    if (c == '\\' && !nextLiteral)
                                    {
                                        nextLiteral = true;
                                    }
                                    else
                                    {
                                        if (((c >= ' ') && (c <= '~')) ||
                                            ((c >= 128) && (c < 255)))
                                        {
                                            resultString += c.ToString();
                                        }

                                        nextLiteral = false;
                                    }
                                }
                            }
                        }
                    }
                }

                // Store the recent characters for 
                // when we have to go back for a checking
                for (int j = 0; j < _numberOfCharsToKeep - 1; j++)
                {
                    previousCharacters[j] = previousCharacters[j + 1];
                }
                previousCharacters[_numberOfCharsToKeep - 1] = c;

                // Start of a text object
                if (!inTextObject && CheckToken(new string[] { "BT" }, previousCharacters))
                {
                    inTextObject = true;
                }
            }
            return resultString;
        }
        catch
        {
            return "";
        }
    }
    #endregion

    #region CheckToken
    /// <summary>
    /// Check if a certain 2 character token just came along (e.g. BT)
    /// </summary>
    /// <param name="search">the searched token</param>
    /// <param name="recent">the recent character array</param>
    /// <returns></returns>
    private bool CheckToken(string[] tokens, char[] recent)
    {
        foreach (string token in tokens)
        {
            if ((recent[_numberOfCharsToKeep - 3] == token[0]) &&
                (recent[_numberOfCharsToKeep - 2] == token[1]) &&
                ((recent[_numberOfCharsToKeep - 1] == ' ') ||
                (recent[_numberOfCharsToKeep - 1] == 0x0d) ||
                (recent[_numberOfCharsToKeep - 1] == 0x0a)) &&
                ((recent[_numberOfCharsToKeep - 4] == ' ') ||
                (recent[_numberOfCharsToKeep - 4] == 0x0d) ||
                (recent[_numberOfCharsToKeep - 4] == 0x0a))
                )
            {
                return true;
            }
        }
        return false;
    }
    #endregion
}

}

我用这种方式:

 PDFParser pdfParser = new PDFParser();
 pdfParser.ExtractText(pdfFile,Path.GetFileNameWithoutExtension(pdfFile) + ".txt");

所以pdf内容是用txt文件写的。 它适用于某些pdf-s,但对于我真正需要使用的pdf文件,txt文件始终为空。我没有收到任何错误,但由于某种原因,它没有写任何内容,尽管你可以在这个截图中看到它识别pdf,它有2页......

enter image description here

这是我需要的pdf但是txt始终保持为空。(黑线是我添加的,所以当我想在txt中写入时不存在)

enter image description here

这是另一个pdf。为此,程序工作正常,它写的是一个txt文件。它比其他pdf大得多,仍然为此我可以提取文本而另一方面我不能。

enter image description here

enter image description here

你知道可能出现什么问题吗?

1 个答案:

答案 0 :(得分:1)

评论太长,也许你不喜欢的答案:

在PDF中,&#34;发短信给你看&#34;又名字体外观和&#34;字形是什么意思&#34;又名什么字形映射到哪个utf8字母是单独的东西。

它们存储在pdf的不同部分 - 完全可能pdf看起来完全正常,但是如果你试图提取文本它将不会给你任何东西,因为它只包含你的textglyphs的形状而不是theire &#34;含义&#34 ;.

尝试打开pdf并选择+复制您所追求的文本,如果将其粘贴到编辑器中并注意到,那么您的pdf缺少信息&#34;此字形显示的是什么utf8字母&#34 ;

OR:

也可能是你的pdf只包含文字的图像 - 照片就这么说了。你可以阅读它,iTextSharp只看到一个&#34;图片&#34; - 没有文字。

这些是可能的,为什么会回答你的问题。至于如何解决它:

关于SO的损坏PDF有几个问题:

How to repair a PDF file and embed missing fonts

Embedded fonts in PDF: copy and paste problemsthis answer

复制和粘贴与文本解析有关,因此可能会帮助您解决问题。

您的编辑会显示有关解析的详细信息,为什么不利用iTextSharp?

using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;

public static string ExtractTextFromPdf(string path)
{
  using (PdfReader reader = new PdfReader(path))
  {
    StringBuilder text = new StringBuilder();

    for (int i = 1; i <= reader.NumberOfPages; i++)
    {
        text.Append(PdfTextExtractor.GetTextFromPage(reader, i));
    }

    return text.ToString();
  }
     

来自:http://www.squarepdf.net/parsing-pdf-files-using-itextsharp

或者像这里:parse-pdf-with-itextsharp-and-then-extract-specific-text-to-the-screen