Question

我正在评估Winnovative的PdfToText库并遇到了一些与我有关的事情。

一切运行正常，如果我正在运行控制台应用程序，我可以立即从小的20k或更少的pdf中提取文本内容。但是，如果我从NUnit gui调用相同的代码运行它需要15-25秒（我已经通过在提取文本的行上设置断点并点击F10来验证它的PdfToText，以查看前进到下一行）。

这让我很担心，因为我不知道应该归咎于哪里，因为我不知道原因。 NUnit或PdfToText有问题吗？我想要做的就是从pdf中提取文本，但如果我要在某些条件下看到这种行为，那么20秒是完全不合理的。如果它只是在运行NUnit时，那是可以接受的，但除此之外我还要去其他地方看看。

使用完整的VS解决方案（2010）更容易演示问题，因此这里的链接使其更易于设置和运行（无需下载NUnit或PdfToText甚至样本pdf）： http://dl.dropbox.com/u/273037/PdfToTextProblem.zip（如果您在32位计算机上运行，则可能必须更改对PdfToText的引用以使用x86 dll。）

点击F5，NUnit Gui跑步者将加载。

我没有绑定这个库，如果你有建议，我已经尝试了iTextSharp（对于2行代码太贵了），看了Aspose（我没试过，但是SaaS许可证是$ 11K）。但他们要么缺乏必要的功能，要么太昂贵。

Answer 1

（评论转为答案）

您的PDF有多复杂？ iText的4.1.6版本允许封闭源解决方案。尽管4.1.6没有直接使用文本提取器，但使用PdfReader和GetPageContent（）编写文本提取器并不是非常困难。

Answer 2

以下是我使用iTextSharp v4.1.6从PDF中提取文本的代码。如果它看起来过于冗长，那就与我如何使用它以及所需的灵活性有关。

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using iTextSharp.text.pdf;

namespace ClassLibrary1
{
    public class PdfToken
    {
        private PdfToken(int type, string value)
        {
            Type = type;
            Value = value;
        }

        public static PdfToken Create(PRTokeniser tokenizer)
        {
            return new PdfToken(tokenizer.TokenType, tokenizer.StringValue);
        }

        public int Type { get; private set; }
        public string Value { get; private set; }
        public bool IsOperand
        {
            get
            {
                return Type == PRTokeniser.TK_OTHER;
            }
        }
    }

    public class PdfOperation
    {
        public PdfOperation(PdfToken operationToken, IEnumerable<PdfToken> arguments)
        {
            Name = operationToken.Value;
            Arguments = arguments;
        }

        public string Name { get; private set; }
        public IEnumerable<PdfToken> Arguments { get; private set; }
    }

    public interface IPdfParsingStrategy
    {
        void Execute(PdfOperation op);
    }

    public class PlainTextParsingStrategy : IPdfParsingStrategy
    {
        StringBuilder text = new StringBuilder();

        public PlainTextParsingStrategy()
        {

        }

        public String GetText()
        {
            return text.ToString();
        }

        #region IPdfParsingStrategy Members

        public void Execute(PdfOperation op)
        {
            // see Adobe PDF specs for additional operations
            switch (op.Name)
            {
                case "TJ":
                    PrintText(op);
                    break;
                case "Tm":
                    SetMatrix(op);
                    break;
                case "Tf":
                    SetFont(op);
                    break;
                case "S":
                    PrintSection(op);
                    break;
                case "G":
                case "g":
                case "rg":
                    SetColor(op);
                    break;
            }
        }

        #endregion

        bool newSection = false;

        private void PrintSection(PdfOperation op)
        {
            text.AppendLine("------------------------------------------------------------");
            newSection = true;
        }

        private void PrintNewline(PdfOperation op)
        {
            text.AppendLine();
        }

        private void PrintText(PdfOperation op)
        {
            if (newSection)
            {
                newSection = false;
                StringBuilder header = new StringBuilder();
                PrintText(op, header);
            }

            PrintText(op, text);
        }

        private static void PrintText(PdfOperation op, StringBuilder text)
        {
            foreach (PdfToken t in op.Arguments)
            {
                switch (t.Type)
                {
                    case PRTokeniser.TK_STRING:
                        text.Append(t.Value);
                        break;
                    case PRTokeniser.TK_NUMBER:
                        text.Append(" ");
                        break;
                }
            }
        }

        String lastFont = String.Empty;
        String lastFontSize = String.Empty;

        private void SetFont(PdfOperation op)
        {
            var args = op.Arguments.ToList();
            string font = args[0].Value;
            string size = args[1].Value;

            //if (font != lastFont || size != lastFontSize)
            //    text.AppendLine();

            lastFont = font;
            lastFontSize = size;
        }

        String lastX = String.Empty;
        String lastY = String.Empty;

        private void SetMatrix(PdfOperation op)
        {
            var args = op.Arguments.ToList();
            string x = args[4].Value;
            string y = args[5].Value;

            if (lastY != y)
                text.AppendLine();
            else if (lastX != x)
                text.Append(" ");

            lastX = x;
            lastY = y;
        }

        String lastColor = String.Empty;

        private void SetColor(PdfOperation op)
        {
            lastColor = PrintCommand(op).Replace(" ", "_");
        }

        private static string PrintCommand(PdfOperation op)
        {
            StringBuilder text = new StringBuilder();
            foreach (PdfToken t in op.Arguments)
                text.AppendFormat("{0} ", t.Value);
            text.Append(op.Name);
            return text.ToString();
        }

    }
}

以下是我的称呼方式：

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;
using iTextSharp.text.pdf;

namespace ClassLibrary1
{
    public class PdfExtractor
    {
        public static string GetText(byte[] pdfBuffer)
        {
            PlainTextParsingStrategy strategy = new PlainTextParsingStrategy();
            ParsePdf(pdfBuffer, strategy);
            return strategy.GetText();
        }

        private static void ParsePdf(byte[] pdf, IPdfParsingStrategy strategy)
        {
            PdfReader reader = new PdfReader(pdf);

            for (int i = 1; i <= reader.NumberOfPages; i++)
            {
                byte[] page = reader.GetPageContent(i);
                if (page != null)
                {
                    PRTokeniser tokenizer = new PRTokeniser(page);
                    List<PdfToken> parameters = new List<PdfToken>();

                    while (tokenizer.NextToken())
                    {
                        var token = PdfToken.Create(tokenizer);
                        if (token.IsOperand)
                        {
                            strategy.Execute(new PdfOperation(token, parameters));
                            parameters.Clear();
                        }
                        else
                        {
                            parameters.Add(token);
                        }
                    }
                }
            }

        }
    }
}

运行NUnit时，第三方Pdf库明显变慢

2 个答案: