Question

我想做的是提取与PDF文件中某些文本关联的图像。例如，PDF将带有房屋正面的照片。在照片上方，将显示一个标题为“前视图”的标题。我希望程序在PDF中搜索文本“ Front View”并提取其后的照片。

我看过iTextSharp，PDFsharp和其他实用程序，但是它们全部将PDF中的文本和图像分开对待。似乎没有任何方法可以确定这行文字位于该图片之前。

我们使用iTextSharp来处理PDF。我已经在C＃中编写了一种方法，该方法将根据给定的页码，页面上的图像编号和图像类型提取图像。例如，我可以在第3页上提取第二个jpeg。这是该代码。我想要的是能够在文件中搜索一行文本，然后提取该文本行之后的图像。

public class ImageExtractor : IRenderListener
{
    int _currentPage = 1;
    int _imageCount = 0;
    int _index = 0;
    int _count = 0;
    readonly string _outputFilePrefix;
    readonly string _outputFolder;
    readonly bool _overwriteExistingFiles;
    string[] _fileTypes;

    public ImageExtractor(string outputFilePrefix, string outputFolder, bool overwriteExistingFiles, string[] fileTypes, int index)
    {
        _outputFilePrefix = outputFilePrefix;
        _outputFolder = outputFolder;
        _overwriteExistingFiles = overwriteExistingFiles;
        _fileTypes = fileTypes;
        _index = index;
    }

    public static int ExtractImageByIndex(string pdfPath, string outputFilePrefix, string outputFolder, bool overwriteExistingFiles, int pageNumber, int index, string[] fileTypes = null)
    {
        // Handle setting of any default values
        outputFilePrefix = outputFilePrefix ?? System.IO.Path.GetFileNameWithoutExtension(pdfPath);
        outputFolder = String.IsNullOrEmpty(outputFolder) ? System.IO.Path.GetDirectoryName(pdfPath) : outputFolder;

        var instance = new ImageExtractor(outputFilePrefix, outputFolder, overwriteExistingFiles, fileTypes, index);
        instance._currentPage = pageNumber;

        using (var pdfReader = new PdfReader(pdfPath))
        {
            if (pdfReader.NumberOfPages == 0)
                return 0;

            if (pdfReader.IsEncrypted())
                throw new ApplicationException(pdfPath + " is encrypted.");

            var pdfParser = new PdfReaderContentParser(pdfReader);

            pdfParser.ProcessContent(instance._currentPage, instance);
        }

        return instance._imageCount;
    }

    public void BeginTextBlock() { }
    public void EndTextBlock() { }
    public void RenderText(TextRenderInfo renderInfo) { }

    public void RenderImage(ImageRenderInfo renderInfo)
    {
        // If _index is greater than 0, we're looking for a specific image. If _count is
        // equal to _index, we've already found it, so don't go any farther.
        if (_index > 0 && _count == _index)
            return;

        var imageObject = renderInfo.GetImage();

        var imageFileName = "";

        if (_fileTypes != null)
        {
            var type = imageObject.GetFileType().ToLower();
            var flag = false;
            foreach (var t in _fileTypes)
            {
                if (t.ToLower() == type)
                {
                    flag = true;
                    break;
                }
            }
            if (flag)
                imageFileName = String.Format("{0}_{1}_{2}.{3}", _outputFilePrefix, _currentPage, _imageCount, imageObject.GetFileType());
        }
        else
        {
            imageFileName = String.Format("{0}_{1}_{2}.{3}", _outputFilePrefix, _currentPage, _imageCount, imageObject.GetFileType());
        }

        if (!string.IsNullOrEmpty(imageFileName))
        {
            // If _index is 0, multiple images may be extracted. If _index is greater than 0,
            // RenderImage will increment count every time it finds an image that matches the
            // file type and will only extract the image if count equals index.
            if (_index > 0)
            {
                _count++;
                if (_count != _index)
                    return;
            }

            var imagePath = System.IO.Path.Combine(_outputFolder, imageFileName);

            if (_overwriteExistingFiles || !File.Exists(imagePath))
            {
                var imageRawBytes = imageObject.GetImageAsBytes();

                File.WriteAllBytes(imagePath, imageRawBytes);

            }

            // Subtle: Always increment even if file is not written. This ensures consistency should only some
            //   of a PDF file's images actually exist.
            _imageCount++;
        }
    }
}

Answer 1

正如评论中已经提到的，这与问题Extraction of images present inside a paragraph的主题非常相似，主要区别在于在该问题的上下文中，使用了Java的iText而不是.Net的iTextSharp。

该问题中Java SimpleMixedExtractionStrategy的端口可能看起来像这样：

public class SimpleMixedExtractionStrategy : LocationTextExtractionStrategy
{
    FieldInfo field = typeof(LocationTextExtractionStrategy).GetField("locationalResult", BindingFlags.Instance | BindingFlags.NonPublic);
    LineSegment UNIT_LINE = new LineSegment(new Vector(0, 0, 1), new Vector(1, 0, 1));
    String outputPath;
    String name;
    int counter = 0;

    public SimpleMixedExtractionStrategy(String outputPath, String name)
    {
        this.outputPath = outputPath;
        this.name = name;
    }

    public override void RenderImage(ImageRenderInfo renderInfo)
    {
        PdfImageObject image = renderInfo.GetImage();
        if (image == null) return;
        int number = counter++;
        String filename = name + "-" + number + "." + image.GetFileType();
        File.WriteAllBytes(outputPath + filename, image.GetImageAsBytes());

        LineSegment segment = UNIT_LINE.TransformBy(renderInfo.GetImageCTM());
        TextChunk location = new TextChunk("[" + filename + "]", segment.GetStartPoint(), segment.GetEndPoint(), 0f);

        List<TextChunk> locationalResult = (List<TextChunk>)field.GetValue(this);
        locationalResult.Add(location);
    }
}

就像在Java实现中一样，必须使用反射来访问private List<TextChunk> locationalResult中的LocationTextExtractionStrategy。如果您的项目中不允许使用反射，则可以将LocationTextExtractionStrategy的整个源复制到自己的类中，并将更改应用于副本。

您可以这样使用它：

String sourceFile = @"SOURCE.pdf";
String imagePath = @"extract\";
String imageBaseName = "SOURCE-";
Directory.CreateDirectory(imagePath);

using (PdfReader pdfReader = new PdfReader(sourceFile))
{
    PdfReaderContentParser parser = new PdfReaderContentParser(pdfReader);
    for (var i = 1; i <= pdfReader.NumberOfPages; i++)
    {
        SimpleMixedExtractionStrategy listener = new SimpleMixedExtractionStrategy(imagePath, imageBaseName + i);
        parser.ProcessContent(i, listener);
        String text = listener.GetResultantText();
        Console.Write("Text of page {0}:\n---\n{1}\n\n", i, text);
    }
}

有关引述问题的示例文件

输出为：

Text of page 1:
---
Getting Started with Vaadin
• A version of Book of Vaadin that you can browse in the Eclipse Help system.
You can install the plugin as follows:
1. Start Eclipse.
2. Select Help   Software Updates....
3. Select the Available Software tab.
4. Add the Vaadin plugin update site by clicking Add Site....
[book-of-vaadin-page14-1-0.png]
Enter the URL of the Vaadin Update Site: http://vaadin.com/eclipse and click OK. The
Vaadin site should now appear in the Software Updates window.
5. Select all the Vaadin plugins in the tree.
[book-of-vaadin-page14-1-1.png]
Finally, click Install.
Detailed and up-to-date installation instructions for the Eclipse plugin can be found at http://vaad-
in.com/eclipse.
Updating the Vaadin Plugin
If you have automatic updates enabled in Eclipse (see Window   Preferences   Install/Update
  Automatic Updates), the Vaadin plugin will be updated automatically along with other plugins.
Otherwise, you can update the Vaadin plugin (there are actually multiple plugins) manually as
follows:
1. Select Help   Software Updates..., the Software Updates and Add-ons window will
open.
2. Select the Installed Software tab.
14 Vaadin Plugin for Eclipse

因此，完成任务

我想要的是能够在文件中搜索一行文本，然后提取该文本行之后的图像。

只需在上方的输出字符串中搜索该文本行，然后查找包含方括号中图像文件名的下一行。

（如果您的PDF也使用方括号，则可以将文件名用SimpleMixedExtractionStrategy中的其他定界符括起来，例如Unicode专用区域中的某些字符。）

Answer 2

这是我找到的解决方案。原始代码包含很多与问题不直接相关的内容，因此我在帖子中对其进行了简化。

public class ImageExtractor : IRenderListener
{
    private string caption;
    private bool _captionFound;
    private string _outputFolder;

    ....
    ....

    public void BeginTextBlock() { }

    public void EndTextBlock() { }

    public void RenderText(TextRenderInfo renderInfo) {
        // If this line of text contains the caption, set _captionFound to true
        if (renderInfo.GetText().Contains(_caption))
            _captionFound = true;
    }

    public void RenderImage(ImageRenderInfo renderInfo)
    {
        // Skip the image if _captionFound is false
        if (!_captionFound)
            return;

        // _captionFound is true, so extract the image

        // Code to extract image

        // Set _captionFound back to false, so that only the first image found is
        // extracted.
        _captionFound = false;

    }

    public static int ExtractImageByCaption(string caption, string pdfPath, string outputFolder, string outputFolder, bool overwriteExistingFiles, string[] fileTypes = null)
    {
        var instance = new ImageExtractor(outputFilePrefix, outputFolder, overwriteExistingFiles, fileTypes, 0);

        instance._caption = caption;
        instance._outputFolder = outputFolder;

        using (var pdfReader = new PdfReader(pdfPath))
        {
            if (pdfReader.IsEncrypted())
                throw new ApplicationException(pdfPath + " is encrypted.");

            var pdfParser = new PdfReaderContentParser(pdfReader);

            while (instance._currentPage <= pdfReader.NumberOfPages)
            {
                pdfParser.ProcessContent(instance._currentPage, instance);

                instance._currentPage++;
            }
        }
    }
}

在PDF中获取图像之前的文本

2 个答案: