以正确的顺序提取pdf图像iTextSharp

时间:2016-09-12 13:38:06

标签: c# pdf itext

我正在尝试从PDF文件中提取图像,但我确实需要以正确的顺序获取图像才能获得正确的图像。

    static void Main(string[] args)
    {
        string filename = "D:\\910723575_marca_coletiva.pdf";

        PdfReader pdfReader = new PdfReader(filename);

        var imagemList = ExtraiImagens(pdfReader);

        // converter byte[] para um bmp
        List<Bitmap> bmpSrcList = new List<Bitmap>();
        IList<byte[]> imagensProcessadas = new List<byte[]>();

        foreach (var imagem in imagemList)
        {

            System.Drawing.ImageConverter converter = new System.Drawing.ImageConverter();
            try
            {
                Image img = (Image)converter.ConvertFrom(imagem);
                ConsoleWriteImage(img);
                imagensProcessadas.Add(imagem);
            }
            catch (Exception)
            {
                continue;
            }

        }

        System.Console.ReadLine();
    }

    public static void ConsoleWriteImage(Image img)
    {
        int sMax = 39;
        decimal percent = Math.Min(decimal.Divide(sMax, img.Width), decimal.Divide(sMax, img.Height));
        Size resSize = new Size((int)(img.Width * percent), (int)(img.Height * percent));
        Func<System.Drawing.Color, int> ToConsoleColor = c =>
        {
            int index = (c.R > 128 | c.G > 128 | c.B > 128) ? 8 : 0;
            index |= (c.R > 64) ? 4 : 0;
            index |= (c.G > 64) ? 2 : 0;
            index |= (c.B > 64) ? 1 : 0;
            return index;
        };
        Bitmap bmpMin = new Bitmap(img, resSize.Width, resSize.Height);
        Bitmap bmpMax = new Bitmap(img, resSize.Width * 2, resSize.Height * 2);
        for (int i = 0; i < resSize.Height; i++)
        {
            for (int j = 0; j < resSize.Width; j++)
            {
                Console.ForegroundColor = (ConsoleColor)ToConsoleColor(bmpMin.GetPixel(j, i));
                Console.Write("██");
            }

            Console.BackgroundColor = ConsoleColor.Black;
            Console.Write("    ");

            for (int j = 0; j < resSize.Width; j++)
            {
                Console.ForegroundColor = (ConsoleColor)ToConsoleColor(bmpMax.GetPixel(j * 2, i * 2));
                Console.BackgroundColor = (ConsoleColor)ToConsoleColor(bmpMax.GetPixel(j * 2, i * 2 + 1));
                Console.Write("▀");

                Console.ForegroundColor = (ConsoleColor)ToConsoleColor(bmpMax.GetPixel(j * 2 + 1, i * 2));
                Console.BackgroundColor = (ConsoleColor)ToConsoleColor(bmpMax.GetPixel(j * 2 + 1, i * 2 + 1));
                Console.Write("▀");
            }
            System.Console.WriteLine();
        }
    }

    public static IList<byte[]> ExtraiImagens(PdfReader pdfReader) 
    {
        var data = new byte[] { };

        IList<byte[]> imagensList = new List<byte[]>();

        for (int numPag = 1; numPag <= 3; numPag++)
        //for (int numPag = 1; numPag <= pdfReader.NumberOfPages; numPag++)
        {
            var pg = pdfReader.GetPageN(numPag);
            var res = PdfReader.GetPdfObject(pg.Get(PdfName.RESOURCES)) as PdfDictionary;
            var xobj = PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT)) as PdfDictionary;
            if (xobj == null) continue;

            var keys = xobj.Keys;
            if (keys == null) continue;

            PdfObject obj = null;
            PdfDictionary tg = null;

            for (int key = 0; key < keys.Count; key++)
            {
                obj = xobj.Get(keys.ElementAt(key));

                if (!obj.IsIndirect()) continue;

                tg = PdfReader.GetPdfObject(obj) as PdfDictionary;

                obj = xobj.Get(keys.ElementAt(key));
                if (!obj.IsIndirect()) continue;
                tg = PdfReader.GetPdfObject(obj) as PdfDictionary;

                var type = PdfReader.GetPdfObject(tg.Get(PdfName.SUBTYPE)) as PdfName;
                if (!PdfName.IMAGE.Equals(type)) continue;

                int XrefIndex = (obj as PRIndirectReference).Number;
                var pdfStream = pdfReader.GetPdfObject(XrefIndex) as PRStream;

                data = PdfReader.GetStreamBytesRaw(pdfStream);

                imagensList.Add(PdfReader.GetStreamBytesRaw(pdfStream));
            }
        }

        return imagensList;
    }
}

方法ConsoleWriteImage只是在控制台上打印图像,我用它来研究iTextSharp根据我的代码为我检索它的顺序的行为。

任何帮助?

1 个答案:

答案 0 :(得分:1)

不幸的是,OP没有解释正确的顺序是什么 - 这不是不言自明的,因为可能存在PDF的某些方面对于程序来说并不明显,仅适用于人类读者查看渲染的PDF。

但至少,OP可能希望逐页获取他的图像。这显然不是他目前的代码所提供的:他的代码扫描PDF中的对象的整个基础以用于图像对象,因此他将获得图像对象,但顺序可能是完全随机的;特别是他甚至可以获得PDF中包含的图像,但不会在其任何页面上使用...

要按页面顺序检索图像(并且只使用实际使用的图像),应该使用解析器框架,例如

PdfReader reader = new PdfReader(pdf);
PdfReaderContentParser parser = new PdfReaderContentParser(reader);
MyImageRenderListener listener = new MyImageRenderListener();
for (int i = 1; i <= reader.NumberOfPages; i++) {
  parser.ProcessContent(i, listener);
} 
// Process images in the List listener.MyImages
// with names in listener.ImageNames

(摘自ExtractImages.cs iTextSharp示例)

其中MyImageRenderListener被定义为收集图像:

public class MyImageRenderListener : IRenderListener {
    /** the byte array of the extracted images */
    private List<byte[]> _myImages;
    public List<byte[]> MyImages {
      get { return _myImages; }
    }
    /** the file names of the extracted images */
    private List<string> _imageNames;
    public List<string> ImageNames { 
      get { return _imageNames; }
    } 

    public MyImageRenderListener() {
      _myImages = new List<byte[]>();
      _imageNames = new List<string>();
    }

    [...]

    public void RenderImage(ImageRenderInfo renderInfo) {
      try {
        PdfImageObject image = renderInfo.GetImage();
        if (image == null || image.GetImageBytesType() == PdfImageObject.ImageBytesType.JBIG2) 
          return;

        _imageNames.Add(string.Format("Image{0}.{1}", renderInfo.GetRef().Number, image.GetFileType() ) );
        _myImages.Add(image.GetImageAsBytes());
      }
      catch
      {
      }
    }

    [...]      
}

(摘自MyImageRenderListener.cs iTextSharp示例)

ImageRenderInfo renderInfo此外还包含有关相关页面上图片的位置和方向的信息,这可能有助于推断OP所追求的正确的顺序