从PDF中的特定页面提取图像

时间:2012-05-21 16:58:01

标签: c# pdf itextsharp

我想从PDF文件中提取图像。我尝试使用以下代码,它从PDF中完美地提取了一个jpeg图像。问题是如何从特定页面提取图像,例如第1页或其他页面。我不想阅读整个PDF来搜索图像。

有什么建议吗?

提取图片的代码:

private void List<System.Drawing.Image> ExtractImages(String PDFSourcePath)
        {
            List<System.Drawing.Image> ImgList = new List<System.Drawing.Image>();

            iTextSharp.text.pdf.RandomAccessFileOrArray RAFObj = null;
            iTextSharp.text.pdf.PdfReader PDFReaderObj = null;
            iTextSharp.text.pdf.PdfObject PDFObj = null;
            iTextSharp.text.pdf.PdfStream PDFStremObj = null;

            try
            {
                RAFObj = new iTextSharp.text.pdf.RandomAccessFileOrArray(PDFSourcePath);
                PDFReaderObj = new iTextSharp.text.pdf.PdfReader(RAFObj, null);

                for (int i = 0; i <= PDFReaderObj.XrefSize - 1; i++)
                {
                    PDFObj = PDFReaderObj.GetPdfObject(i);

                    if ((PDFObj != null) && PDFObj.IsStream())
                    {
                        PDFStremObj = (iTextSharp.text.pdf.PdfStream)PDFObj;
                        iTextSharp.text.pdf.PdfObject subtype = PDFStremObj.Get(iTextSharp.text.pdf.PdfName.SUBTYPE);

                        if ((subtype != null) && subtype.ToString() == iTextSharp.text.pdf.PdfName.IMAGE.ToString())
                        {
                            byte[] bytes = iTextSharp.text.pdf.PdfReader.GetStreamBytesRaw((iTextSharp.text.pdf.PRStream)PDFStremObj);

                            if ((bytes != null))
                            {
                                try
                                {
                                    System.IO.MemoryStream MS = new System.IO.MemoryStream(bytes);

                                    MS.Position = 0;
                                    System.Drawing.Image ImgPDF = System.Drawing.Image.FromStream(MS);
                                    pictureBox1.Image = ImgPDF;
                                    MS.Close();
                                    MS.Flush();

                                }
                                catch (Exception)
                                {

                                }
                            }
                        }
                    }
                }
                PDFReaderObj.Close();
            }
            catch (Exception ex)
            {
                throw new Exception(ex.Message);
            }                
        }

3 个答案:

答案 0 :(得分:9)

我目前没有iTextSharp 4.0可用,所以这段代码的目标是5.2,但它也应该适用于较旧的代码。此代码几乎是直接提升from this post here,因此请查看该帖子以及针对其他问题的回复。正如我在上面的评论中所说,您的代码正在查看文档透视图中的所有图像,而我链接的代码逐页进行。

请阅读其他帖子中的所有评论,尤其是this one,其中说明此 ONLY 适用于JPG图片。 PDF支持很多不同类型的图像,除非你知道你只处理JPG,否则你需要添加更多代码。有关提示,请参阅this postthis post

        string testFile = System.IO.Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), "Doc1.pdf");
        string outputPath = Environment.GetFolderPath(Environment.SpecialFolder.Desktop);
        int pageNum = 1;

        PdfReader pdf = new PdfReader(testFile);
        PdfDictionary pg = pdf.GetPageN(pageNum);
        PdfDictionary res = (PdfDictionary)PdfReader.GetPdfObject(pg.Get(PdfName.RESOURCES));
        PdfDictionary xobj = (PdfDictionary)PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT));
        if (xobj == null) { return; }
        foreach (PdfName name in xobj.Keys) {
            PdfObject obj = xobj.Get(name);
            if (!obj.IsIndirect()) { continue; }
            PdfDictionary tg = (PdfDictionary)PdfReader.GetPdfObject(obj);
            PdfName type = (PdfName)PdfReader.GetPdfObject(tg.Get(PdfName.SUBTYPE));
            if (!type.Equals(PdfName.IMAGE)) { continue; }
            int XrefIndex = Convert.ToInt32(((PRIndirectReference)obj).Number.ToString(System.Globalization.CultureInfo.InvariantCulture));
            PdfObject pdfObj = pdf.GetPdfObject(XrefIndex);
            PdfStream pdfStrem = (PdfStream)pdfObj;
            byte[] bytes = PdfReader.GetStreamBytesRaw((PRStream)pdfStrem);
            if (bytes == null) { continue; }
            using (System.IO.MemoryStream memStream = new System.IO.MemoryStream(bytes)) {
                memStream.Position = 0;
                System.Drawing.Image img = System.Drawing.Image.FromStream(memStream);
                if (!Directory.Exists(outputPath))
                    Directory.CreateDirectory(outputPath);

                string path = Path.Combine(outputPath, String.Format(@"{0}.jpg", pageNum));
                System.Drawing.Imaging.EncoderParameters parms = new System.Drawing.Imaging.EncoderParameters(1);
                parms.Param[0] = new System.Drawing.Imaging.EncoderParameter(System.Drawing.Imaging.Encoder.Compression, 0);
                var jpegEncoder = ImageCodecInfo.GetImageEncoders().ToList().Find(x => x.FormatID == ImageFormat.Jpeg.Guid);
                img.Save(path, jpegEncoder, parms);

            }
        }

答案 1 :(得分:2)

以下是我用于从PDF中提取图像的代码。它对我来说完全没问题。

//   Required: iTextSharp.dll

using System.Drawing;
using System.Drawing.Imaging;
using System.IO;
using iTextSharp.text.pdf.parser;
using Dotnet = System.Drawing.Image;
using iTextSharp.text.pdf;

namespace PDF_Parsing {
    partial class ExtractPdfImage
    {
        string imgPath = @"c:\extractedImg.png";
        private void ExtractImage(string pdfFile)
        {
            const int pageNumber = 1;
            PdfReader pdf = new PdfReader(pdfFile);
            PdfDictionary pg = pdf.GetPageN(pageNumber);
            PdfDictionary res =               (PdfDictionary)PdfReader.GetPdfObject(pg.Get(PdfName.RESOURCES));
            PdfDictionary xobj = (PdfDictionary)PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT));
            foreach (PdfName name in xobj.Keys)
            {
                PdfObject obj = xobj.Get(name);
                if (obj.IsIndirect())
                {
                    PdfDictionary tg = (PdfDictionary)PdfReader.GetPdfObject(obj);
                    string width = tg.Get(PdfName.WIDTH).ToString();
                    string height = tg.Get(PdfName.HEIGHT).ToString();
                    ImageRenderInfo imgRI = ImageRenderInfo.CreateForXObject(new   Matrix(float.Parse(width), float.Parse(height)),
                        (PRIndirectReference)obj, tg);
                    RenderImage(imgRI);
                }
            }
        }

        private void RenderImage(ImageRenderInfo renderInfo)
        {
            PdfImageObject image = renderInfo.GetImage();
            using (Dotnet dotnetImg = image.GetDrawingImage())
            {
                if (dotnetImg != null)
                {
                    using (MemoryStream ms = new MemoryStream())
                    {
                        dotnetImg.Save(ms, ImageFormat.Tiff);
                        Bitmap d = new Bitmap(dotnetImg);
                        d.Save(imgPath);
                    }
                }
            }
        }
    }
}

答案 2 :(得分:0)

以下代码可以从特定页面中提取图像。

using System.Drawing;
using System.Drawing.Imaging;
using System.IO;
using iTextSharp.text.pdf.parser;
using Dotnet = System.Drawing.Image;
using iTextSharp.text.pdf;
namespace PDF_Parsing
{
  partial class PDF_ImgExtraction
  {
    string imgPath;
    private void ExtractImage(string pdfFile)
    {
      const int pageNumber = 1;//Page number to extract the image from
      PdfReader pdf = new PdfReader(pdfFile);
      PdfDictionary pg = pdf.GetPageN(pageNumber);
      PdfDictionary res = (PdfDictionary)PdfReader.GetPdfObject(pg.Get(PdfName.RESOURCES));
      PdfDictionary xobj = (PdfDictionary)PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT));
      foreach (PdfName name in xobj.Keys)
      {
        PdfObject obj = xobj.Get(name);
        if (obj.IsIndirect())
        {
          PdfDictionary tg = (PdfDictionary)PdfReader.GetPdfObject(obj);
          string width = tg.Get(PdfName.WIDTH).ToString();
          string height = tg.Get(PdfName.HEIGHT).ToString();
          ImageRenderInfo imgRI = ImageRenderInfo.CreateForXObject(new Matrix(float.Parse(width), float.Parse(height)), (PRIndirectReference)obj, tg);
          RenderImage(imgRI);
        }
      }
    }
    private void RenderImage(ImageRenderInfo renderInfo)
    {
      PdfImageObject image = renderInfo.GetImage();
      using (Dotnet dotnetImg = image.GetDrawingImage())
      {
        if (dotnetImg != null)
        {
          using (MemoryStream ms = new MemoryStream())
          {
            dotnetImg.Save(ms, ImageFormat.Tiff);
            Bitmap d = new Bitmap(dotnetImg);
            d.Save(imgPath);
          }
        }
      }
    }
  }
}