我一直在使用itextsharp从PDF中提取文本,我想提取所有图像,可能是灰度级JPG,CMYK JPG和PNG(根据itextsharp过滤器)。当我提取图像时,灰色JPG和PNG显示正确。然而,CMYK JPG看起来很奇怪,就像他们的颜色在提取后被倒置一样。这是我的代码:
internal class ImageRenderListener : IRenderListener
{
#region Fields
List<System.Drawing.Image> images = new List<System.Drawing.Image>();
#endregion Fields
#region Properties
public List<System.Drawing.Image> Images
{
get { return images; }
}
#endregion Properties
#region Methods
#region Public Methods
public void BeginTextBlock() { }
public void EndTextBlock() { }
public void RenderImage(ImageRenderInfo renderInfo)
{
PdfImageObject image = renderInfo.GetImage();
PdfName filter = (PdfName)image.Get(PdfName.FILTER);
//int width = Convert.ToInt32(image.Get(PdfName.WIDTH).ToString());
//int bitsPerComponent = Convert.ToInt32(image.Get(PdfName.BITSPERCOMPONENT).ToString());
//string subtype = image.Get(PdfName.SUBTYPE).ToString();
//int height = Convert.ToInt32(image.Get(PdfName.HEIGHT).ToString());
//int length = Convert.ToInt32(image.Get(PdfName.LENGTH).ToString());
//string colorSpace = image.Get(PdfName.COLORSPACE).ToString();
/* It appears to be safe to assume that when filter == null, PdfImageObject
* does not know how to decode the image to a System.Drawing.Image.
*
* Uncomment the code above to verify, but when I've seen this happen,
* width, height and bits per component all equal zero as well. */
if (filter != null)
{
System.Drawing.Image drawingImage = image.GetDrawingImage();
var DimParams = renderInfo.GetImageCTM();
string extension = string.Empty;
if (filter == PdfName.DCTDECODE)
{
extension += PdfImageObject.ImageBytesType.JPG.FileExtension;
}
else if (filter == PdfName.JPXDECODE)
{
extension += PdfImageObject.ImageBytesType.JP2.FileExtension;
}
else if (filter == PdfName.FLATEDECODE)
{
extension += PdfImageObject.ImageBytesType.PNG.FileExtension;
}
else if (filter == PdfName.LZWDECODE)
{
extension += PdfImageObject.ImageBytesType.CCITT.FileExtension;
}
/* Rather than struggle with the image stream and try to figure out how to handle
* BitMapData scan lines in various formats (like virtually every sample I've found
* online), use the PdfImageObject.GetDrawingImage() method, which does the work for us. */
Images.Add(drawingImage );
}
}
public void RenderText(TextRenderInfo renderInfo) { }
#endregion Public Methods
#endregion Methods
}
我认为问题应该是流解码的方式,但我不明白如何解决这个问题。 感谢任何有想法的人。
这是提取的图像(已更新):
这是PDF的链接 http://docdro.id/ZoHmiAd