我参与编写的C#软件有一个组件,涉及从扫描文档中读取条形码。 PDF文件本身使用PDFSharp打开。
不幸的是,当涉及到Flate Decoding of PDF时,我们遇到了该过程的问题。基本上,我们得到的是一堆模糊,这意味着没有条形码可以检查,文件无法识别。
我们的代码(我们从另一个Stack Overflow案例中无耻地“借用”)如下:
private FileInfo ExportAsPngImage(PdfDictionary image, string sourceFileName, ref int count)
{
//This code basically comes from http://forum.pdfsharp.net/viewtopic.php?f=2&t=2338#p6755
//and http://stackoverflow.com/questions/10024908/how-to-extract-flatedecoded-images-from-pdf-with-pdfsharp
string tempFile = string.Format("{0}_Image{1}.png", sourceFileName, count);
int width = image.Elements.GetInteger(PdfImage.Keys.Width);
int height = image.Elements.GetInteger(PdfImage.Keys.Height);
int bitsPerComponent = image.Elements.GetInteger(PdfImage.Keys.BitsPerComponent);
var pixelFormat = new PixelFormat();
switch (bitsPerComponent)
{
case 1:
pixelFormat = System.Drawing.Imaging.PixelFormat.Format1bppIndexed;
break;
case 8:
pixelFormat = System.Drawing.Imaging.PixelFormat.Format8bppIndexed;
break;
case 24:
pixelFormat = System.Drawing.Imaging.PixelFormat.Format24bppRgb;
break;
default:
throw new Exception("Unknown pixel format " + bitsPerComponent);
}
var fd = new FlateDecode();
byte[] decodedBytes = fd.Decode(image.Stream.Value);
byte[] resultBytes = null;
int newWidth = width;
int alignment = 4;
if (newWidth % alignment != 0)
//Image data in BMP files always starts at a DWORD boundary, in PDF it starts at a BYTE boundary.
//Most images have a width that is a multiple of 4, so there is no problem with them.
//You must copy the image data line by line and start each line at the DWORD boundary.
{
while (newWidth % alignment != 0)
{
newWidth++;
}
var copy_dword_boundary = new byte[height, newWidth];
for (int y = 0; y < height; y++)
{
for (int x = 0; x < newWidth; x++)
{
if (x <= width && (x + (y * width) < decodedBytes.Length))
// while not at end of line, take orignal array
copy_dword_boundary[y, x] = decodedBytes[x + (y * width)];
else //fill new array with ending 0
copy_dword_boundary[y, x] = 0;
}
}
resultBytes = new byte[newWidth * height];
int counter = 0;
for (int x = 0; x < copy_dword_boundary.GetLength(0); x++)
{
for (int y = 0; y < copy_dword_boundary.GetLength(1); y++)
{ //put 2dim array back in 1dim array
resultBytes[counter] = copy_dword_boundary[x, y];
counter++;
}
}
}
else
{
resultBytes = new byte[decodedBytes.Length];
decodedBytes.CopyTo(resultBytes, 0);
}
//Create a new bitmap and shove the bytes into it
var bitmap = new Bitmap(newWidth, height, pixelFormat);
BitmapData bitmapData = bitmap.LockBits(new Rectangle(0, 0, bitmap.Width, bitmap.Height), ImageLockMode.WriteOnly, bitmap.PixelFormat);
int length = (int)Math.Ceiling(width * bitsPerComponent / 8.0);
for (int i = 0; i < height; i++)
{
int offset = i * length;
int scanOffset = i * bitmapData.Stride;
Marshal.Copy(resultBytes, offset, new IntPtr(bitmapData.Scan0.ToInt32() + scanOffset), length);
}
bitmap.UnlockBits(bitmapData);
//Now save the bitmap to memory
using (var fs = new FileStream(String.Format(tempFile, count++), FileMode.Create, FileAccess.Write))
{
bitmap.Save(fs, ImageFormat.Png);
}
return new FileInfo(tempFile);
}
不幸的是,我们得到的就是http://i.stack.imgur.com/FwatQ.png
我们非常感谢任何关于我们哪里出错的想法,或对我们可能会尝试的事情的建议。
干杯
答案 0 :(得分:0)
感谢你们的建议。其中一个开发人员设法破解了它 - 它(正如Jongware建议的那样)是一个JPEG,但它实际上也是拉链的!解压缩后,可以将其处理并识别为正常。