Question

基本上有4种类型的pdf文件。 •仅限PDF图像 •PDF可搜索图像精确 •PDF可搜索图像紧凑 •PDF格式化文本和图形我需要知道如何区分给定的pdf文档以执行OCR。

我已尝试过类似的内容，但是使用Image.fromStream方法无法将指定为仅图像（在我的代码中称为扫描）的给定pdf转换为图像。

 public static PdfType GetPdfType(Stream sourcePdf)
    {
        PdfDocument pdf = PdfReader.Open(sourcePdf);

        PdfDictionary pg = pdf.Pages[0];

        PdfDictionary res = pg.Elements.GetDictionary("/Resources");

        PdfDictionary xobj = res.Elements.GetDictionary("/XObject");

        PdfDictionary font = res.Elements.GetDictionary("/Font");

        if (xobj == null)
        {
            return PdfType.Text;
        }
        else if (xobj != null && font != null)
        {
            return PdfType.ScannedText;
        }
        else
            return PdfType.Scanned;
    }

编辑：这是我尝试添加从pdf文件中提取的图像的代码片段

public static Collection<Image> GetExtractedImagesFromPDF(Stream sourcePdf, string outputPath)
    {
        Collection<Image> extractedImages = new Collection<Image>();

        PdfDocument pdf = PdfReader.Open(sourcePdf);

        for (int pageNumber = 0; pageNumber < pdf.Pages.Count; pageNumber++)
        {
            PdfDictionary pg = pdf.Pages[pageNumber];

            // recursively search pages, forms and groups for images.
            PdfDictionary obj = FindImageInPDFDictionary(pg);

            if (obj != null)
            {
                byte[] bytes = obj.Stream.Value;
                ImageConverter ic = new ImageConverter();
                //Image img = (Image)ic.ConvertFrom(bytes);  // throws parameter not valid error
                MemoryStream ms = new MemoryStream(bytes);
                Image img = Image.FromStream(ms, false, true);  // throws parameter not valid error
                extractedImages.Add(img);
            }
        }

        return extractedImages;
    }

这是FindImageInPdfDictionary方法

static PdfDictionary FindImageInPDFDictionary(PdfDictionary pg)
    {

        PdfDictionary res = pg.Elements.GetDictionary("/Resources");
        if (res != null)
        {
            PdfDictionary xobj = res.Elements.GetDictionary("/XObject");

            if (xobj != null)
            {
                ICollection<PdfItem> items = xobj.Elements.Values;

                foreach (PdfItem item in items)
                {
                    PdfReference reference = item as PdfReference;
                    if (reference != null)
                    {
                        PdfDictionary xObject = reference.Value as PdfDictionary;

                        if (xObject != null && xObject.Elements.GetString("/Subtype") == "/Image")
                        {
                            return xObject;
                        }
                    }
                }
            }
        }

        return null;
    }

如何使用PDFSharp区分pdf文档类型？

0 个答案: