我知道PDFKit
允许提取文本和格式为NSAttributedString
,但是我找不到使用Swift从任何PDF文档提取每个单独图形的任何信息。
任何帮助将不胜感激,谢谢!
edit:https://stackoverflow.com/a/40788449/2303865解释了如何将整个页面转换为图像,但是我需要解析一系列PDF文档中已经存在的所有图像,而又不知道它们的位置,因此该解决方案是不合适的我的问题。
答案 0 :(得分:0)
这是一个Swift函数,可从pdf页面提取图像,更具体地说是提取所有子类型为“图像”的对象:
import PDFKit
func extractImages(from pdf: PDFDocument, extractor: @escaping (ImageInfo)->Void) throws {
for pageNumber in 0..<pdf.pageCount {
guard let page = pdf.page(at: pageNumber) else {
throw PDFReadError.couldNotOpenPageNumber(pageNumber)
}
try extractImages(from: page, extractor: extractor)
}
}
func extractImages(from page: PDFPage, extractor: @escaping (ImageInfo)->Void) throws {
let pageNumber = page.label ?? "unknown page"
guard let page = page.pageRef else {
throw PDFReadError.couldNotOpenPage(pageNumber)
}
guard let dictionary = page.dictionary else {
throw PDFReadError.couldNotOpenDictionaryOfPage(pageNumber)
}
guard let resources = dictionary[CGPDFDictionaryGetDictionary, "Resources"] else {
throw PDFReadError.couldNotReadResources(pageNumber)
}
if let xObject = resources[CGPDFDictionaryGetDictionary, "XObject"] {
print("reading resources of page", pageNumber)
func extractImage(key: UnsafePointer<Int8>, object: CGPDFObjectRef, info: UnsafeMutableRawPointer?) -> Bool {
guard let stream: CGPDFStreamRef = object[CGPDFObjectGetValue, .stream] else { return true }
guard let dictionary = CGPDFStreamGetDictionary(stream) else {return true}
guard dictionary.getName("Subtype", CGPDFDictionaryGetName) == "Image" else {return true}
let colorSpaces = dictionary.getNameArray(for: "ColorSpace") ?? []
let filter = dictionary.getNameArray(for: "Filter") ?? []
var format = CGPDFDataFormat.raw
guard let data = CGPDFStreamCopyData(stream, &format) as Data? else { return false }
extractor(
ImageInfo(
name: String(cString: key),
colorSpaces: colorSpaces,
filter: filter,
format: format,
data: data
)
)
return true
}
CGPDFDictionaryApplyBlock(xObject, extractImage, nil)
}
}
struct ImageInfo: CustomDebugStringConvertible {
let name: String
let colorSpaces: [String]
let filter: [String]
let format: CGPDFDataFormat
let data: Data
var debugDescription: String {
"""
Image "\(name)"
- color spaces: \(colorSpaces)
- format: \(format == .JPEG2000 ? "JPEG2000" : format == .jpegEncoded ? "jpeg" : "raw")
- filters: \(filter)
- size: \(ByteCountFormatter.string(fromByteCount: Int64(data.count), countStyle: .binary))
"""
}
}
extension CGPDFObjectRef {
func getName<K>(_ key: K, _ getter: (OpaquePointer, K, UnsafeMutablePointer<UnsafePointer<Int8>?>)->Bool) -> String? {
guard let pointer = self[getter, key] else { return nil }
return String(cString: pointer)
}
func getName<K>(_ key: K, _ getter: (OpaquePointer, K, UnsafeMutableRawPointer?)->Bool) -> String? {
guard let pointer: UnsafePointer<UInt8> = self[getter, key] else { return nil }
return String(cString: pointer)
}
subscript<R, K>(_ getter: (OpaquePointer, K, UnsafeMutablePointer<R?>)->Bool, _ key: K) -> R? {
var result: R!
guard getter(self, key, &result) else { return nil }
return result
}
subscript<R, K>(_ getter: (OpaquePointer, K, UnsafeMutableRawPointer?)->Bool, _ key: K) -> R? {
var result: R!
guard getter(self, key, &result) else { return nil }
return result
}
func getNameArray(for key: String) -> [String]? {
var object: CGPDFObjectRef!
guard CGPDFDictionaryGetObject(self, key, &object) else { return nil }
if let name = object.getName(.name, CGPDFObjectGetValue) {
return [name]
} else {
guard let array: CGPDFArrayRef = object[CGPDFObjectGetValue, .array] else {return nil}
var names = [String]()
for index in 0..<CGPDFArrayGetCount(array) {
guard let name = array.getName(index, CGPDFArrayGetName) else { continue }
names.append(name)
}
return names
}
}
}
enum PDFReadError: Error {
case couldNotOpenPageNumber(Int)
case couldNotOpenPage(String)
case couldNotOpenDictionaryOfPage(String)
case couldNotReadResources(String)
case cannotReadXObjectStream(xObject: String, page: String)
}
您应该知道,PDF中的图像可以用不同的方式表示。它们可以作为自包含的JPG嵌入,也可以作为原始像素数据(无论是否进行无损压缩)嵌入,其中包含有关压缩,颜色空间,宽度,高度等的元信息。
因此,如果您要导出嵌入的JPG:此代码可以正常工作。但是,如果您还想可视化原始图像,则将需要更多的解析代码。首先,您可以查看PDF 2.0 spec(或an older free version of the spec),以及此gist可以解释任何颜色配置文件中的JPG以及具有以下任何颜色配置文件的原始图像: