我遇到了一个与unconv无关的问题,它会在尝试将随机文档转换为PDF时无限期地挂起,所以我写了一个小的python脚本将文档上传到GDrive并再次下载为PDF以解决这个问题
我遇到的问题是,谷歌驱动器会自动尝试上传的OCR图像,我不希望这种情况发生,但我到目前为止一直无法找到有关如何禁用OCR的文档。
我注意到的一件事:我是create function from v3 of the api,在v2 api中,有一个insert function带有OCR标志。这可能与v3 api一起使用吗?
这是我的代码:
from __future__ import print_function
import httplib2
import magic
import io
import sys
import argparse
import subprocess as sp
from apiclient import discovery
from oauth2client.service_account import ServiceAccountCredentials
from httplib2 import Http
from googleapiclient.http import MediaFileUpload
from googleapiclient.http import MediaIoBaseDownload
from settings import *
"""
This script exists to mask unoconv for JUST pdf conversion. If it gets flags for anything else, it will fallback on unoconv.
Otherwise, it uploads the document to google drive, download it as a pdf, and then delete the file out of the drive.
"""
MIMETYPE_MAPPING = {
"application/vnd.openxmlformats-officedocument.wordprocessingml.document":"application/vnd.google-apps.document",
"application/rtf":"application/vnd.google-apps.document",
"text/richtext":"application/vnd.google-apps.document",
"text/plain":"application/vnd.google-apps.document",
"text/html":"application/vnd.google-apps.document",
"application/vnd.oasis.opendocument.text":"application/vnd.google-apps.document",
"application/x-iwork-pages-sffpages":"application/vnd.google-apps.document",
"application/msword":"application/vnd.google-apps.document",
"application/vnd.ms-excel":"application/vnd.google-apps.spreadsheets",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":"application/vnd.google-apps.spreadsheets",
"text/csv":"application/vnd.google-apps.spreadsheets",
"text/tab-separated-values":"application/vnd.google-apps.spreadsheets",
"application/vnd.oasis.opendocument.spreadsheets":"application/vnd.google-apps.spreadsheets",
"application/vnd.oasis.opendocument.spreadsheet":"application/vnd.google-apps.spreadsheets",
"application/vnd.ms-powerpoint":"application/vnd.google-apps.presentation",
"application/vnd.openxmlformats-officedocument.presentationml.presentationml":"application/vnd.google-apps.presentation",
"application/vnd.oasis.opendocument.presentation":"application/vnd.google-apps.presentation",
"image/png":"application/vnd.google-apps.document",
"image/x-citrix-png":"application/vnd.google-apps.document",
"image/x-png":"application/vnd.google-apps.document",
"image/jpeg":"application/vnd.google-apps.document",
"image/x-citrix-jpeg":"application/vnd.google-apps.document",
"image/gif":"application/vnd.google-apps.document",
"image/bmp":"application/vnd.google-apps.document",
"application/pdf":"application/vnd.google-apps.document",
}
SERVICE = None
def get_service():
"""
Establishes the connection to the google drive APIs.
"""
global SERVICE
if SERVICE is None:
credentials = ServiceAccountCredentials.from_json(JSON_KEY)
http = http_auth = credentials.authorize(Http())
SERVICE = discovery.build('drive', 'v3', http=http_auth)
return SERVICE
def drive_upload(fp, fn):
"""
Uploads the file found at fp to root of google drive account as a google doc with name fn
Returns the id of the new file
"""
mimetype = magic.from_file(fp, mime=True)
drive_service = get_service()
file_metadata = {
'name' : fn,
'mimeType' : MIMETYPE_MAPPING.get(mimetype, 'application/vnd.google-apps.document'),
}
media = MediaFileUpload(fp,
mimetype=mimetype,
resumable=True)
import inspect
print(inspect.getargspec(drive_service.files().create)[0])
file = drive_service.files().create(body=file_metadata,
media_body=media,
fields='id').execute()
return file.get('id')
def download_pdf(file_id,dlp):
"""
Downloads file from google drive specified by file_id to the filepath in dlp
Will download file as pdf
"""
drive_service = get_service()
request = drive_service.files().export_media(fileId=file_id,
mimeType='application/pdf')
resp = request.execute()
f = open(dlp,'w')
f.write(resp)
f.close()
def convert_to_pdf(inputf, outputf):
"""
Converts input file to pdf located at output file and cleans up file from google drive
"""
fid = drive_upload(inputf,inputf.split('/')[-1])
download_pdf(fid,outputf)
#Now delete the file from drive
service = get_service()
service.files().delete(fileId=fid).execute()
def pass_through():
"""
Calls unoconv with same args that were passed to this script
"""
print("PASSING THROUGH",file=sys.stderr)
cmd = PATH_TO_UNOCONV + " " + " ".join(sys.argv[1:])
child = sp.Popen(cmd.split(), stdout=sp.PIPE, stderr=sp.PIPE)
stdout, stderr = child.communicate()
print(stdout,end='')
print(stderr, file=sys.stderr,end='')
sys.exit(child.returncode)
class ArgParse(argparse.ArgumentParser):
"""
This subclass of ArgumentParser exists to change the default behaviour of the exit function
If the exit function is called with a status other than 0 (usually because unsupported flags are used),
a call is made to pass_through let unoconv handle this call.
"""
def exit(self, status=0,message=None):
if status != 0:
pass_through()
else:
return super(ArgParse,self).exit(status=status,message=message)
if __name__ == '__main__':
parser = ArgParse(description="Wrapper for unoconv that farms pdf conversions to google drive, using any args other than the supplied will cause it to fallback on unoconv")
parser.add_argument('-f', metavar='format', help='Desired ouput format')
parser.add_argument('-o', metavar='output_file', help='Path to output file')
parser.add_argument('fname', metavar='inputf', type=str, nargs=1, help='Path to file to convert')
args = parser.parse_args()
fmt = args.f
output_file = args.o
input_file = args.fname[0]
if fmt.upper() == "PDF":
try:
convert_to_pdf(input_file, output_file)
except:
pass_through()
else:
#if we aren't converting the file to a PDF, let unoconv handle it
pass_through()