通过API上传到Google云端硬盘时禁用OCR

时间:2018-05-29 15:46:33

标签: python google-drive-api

我遇到了一个与unconv无关的问题,它会在尝试将随机文档转换为PDF时无限期地挂起,所以我写了一个小的python脚本将文档上传到GDrive并再次下载为PDF以解决这个问题

我遇到的问题是,谷歌驱动器会自动尝试上传的OCR图像,我不希望这种情况发生,但我到目前为止一直无法找到有关如何禁用OCR的文档。

我注意到的一件事:我是create function from v3 of the api,在v2 api中,有一个insert function带有OCR标志。这可能与v3 api一起使用吗?

这是我的代码:

    from __future__ import print_function
    import httplib2
    import magic
    import io
    import sys
    import argparse
    import subprocess as sp

    from apiclient import discovery
    from oauth2client.service_account import ServiceAccountCredentials
    from httplib2 import Http

    from googleapiclient.http import MediaFileUpload
    from googleapiclient.http import MediaIoBaseDownload

    from settings import *

    """
    This script exists to mask unoconv for JUST pdf conversion. If it gets flags for anything else, it will fallback on unoconv.

    Otherwise, it uploads the document to google drive, download it as a pdf, and then delete the file out of the drive.
    """

    MIMETYPE_MAPPING = {
        "application/vnd.openxmlformats-officedocument.wordprocessingml.document":"application/vnd.google-apps.document",
        "application/rtf":"application/vnd.google-apps.document",
        "text/richtext":"application/vnd.google-apps.document",
        "text/plain":"application/vnd.google-apps.document",
        "text/html":"application/vnd.google-apps.document",
        "application/vnd.oasis.opendocument.text":"application/vnd.google-apps.document",
        "application/x-iwork-pages-sffpages":"application/vnd.google-apps.document",
        "application/msword":"application/vnd.google-apps.document",

        "application/vnd.ms-excel":"application/vnd.google-apps.spreadsheets",
        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":"application/vnd.google-apps.spreadsheets",
        "text/csv":"application/vnd.google-apps.spreadsheets",
        "text/tab-separated-values":"application/vnd.google-apps.spreadsheets",
        "application/vnd.oasis.opendocument.spreadsheets":"application/vnd.google-apps.spreadsheets",
        "application/vnd.oasis.opendocument.spreadsheet":"application/vnd.google-apps.spreadsheets",

        "application/vnd.ms-powerpoint":"application/vnd.google-apps.presentation",
        "application/vnd.openxmlformats-officedocument.presentationml.presentationml":"application/vnd.google-apps.presentation",
        "application/vnd.oasis.opendocument.presentation":"application/vnd.google-apps.presentation",

        "image/png":"application/vnd.google-apps.document",
        "image/x-citrix-png":"application/vnd.google-apps.document",
        "image/x-png":"application/vnd.google-apps.document",
        "image/jpeg":"application/vnd.google-apps.document",
        "image/x-citrix-jpeg":"application/vnd.google-apps.document",
        "image/gif":"application/vnd.google-apps.document",
        "image/bmp":"application/vnd.google-apps.document",

        "application/pdf":"application/vnd.google-apps.document",
    }

    SERVICE = None

    def get_service():
        """
        Establishes the connection to the google drive APIs.
        """
        global SERVICE
        if SERVICE is None:
            credentials = ServiceAccountCredentials.from_json(JSON_KEY)
            http = http_auth = credentials.authorize(Http())
            SERVICE = discovery.build('drive', 'v3', http=http_auth)
        return SERVICE

    def drive_upload(fp, fn):
        """
        Uploads the file found at fp to root of google drive account as a google doc with name fn

        Returns the id of the new file
        """
        mimetype = magic.from_file(fp, mime=True)
        drive_service = get_service()
        file_metadata = {
            'name' : fn,
            'mimeType' : MIMETYPE_MAPPING.get(mimetype, 'application/vnd.google-apps.document'),
        } 
        media = MediaFileUpload(fp,
                                mimetype=mimetype,
                                resumable=True)
        import inspect
        print(inspect.getargspec(drive_service.files().create)[0])

        file = drive_service.files().create(body=file_metadata,
                                            media_body=media,
                                            fields='id').execute()
        return file.get('id')

    def download_pdf(file_id,dlp):
        """
        Downloads file from google drive specified by file_id to the filepath in dlp

        Will download file as pdf
        """
        drive_service = get_service()
        request = drive_service.files().export_media(fileId=file_id,
                                                     mimeType='application/pdf')
        resp = request.execute()
        f = open(dlp,'w')
        f.write(resp)
        f.close()

    def convert_to_pdf(inputf, outputf):
        """
        Converts input file to pdf located at output file and cleans up file from google drive
        """
        fid = drive_upload(inputf,inputf.split('/')[-1])
        download_pdf(fid,outputf)

        #Now delete the file from drive
        service = get_service()
        service.files().delete(fileId=fid).execute()

    def pass_through():
        """
        Calls unoconv with same args that were passed to this script
        """
        print("PASSING THROUGH",file=sys.stderr)
        cmd = PATH_TO_UNOCONV + " " + " ".join(sys.argv[1:])
        child = sp.Popen(cmd.split(), stdout=sp.PIPE, stderr=sp.PIPE)
        stdout, stderr = child.communicate()

        print(stdout,end='')
        print(stderr, file=sys.stderr,end='')
        sys.exit(child.returncode)

    class ArgParse(argparse.ArgumentParser):
        """
        This subclass of ArgumentParser exists to change the default behaviour of the exit function

        If the exit function is called with a status other than 0 (usually because unsupported flags are used),
            a call is made to pass_through let unoconv handle this call.
        """

        def exit(self, status=0,message=None):
            if status != 0:
                pass_through()
            else:
                return super(ArgParse,self).exit(status=status,message=message)

    if __name__ == '__main__':
        parser = ArgParse(description="Wrapper for unoconv that farms pdf conversions to google drive, using any args other than the supplied will cause it to fallback on unoconv")
        parser.add_argument('-f', metavar='format', help='Desired ouput format')
        parser.add_argument('-o', metavar='output_file',  help='Path to output file')
        parser.add_argument('fname', metavar='inputf', type=str, nargs=1, help='Path to file to convert')

        args = parser.parse_args()

        fmt = args.f
        output_file = args.o
        input_file = args.fname[0]

        if fmt.upper() == "PDF":
            try:
                convert_to_pdf(input_file, output_file)
            except:
                pass_through()
        else:
            #if we aren't converting the file to a PDF, let unoconv handle it
            pass_through()

0 个答案:

没有答案