当Runner为DataflowRunner时,Google Dataflow管道在本地执行代码

时间:2019-08-21 14:57:12

标签: python sqlalchemy apache-beam

我有一个数据流python脚本:

from __future__ import print_function

import argparse
import logging
import apache_beam as beam
import subprocess
import sys
from pycloudsqlproxy import connect as proxy_connect
from apache_beam.options.pipeline_options import PipelineOptions, StandardOptions
from apache_beam.io.gcp.internal.clients import bigquery
from beam_nuggets.io import relational_db
from sqlalchemy import create_engine

"""
full Command to Run Locally:

python load_sql_to_bq.py \
        --user <user> \
        --client <client> \
        --table <table> \
        --password <password> \
        --noauth_local_webserver \
        --project my-project \
        --job_name load_sql_to_bq-local-$USER \
        --runner DirectRunner \
        --requirements_file requirements.txt \
        --setup_file ./setup.py \

Full Command to Run in DataFlow:

python load_sql_to_bq.py \
    --user <user> \
    --client <client> \
    --table <table> \
    --password <password> \
    --project my-project \
    --job_name load_sql_to_bq \
    --staging-location gs://my-project-dataflow/staging \
    --temp-location gs://icentris-ml-dataflow/tmp \
    --runner DataflowRunner \
    --requirements_file requirements.txt \
    --setup_file ./setup.py
"""

"""
Custom SourceConfiguration class because we need to use a socket for CloudSql
"""
class mysql_cfg(object):
    def __init__(self, username, password, database, cloud_sql_instance=None, host=None, port=None):
        self.create_if_missing = False
        if cloud_sql_instance:
            self.url = 'mysql+pymysql://{username}:{password}@/{database}?unix_socket=/cloudsql/{cloud_sql_instance}'.format(
                    username=username,
                    password=password,
                    database=database,
                    cloud_sql_instance=cloud_sql_instance)
        else:
            self.url = 'mysql+pymysql://{username}:{password}@{host}:{port}/{database}'.format(
                    username=username,
                    password=password,
                    host=host,
                    port=port,
                    database=database)

class Options(PipelineOptions):
    @classmethod
    def _add_argparse_args(cls, parser):
        parser.add_argument('--client',nargs=1, help='monat, idlife, stampinup')
        parser.add_argument('--user', nargs=1, help='mysql username')
        parser.add_argument('--password', nargs=1, help='mysql password')
        parser.add_argument('--table', nargs=1, help='mysql table')

class LoadSqlToBQ():
    def __init__(self, pipeline_options, database, table):
        self._pipeline_options = pipeline_options
        self._database = database
        self._table = table

    def field_schema(self, field):
        schema = bigquery.TableFieldSchema()

        schema.name = field['COLUMN_NAME']

        """
        Simple attempt to do some basic datatypes.
        Fallback to String for unknowns and then you can fix it later in bigquery.
        """
        datatype = field['DATA_TYPE'].upper()
        if "DATETIME" in datatype:
           t = "DATETIME"
        elif "DATE" in datatype:
            t = "DATE"
        elif "INT" in datatype:
            t = "INTEGER"
        elif "FLOAT" in datatype or "DOUBLE" in datatype or "DECIMAL" in datatype:
            t =  "FLOAT"
        else:
            t = "STRING"

        schema.type = t
        """
        Simple check for mode.
        Valid options include nullable, required, repeated
        """
        if field['IS_NULLABLE'] == 'NO':
            schema.mode = 'required'
        else:
            schema.mode = 'nullable'

        return schema

    def create_bq_schema(self):
        engine = create_engine(self._source_config.url)

        sql = 'SELECT * FROM information_schema.COLUMNS WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s'
        rs = engine.execute(sql, self._database, self._table)
        table_schema = bigquery.TableSchema()
        for r in rs:
            table_schema.fields.append(self.field_schema(r))

        return table_schema

    def execute_pipeline(self, pipeline, table_schema):
        dataset = self._database.replace('-', '_')
        table_spec = dataset+'.'+self._table
        output = pipeline | "Reading {} records from {} in CloudSql".format(self._table, self._database) >> relational_db.ReadFromDB(
            source_config=self._source_config,
            table_name=self._table
        ) | 'Writing {} to {} BigQuery'.format(self._table, dataset) >> beam.io.WriteToBigQuery(
            table_spec,
            schema=table_schema,
            write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY,
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED)

    def run(self, mysql_config):
        with beam.Pipeline(options=self._pipeline_options) as p:
            self._source_config = mysql_config

            schema = self.create_bq_schema()
            self.execute_pipeline(p, schema)

            p.run()

def run(argv=None):
    log = logging.getLogger()
    log.info('>> Running cloudsql to bigquery pipeline')

    options = Options(flags=argv)

    client = options.client[0]
    database = 'pyr-{}-prod'.format(client)
    user = options.user[0]
    password = options.password[0]
    table = options.table[0]

    standard_options = options.view_as(StandardOptions)
    if hasattr(standard_options, 'runner') == True and standard_options.runner == 'DataflowRunner':
        mysql_config = mysql_cfg(
            username=user,
            password=password,
            database=database,
            cloud_sql_instance='my-project:us-west1:db-instance')
    else:
        proxy_connect()
        mysql_config = mysql_cfg(
                username=user,
                password=password,
                database=database,
                host='127.0.0.1',
                port=3306)

    load_sql_to_bq = LoadSqlToBQ(options, database, table)
    load_sql_to_bq.run(mysql_config)

if __name__ == '__main__':
    logging.getLogger().setLevel(logging.INFO)
    logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
    logging.getLogger().info('> Starting DataFlow Pipeline Runner')
    run()

当我使用--runner DirectRunner运行代码时,代码执行正常,但是当我使用--runner DataflowRunner执行代码时,出现以下错误:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/pymysql/connections.py", line 571, in connect
    sock.connect(self.unix_socket)
FileNotFoundError: [Errno 2] No such file or directory

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 2265, in _wrap_pool_connect
    return fn()
  File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/pool/base.py", line 363, in connect
    return _ConnectionFairy._checkout(self)
  File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/pool/base.py", line 760, in _checkout
    fairy = _ConnectionRecord.checkout(pool)
  File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/pool/base.py", line 492, in checkout
    rec = pool._do_get()
  File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/pool/impl.py", line 139, in _do_get
    self._dec_overflow()
  File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/util/langhelpers.py", line 68, in __exit__
    compat.reraise(exc_type, exc_value, exc_tb)
  File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/util/compat.py", line 153, in reraise
    raise value
  File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/pool/impl.py", line 136, in _do_get
    return self._create_connection()
  File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/pool/base.py", line 308, in _create_connection
    return _ConnectionRecord(self)
  File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/pool/base.py", line 437, in __init__
    self.__connect(first_connect_check=True)
  File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/pool/base.py", line 639, in __connect
    connection = pool._invoke_creator(self)
  File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/engine/strategies.py", line 114, in connect
    return dialect.connect(*cargs, **cparams)
  File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/engine/default.py", line 453, in connect
    return self.dbapi.connect(*cargs, **cparams)
  File "/opt/conda/lib/python3.7/site-packages/pymysql/__init__.py", line 94, in Connect
return Connection(*args, **kwargs)
  File "/opt/conda/lib/python3.7/site-packages/pymysql/connections.py", line 325, in __init__
    self.connect()
  File "/opt/conda/lib/python3.7/site-packages/pymysql/connections.py", line 630, in connect
    raise exc
pymysql.err.OperationalError: (2003, "Can't connect to MySQL server on 'localhost' ([Errno 2] No such file or directory)")

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "load_sql_to_bq.py", line 179, in <module>
    run()
  File "load_sql_to_bq.py", line 173, in run
    load_sql_to_bq.run(mysql_config)
  File "load_sql_to_bq.py", line 139, in run
    schema = self.create_bq_schema()
  File "load_sql_to_bq.py", line 116, in create_bq_schema
    rs = engine.execute(sql, self._database, self._table)
  File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 2168, in execute
    connection = self._contextual_connect(close_with_result=True)
  File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 2229, in _contextual_connect
    self._wrap_pool_connect(self.pool.connect, None),
  File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 2269, in _wrap_pool_connect
    e, dialect, self
  File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 1536, in _handle_dbapi_exception_noconnection
    util.raise_from_cause(sqlalchemy_exception, exc_info)
  File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/util/compat.py", line 398, in raise_from_cause
    reraise(type(exception), exception, tb=exc_tb, cause=cause)
  File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/util/compat.py", line 152, in reraise
    raise value.with_traceback(tb)
  File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 2265, in _wrap_pool_connect
    return fn()
  File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/pool/base.py", line 363, in connect
    return _ConnectionFairy._checkout(self)
  File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/pool/base.py", line 760, in _checkout
    fairy = _ConnectionRecord.checkout(pool)
  File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/pool/base.py", line 492, in checkout
    rec = pool._do_get()
  File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/pool/impl.py", line 139, in _do_get
    self._dec_overflow()
  File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/util/langhelpers.py", line 68, in __exit__
    compat.reraise(exc_type, exc_value, exc_tb)
  File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/util/compat.py", line 153, in reraise
    raise value
  File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/pool/impl.py", line 136, in _do_get
    return self._create_connection()
  File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/pool/base.py", line 308, in _create_connection
    return _ConnectionRecord(self)
File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/pool/base.py", line 437, in __init__
    self.__connect(first_connect_check=True)
  File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/pool/base.py", line 639, in __connect
    connection = pool._invoke_creator(self)
  File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/engine/strategies.py", line 114, in connect
    return dialect.connect(*cargs, **cparams)
  File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/engine/default.py", line 453, in connect
    return self.dbapi.connect(*cargs, **cparams)
  File "/opt/conda/lib/python3.7/site-packages/pymysql/__init__.py", line 94, in Connect
    return Connection(*args, **kwargs)
  File "/opt/conda/lib/python3.7/site-packages/pymysql/connections.py", line 325, in __init__
    self.connect()
  File "/opt/conda/lib/python3.7/site-packages/pymysql/connections.py", line 630, in connect
    raise exc
sqlalchemy.exc.OperationalError: (pymysql.err.OperationalError) (2003, "Can't connect to MySQL server on 'localhost' ([Errno 2] No such file or directory)")
(Background on this error at: http://sqlalche.me/e/e3q8)

我已经假设,如果我移动了东西,以便该查询仅在with Pipeline语句之后执行,则直到将代码部署到Cloud Dataflow时才真正执行。

我该如何解决?

0 个答案:

没有答案