我有一个数据流python脚本:
from __future__ import print_function
import argparse
import logging
import apache_beam as beam
import subprocess
import sys
from pycloudsqlproxy import connect as proxy_connect
from apache_beam.options.pipeline_options import PipelineOptions, StandardOptions
from apache_beam.io.gcp.internal.clients import bigquery
from beam_nuggets.io import relational_db
from sqlalchemy import create_engine
"""
full Command to Run Locally:
python load_sql_to_bq.py \
--user <user> \
--client <client> \
--table <table> \
--password <password> \
--noauth_local_webserver \
--project my-project \
--job_name load_sql_to_bq-local-$USER \
--runner DirectRunner \
--requirements_file requirements.txt \
--setup_file ./setup.py \
Full Command to Run in DataFlow:
python load_sql_to_bq.py \
--user <user> \
--client <client> \
--table <table> \
--password <password> \
--project my-project \
--job_name load_sql_to_bq \
--staging-location gs://my-project-dataflow/staging \
--temp-location gs://icentris-ml-dataflow/tmp \
--runner DataflowRunner \
--requirements_file requirements.txt \
--setup_file ./setup.py
"""
"""
Custom SourceConfiguration class because we need to use a socket for CloudSql
"""
class mysql_cfg(object):
def __init__(self, username, password, database, cloud_sql_instance=None, host=None, port=None):
self.create_if_missing = False
if cloud_sql_instance:
self.url = 'mysql+pymysql://{username}:{password}@/{database}?unix_socket=/cloudsql/{cloud_sql_instance}'.format(
username=username,
password=password,
database=database,
cloud_sql_instance=cloud_sql_instance)
else:
self.url = 'mysql+pymysql://{username}:{password}@{host}:{port}/{database}'.format(
username=username,
password=password,
host=host,
port=port,
database=database)
class Options(PipelineOptions):
@classmethod
def _add_argparse_args(cls, parser):
parser.add_argument('--client',nargs=1, help='monat, idlife, stampinup')
parser.add_argument('--user', nargs=1, help='mysql username')
parser.add_argument('--password', nargs=1, help='mysql password')
parser.add_argument('--table', nargs=1, help='mysql table')
class LoadSqlToBQ():
def __init__(self, pipeline_options, database, table):
self._pipeline_options = pipeline_options
self._database = database
self._table = table
def field_schema(self, field):
schema = bigquery.TableFieldSchema()
schema.name = field['COLUMN_NAME']
"""
Simple attempt to do some basic datatypes.
Fallback to String for unknowns and then you can fix it later in bigquery.
"""
datatype = field['DATA_TYPE'].upper()
if "DATETIME" in datatype:
t = "DATETIME"
elif "DATE" in datatype:
t = "DATE"
elif "INT" in datatype:
t = "INTEGER"
elif "FLOAT" in datatype or "DOUBLE" in datatype or "DECIMAL" in datatype:
t = "FLOAT"
else:
t = "STRING"
schema.type = t
"""
Simple check for mode.
Valid options include nullable, required, repeated
"""
if field['IS_NULLABLE'] == 'NO':
schema.mode = 'required'
else:
schema.mode = 'nullable'
return schema
def create_bq_schema(self):
engine = create_engine(self._source_config.url)
sql = 'SELECT * FROM information_schema.COLUMNS WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s'
rs = engine.execute(sql, self._database, self._table)
table_schema = bigquery.TableSchema()
for r in rs:
table_schema.fields.append(self.field_schema(r))
return table_schema
def execute_pipeline(self, pipeline, table_schema):
dataset = self._database.replace('-', '_')
table_spec = dataset+'.'+self._table
output = pipeline | "Reading {} records from {} in CloudSql".format(self._table, self._database) >> relational_db.ReadFromDB(
source_config=self._source_config,
table_name=self._table
) | 'Writing {} to {} BigQuery'.format(self._table, dataset) >> beam.io.WriteToBigQuery(
table_spec,
schema=table_schema,
write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY,
create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED)
def run(self, mysql_config):
with beam.Pipeline(options=self._pipeline_options) as p:
self._source_config = mysql_config
schema = self.create_bq_schema()
self.execute_pipeline(p, schema)
p.run()
def run(argv=None):
log = logging.getLogger()
log.info('>> Running cloudsql to bigquery pipeline')
options = Options(flags=argv)
client = options.client[0]
database = 'pyr-{}-prod'.format(client)
user = options.user[0]
password = options.password[0]
table = options.table[0]
standard_options = options.view_as(StandardOptions)
if hasattr(standard_options, 'runner') == True and standard_options.runner == 'DataflowRunner':
mysql_config = mysql_cfg(
username=user,
password=password,
database=database,
cloud_sql_instance='my-project:us-west1:db-instance')
else:
proxy_connect()
mysql_config = mysql_cfg(
username=user,
password=password,
database=database,
host='127.0.0.1',
port=3306)
load_sql_to_bq = LoadSqlToBQ(options, database, table)
load_sql_to_bq.run(mysql_config)
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
logging.getLogger().info('> Starting DataFlow Pipeline Runner')
run()
当我使用--runner DirectRunner
运行代码时,代码执行正常,但是当我使用--runner DataflowRunner
执行代码时,出现以下错误:
Traceback (most recent call last):
File "/opt/conda/lib/python3.7/site-packages/pymysql/connections.py", line 571, in connect
sock.connect(self.unix_socket)
FileNotFoundError: [Errno 2] No such file or directory
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 2265, in _wrap_pool_connect
return fn()
File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/pool/base.py", line 363, in connect
return _ConnectionFairy._checkout(self)
File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/pool/base.py", line 760, in _checkout
fairy = _ConnectionRecord.checkout(pool)
File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/pool/base.py", line 492, in checkout
rec = pool._do_get()
File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/pool/impl.py", line 139, in _do_get
self._dec_overflow()
File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/util/langhelpers.py", line 68, in __exit__
compat.reraise(exc_type, exc_value, exc_tb)
File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/util/compat.py", line 153, in reraise
raise value
File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/pool/impl.py", line 136, in _do_get
return self._create_connection()
File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/pool/base.py", line 308, in _create_connection
return _ConnectionRecord(self)
File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/pool/base.py", line 437, in __init__
self.__connect(first_connect_check=True)
File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/pool/base.py", line 639, in __connect
connection = pool._invoke_creator(self)
File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/engine/strategies.py", line 114, in connect
return dialect.connect(*cargs, **cparams)
File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/engine/default.py", line 453, in connect
return self.dbapi.connect(*cargs, **cparams)
File "/opt/conda/lib/python3.7/site-packages/pymysql/__init__.py", line 94, in Connect
return Connection(*args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/pymysql/connections.py", line 325, in __init__
self.connect()
File "/opt/conda/lib/python3.7/site-packages/pymysql/connections.py", line 630, in connect
raise exc
pymysql.err.OperationalError: (2003, "Can't connect to MySQL server on 'localhost' ([Errno 2] No such file or directory)")
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "load_sql_to_bq.py", line 179, in <module>
run()
File "load_sql_to_bq.py", line 173, in run
load_sql_to_bq.run(mysql_config)
File "load_sql_to_bq.py", line 139, in run
schema = self.create_bq_schema()
File "load_sql_to_bq.py", line 116, in create_bq_schema
rs = engine.execute(sql, self._database, self._table)
File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 2168, in execute
connection = self._contextual_connect(close_with_result=True)
File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 2229, in _contextual_connect
self._wrap_pool_connect(self.pool.connect, None),
File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 2269, in _wrap_pool_connect
e, dialect, self
File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 1536, in _handle_dbapi_exception_noconnection
util.raise_from_cause(sqlalchemy_exception, exc_info)
File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/util/compat.py", line 398, in raise_from_cause
reraise(type(exception), exception, tb=exc_tb, cause=cause)
File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/util/compat.py", line 152, in reraise
raise value.with_traceback(tb)
File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 2265, in _wrap_pool_connect
return fn()
File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/pool/base.py", line 363, in connect
return _ConnectionFairy._checkout(self)
File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/pool/base.py", line 760, in _checkout
fairy = _ConnectionRecord.checkout(pool)
File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/pool/base.py", line 492, in checkout
rec = pool._do_get()
File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/pool/impl.py", line 139, in _do_get
self._dec_overflow()
File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/util/langhelpers.py", line 68, in __exit__
compat.reraise(exc_type, exc_value, exc_tb)
File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/util/compat.py", line 153, in reraise
raise value
File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/pool/impl.py", line 136, in _do_get
return self._create_connection()
File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/pool/base.py", line 308, in _create_connection
return _ConnectionRecord(self)
File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/pool/base.py", line 437, in __init__
self.__connect(first_connect_check=True)
File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/pool/base.py", line 639, in __connect
connection = pool._invoke_creator(self)
File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/engine/strategies.py", line 114, in connect
return dialect.connect(*cargs, **cparams)
File "/opt/conda/lib/python3.7/site-packages/sqlalchemy/engine/default.py", line 453, in connect
return self.dbapi.connect(*cargs, **cparams)
File "/opt/conda/lib/python3.7/site-packages/pymysql/__init__.py", line 94, in Connect
return Connection(*args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/pymysql/connections.py", line 325, in __init__
self.connect()
File "/opt/conda/lib/python3.7/site-packages/pymysql/connections.py", line 630, in connect
raise exc
sqlalchemy.exc.OperationalError: (pymysql.err.OperationalError) (2003, "Can't connect to MySQL server on 'localhost' ([Errno 2] No such file or directory)")
(Background on this error at: http://sqlalche.me/e/e3q8)
我已经假设,如果我移动了东西,以便该查询仅在with Pipeline
语句之后执行,则直到将代码部署到Cloud Dataflow时才真正执行。
我该如何解决?