我想围绕sqlalchemy(python3.6)创建一个简单的包装器,但是在映射会话和表类的那一刻我很困惑。我现在应该怎么办?是应该实现DBManager来提供与所有表的交互(如何正确实现?),还是应该返回将与某些表/类进行交互的对象?不确定哪种方法正确。也是第一次与ORM合作。谢谢您的宝贵时间。
class Singleton(type):
_instances = {}
def __call__(cls, *args, **kwargs):
if cls not in cls._instances:
cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs)
return cls._instances[cls]
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine, MetaData
from sqlalchemy.orm import create_session, Session
from sqlalchemy.ext.automap import automap_base
class DBManager(metaclass=Singleton):
def __init__(self, config_path=''):
self._db_operator = None
self._db_config = {}
self._db_tables = {}
self._error = ''
if config_path:
self.load_config(config_path)
self.connect_to_database()
def connect_to_database(self):
self._clean()
DB_USER = ''
DB_PASS = ''
DB_HOST = ''
DATABASE = ''
DB_PORT = 3306
try:
DB_USER = self._db_config['DB_USER']
DB_PASS = self._db_config['DB_PASSWORD']
DB_HOST = self._db_config['DB_HOST']
DATABASE = self._db_config['DB_NAME']
except Exception as e:
pass
connection_string = 'mysql+pymysql://{}:{}@{}:{}/{}?charset=utf8mb4'.format(DB_USER, DB_PASS, DB_HOST, DB_PORT,
DATABASE)
engine = create_engine(connection_string, echo=True)
session = Session(engine)
Base = automap_base()
Base.prepare(engine, reflect=True)
user = Base.classes.users
article = Base.classes.article
session.add(user(username='ohyeah'))
session.commit()
答案 0 :(得分:0)
import math
import time
import numpy as np
import pandas as pd
import sqlalchemy
from sqlalchemy import create_engine,text
from functools import wraps
import datetime
from io import StringIO
import csv
class Connector(object):
"""
con.init_engine()
with con.engine.begin() as conn:
response = con.exec_query(_query,_transaction_connection=conn):
con.push_df('tablename',df,is_replace=True,schema='',con=conn,method='postgrescopy')
con.engine.dispose()
if response.cursor:
result = response.fetchone()
else:
result = response.rowcount
"""
def __init__(self, engine_args, logger=None):
"""
This is constructor of class
:param engine_args: It expects dict type or string with information like
engine_args = {"db_type": 'postgresql',
"address":'Name:Port',
"user": "",
"password": "",
"db_name": "databasename" }
or engine_args ='sqlite:///my_db.sqlite'
:param logger: pass any logger object
"""
self.logger = logger
self.engine_args = engine_args
if(type(engine_args)==str):
self.connection_string = engine_args
else:
self.connection_string = "{db_type}://{user}:{password}@{address}/{db_name}".format(**self.engine_args)
def init_engine(self,echo=False, **kwargs):
"""
sqlalchemy.create_engine class. For postgresql pass for faster query execution use_batch_mode=True etc.
"""
kwargs['echo']=echo
if self.logger:
self.logger.debug("initiating engine with these options {}".format(kwargs))
self.engine = create_engine(self.connection_string, **kwargs)
if self.logger:
if(type(self.engine_args)==str):
self.logger.info('initiated engine with {}'.format(self.engine))
else:
self.logger.info('initiated engine on address: {} with database: {} username: {}'.format(self.engine_args['address'],
self.engine_args['db_name'],
self.engine_args['user']))
def __remove_non_ascii(self, text):
if isinstance(text, (pd.Timestamp, float, int)):
return text
if isinstance(text, (datetime.datetime, float, int)):
return text
if pd.isnull(text):
return np.nan
return ''.join(i for i in text if ord(i) < 128)
def __remove_non_ascii_df(self, df) -> pd.DataFrame:
for colmn in df.columns:
df[colmn] = df[colmn].apply(self.__remove_non_ascii)
return df
def __psql_insert_copy(self,table, conn, keys, data_iter):
"""
Execute SQL statement inserting data
Parameters
----------
table : pandas.io.sql.SQLTable
conn : sqlalchemy.engine.Engine or sqlalchemy.engine.Connection
keys : list of str
Column names
data_iter : Iterable that iterates the values to be inserted
"""
# gets a DBAPI connection that can provide a cursor
dbapi_conn = conn.connection
with dbapi_conn.cursor() as cur:
s_buf = StringIO()
writer = csv.writer(s_buf)
writer.writerows(data_iter)
s_buf.seek(0)
columns = ', '.join('"{}"'.format(k) for k in keys)
if table.schema:
table_name = '{}.{}'.format(table.schema, '"'+table.name+'"')
else:
table_name = '"'+table.name+'"'
sql = 'COPY {} ({}) FROM STDIN WITH CSV'.format(
table_name, columns)
cur.copy_expert(sql=sql, file=s_buf)
def __write_df(self, table_name, df, **kwargs):
"""
This is private function to class. This gets called by other public functions only.
This function write dataframe to database by calling df.to_sql function
param: table_name: Table where data to be saved in database
param: df: This is DataFrame object
param:**kwargs : This accepts any additional parameters needs to be passed to df.to_sql function
"""
conn = self.engine
if 'con' in kwargs.keys():
conn = kwargs['con']
kwargs.pop('con', None)
index = False
if 'index' in kwargs.keys():
index = kwargs['index']
kwargs.pop('index',None)
df.to_sql(table_name, con=conn, index=index, **kwargs)
return True
def __write_split_df(self, table_name, dfs, **kwargs):
"""
This is private function to class. This gets called by other public functions only.
This function saves array of dataframes to database by calling
another private function __write_df and passing individual dataframe.
param: table_name: Table where data to be saved in database
param: dfs: This is DataFrame object
param:**kwargs : This accepts any additional parameters needs to be passed to df.to_sql function
"""
self.__write_df(table_name, dfs[0], **kwargs)
if self.logger:
self.logger.info("Pushed {} rows in table : {}".format(len(dfs[0]), table_name))
kwargs.pop('if_exists', None)
for df in dfs[1:]:
self.__write_df(table_name, df, if_exists='append', **kwargs)
if self.logger:
self.logger.info("Pushed {} rows in table : {}".format(len(df), table_name))
return True
def __split_df(self, df, chunksize):
"""
This is private function to class. This gets called by other public functions only.
This function splits large dataframes.
param: df: This is DataFrame object
param:chunksize : rows to split dataframe
"""
chunk_count = int(math.ceil(len(df) / chunksize))
return np.array_split(df, chunk_count)
def __df_cleanup(self, df):
"""
This is private function to class. This gets called by other public functions only.
This function cleans data in dataframe.
param: df: This is DataFrame object
"""
df.columns = df.columns.str.replace("(", "")
df.columns = df.columns.str.replace(")", "")
df.columns = df.columns.str.replace("%", "per")
df.columns = df.columns.str.replace(r"\\t","")
df.columns = df.columns.str.replace(r"\\n","")
df.columns = df.columns.str.replace(r"\\r","")
df.columns = df.columns.str.replace(r"\t","")
df.columns = df.columns.str.replace(r"\n","")
df.columns = df.columns.str.replace(r"\r","")
df = self.__remove_non_ascii_df(df)
# DF=DF.fillna(np.nan,inplace=True)
#remove special char from data.
df.replace(to_replace=[r"\\t|\\n|\\r|'|\t|\n|\r"], value=[""], regex=True, inplace=True)
return df
@staticmethod
def df_col_detect(dfparam:pd.DataFrame, is_all_data_string) -> dict:
"""
df_col_detect is a static function and can be called as Connector.df_col_detect()
Determines data types from dataframe and returns a dictionary of the database data types.
:param dfparam: the dataframe.
:param is_all_data_string : Treat all columns as string type.
:return: Dict
"""
dtypedict = {}
for i, j in zip(dfparam.columns, dfparam.dtypes):
if (is_all_data_string):
if (i == 'date'):
dtypedict.update({i: sqlalchemy.types.DateTime()})
else:
dtypedict.update({i: sqlalchemy.types.VARCHAR()})
else:
if "object" in str(j):
dtypedict.update({i: sqlalchemy.types.VARCHAR()})
if "date" in str(j):
dtypedict.update({i: sqlalchemy.types.DateTime()})
if "float" in str(j):
dtypedict.update({i: sqlalchemy.types.Float()})
if "int" in str(j):
dtypedict.update({i: sqlalchemy.types.INT()})
return dtypedict
def __check_engine(func):
"""
This is private Decorating function. Its checks for engine class attribute. If not found then it calls
init_engine function.
"""
@wraps(func)
def inner(*args, **kwargs):
if not hasattr(args[0], 'engine'):
if 'use_batch_mode' in kwargs.keys():
args[0].init_engine(use_batch_mode=kwargs['use_batch_mode'])
else:
args[0].init_engine()
kwargs.pop('use_batch_mode',None)
return func(*args, **kwargs)
return inner
@__check_engine
def push_df_large(self, table_name, df, is_replace=False, chunksize=10 ** 5,call_df_cleanup=False,method=None,schema=None, **kwargs):
"""
This function pushes large datasets to database. It has functionality of spliting dataframes.
:param table_name: Table to save data.
:param df: Dataframe to be saved in database.
:param is_replace: Pass false if you want to append data into existing table. Pass True if you want to create or replace
table with new data.
:Param call_df_cleanup: If True it will call function to clean up df like remove non ascii characters from data also remove
any special characters like tabs or new line
:param chunksize: Specify the number of rows in each batch to be written at a time. By default, be default only 100000 rows witten in batch.
:param **kwargs: This function calls df.to_sql function internally, any additional parameters need to be passed, pass as key and value
like push_df_large(table_name, df, is_replace=False, chunksize=10 ** 5,schema='raw',dtype=df_col_detect(df,True),use_batch_mode=True)
dtype and schema are additional parameters which will be passed to df.to_sql fuction as **kwargs
:return : True or False
:param: method : default None
values accepted 'postgrescopy' or None or 'multi'
Controls the SQL insertion clause used:
'postgrescopy' : When insert data in postgresql or postgressql flavour like redshift. This is very fast insert method for Postgres database
None : Uses standard SQL INSERT clause (one per row).
'multi': Pass multiple values in a single INSERT clause.
con : Assign value only when you want to maintain transaction
example :
con.init_engine()
with con.engine.begin() as conn:
response = con.exec_query(_query,_transaction_connection=conn):
con.push_df('tablename',df,is_replace=True,schema='czuc',con=conn,method='postgrescopy')
con.engine.dispose()
if response.cursor:
result = response.fetchone()
else:
result = response.rowcount
schema : string, optional
Specify the schema (if database flavor supports this). If None, use default schema.
index : bool, default True
Write DataFrame index as a column. Uses index_label as the column name in the table.
index_label : string or sequence, default None
Column label for index column(s). If None is given (default) and index is True, then the index names are used. A sequence should be given if the DataFrame uses MultiIndex.
chunksize : int, optional
Rows will be written in batches of this size at a time. By default, all rows will be written at once.
dtype : dict, optional
Specifying the datatype for columns. The keys should be the column names and the values should be the SQLAlchemy types or strings for the sqlite3 legacy mode.
:return : True or False
use_batch_mode : Optional. Accept values as True or False
This is to initiate db engine with use_batch_mode option, if it is not initiated earlier.
Please read SQLAlchemy's create_engine() documentation where to use use_batch_mode.
"""
if df.empty:
raise Exception ("Error : Empty DataFrame.")
if df.columns.duplicated().any():
raise Exception ("Error : Duplicate columns in DataFrame.")
dispose_engine = True
if 'con' in kwargs.keys():
dispose_engine = False
dfsize = chunksize
if (is_replace):
if_exists = 'replace'
else:
if_exists = 'append'
kwargs['schema'] = schema
if call_df_cleanup:
df = self.__df_cleanup(df)
table_name = table_name.replace("'", "").replace('"', '').lower()
status = False
if dfsize is None:
dfsize = 10 ** 5
kwargs.pop('chunksize', None)
if not method == None:
if method =='postgrescopy':
method = self.__psql_insert_copy
kwargs['method'] = method
s = time.time()
if len(df) > dfsize:
dfs = self.__split_df(df, dfsize)
status = self.__write_split_df(table_name, dfs, if_exists=if_exists, **kwargs)
if self.logger:
self.logger.info("Total {} rows pushed in table : {} within: {}s".format(len(df), table_name,
round(time.time() - s, 4)))
else:
status = self.__write_df(table_name, df, if_exists=if_exists, chunksize=dfsize, **kwargs)
if self.logger:
self.logger.info("Pushed {} rows in table : {} within: {}s".format(len(df), table_name,
round(time.time() - s, 4)))
#if self.logger:
#self.logger.info('Pushed name: {} dataframe shape: {} within: {}s'.format(table_name,
#df.shape,
#round(time.time() - s, 4)))
if dispose_engine:
self.__db_dispose()
return status
@__check_engine
def push_df(self, table_name, df, is_replace=False, call_df_cleanup=False, method=None,schema=None, **kwargs):
"""
This function pushes datasets to database.
:param table_name: Table to save data.
:param df: Dataframe to be saved in database.
:param is_replace: Pass false if you want to append data into existing table. Pass True if you want to create or replace
table with new data.
:Param call_df_cleanup: If True it will call function to clean up df like remove non ascii characters from data also remove
any special characters like tabs or new line
:param **kwargs: This function calls df.to_sql function internally, any additional parameters need to be passed, pass as key and value
like push_df(table_name, df, is_replace=False, chunksize=10 ** 5,schema='raw',dtype=df_col_detect(df,True),use_batch_mode=True)
chunksize ,dtype and schema are additional parameters which will be passed to df.to_sql fuction as **kwargs
:param: method : default None
values accepted 'postgrescopy' or None or 'multi'
Controls the SQL insertion clause used:
'postgrescopy' : When insert data in postgresql or postgressql flavour like redshift. This is very fast insert method for Postgres database
None : Uses standard SQL INSERT clause (one per row).
'multi': Pass multiple values in a single INSERT clause.
con : Optional, Assign value only when you want to maintain transaction
example :
con.init_engine()
with con.engine.begin() as conn:
response = con.exec_query(_query,_transaction_connection=conn):
con.push_df('tablename',df,is_replace=True,schema='czuc',con=conn,method='postgrescopy')
con.engine.dispose()
if response.cursor:
result = response.fetchone()
else:
result = response.rowcount
schema : string, optional
Specify the schema (if database flavor supports this). If None, use default schema.
index : bool, default True
Write DataFrame index as a column. Uses index_label as the column name in the table.
index_label : string or sequence, default None
Column label for index column(s). If None is given (default) and index is True, then the index names are used. A sequence should be given if the DataFrame uses MultiIndex.
chunksize : int, optional
Rows will be written in batches of this size at a time. By default, all rows will be written at once.
dtype : dict, optional
Specifying the datatype for columns. The keys should be the column names and the values should be the SQLAlchemy types or strings for the sqlite3 legacy mode.
:return : True or False
use_batch_mode : Optional. Accept values as True or False
This is to initiate db engine with use_batch_mode option, if it is not initiated earlier.
Please read SQLAlchemy's create_engine() documentation where to use use_batch_mode.
"""
if df.empty:
raise Exception ("Error : Empty DataFrame.")
if df.columns.duplicated().any():
raise Exception ("Error : Duplicate columns in DataFrame.")
dispose_engine = True
if 'con' in kwargs.keys():
dispose_engine = False
if (is_replace):
if_exists = 'replace'
else:
if_exists = 'append'
kwargs['schema'] = schema
if call_df_cleanup:
df = self.__df_cleanup(df)
table_name = table_name.replace("'", "").replace('"', '').lower()
# Format the Dataframe in preparation
# hit it up
status = False
if not method == None:
if method =='postgrescopy':
method = self.__psql_insert_copy
kwargs['method'] = method
s = time.time()
status = self.__write_df(table_name, df, if_exists=if_exists, **kwargs)
if self.logger:
self.logger.info("Pushed {} rows in table : {} within: {}s".format(len(df), table_name,
round(time.time() - s, 4)))
if dispose_engine:
self.__db_dispose()
return status
@__check_engine
def get_df_large_table(self, table_name, **kwargs)-> pd.DataFrame:
"""
This function performce select * from table , with default 100000 rows of batch reading
**kwargs : Any aditional parameter
"""
s = time.time()
if 'chunksize' not in kwargs.keys():
kwargs['chunksize'] = 10 ** 5
dfs = pd.read_sql_table(table_name, self.engine, **kwargs)
try:
df = pd.concat(dfs, axis=0)
except ValueError: # No objects to concetenate. dfs is a generator object so has no len() property!
if self.logger:
self.logger.warning("No objects to concetenate on table_name: {}".format(table_name))
return None
length = 0
if df is not None:
length = len(df)
if self.logger:
self.logger.info('fetched {} rows from {} within: {}'.format(length,table_name,
round(time.time() - s, 4)))
self.__db_dispose()
return df
@__check_engine
def get_df_table(self, table_name, **kwargs)-> pd.DataFrame:
s = time.time()
kwargs.pop('chunksize', None)
df = pd.read_sql_table(table_name, self.engine, **kwargs)
length = 0
if df is not None:
length = len(df)
if self.logger:
self.logger.info('fetched {} rows from {} within: {}'.format(length,table_name,
round(time.time() - s, 4)))
self.__db_dispose()
return df
@__check_engine
def exec_query(self, _query,_transaction_connection=None,**kwargs):
"""
This function performce any query on database. Like DDL queries
:param _query: SQL Query.
:param _transaction_connection if any transaction is required to maintain if transaction is passed
then user is responsible for engine disposal and connection closure.
"""
if self.logger:
self.logger.info('running query: "{}"'.format(_query))
_query = text(_query)
result = None
if _transaction_connection is not None:
con = _transaction_connection
result = con.execute(_query)
else:
if( type(self.engine)==sqlalchemy.engine.base.Engine):
with self.engine.connect() as con:
response = con.execute(_query)
if response.cursor:
result = response.fetchone()
else:
result = response.rowcount
else:
"""Below work around is for SQLite if some how we use native sqlite engine for excute query"""
result = self.engine.execute(_query)
if _transaction_connection is None:
self.__db_dispose()
return result
@__check_engine
def get_df_query(self, _query,sqlite_text_factory=None,**kwargs) -> pd.DataFrame:
"""
This function performce any query on database. Any DML query
def read_sql_query(sql, con, index_col=None, coerce_float=True, params=None, parse_dates=None, chunksize: None=None)
Read SQL query into a DataFrame.
Returns a DataFrame corresponding to the result set of the query string. Optionally provide an index_col parameter to use one of the columns as the index, otherwise default integer index will be used.
Returns
DataFrame
"""
if self.logger:
self.logger.info('running query: "{}"'.format(_query))
s = time.time()
if sqlite_text_factory is None:
con = self.engine
else:
con = self.engine.raw_connection()
con.connection.text_factory = sqlite_text_factory
result = pd.read_sql_query(_query, con=con)
length = 0
if result is not None:
length = len(result)
if self.logger:
self.logger.info('Finished running query with rows fetched {} within: {}'.format(
length,
round(time.time() - s, 4)))
self.__db_dispose()
return result
def __db_dispose(self):
"""This function dispose database"""
if( type(self.engine)==sqlalchemy.engine.base.Engine):
self.engine.dispose()