Question

我想围绕sqlalchemy（python3.6）创建一个简单的包装器，但是在映射会话和表类的那一刻我很困惑。我现在应该怎么办？是应该实现DBManager来提供与所有表的交互（如何正确实现？），还是应该返回将与某些表/类进行交互的对象？不确定哪种方法正确。也是第一次与ORM合作。谢谢您的宝贵时间。

class Singleton(type):
    _instances = {}
    def __call__(cls, *args, **kwargs):
        if cls not in cls._instances:
            cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs)
        return cls._instances[cls]




from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine, MetaData
from sqlalchemy.orm import create_session, Session

from sqlalchemy.ext.automap import automap_base

class DBManager(metaclass=Singleton):
    def __init__(self, config_path=''):
        self._db_operator = None
        self._db_config = {}
        self._db_tables = {}

        self._error = ''

        if config_path:
            self.load_config(config_path)
            self.connect_to_database()

    def connect_to_database(self):
        self._clean()

        DB_USER = ''
        DB_PASS = ''
        DB_HOST = ''
        DATABASE = ''
        DB_PORT = 3306
        try:
            DB_USER = self._db_config['DB_USER']
            DB_PASS = self._db_config['DB_PASSWORD']
            DB_HOST = self._db_config['DB_HOST']
            DATABASE = self._db_config['DB_NAME']

        except Exception as e:
            pass

        connection_string = 'mysql+pymysql://{}:{}@{}:{}/{}?charset=utf8mb4'.format(DB_USER, DB_PASS, DB_HOST, DB_PORT,
                                                                                 DATABASE)

        engine = create_engine(connection_string, echo=True)
        session = Session(engine)

        Base = automap_base()
        Base.prepare(engine, reflect=True)

        user = Base.classes.users
        article = Base.classes.article

        session.add(user(username='ohyeah'))
        session.commit()

Answer 1

import math
import time
import numpy as np
import pandas as pd
import sqlalchemy
from sqlalchemy import create_engine,text

from functools import wraps
import datetime
from io import StringIO
import csv
class Connector(object):
    """
       
    con.init_engine()
    with con.engine.begin() as conn:
        response = con.exec_query(_query,_transaction_connection=conn):
        con.push_df('tablename',df,is_replace=True,schema='',con=conn,method='postgrescopy')
    con.engine.dispose()
    if response.cursor:
        result = response.fetchone()
    else:
        result = response.rowcount
    

    """
    
    
    def __init__(self, engine_args, logger=None):
        """
                This is constructor of class
                :param engine_args: It expects dict type or string with information like
                engine_args = {"db_type": 'postgresql',
                                "address":'Name:Port',
                                "user": "",
                                "password": "",
                               "db_name": "databasename" }
                or engine_args ='sqlite:///my_db.sqlite'
                :param logger: pass any logger object
        """
        self.logger = logger
        self.engine_args = engine_args

        if(type(engine_args)==str):
            self.connection_string = engine_args
        else:
            self.connection_string = "{db_type}://{user}:{password}@{address}/{db_name}".format(**self.engine_args)


       

    def init_engine(self,echo=False, **kwargs):

        """
        
        sqlalchemy.create_engine class. For postgresql pass for faster query execution use_batch_mode=True etc.


                """
        
        kwargs['echo']=echo
        
        
        if self.logger:
            self.logger.debug("initiating engine with these options {}".format(kwargs))
            
                    
        
        self.engine = create_engine(self.connection_string, **kwargs)
        
        if self.logger:
            if(type(self.engine_args)==str):
            
                self.logger.info('initiated engine with {}'.format(self.engine))
            else:        
            
                self.logger.info('initiated engine on address: {} with database: {} username: {}'.format(self.engine_args['address'],
                                                                                        self.engine_args['db_name'],
                                                                                        self.engine_args['user']))

   
    def __remove_non_ascii(self, text):
       
        if isinstance(text, (pd.Timestamp, float, int)):
            return text
        if isinstance(text, (datetime.datetime, float, int)):
            return text
        if pd.isnull(text):
            return np.nan
        return ''.join(i for i in text if ord(i) < 128)


    def __remove_non_ascii_df(self, df) -> pd.DataFrame:
       
        for colmn in df.columns:
            df[colmn] = df[colmn].apply(self.__remove_non_ascii)
        return df

    def __psql_insert_copy(self,table, conn, keys, data_iter):
        """
        Execute SQL statement inserting data

        Parameters
        ----------
        table : pandas.io.sql.SQLTable
        conn : sqlalchemy.engine.Engine or sqlalchemy.engine.Connection
        keys : list of str
            Column names
        data_iter : Iterable that iterates the values to be inserted
        """
        # gets a DBAPI connection that can provide a cursor
        dbapi_conn = conn.connection
        with dbapi_conn.cursor() as cur:
            s_buf = StringIO()
            writer = csv.writer(s_buf)
            writer.writerows(data_iter)
            s_buf.seek(0)

            columns = ', '.join('"{}"'.format(k) for k in keys)
            if table.schema:
                table_name = '{}.{}'.format(table.schema, '"'+table.name+'"')
            else:
                table_name = '"'+table.name+'"'
            
            
            
            sql = 'COPY {} ({}) FROM STDIN WITH CSV'.format(
                table_name, columns)
            
            cur.copy_expert(sql=sql, file=s_buf)


    def __write_df(self, table_name, df, **kwargs):
        """
        This is private function to class. This gets called by other public functions only.
        This function write dataframe to database by calling df.to_sql function
        param: table_name: Table where data to be saved in database
        param: df: This is DataFrame object
        param:**kwargs : This accepts any additional parameters needs to be passed to df.to_sql function
        """
        conn = self.engine
        if 'con' in kwargs.keys():
            conn = kwargs['con']
            kwargs.pop('con', None)

        index = False
        if 'index' in kwargs.keys():
            index =   kwargs['index'] 
            kwargs.pop('index',None)  
        
        df.to_sql(table_name, con=conn, index=index, **kwargs)
        
        return True



    def __write_split_df(self, table_name, dfs, **kwargs):

        """
        This is private function to class. This gets called by other public functions only.
        This function saves array of dataframes to database by calling
        another private function __write_df and passing individual dataframe.
        param: table_name: Table where data to be saved in database
        param: dfs: This is DataFrame object
        param:**kwargs : This accepts any additional parameters needs to be passed to df.to_sql function
        """    

        self.__write_df(table_name, dfs[0], **kwargs)

        if self.logger:
            self.logger.info("Pushed {} rows in table : {}".format(len(dfs[0]), table_name))

        kwargs.pop('if_exists', None)
        for df in dfs[1:]:
            self.__write_df(table_name, df, if_exists='append', **kwargs)

            if self.logger:
                self.logger.info("Pushed {} rows in table : {}".format(len(df), table_name))
        return True



    def __split_df(self, df, chunksize):

        """
        This is private function to class. This gets called by other public functions only.
        This function splits large dataframes.     
        param: df: This is DataFrame object
        param:chunksize : rows to split dataframe
        """      

        chunk_count = int(math.ceil(len(df) / chunksize))
        return np.array_split(df, chunk_count)


    def __df_cleanup(self, df):

        """
        This is private function to class. This gets called by other public functions only.
        This function cleans data in dataframe.     
        param: df: This is DataFrame object
        """      

        df.columns = df.columns.str.replace("(", "")
        df.columns = df.columns.str.replace(")", "")
        df.columns = df.columns.str.replace("%", "per")
        df.columns = df.columns.str.replace(r"\\t","")
        df.columns = df.columns.str.replace(r"\\n","")
        df.columns = df.columns.str.replace(r"\\r","")
        df.columns = df.columns.str.replace(r"\t","")
        df.columns = df.columns.str.replace(r"\n","")
        df.columns = df.columns.str.replace(r"\r","")

        df = self.__remove_non_ascii_df(df)
        # DF=DF.fillna(np.nan,inplace=True)
        #remove special char from data.
        df.replace(to_replace=[r"\\t|\\n|\\r|'|\t|\n|\r"], value=[""], regex=True, inplace=True) 

        return df

    @staticmethod
    def df_col_detect(dfparam:pd.DataFrame, is_all_data_string) -> dict:
        """
        df_col_detect is a static function and can be called as Connector.df_col_detect()
        Determines data types from dataframe and returns a dictionary of the database data types.
        :param dfparam: the dataframe.
        :param is_all_data_string : Treat all columns as string type. 
        :return: Dict
        """
        
        dtypedict = {}
        
        for i, j in zip(dfparam.columns, dfparam.dtypes):
            
            if (is_all_data_string):
                if (i == 'date'):
                    dtypedict.update({i: sqlalchemy.types.DateTime()})
                else:

                    dtypedict.update({i: sqlalchemy.types.VARCHAR()})
            else:

                if "object" in str(j):
                    dtypedict.update({i: sqlalchemy.types.VARCHAR()})
                if "date" in str(j):
                    dtypedict.update({i: sqlalchemy.types.DateTime()})
                if "float" in str(j):
                    dtypedict.update({i: sqlalchemy.types.Float()})
                if "int" in str(j):
                    dtypedict.update({i: sqlalchemy.types.INT()})
        return dtypedict


    def __check_engine(func):
        
        """
        This is private Decorating function. Its checks for engine class attribute. If not found then it calls 
        init_engine function.
        """
        @wraps(func)
        
        def inner(*args, **kwargs):
            if not hasattr(args[0], 'engine'):
            
                if 'use_batch_mode' in kwargs.keys():
                    args[0].init_engine(use_batch_mode=kwargs['use_batch_mode'])
                    
                else:
                    args[0].init_engine()
                    
            kwargs.pop('use_batch_mode',None)   
            return func(*args, **kwargs)
        return inner

    @__check_engine
    def push_df_large(self, table_name, df, is_replace=False, chunksize=10 ** 5,call_df_cleanup=False,method=None,schema=None, **kwargs):

        """
        This function pushes large datasets to database. It has functionality of spliting dataframes.
        :param table_name: Table to save data.
        :param df: Dataframe to be saved in database.
        :param is_replace: Pass false if you want to append data into existing table. Pass True if you want to create or replace
                            table with new data.
        :Param call_df_cleanup: If True it will call function to clean up df like remove non ascii characters from data also remove 
        any special characters like tabs or new line 
        :param chunksize: Specify the number of rows in each batch to be written at a time. By default, be default only 100000 rows witten in batch.
        :param **kwargs: This function calls df.to_sql function internally, any additional parameters need to be passed, pass as key and value
        like push_df_large(table_name, df, is_replace=False, chunksize=10 ** 5,schema='raw',dtype=df_col_detect(df,True),use_batch_mode=True)
        dtype and schema are additional parameters which will be passed to df.to_sql fuction as **kwargs
        :return : True or False
        :param: method :  default None
                values accepted 'postgrescopy' or None or 'multi'
                Controls the SQL insertion clause used:
                'postgrescopy' : When insert data in postgresql or postgressql flavour like redshift. This is very fast insert method for Postgres database
                None : Uses standard SQL INSERT clause (one per row).
                'multi': Pass multiple values in a single INSERT clause.
                
            con : Assign value only when you want to maintain transaction
                  example : 
                    con.init_engine()
                    with con.engine.begin() as conn:
                        response = con.exec_query(_query,_transaction_connection=conn):
                        con.push_df('tablename',df,is_replace=True,schema='czuc',con=conn,method='postgrescopy')
                    con.engine.dispose()
                    if response.cursor:
                        result = response.fetchone()
                    else:
                        result = response.rowcount
                
            
            schema : string, optional
            Specify the schema (if database flavor supports this). If None, use default schema.

            index : bool, default True
            Write DataFrame index as a column. Uses index_label as the column name in the table.

            index_label : string or sequence, default None
            Column label for index column(s). If None is given (default) and index is True, then the index names are used. A sequence should be given if the DataFrame uses MultiIndex.

            chunksize : int, optional
            Rows will be written in batches of this size at a time. By default, all rows will be written at once.

            dtype : dict, optional
            Specifying the datatype for columns. The keys should be the column names and the values should be the SQLAlchemy types or strings for the sqlite3 legacy mode.
            :return : True or False
            use_batch_mode : Optional. Accept values as True or False 
            This is to initiate db engine with use_batch_mode option, if it is not initiated earlier.
            Please read SQLAlchemy's create_engine() documentation where to use use_batch_mode. 
            
        """
        if df.empty:
            raise Exception ("Error : Empty DataFrame.")

        if df.columns.duplicated().any():
            raise Exception ("Error : Duplicate columns in DataFrame.")
        
        
        dispose_engine = True        
        if 'con' in kwargs.keys():
            dispose_engine = False
        
        dfsize = chunksize

        if (is_replace):
            if_exists = 'replace'
        else:
            if_exists = 'append'

        kwargs['schema'] = schema

        if call_df_cleanup:
            df = self.__df_cleanup(df)

        table_name = table_name.replace("'", "").replace('"', '').lower()

        status = False

        if dfsize is None:
            dfsize = 10 ** 5

        kwargs.pop('chunksize', None)

        if not method == None:
            if method =='postgrescopy':
                method = self.__psql_insert_copy
            
            kwargs['method'] = method
        
        s = time.time()
        if len(df) > dfsize:

            dfs = self.__split_df(df, dfsize)
            status = self.__write_split_df(table_name, dfs, if_exists=if_exists, **kwargs)
            if self.logger:
                self.logger.info("Total {} rows pushed in table : {} within: {}s".format(len(df), table_name,
                                                                                    round(time.time() - s, 4)))

        else:

            status = self.__write_df(table_name, df, if_exists=if_exists, chunksize=dfsize, **kwargs)
            if self.logger:
                self.logger.info("Pushed {} rows in table : {} within: {}s".format(len(df), table_name,
                                                                                    round(time.time() - s, 4)))

        
        #if self.logger:
            #self.logger.info('Pushed name: {} dataframe shape: {} within: {}s'.format(table_name,
                                                                                     #df.shape,
                                                                                     #round(time.time() - s, 4)))
        if dispose_engine:
            self.__db_dispose()
        return status



    @__check_engine
    def push_df(self, table_name, df, is_replace=False, call_df_cleanup=False, method=None,schema=None, **kwargs):

        """
            This function pushes datasets to database.
            :param table_name: Table to save data.
            :param df: Dataframe to be saved in database.
            :param is_replace: Pass false if you want to append data into existing table. Pass True if you want to create or replace
                                table with new data.
            :Param call_df_cleanup: If True it will call function to clean up df like remove non ascii characters from data also remove 
            any special characters like tabs or new line 
            :param **kwargs: This function calls df.to_sql function internally, any additional parameters need to be passed, pass as key and value
            like push_df(table_name, df, is_replace=False, chunksize=10 ** 5,schema='raw',dtype=df_col_detect(df,True),use_batch_mode=True)
            chunksize ,dtype and schema are additional parameters which will be passed to df.to_sql fuction as **kwargs
            
            :param: method :  default None
                values accepted 'postgrescopy' or None or 'multi'
                Controls the SQL insertion clause used:
                'postgrescopy' : When insert data in postgresql or postgressql flavour like redshift. This is very fast insert method for Postgres database
                None : Uses standard SQL INSERT clause (one per row).
                'multi': Pass multiple values in a single INSERT clause.
                
            con : Optional, Assign value only when you want to maintain transaction
                  example : 
                    con.init_engine()
                    with con.engine.begin() as conn:
                        response = con.exec_query(_query,_transaction_connection=conn):
                        con.push_df('tablename',df,is_replace=True,schema='czuc',con=conn,method='postgrescopy')
                    con.engine.dispose()
                    if response.cursor:
                        result = response.fetchone()
                    else:
                        result = response.rowcount
                
            
            schema : string, optional
            Specify the schema (if database flavor supports this). If None, use default schema.

            index : bool, default True
            Write DataFrame index as a column. Uses index_label as the column name in the table.

            index_label : string or sequence, default None
            Column label for index column(s). If None is given (default) and index is True, then the index names are used. A sequence should be given if the DataFrame uses MultiIndex.

            chunksize : int, optional
            Rows will be written in batches of this size at a time. By default, all rows will be written at once.

            dtype : dict, optional
            Specifying the datatype for columns. The keys should be the column names and the values should be the SQLAlchemy types or strings for the sqlite3 legacy mode.
            :return : True or False
            
            use_batch_mode : Optional. Accept values as True or False 
            This is to initiate db engine with use_batch_mode option, if it is not initiated earlier.
            Please read SQLAlchemy's create_engine() documentation where to use use_batch_mode. 
            
        """

        if df.empty:
            raise Exception ("Error : Empty DataFrame.")
        
        if df.columns.duplicated().any():
            raise Exception ("Error : Duplicate columns in DataFrame.")
        
        dispose_engine = True        
        if 'con' in kwargs.keys():
            dispose_engine = False
        
        if (is_replace):
            if_exists = 'replace'
        else:
            if_exists = 'append'
        
        kwargs['schema'] = schema
        
        if call_df_cleanup:
            df = self.__df_cleanup(df)

        table_name = table_name.replace("'", "").replace('"', '').lower()

        # Format the Dataframe in preparation

        # hit it up

        status = False

        if not method == None:
            if method =='postgrescopy':
                method = self.__psql_insert_copy
            
            kwargs['method'] = method

        s = time.time()
        status = self.__write_df(table_name, df, if_exists=if_exists, **kwargs)

        

        if self.logger:
            self.logger.info("Pushed {} rows in table : {} within: {}s".format(len(df), table_name,
                                                                                    round(time.time() - s, 4)))
        if dispose_engine:
            self.__db_dispose()
        return status



    


    @__check_engine
    def get_df_large_table(self, table_name, **kwargs)-> pd.DataFrame:

        """
        This function performce select * from table , with default 100000 rows of batch reading
       
       
         **kwargs : Any aditional parameter
        """
        

        s = time.time()
        if 'chunksize' not in kwargs.keys():
            kwargs['chunksize'] = 10 ** 5
        dfs = pd.read_sql_table(table_name, self.engine, **kwargs)

        try:
            df = pd.concat(dfs, axis=0)
        except ValueError:  # No objects to concetenate. dfs is a generator object so has no len() property!
            if self.logger:
                self.logger.warning("No objects to concetenate on table_name: {}".format(table_name))
            return None

        length = 0
        if df is not None:
            length = len(df)
        if self.logger:
            self.logger.info('fetched {} rows from  {} within: {}'.format(length,table_name,                                                                                      
                                                                                      round(time.time() - s, 4)))
        self.__db_dispose()
        return df



    @__check_engine
    def get_df_table(self, table_name, **kwargs)-> pd.DataFrame:

        
       

        s = time.time()
        kwargs.pop('chunksize', None)
        df = pd.read_sql_table(table_name, self.engine, **kwargs)

        length = 0
        if df is not None:
            length = len(df)
        if self.logger:
            self.logger.info('fetched {} rows from  {} within: {}'.format(length,table_name,                                                                                      
                                                                                      round(time.time() - s, 4)))
        self.__db_dispose()
        return df


    @__check_engine
    def exec_query(self, _query,_transaction_connection=None,**kwargs):
        
        """
        
        This function performce any query on database. Like DDL queries
        :param _query: SQL Query.
        :param _transaction_connection if any transaction is required to maintain if transaction is passed
         then user is responsible for engine disposal and connection closure.
        """
        if self.logger:
            self.logger.info('running query: "{}"'.format(_query))
        _query = text(_query)
        result = None
        if _transaction_connection is not None:
            con = _transaction_connection
            result = con.execute(_query)
        else:
            
            if( type(self.engine)==sqlalchemy.engine.base.Engine):
                with self.engine.connect() as con:
                    response = con.execute(_query)
                    if response.cursor:
                        result = response.fetchone()
                    else:
                        result = response.rowcount
            else:
                """Below work around is for SQLite if some how we use native sqlite engine for excute query"""
                result = self.engine.execute(_query)
        
        

        

        if _transaction_connection is None:
            self.__db_dispose()
        
                

        return result



    @__check_engine
    def get_df_query(self, _query,sqlite_text_factory=None,**kwargs) -> pd.DataFrame:

        """
                This function performce any query on database. Any DML query
                
                        
                def read_sql_query(sql, con, index_col=None, coerce_float=True, params=None, parse_dates=None, chunksize: None=None)
        Read SQL query into a DataFrame.

        Returns a DataFrame corresponding to the result set of the query string. Optionally provide an index_col parameter to use one of the columns as the index, otherwise default integer index will be used.

                   Returns
        DataFrame 
        """

        

        if self.logger:
            self.logger.info('running query: "{}"'.format(_query))

        s = time.time()
        if sqlite_text_factory is None:
            con = self.engine
        else:
            con = self.engine.raw_connection()
            con.connection.text_factory = sqlite_text_factory
        
        result = pd.read_sql_query(_query, con=con)
        length = 0
        if result is not None:
            
            length = len(result)
        if self.logger:
            self.logger.info('Finished running query with rows fetched {} within: {}'.format(
                                                                                      length,
                                                                                      round(time.time() - s, 4)))
       
        self.__db_dispose()
        return result
        
    def __db_dispose(self):
    
        """This function dispose database"""
        
        if( type(self.engine)==sqlalchemy.engine.base.Engine):
            
            self.engine.dispose()

围绕sqlalchemy设计简单的包装器

1 个答案: