我正在尝试使用Pandas和SQLAlchemy对Oracle数据库中的数据进行一些自动分析。但是,Pandas(或SQLAlchemy,或者cx_Oracle,我不确定哪个)似乎不一致地读取Oracle中定义的数据类型。
例如,以下是表中的一些列(如Oracle中所定义):
列名,数据类型
PRINCIPAL_PHOTOGRAPHY_END_DATE DATE
PRINCIPAL_PHOTO_START_DATE DATE
PRODUCER VARCHAR2(26 BYTE)
RELEASE_AIR_DATE VARCHAR2(26 BYTE)
RUNNING_TIME_MINUTES NUMBER(38,0)
RUNNING_TIME_SECONDS NUMBER(38,0)
EFFECTIVE_TIMESTAMP DATE
EXPIRE_TIMESTAMP DATE
以下是我用来连接并从表中获取信息并将其加载到pandas数据帧中的代码:
from sqlalchemy import *
import pandas as pd
orcl_connect_string = 'oracle+cx_oracle://system:oracle@192.168.56.101:1521/?service_name=orcl'
engine = create_engine(orcl_connect_string)
df = pd.read_sql_query('SELECT * FROM INT_PROP', engine)
print df.dtypes
我得到的输出非常混乱,因为它不一致 - 它在一个案例中捕获日期时间数据类型但在其他案例中没有...!
production_start_date object
principal_photography_end_date object
producer object
release_air_date object
running_time_minutes float64
running_time_seconds float64
effective_timestamp datetime64[ns]
expire_timestamp object
为什么Pandas似乎能够读取effective_timestamp列的相应数据类型而不能读取其他列的任何想法?
编辑(某些版本信息):
答案 0 :(得分:0)
# -*- coding: utf-8 -*-
"""
Description: Autonomous Database Data uploader from different kind files
Date: Oct 10, 2010
Python: 3.7
Packages: cx_Oracle, sqlalchemy, pandas, xlrd
"""
import cx_Oracle
import sqlalchemy
import pandas as pd
import os
import json
import argparse
import re
import sys
def preCheck():
"""
Check whether there is user configuration json files and client.
"""
# Check the existence of TNS_ADMIN
if os.environ.get('ORACLE_HOME') :
if os.environ.get('TNS_ADMIN') :
# Check the existence of the configuration file.
if not os.path.exists("./.config.json") :
configADB()
else:
print("""
First, download the Autonomous Database Wallet from OCI into the instant client directory.
Secondly, set the environment variable TNS_ADMIN.
Third, modify the sqlnet.ora to contain the correct wallet file path.
Finally, run this program!
""")
exit(-1)
else:
print("""
First, download the instant client driver from:
https://www.oracle.com/database/technologies/instant-client/downloads.html
Secondly, set the environment variable ORACLE_HOME.
Finally, run this program!
""")
exit(-1)
def configADB():
"""
Configure the Autonomous Database Connection
"""
cfg={}
cfg['user']=input("Enter your ADB username:")
cfg['passwd']=input("Enter your ADB user's password:")
cfg['TNS']=input("Enter your ADB TNS's name:")
cfgFile=open("./.config.json",'w')
json.dump(cfg,cfgFile)
cfgFile.close()
def adbConnect():
"""
Establish the database connection
"""
cfgFile=open("./.config.json",'r')
cfg={}
cfg=json.load(cfgFile)
cfgFile.close()
os.environ['NLS_LANG']="SIMPLIFIED CHINESE_CHINA.UTF8"
connstr= "oracle+cx_oracle://{}:{}@{}".format(cfg['user'],cfg['passwd'],cfg['TNS'])
return sqlalchemy.create_engine(connstr, max_identifier_length=128)
def loadCSV(conn, destSchema, destTable, dataType, srcFile):
"""
Load csv data into ADB
"""
df=pd.read_csv(srcFile,encoding='utf-8')
df.to_sql(destTable, conn, schema=destSchema, if_exists='append', index=False, chunksize=10000, dtype=dataType)
def loadJSON(conn, destSchema, destTable, dataType, srcFile):
"""
Load JSON data into ADB
"""
df=pd.read_json(srcFile)
df.to_sql(destTable, conn, schema=destSchema, if_exists='append', index=False, chunksize=10000, dtype=dataType)
def loadExcel(conn, destSchema, destTable, dataType, srcFile):
"""
Load Excel data into ADB
"""
df=pd.read_excel(srcFile)
df.to_sql(destTable, conn, schema=destSchema, if_exists='append', index=False, chunksize=10000, dtype=dataType)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("-s", "--schema", help="Autonomous Database's schema")
parser.add_argument("-t", "--table", help="Schema's table")
parser.add_argument("-d", "--srcFile", help="Source data file to been loaded")
args = vars(parser.parse_args())
if len(sys.argv) == 1:
parser.print_help()
exit(-1)
preCheck()
adbConn=adbConnect()
# Generate the dtype
colType={}
metadata=sqlalchemy.MetaData(schema=args['schema'])
tabDef=sqlalchemy.Table(args['table'], metadata, autoload=True, autoload_with=adbConn)
for col in tabDef.columns:
colType[col.name]=col.type
if re.search("\.csv",args['srcFile']) :
loadCSV(adbConn, args['schema'], args['table'], colType, args['srcFile'])
if re.search("\.json",args['srcFile']) :
loadJSON(adbConn, args['schema'], args['table'], colType, args['srcFile'])
if re.search("\.(xls|xlsx)",args['srcFile']) :
loadExcel(adbConn, args['schema'], args['table'], colType, args['srcFile'])
# Close the connection
adbConn.dispose()