将数TB的数据写入Teradata数据库,但to_sql()速度极慢。目前每100行需要大约60秒,这意味着数据增长速度比我写的快得多!我有大约450列和数亿行。由于所有正在发生的数据清理,我希望将进程保留在Python内部(我没有在此处包含这些内容以使其更简单)并且它很容易实现自动化。
import pandas as pd
import glob
import sqlalchemy
from sqlalchemy import create_engine, text, update, insert
from datetime import datetime
import numpy as np
from timeit import default_timer as timer
keys = pd.read_csv('C:/dictionary.csv')
fwidths = [4, 6, 8]
for n in range(441):
fwidths.append(4)
fwidths.append(2)
fwidths.append(2)
fwidths.append(8)
cols = ['ACT_NUMBER', 'Subject_Code', 'Segment']
t = 1
for n in range(441):
cols.append('KEY{0}'.format(t))
cols.append('DATA_TYPE{0}'.format(t))
cols.append('SIGN{0}'.format(t))
try:
cols.append(keys.loc[keys.Key_Num == t, 'Name'].item())
except:
cols.append('N/A')
t+=1
conn = create_engine('teradata://username:password')
chunksize = 100
for file_path in glob.glob('C:/files/*.txt'):
for chunk in pd.read_fwf(file_path,widths = fwidths, names = cols, iterator = True, skiprows=1, chunksize = chunksize):
start = timer()
chunk.to_sql('new_table', con=conn, if_exists = 'append', schema='mySchema', index = False, dtype={'ACT_NUMBER': sqlalchemy.types.VARCHAR(length=255), 'Subject_Code': sqlalchemy.types.VARCHAR(length=255), 'Segment': sqlalchemy.types.VARCHAR(length=255)})
end = timer()
print('writing chunk time {0}'.format(end - start))