我在teradata中创建了一个表,该表从每个要加载到文件夹中的文件中加载多个记录。某些文件可以有重复的记录。有什么方法可以在将重复记录插入表中之前检查重复记录并仅保留一个唯一记录?
num_of_chunks = 1000 #breaking the data into chunks
#Insert Query
insert_query= "INSERT INTO db.table VALUES(?,?,?,?,?)"
#set host, user, password params
host,username,password = 'x','y', "z"
#connet to DB using UdaExec
udaExec = teradata.UdaExec (appName="IMC", version="1.0", logConsole=False)
with udaExec.connect(method="odbc",system=host, username = username,
password=password, driver="Teradata") as session:
file_exist=session.execute(file=r"\\path to a fastload script to
create a table in a db in teradata" ,fileType="bteq",ignoreErrors=
[3803])
df_chunks = np.array_split(df, num_of_chunks)
for i,_ in enumerate(df_chunks):
data = [tuple(x) for x indf_chunks[i].to_records(index=False)]
rows= session.executemany(insert_query, data,batch=True)