我正在使用python脚本在postgresql中插入或更新大约3到4百万个数据。请参阅下面的代码。如果密钥已存在,则需要插入新密钥或使用新值更新密钥。但是下面的代码是与DB进行过多的往返连接,并且需要大约35-45分钟才能在DB中插入300万条记录,这非常慢。如何避免往返连接并以更快的方式插入或更新?
任何帮助都会非常感激。
提前感谢您的帮助。
InputFile.txt - 此文件大约有3到4百万行itesm
productKey1 printer1,printerModel1,printerPrice1,printerDesc1|
productKey2 sacnner2,scannerModel2,scannerPrice2,scannerDesc2|
productKey3 mobile3,mobileModel3,mobilePrice3,mobileDesc3|
productKey4 tv4,tvModel4,tvPrice4,tvDescription4|
productKey2 sacnner22,scannerModel22,scannerPrice22,scannerDesc22|
insert.py
def insertProduct(filename, conn):
seen = set()
cursor = conn.cursor()
qi = "INSERT INTO productTable (key, value) VALUES (%s, %s);"
qu = "UPDATE productTable SET value = CONCAT(value, %s) WHERE key = %s;"
with open(filename) as f:
for line in f:
if line.strip():
key, value = line.split(' ', 1)
if key not in seen:
seen.add(key)
cursor.execute(qi, (key, value))
else:
cursor.execute(qu, (value, key))
conn.commit()
conn = psycopg2.connect("dbname='productDB' user='myuser' host='localhost'")
insertProduct('InputFile.txt', conn)
答案 0 :(得分:2)
执行批量准备好的陈述。 http://initd.org/psycopg/docs/extras.html#fast-execution-helpers
import psycopg2, psycopg2.extras
def insertProduct(filename, conn):
data = []
with open(filename) as f:
for line in f:
line = line.strip()
if line:
key, value = line.split(' ', 1)
data.append((key, value))
cursor = conn.cursor()
cursor.execute("""
prepare upsert (text, text) as
with i as (
insert into productTable (key, value)
select $1, $2
where not exists (select 1 from productTable where key = $1)
returning *
)
update productTable p
set value = concat (p.value, $2)
where p.key = $1 and not exists (select 1 from i)
""")
psycopg2.extras.execute_batch(cursor, "execute upsert (%s, %s)", data, page_size=500)
cursor.execute("deallocate upsert")
conn.commit()
conn = psycopg2.connect(database='cpn')
insertProduct('InputFile.txt', conn)