我有一个脚本,它一个一个读取一些文件,清理它并插入到 postgres 数据库中。
我尝试使用 Pools 使用 python 多处理,但实际上我发现 CPU 使用率有时仍达到 30%,大部分时间返回到 6%。所以真的很慢。
有什么加快速度的建议吗?
谢谢
import os
import multiprocessing
path = 'data/'
arr = os.listdir(path)
connection = psycopg2.connect(
user="postgres", password="blabla", host="127.0.0.1", port="5432", database="test"
)
cursor = connection.cursor()
postgres_insert_query = """ INSERT INTO mobile (data1, data2) VALUES (%s,%s)
ON CONFLICT (data1)
DO
UPDATE SET data2 = EXCLUDED.data1 ;"""
def insert_data(key,record_to_insert, item):
print(key)
try:
cursor.executemany(postgres_insert_query, record_to_insert)
connection.commit()
count = cursor.rowcount
print(count, "Record inserted successfully into mobile table", item)
except (Exception, psycopg2.Error) as error:
print("Failed to insert record into mobile table", error)
i = 1
def process_data(item):
print(item)
global i
records = []
i+=1
with open(path+item,'r') as file:
for line in file:
line = dataCleansing(line)
records.append((line+'-'+str(i),'data2-'+str(i)+line))
if len(records)==50000:
insert_data(i,records,item)
records=[]
insert_data(i,records,item)
records=[]
if __name__ == '__main__':
a_pool = multiprocessing.Pool(6)
result = a_pool.map(process_data, arr)