我必须大量导入一堆文件,总计大约1TB。每个文件大约为150MB。
我必须使用python将所有文件导入PostgreSQL数据库。
这是我的解决方案,是否有更好的方法来完成此任务?
import csv
import psycopg2
import os
from multiprocessing.pool import ThreadPool as Pool
conn = psycopg2.connect(
host="localhost",
database="database",
user="user",
password="password",
port="5432"
)
cur = conn.cursor()
def searchFiles(directory='.', extension=''):
filelist = []
extension = extension.lower()
for dirpath, dirnames, files in os.walk(directory):
for name in files:
if extension and name.lower().endswith(extension):
filelist.append(os.path.join(dirpath, name))
elif not extension:
print(os.path.join(dirpath, name))
return filelist
def importData(file):
with open(file, 'r') as f:
reader = csv.reader(f, delimiter=":")
for row in reader:
print(row)
cur.execute("INSERT INTO database VALUES (%s, %s)", row)
conn.commit()
fileList = searchFiles('.', '.txt')
threadPool = Pool(processes=8)
for file in fileList:
threadPool.map(importData, (file,))
谢谢您的建议!