我必须将大约13'000个文件导入到PostgreSQL数据库中。问题在于这些文件具有不同的定界符,有些甚至在同一文件中没有不同的定界符。我一直在尝试使用Python Regex软件包,但不确定使用该软件包是否可以实现我的目标。
想象一下,一行的分隔符为:
,但该行中的某处还有:
。我需要做的就是只看定界符列表的第一个字符。
解决第一个问题后,我需要将这些行插入数据库中。这是一个简单的两列表,如果这样会使生活更轻松...
代码:
import sys
import csv
import os
import re
import psycopg2
conn = psycopg2.connect(
host="localhost",
database="XXX",
user="XXX",
password="XXX",
port="5432"
)
cur = conn.cursor()
maxInt = sys.maxsize
while True:
# decrease the maxInt value by factor 10
# as long as the OverflowError occurs.
try:
csv.field_size_limit(maxInt)
break
except OverflowError:
maxInt = int(maxInt / 10)
def searchFiles(directory='', extension=''):
print('SEARCHING IN: ' + directory)
filelist = []
extension = extension.lower()
for dirpath, dirnames, files in os.walk(directory):
for name in files:
if extension and name.lower().endswith(extension):
filelist.append(os.path.join(dirpath, name))
elif not extension:
with open("history.txt", "a+") as fileToImport:
fileToImport.write('FAILED TO READ: ' + (os.path.join(dirpath, name)))
print('FINISHED FILE SEARCH AND FOUND ' + str(len(filelist)) + ' FILES')
return filelist
def importData(fileToImport):
with open(fileToImport, 'r') as f:
reader = csv.reader(f)
for row in reader:
try:
delimiters = ":", ";", "|"
regexPattern = '|'.join(map(re.escape, delimiters))
re.split(regexPattern, row)
'''print('IMPORTING ' + str(row))
cur.execute("INSERT INTO pwned VALUES (%s, %s)", row)
conn.commit()'''
except:
print('FAILED TO IMPORT ' + str(row))
'''with open("output.log", "a+") as logFile:
logFile.write('FAILED FILE: ' + file + 'FAILED TO IMPORT ' + str(row))
with open("history.txt", "a+") as fileToImport:
fileToImport.write(str(row) + '\n')'''
print(conn.get_dsn_parameters())
cur.execute("SELECT version();")
record = cur.fetchone()
print("You are connected to - ", record)
fileList = searchFiles('/opt/bdlc/dataset/', '.txt')
counter = 0
length = len(fileList)
for file in fileList:
# if counter % 0 == 0:
print('Processing File: ' + str(counter) + '/' + str(length))
importData(file)
counter += 1
我非常感谢任何想法或帮助 谢谢!