使用不同的分隔符将数千个文件插入数据库

时间:2019-04-24 14:31:42

标签: python-3.x python-regex

我必须将大约13'000个文件导入到PostgreSQL数据库中。问题在于这些文件具有不同的定界符,有些甚至在同一文件中没有不同的定界符。我一直在尝试使用Python Regex软件包,但不确定使用该软件包是否可以实现我的目标。

想象一下,一行的分隔符为:,但该行中的某处还有:。我需要做的就是只看定界符列表的第一个字符。

解决第一个问题后,我需要将这些行插入数据库中。这是一个简单的两列表,如果这样会使生活更轻松...

代码:

import sys
import csv
import os
import re

import psycopg2

conn = psycopg2.connect(
    host="localhost",
    database="XXX",
    user="XXX",
    password="XXX",
    port="5432"
)

cur = conn.cursor()

maxInt = sys.maxsize

while True:
    # decrease the maxInt value by factor 10
    # as long as the OverflowError occurs.

    try:
        csv.field_size_limit(maxInt)
        break
    except OverflowError:
        maxInt = int(maxInt / 10)


def searchFiles(directory='', extension=''):
    print('SEARCHING IN: ' + directory)
    filelist = []
    extension = extension.lower()
    for dirpath, dirnames, files in os.walk(directory):
        for name in files:
            if extension and name.lower().endswith(extension):
                filelist.append(os.path.join(dirpath, name))
            elif not extension:
                with open("history.txt", "a+") as fileToImport:
                    fileToImport.write('FAILED TO READ: ' + (os.path.join(dirpath, name)))
    print('FINISHED FILE SEARCH AND FOUND ' + str(len(filelist)) + ' FILES')
    return filelist


def importData(fileToImport):
    with open(fileToImport, 'r') as f:
        reader = csv.reader(f)
        for row in reader:
            try:
                delimiters = ":", ";", "|"
                regexPattern = '|'.join(map(re.escape, delimiters))
                re.split(regexPattern, row)
                '''print('IMPORTING ' + str(row))
                cur.execute("INSERT INTO pwned VALUES (%s, %s)", row)
                conn.commit()'''
            except:
                print('FAILED TO IMPORT ' + str(row))
                '''with open("output.log", "a+") as logFile:
                    logFile.write('FAILED FILE: ' + file + 'FAILED TO IMPORT ' + str(row))
                with open("history.txt", "a+") as fileToImport:
                    fileToImport.write(str(row) + '\n')'''


print(conn.get_dsn_parameters())
cur.execute("SELECT version();")
record = cur.fetchone()
print("You are connected to - ", record)

fileList = searchFiles('/opt/bdlc/dataset/', '.txt')


counter = 0
length = len(fileList)
for file in fileList:
    # if counter % 0 == 0:
    print('Processing File: ' + str(counter) + '/' + str(length))
    importData(file)
    counter += 1

我非常感谢任何想法或帮助 谢谢!

0 个答案:

没有答案