我正在尝试运行python脚本,以使用ID和列标题进行交叉引用从输入的.csv文件中获取数据并将其打印到输出的.csv文件中。
它们是非常大的文件,因此我包含了多处理功能,第一步是将输出文件分成多个部分,以便可以并行处理它们。
我尝试在计算机A上执行此操作,并且将输出文件拆分的磁盘速度为0 mb / s(或非常缓慢的滴流),并且程序从未执行。同时,在计算机B上,程序可以正确运行,并且磁盘速度以〜40mb / s的速度将输出文件分成多个部分。
这是相同的文件和相同的代码,为什么计算机A无法正确响应?是操作系统错误吗?我是否在计算机A上缺少驱动程序?还有吗?
输入和输出文件位于WD 4TB外部硬盘上。
计算机A规格:Windows 10 Pro 64位,Intel i9 7920X 12核,8 x 8GB Geil Evo Potenza RAM,三星850 Evo 500GB本地SSD,WD 4TB外部硬盘,华擎X299 Killer主板
Copmuter B规格:Windows 10 Pro 64位, Intel i7 6700K 4核,2个16GB Geil Evo Forza RAM,PNY CS1311 240GB SSD,WD 4TB外部硬盘,MSI B250M Gaming Pro主板
# std lib imports
import csv
import multiprocessing
import os
import os.path
import shutil
import sqlite3
import sys
import tempfile
import timeit
# third party imports
# our imports
buffer_size = 8192000 # 8.192 MB, 10x the default (io.DEFAULT_BUFFER_SiZE)
# working_dir = tempfile.gettempdir()
working_dir = "E:\\temp_files"
def return_csv_header(filename):
"""
Returns the first column of the csv file filename
as a list.
"""
with open(filename, "r", newline="", encoding="utf-8") as csvfile:
reader = csv.reader(csvfile)
return next(reader)
def return_input_mapping(input_file, identifier):
"""
Returns a {identifier:record} dictionary where
identifier is the value of the identifier column
for each row in the input file.
record is a dictionary of {column:value}
representing a row in a csv file.
"""
to_ret = dict()
with open(input_file, "r", newline="", encoding="utf-8") as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
key = row[identifier]
to_ret[key] = row
return to_ret
def return_csv_contents(input_file):
"""
Returns a list of lists representing the rows
in a csv file.
"""
to_ret = list()
with open(input_file, "r", newline="", encoding="utf-8") as csvfile:
reader = csv.reader(csvfile)
for row in reader:
to_ret.append(row)
return to_ret
def create_db_and_table(csv_contents, identifier):
"""
Creates a sqlite3 database and table.
Creates the identifier table column along with more
table columns named from col0 to colN. We do this because
the csv column headers can be all sorts of weird stuff. And
we only really care about the order of the columns, and the
identifier so we can set it as the primary key.
No rows are added.
Returns the database path and table name as a tuple.
"""
# assert that the identifier is in the csv_contents header
header = csv_contents[0]
assert(identifier in header)
db_path = os.path.join(working_dir, "csv_input.sqlite")
tablename = "data"
# delete the database if it already exists
if os.path.exists(db_path):
os.remove(db_path)
# create the database, table, and columns
with sqlite3.connect(db_path) as conn:
cursor = conn.cursor()
# prepare the table's columns - it will look like this
# (col1 text, col2 text, col3 text primary key, ...)
name = "col"
counter = 0
column_names = "("
for column in header:
if column == identifier:
column_names += "%s text primary key," % identifier
else:
column_names += "%s text," % (name + str(counter))
counter += 1
# remove the last comma and space
if column_names.endswith(","):
column_names = column_names[0:-1]
column_names += ")"
# prepare the sql statement
sql = "CREATE TABLE %s %s" % (tablename, column_names)
# some performance tweaks for the database
cursor.execute("PRAGMA synchronous=OFF")
cursor.execute("PRAGMA cache_size=20000") # 20,000*1024 = 20.48MB
cursor.execute("PRAGMA journal_mode=off") # dont keep journal of operations
cursor.execute("PRAGMA temp_store=memory") # store temp files in memory
# execute the statement
cursor.execute(sql)
conn.commit()
return db_path, tablename
def delete_db(db_path):
"""
Deletes the sqlite3 database file at the given db_path.
"""
assert(os.path.exists(db_path) is True)
os.remove(db_path)
assert(os.path.exists(db_path) is False)
def load_db_content(db_path, table, contents):
"""
Loads the database table with the given contents.
Skips the first element in contents as that is the
header aka the database column names.
"""
header = contents[0]
num_of_cols = len(header)
assert(num_of_cols != 0)
contents = contents[1:] # remove the header from the contents
# connect to the database
with sqlite3.connect(db_path) as conn:
# only commit once versus after every statement
cursor = conn.cursor()
cursor.execute("BEGIN IMMEDIATE")
# insert into the database in chunks if needed
limit = 999
remaining = num_of_cols
beginning = 0
while remaining > limit: # sqlite column limit
# prepare the sql statement
# this makes the string (?, ?, ?, ...)
columns = "(" + "?," * limit
columns = columns[:-1] # remove last comma
columns += ")"
# prepare the columns to insert
to_insert = []
for i in range(len(contents)):
to_insert.append(contents[i][beginning:beginning+limit])
sql = "INSERT INTO %s VALUES %s" % (table, columns)
cursor.executemany(sql, to_insert)
remaining -= 999
beginning += 999
columns = "(" + "?," * remaining
columns = columns[:-1] # remove last comma
columns += ")"
to_insert = []
for i in range(len(contents)):
to_insert.append(contents[i][beginning:beginning+remaining])
sql = "INSERT INTO %s VALUES %s" % (table, columns)
cursor.executemany(sql, to_insert)
# commit the changes
conn.commit()
def get_na_dict(columns, identifier):
"""
Returns a dict with the given columns as keys, and
"n/a" as the values.
Skip over the identifier because we want to keep
that piece of data as it is.
"""
to_ret = dict()
for column in columns:
if column == identifier:
continue
else:
to_ret[column] = "n/a"
return to_ret
def run_vlookup(input_file, output_file, identifier, db_path, table):
"""
Completes the output file with data from the input file
that matches the record identifier and the header columns.
See the description at the top of this file for an example.
Returns the path of the new output file.
"""
# header of input file
input_header = return_csv_header(input_file)
# fill in the output file with data from the input file
output_file_name = os.path.basename(output_file)
temp_file = os.path.join(working_dir, output_file_name + ".tmp")
with open(output_file, "r", newline="", buffering=buffer_size, encoding="utf-8") as inputcsv:
with open(temp_file, "w", newline="", buffering=buffer_size, encoding="utf-8") as tempcsv:
reader = csv.DictReader(inputcsv)
# set restval to "" which will write empty values for columns
# in the output file that are not in the input file
# set extrasaction to "ignore" which will skip over columns
# from the input file that are not in the output file
writer = csv.DictWriter(tempcsv,
fieldnames=reader.fieldnames,
restval="",
extrasaction="ignore")
writer.writeheader()
# open databse connection
with sqlite3.connect(db_path) as conn:
cursor = conn.cursor()
for row in reader:
key = row[identifier] # get the value for the lookup
# fetch the key from the database
sql = "SELECT * FROM %s where %s = '%s'" % (table, identifier, key)
cursor.execute(sql)
result = cursor.fetchone()
# update the output file with "n/a" if the item
# does not exist in the database
if result is None:
lookup_values = get_na_dict(input_header, identifier)
# otherwise update it with the values from the database
else:
lookup_values = dict(zip(input_header, result))
row.update(lookup_values) # merge the two dictionaries
writer.writerow(row)
return temp_file
def split_file(output_file, cpu_count):
"""
Splits the output_file into cpu_count number of
temporary files.
Returns the names of the temporary files as a list.
"""
# generate the temp file names
output_file_name = os.path.basename(output_file)
tempdir = working_dir
temp_files = []
for i in range(cpu_count):
temp_file = os.path.join(tempdir, "%s_%s" % (output_file_name, i))
temp_files.append(temp_file)
# create the files
with open(output_file, "rb", buffering=0) as outfile:
# calculate size of the file
size = outfile.seek(0, 2)
outfile.seek(0, 0)
# read the header in (at the same time moving the file pointer forward)
header = return_header_bytes(outfile)
# calculate the size of the smaller files (excluding the header)
chunk_size = (size - len(header)) / cpu_count
for file in temp_files:
create_temp_file(file, chunk_size, outfile, header)
return temp_files
def create_temp_file(file, chunk_size, outfile, header):
"""
Creates file with the given header plus chunk_size data
from the given outfile.
Header is a byte string.
If chunk_size is bigger than 100MB we read/write it in
chunks of 100MB.
After reading chunk_size amount of data, if the last byte
does not end with a newline, we keep reading until
it does. That way we dont write a file with truncated data.
If we reach the end of outfile then we stop reading and we
finish our last write.
"""
# start the file with the header
data = header
memory_safe_chunk = 100000000 # bytes, so 100MB
# read in chunk_size data from outfile
if isinstance(chunk_size, float):
chunk_size = int(chunk_size)
# write the data to the temp file
with open(file, "wb", buffering=0) as fp:
# safe to read/write chunk in one go
if chunk_size < memory_safe_chunk:
data += outfile.read(chunk_size)
# need to read/write chunk in chunks, go figure
else:
remaining = chunk_size
while remaining > memory_safe_chunk:
data += outfile.read(memory_safe_chunk)
fp.write(data)
data = b""
remaining -= memory_safe_chunk
data += outfile.read(remaining)
# keep reading 1 byte until we reach a newline
# or the end of the file
while not data.endswith(b"\n"):
char = outfile.read(1)
# reached EOF
if char == b"":
break
else:
data += char
fp.write(data)
del data # free up memory
def return_header_bytes(file_pointer):
"""
Returns a string starting from the file_pointer until
the first newline character.
"""
to_ret = file_pointer.read(1)
while not to_ret.endswith(b"\n"):
to_ret += file_pointer.read(1)
return to_ret
def merge_files(files):
"""
Returns a file that has the contents of files merged
together in one.
Keeps only the header from the first file, and discards
the rest as they are duplicates.
"""
chunk_size = 100000000 # bytes, so 100MB
master_file = os.path.join(working_dir, "temp.csv")
with open(files[0], "rb") as fpointer:
header = return_header_bytes(fpointer)
# open master file for writing
with open(master_file, "wb", buffering=0) as master_fp:
master_fp.write(header)
# loop through each file copying over the contents minus
# the header
for file in files:
# read the temp file in chunks
# and write it to the master file
with open(file, "rb", buffering=0) as temp_fp:
temp_fp.seek(len(header))
data = temp_fp.read(chunk_size)
while data != b"":
master_fp.write(data)
data = temp_fp.read(chunk_size)
del data # free up memory
return master_file
def launch_processes(input_file, output_file, identifier):
"""
Splits the output file into N temporary files.
Launches a process to run the vlookup on each temp file.
Merges the temp files back into one.
Moves the final temp file to the output_file location.
Deletes the N temp files.
"""
# create temporary files equal to the amount of cpu cores
cpu_count = multiprocessing.cpu_count()
files = split_file(output_file, cpu_count)
temp_files = []
# load the input file into memory; this is a memory-hungry operation,
# see note at top of file
contents = return_csv_contents(input_file)
# create sqlite3 database to store input mapping
db_path, table = create_db_and_table(contents, identifier)
load_db_content(db_path, table, contents)
del contents # free up memory
# run vlookup with N processes equal to CPU count
with multiprocessing.Pool(processes=cpu_count) as pool:
results = []
# launch asynchronous processing of each file
for file in files:
res = pool.apply_async(run_vlookup, (input_file, file, identifier, db_path, table))
results.append(res)
# wait for the processes to finish
for result in results:
res = result.get()
temp_files.append(res)
# collect the processes
pool.close()
pool.join()
# delete input mapping db
delete_db(db_path)
# delete the small files
for i in range(len(files)):
os.remove(files[i])
# merge temp files
temp_file = merge_files(temp_files)
# delete temp files
for i in range(len(temp_files)):
os.remove(temp_files[i])
# replace original output file with merged temp file
shutil.move(temp_file, output_file)
if __name__ == "__main__":
print(timeit.default_timer())
input_file = sys.argv[1]
output_file = sys.argv[2]
identifier = sys.argv[3]
launch_processes(input_file, output_file, identifier)
print(timeit.default_timer())