Question

我正在尝试运行python脚本，以使用ID和列标题进行交叉引用从输入的.csv文件中获取数据并将其打印到输出的.csv文件中。

它们是非常大的文件，因此我包含了多处理功能，第一步是将输出文件分成多个部分，以便可以并行处理它们。

我尝试在计算机A上执行此操作，并且将输出文件拆分的磁盘速度为0 mb / s（或非常缓慢的滴流），并且程序从未执行。同时，在计算机B上，程序可以正确运行，并且磁盘速度以〜40mb / s的速度将输出文件分成多个部分。

这是相同的文件和相同的代码，为什么计算机A无法正确响应？是操作系统错误吗？我是否在计算机A上缺少驱动程序？还有吗？

输入和输出文件位于WD 4TB外部硬盘上。

计算机A规格：Windows 10 Pro 64位，Intel i9 7920X 12核，8 x 8GB Geil Evo Potenza RAM，三星850 Evo 500GB本地SSD，WD 4TB外部硬盘，华擎X299 Killer主板

Copmuter B规格：Windows 10 Pro 64位， Intel i7 6700K 4核，2个16GB Geil Evo Forza RAM，PNY CS1311 240GB SSD，WD 4TB外部硬盘，MSI B250M Gaming Pro主板

# std lib imports
import csv
import multiprocessing
import os 
import os.path
import shutil
import sqlite3
import sys
import tempfile
import timeit

# third party imports

# our imports

buffer_size = 8192000  # 8.192 MB, 10x the default (io.DEFAULT_BUFFER_SiZE)
# working_dir = tempfile.gettempdir()
working_dir = "E:\\temp_files"

def return_csv_header(filename):
    """
    Returns the first column of the csv file filename
    as a list.
    """
    with open(filename, "r", newline="", encoding="utf-8") as csvfile:
        reader = csv.reader(csvfile)
        return next(reader)

def return_input_mapping(input_file, identifier):
    """
    Returns a {identifier:record} dictionary where
        identifier is the value of the identifier column
         for each row in the input file.
        record is a dictionary of {column:value}
         representing a row in a csv file.
    """
    to_ret = dict()
    with open(input_file, "r", newline="", encoding="utf-8") as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            key = row[identifier]
            to_ret[key] = row

    return to_ret

def return_csv_contents(input_file):
    """
    Returns a list of lists representing the rows
    in a csv file.
    """
    to_ret = list()
    with open(input_file, "r", newline="", encoding="utf-8") as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            to_ret.append(row)

    return to_ret

def create_db_and_table(csv_contents, identifier):
    """
    Creates a sqlite3 database and table.
    Creates the identifier table column along with more
    table columns named from col0 to colN. We do this because
    the csv column headers can be all sorts of weird stuff. And
    we only really care about the order of the columns, and the
    identifier so we can set it as the primary key.
    No rows are added.
    Returns the database path and table name as a tuple.
    """
    # assert that the identifier is in the csv_contents header
    header = csv_contents[0]
    assert(identifier in header)

    db_path = os.path.join(working_dir, "csv_input.sqlite")
    tablename = "data"

    # delete the database if it already exists
    if os.path.exists(db_path):
        os.remove(db_path)

    # create the database, table, and columns
    with sqlite3.connect(db_path) as conn:
        cursor = conn.cursor()

        # prepare the table's columns - it will look like this
        # (col1 text, col2 text, col3 text primary key, ...) 
        name = "col"
        counter = 0
        column_names = "("
        for column in header:
            if column == identifier:
                column_names += "%s text primary key," % identifier
            else:
                column_names += "%s text," % (name + str(counter))

            counter += 1

        # remove the last comma and space
        if column_names.endswith(","):
            column_names = column_names[0:-1]
        column_names += ")"

        # prepare the sql statement
        sql = "CREATE TABLE %s %s" % (tablename, column_names)

        # some performance tweaks for the database
        cursor.execute("PRAGMA synchronous=OFF")
        cursor.execute("PRAGMA cache_size=20000")  # 20,000*1024 = 20.48MB
        cursor.execute("PRAGMA journal_mode=off")   # dont keep journal of operations
        cursor.execute("PRAGMA temp_store=memory")  # store temp files in memory
        # execute the statement
        cursor.execute(sql)
        conn.commit()

    return db_path, tablename

def delete_db(db_path):
    """
    Deletes the sqlite3 database file at the given db_path.
    """
    assert(os.path.exists(db_path) is True)
    os.remove(db_path)
    assert(os.path.exists(db_path) is False)

def load_db_content(db_path, table, contents):
    """
    Loads the database table with the given contents.
    Skips the first element in contents as that is the
    header aka the database column names.
    """
    header = contents[0]
    num_of_cols = len(header)
    assert(num_of_cols != 0)
    contents = contents[1:] # remove the header from the contents

    # connect to the database
    with sqlite3.connect(db_path) as conn:
        # only commit once versus after every statement 
        cursor = conn.cursor()
        cursor.execute("BEGIN IMMEDIATE")

        # insert into the database in chunks if needed
        limit = 999
        remaining = num_of_cols
        beginning = 0
        while remaining > limit:  # sqlite column limit
            # prepare the sql statement
            # this makes the string (?, ?, ?, ...)
            columns = "(" + "?," * limit
            columns = columns[:-1]  # remove last comma
            columns += ")"

            # prepare the columns to insert
            to_insert = []
            for i in range(len(contents)):
                to_insert.append(contents[i][beginning:beginning+limit])

            sql = "INSERT INTO %s VALUES %s" % (table, columns)
            cursor.executemany(sql, to_insert)
            remaining -= 999
            beginning += 999

        columns = "(" + "?," * remaining
        columns = columns[:-1]  # remove last comma
        columns += ")"

        to_insert = []
        for i in range(len(contents)):
            to_insert.append(contents[i][beginning:beginning+remaining])

        sql = "INSERT INTO %s VALUES %s" % (table, columns)
        cursor.executemany(sql, to_insert)

        # commit the changes
        conn.commit()


def get_na_dict(columns, identifier):
    """
    Returns a dict with the given columns as keys, and
    "n/a" as the values.
    Skip over the identifier because we want to keep
    that piece of data as it is.
    """
    to_ret = dict()
    for column in columns:
        if column == identifier:
            continue
        else:
            to_ret[column] = "n/a"

    return to_ret

def run_vlookup(input_file, output_file, identifier, db_path, table):
    """
    Completes the output file with data from the input file
    that matches the record identifier and the header columns.
    See the description at the top of this file for an example.
    Returns the path of the new output file.
    """
    # header of input file
    input_header = return_csv_header(input_file)

    # fill in the output file with data from the input file
    output_file_name = os.path.basename(output_file)
    temp_file = os.path.join(working_dir, output_file_name + ".tmp")

    with open(output_file, "r", newline="", buffering=buffer_size, encoding="utf-8") as inputcsv:
        with open(temp_file, "w", newline="", buffering=buffer_size, encoding="utf-8") as tempcsv:
            reader = csv.DictReader(inputcsv)

            # set restval to "" which will write empty values for columns
            #  in the output file that are not in the input file
            # set extrasaction to "ignore" which will skip over columns
            #  from the input file that are not in the output file
            writer = csv.DictWriter(tempcsv,
                                    fieldnames=reader.fieldnames,
                                    restval="",
                                    extrasaction="ignore")
            writer.writeheader()

            # open databse connection
            with sqlite3.connect(db_path) as conn:
                cursor = conn.cursor()

                for row in reader:
                    key = row[identifier] # get the value for the lookup

                    # fetch the key from the database
                    sql = "SELECT * FROM %s where %s = '%s'" % (table, identifier, key)
                    cursor.execute(sql)
                    result = cursor.fetchone()

                    # update the output file with "n/a" if the item
                    # does not exist in the database
                    if result is None:
                        lookup_values = get_na_dict(input_header, identifier)

                    # otherwise update it with the values from the database
                    else:
                        lookup_values = dict(zip(input_header, result))

                    row.update(lookup_values) # merge the two dictionaries

                    writer.writerow(row)

    return temp_file

def split_file(output_file, cpu_count):
    """
    Splits the output_file into cpu_count number of
    temporary files.
    Returns the names of the temporary files as a list.
    """
    # generate the temp file names
    output_file_name = os.path.basename(output_file)
    tempdir = working_dir
    temp_files = []
    for i in range(cpu_count):
        temp_file = os.path.join(tempdir, "%s_%s" % (output_file_name, i))
        temp_files.append(temp_file)

    # create the files
    with open(output_file, "rb", buffering=0) as outfile:
        # calculate size of the file
        size = outfile.seek(0, 2)
        outfile.seek(0, 0)

        # read the header in (at the same time moving the file pointer forward)
        header = return_header_bytes(outfile)

        # calculate the size of the smaller files (excluding the header)
        chunk_size = (size - len(header)) / cpu_count

        for file in temp_files:
            create_temp_file(file, chunk_size, outfile, header)

    return temp_files


def create_temp_file(file, chunk_size, outfile, header):
    """
    Creates file with the given header plus chunk_size data
    from the given outfile.
    Header is a byte string.
    If chunk_size is bigger than 100MB we read/write it in
    chunks of 100MB.
    After reading chunk_size amount of data, if the last byte
    does not end with a newline, we keep reading until
    it does. That way we dont write a file with truncated data.
    If we reach the end of outfile then we stop reading and we
    finish our last write.
    """
    # start the file with the header
    data = header

    memory_safe_chunk = 100000000  # bytes, so 100MB

    # read in chunk_size data from outfile
    if isinstance(chunk_size, float):
        chunk_size = int(chunk_size)

    # write the data to the temp file
    with open(file, "wb", buffering=0) as fp:
        # safe to read/write chunk in one go
        if chunk_size < memory_safe_chunk:
            data += outfile.read(chunk_size)

        # need to read/write chunk in chunks, go figure
        else:
            remaining = chunk_size
            while remaining > memory_safe_chunk:
                data += outfile.read(memory_safe_chunk)
                fp.write(data)
                data = b""
                remaining -= memory_safe_chunk

            data += outfile.read(remaining)

        # keep reading 1 byte until we reach a newline
        # or the end of the file
        while not data.endswith(b"\n"):
            char = outfile.read(1)

            # reached EOF
            if char == b"":
                break
            else:
                data += char

        fp.write(data)

    del data    # free up memory


def return_header_bytes(file_pointer):
    """
    Returns a string starting from the file_pointer until
    the first newline character.
    """
    to_ret = file_pointer.read(1)
    while not to_ret.endswith(b"\n"):
        to_ret += file_pointer.read(1)

    return to_ret

def merge_files(files):
    """
    Returns a file that has the contents of files merged
    together in one.
    Keeps only the header from the first file, and discards
    the rest as they are duplicates.
    """
    chunk_size = 100000000 # bytes, so 100MB
    master_file = os.path.join(working_dir, "temp.csv")

    with open(files[0], "rb") as fpointer:
        header = return_header_bytes(fpointer)

    # open master file for writing
    with open(master_file, "wb", buffering=0) as master_fp:
        master_fp.write(header)

        # loop through each file copying over the contents minus
        # the header
        for file in files:
            # read the temp file in chunks
            # and write it to the master file
            with open(file, "rb", buffering=0) as temp_fp:
                temp_fp.seek(len(header))

                data = temp_fp.read(chunk_size)
                while data != b"":
                    master_fp.write(data)
                    data = temp_fp.read(chunk_size)

                del data    # free up memory

    return master_file

def launch_processes(input_file, output_file, identifier):
    """
    Splits the output file into N temporary files.
    Launches a process to run the vlookup on each temp file.
    Merges the temp files back into one.
    Moves the final temp file to the output_file location. 
    Deletes the N temp files.
    """
    # create temporary files equal to the amount of cpu cores
    cpu_count = multiprocessing.cpu_count()
    files = split_file(output_file, cpu_count)
    temp_files = []

    # load the input file into memory; this is a memory-hungry operation,
    # see note at top of file
    contents = return_csv_contents(input_file)

    # create sqlite3 database to store input mapping
    db_path, table = create_db_and_table(contents, identifier)
    load_db_content(db_path, table, contents)
    del contents    # free up memory

    # run vlookup with N processes equal to CPU count
    with multiprocessing.Pool(processes=cpu_count) as pool:
        results = []

        # launch asynchronous processing of each file
        for file in files:
            res = pool.apply_async(run_vlookup, (input_file, file, identifier, db_path, table))
            results.append(res)

        # wait for the processes to finish
        for result in results:
            res = result.get()
            temp_files.append(res)

        # collect the processes
        pool.close()
        pool.join()

    # delete input mapping db
    delete_db(db_path)

    # delete the small files
    for i in range(len(files)):
        os.remove(files[i])

    # merge temp files
    temp_file = merge_files(temp_files)

    # delete temp files
    for i in range(len(temp_files)):
        os.remove(temp_files[i])

    # replace original output file with merged temp file
    shutil.move(temp_file, output_file)


if __name__ == "__main__":
    print(timeit.default_timer())
    input_file = sys.argv[1]
    output_file = sys.argv[2]
    identifier = sys.argv[3]
    launch_processes(input_file, output_file, identifier)
    print(timeit.default_timer())

两台计算机上的Python磁盘写入速度不同

0 个答案: