Question

我最近不得不为一家公司写一个挑战，即根据每个文件的第一个属性将3个CSV文件合并为一个（属性在所有文件中重复）。

我编写代码并将其发送给他们，但他们说需要2分钟才能运行。这很有趣，因为它在我的机器上跑了10秒钟。我的机器有相同的处理器，16GB的RAM，还有一个SSD。非常相似的环境。

我尝试优化它并重新提交。这次他们说他们在Ubuntu机器上运行了11秒，而代码在Windows 10上运行了100秒。

另一个奇怪的事情是，当我尝试使用Profile模块进行分析时，它会永远持续下去，不得不在450秒后终止。我移动到cProfiler并记录了7秒钟。

编辑：问题的确切表述是

编写控制台程序以合并及时中提供的文件   高效的方式。文件路径应作为参数提供，以便   可以在不同的数据集上评估程序。合并的文件   应保存为CSV格式;使用id列作为唯一键   合并;程序应该做任何必要的数据清理和错误   检查。

随意使用您认为合适的任何语言   限制是没有外部图书馆，因为这违背了目的   考试。如果该语言提供CSV解析库（如   Python），请避免使用它们以及这是它的一部分   测试

这里没有进一步的代码：

#!/usr/bin/python3

import sys
from multiprocessing import Pool

HEADERS = ['id']

def csv_tuple_quotes_valid(a_tuple):
    """
    checks if a quotes in each attribute of a entry (i.e. a tuple) agree with the csv format

    returns True or False
    """
    for attribute in a_tuple:
        in_quotes = False
        attr_len = len(attribute)
        skip_next = False

        for i in range(0, attr_len):
            if not skip_next and attribute[i] == '\"':
                if i < attr_len - 1 and attribute[i + 1] == '\"':
                    skip_next = True
                    continue
                elif i == 0 or i == attr_len - 1:
                    in_quotes = not in_quotes
                else:
                    return False
            else:
                skip_next = False

        if in_quotes:
            return False
    return True

def check_and_parse_potential_tuple(to_parse):
    """
    receives a string and returns an array of the attributes of the csv line
    if the string was not a valid csv line, then returns False
    """
    a_tuple = []
    attribute_start_index = 0
    to_parse_len = len(to_parse)
    in_quotes = False
    i = 0

    #iterate through the string (line from the csv)
    while i < to_parse_len:
        current_char = to_parse[i]

        #this works the following way: if we meet a quote ("), it must be in one
        #of five cases: "" | ", | ," | "\0 | (start_of_string)"
        #in case we are inside a quoted attribute (i.e. "123"), then commas are ignored
        #the following code also extracts the tuples' attributes 

        if current_char == '\"':
            if i == 0 or (to_parse[i - 1] == ',' and not in_quotes): # (start_of_string)" and ," case
                #not including the quote in the next attr
                attribute_start_index = i + 1

                #starting a quoted attr
                in_quotes = True
            elif i + 1 < to_parse_len:
                if to_parse[i + 1] == '\"': # "" case
                    i += 1 #skip the next " because it is part of a ""
                elif to_parse[i + 1] == ',' and in_quotes: # ", case
                    a_tuple.append(to_parse[attribute_start_index:i].strip())

                    #not including the quote and comma in the next attr
                    attribute_start_index = i + 2

                    in_quotes = False #the quoted attr has ended

                    #skip the next comma - we know what it is for
                    i += 1
                else:
                    #since we cannot have a random " in the middle of an attr
                    return False 
            elif i == to_parse_len - 1: # "\0 case
                a_tuple.append(to_parse[attribute_start_index:i].strip())

                #reached end of line, so no more attr's to extract
                attribute_start_index = to_parse_len

                in_quotes = False
            else:
                return False
        elif current_char == ',':
            if not in_quotes:
                a_tuple.append(to_parse[attribute_start_index:i].strip())
                attribute_start_index = i + 1

        i += 1

    #in case the last attr was left empty or unquoted
    if attribute_start_index < to_parse_len or (not in_quotes and to_parse[-1] == ','):
        a_tuple.append(to_parse[attribute_start_index:])

    #line ended while parsing; i.e. a quote was openned but not closed 
    if in_quotes:
        return False

    return a_tuple


def parse_tuple(to_parse, no_of_headers):
    """
    parses a string and returns an array with no_of_headers number of headers

    raises an error if the string was not a valid CSV line
    """

    #get rid of the newline at the end of every line
    to_parse = to_parse.strip()

    # return to_parse.split(',') #if we assume the data is in a valid format

    #the following checking of the format of the data increases the execution
    #time by a factor of 2; if the data is know to be valid, uncomment 3 lines above here

    #if there are more commas than fields, then we must take into consideration
    #how the quotes parse and then extract the attributes
    if to_parse.count(',') + 1 > no_of_headers:
        result = check_and_parse_potential_tuple(to_parse)
        if result:
            a_tuple = result
        else:
            raise TypeError('Error while parsing CSV line %s. The quotes do not parse' % to_parse)
    else:
        a_tuple = to_parse.split(',')
        if not csv_tuple_quotes_valid(a_tuple):
            raise TypeError('Error while parsing CSV line %s. The quotes do not parse' % to_parse)

    #if the format is correct but more data fields were provided
    #the following works faster than an if statement that checks the length of a_tuple
    try:
        a_tuple[no_of_headers - 1]
    except IndexError:
        raise TypeError('Error while parsing CSV line %s. Unknown reason' % to_parse)

    #this replaces the use my own hashtables to store the duplicated values for the attributes
    for i in range(1, no_of_headers):
        a_tuple[i] = sys.intern(a_tuple[i])

    return a_tuple


def read_file(path, file_number):
    """
    reads the csv file and returns (dict, int)

    the dict is the mapping of id's to attributes

    the integer is the number of attributes (headers) for the csv file
    """
    global HEADERS

    try:
        file = open(path, 'r');
    except FileNotFoundError as e:
        print("error in %s:\n%s\nexiting...")
        exit(1)

    main_table = {}
    headers = file.readline().strip().split(',')
    no_of_headers = len(headers)

    HEADERS.extend(headers[1:]) #keep the headers from the file

    lines = file.readlines()
    file.close()

    args = []
    for line in lines:
        args.append((line, no_of_headers))

    #pool is a pool of worker processes parsing the lines in parallel
    with Pool() as workers:
        try:
            all_tuples = workers.starmap(parse_tuple, args, 1000)
        except TypeError as e:
            print('Error in file %s:\n%s\nexiting thread...' % (path, e.args))
            exit(1)

    for a_tuple in all_tuples:
        #add quotes to key if needed
        key = a_tuple[0] if a_tuple[0][0] == '\"' else ('\"%s\"' % a_tuple[0])
        main_table[key] = a_tuple[1:]

    return (main_table, no_of_headers)

def merge_files():
    """
    produces a file called merged.csv 
    """
    global HEADERS

    no_of_files = len(sys.argv) - 1
    processed_files = [None] * no_of_files

    for i in range(0, no_of_files):
        processed_files[i] = read_file(sys.argv[i + 1], i)

    out_file = open('merged.csv', 'w+')

    merged_str = ','.join(HEADERS)

    all_keys = {}
    #this is to ensure that we include all keys in the final file.
    #even those that are missing from some files and present in others
    for processed_file in processed_files:
        all_keys.update(processed_file[0])

    for key in all_keys:
        merged_str += '\n%s' % key
        for i in range(0, no_of_files):
            (main_table, no_of_headers) = processed_files[i]

            try:
                for attr in main_table[key]:
                    merged_str += ',%s' % attr
            except KeyError:
                print('NOTE: no values found for id %s in file \"%s\"' % (key, sys.argv[i + 1]))
                merged_str += ',' * (no_of_headers - 1)

    out_file.write(merged_str)
    out_file.close()

if __name__ == '__main__':
    # merge_files()
    import cProfile
    cProfile.run('merge_files()')

# import time
# start = time.time()

# print(time.time() - start);

Here是我在Windows上获得的分析器报告。

编辑：提供的其余csv数据是here。 Pastebin处理文件的时间太长，所以......

它可能不是最好的代码，我知道，但我的问题是什么减慢了Windows，以至于不会减慢Ubuntu的速度？ merge_files（）函数占用时间最长，仅为自己94秒，不包括对其他函数的调用。对我来说似乎没有什么太明显的，为什么它如此缓慢。

由于

编辑：注意：我们都使用相同的数据集来运行代码。

Answer 1

当我使用三个给定文件在Ubuntu 16.04上运行您的解决方案时，似乎需要大约8秒才能完成。我做的唯一修改是取消注释底部的时间码并使用它。

$ python3 dimitar_merge.py file1.csv file2.csv file3.csv
NOTE: no values found for id "aaa5d09b-684b-47d6-8829-3dbefd608b5e" in file "file2.csv"
NOTE: no values found for id "38f79a49-4357-4d5a-90a5-18052ef03882" in file "file2.csv"
NOTE: no values found for id "766590d9-4f5b-4745-885b-83894553394b" in file "file2.csv"
8.039648056030273
$ python3 dimitar_merge.py file1.csv file2.csv file3.csv
NOTE: no values found for id "38f79a49-4357-4d5a-90a5-18052ef03882" in file "file2.csv"
NOTE: no values found for id "766590d9-4f5b-4745-885b-83894553394b" in file "file2.csv"
NOTE: no values found for id "aaa5d09b-684b-47d6-8829-3dbefd608b5e" in file "file2.csv"
7.78482985496521

我在没有使用标准库中的csv的情况下重写了我的第一次尝试，现在时间大约为4.3秒。

$ python3 lettuce_merge.py file1.csv file2.csv file3.csv
4.332579612731934
$ python3 lettuce_merge.py file1.csv file2.csv file3.csv
4.305467367172241
$ python3 lettuce_merge.py file1.csv file2.csv file3.csv
4.27345871925354

这是我的解决方案代码（lettuce_merge.py）：

from collections import defaultdict


def split_row(csv_row):
    return [col.strip('"') for col in csv_row.rstrip().split(',')]


def merge_csv_files(files):
    file_headers = []
    merged_headers = []
    for i, file in enumerate(files):
        current_header = split_row(next(file))
        unique_key, *current_header = current_header
        if i == 0:
            merged_headers.append(unique_key)
        merged_headers.extend(current_header)
        file_headers.append(current_header)

    result = defaultdict(lambda: [''] * (len(merged_headers) - 1))
    for file_header, file in zip(file_headers, files):
        for line in file:
            key, *values = split_row(line)
            for col_name, col_value in zip(file_header, values):
                result[key][merged_headers.index(col_name) - 1] = col_value
        file.close()

    quotes = '"{}"'.format
    with open('lettuce_merged.csv', 'w') as f:
        f.write(','.join(quotes(a) for a in merged_headers) + '\n')
        for key, values in result.items():
            f.write(','.join(quotes(b) for b in [key] + values) + '\n')


if __name__ == '__main__':
    from argparse import ArgumentParser, FileType
    from time import time

    parser = ArgumentParser()
    parser.add_argument('files', nargs='*', type=FileType('r'))
    args = parser.parse_args()

    start_time = time()
    merge_csv_files(args.files)
    print(time() - start_time)

我确信此代码可以进一步优化，但有时只是看到另一种解决问题的方法可以帮助激发新想法。

Answer 2

事实证明，Windows和Linux处理非常长字符串的方式不同。当我将out_file.write(merged_str)移到外部for循环（for key in all_keys:）内并停止追加到merged_str时，它按预期运行了11秒。我对OS的任何一个内存管理系统都没有足够的知识来预测它为何如此不同。

但是我会说第二个（Windows one）的方式是更加自动防故障的方法，因为在内存中保留30 MB的字符串是不合理的。事实证明Linux看到了并且并不总是试图将字符串保留在缓存中，或者每次都重建它。

有趣的是，最初我在我的Linux机器上使用相同的写入策略运行了几次，并且带有大字符串的那个似乎更快，所以我坚持使用它。我想你永远不会知道。

这是修改后的代码

    for key in all_keys:
        merged_str = '%s' % key
        for i in range(0, no_of_files):
            (main_table, no_of_headers) = processed_files[i]

            try:
                for attr in main_table[key]:
                    merged_str += ',%s' % attr
            except KeyError:
                print('NOTE: no values found for id %s in file \"%s\"' % (key, sys.argv[i + 1]))
                merged_str += ',' * (no_of_headers - 1)
        out_file.write(merged_str + '\n')

    out_file.close()

Python 3.6脚本在Windows 10上出乎意料地慢，但在Ubuntu 17.10上却没有

2 个答案: