Question

我正在尝试查找导致CSV读取速度慢的原因。

我尝试了多种方法，处理8个csv文件后，它的大小约为6GB，有10列。

我在想的是，用一个线程读取文件，然后在另一个线程中处理它，所以我不使用任何带宽。基本上是我在另一个堆栈溢出线程中发现的。

现在仅读取文件的速度大约是1112秒！大约等于7MB / s。我可以通过SQL将该驱动器上的读取速度提高到大约380 MB / s，因此必须存在一些瓶颈或其他问题。

我不担心处理或其他一些东西。我只是想将文件尽快读取到内存中，然后进行处理。我的代码可能存在一些问题，因为pandas的速度要快得多（尽管无论如何都不接近磁盘速度），请参见底部。

也许就是这样，但是我对此不满意。

import os, csv, time, math
from queue import Queue
from threading import Thread

file = r'local_disk_file.csv'
out = r'network_location'
_sentinel = object()

def convert10(x10, y10):
    # some processing

    return gridcellid10



def read_file(file, q):
    start = time.monotonic()

    with open(file, 'r', newline='') as inFile:
        next(inFile)
        for row in inFile:
            q.put(row)

    q.put(_sentinel)
    print('File read in {}s'.format(time.monotonic()-start))


def post_process(in_q):
    with open(os.path.join(out, "output_on_network.csv"), 'w', newline='') as outFile:
        writer = csv.writer(outFile)
        row = ['AreaID', 'CellID', 'Case1', 'Case2', 'Case3', 'Case4', 'Case5', 'Case6', 'Case7', 'Case8']
        writer.writerow(row)
        for row in iter(in_q.get, _sentinel):
            reader = csv.reader([row])
            for row in reader:
                cellid = convert10(int(row[1]), int(row[2]))
                final_row = [row[0], cellid]

                switch = False
                for item in row[6:]:
                    if int(item) > 15000:
                        switch = True
                        print('Broken row, skipping')
                        print('\t' + ' '.join(row))
                final_row.extend(row[6:])
                if not switch:
                    writer.writerow(final_row)


def main():
    q = Queue()
    t1 = Thread(target=read_file, args=(file, q))
    t2 = Thread(target=post_process, args=(q,))
    t1.start()
    t2.start()


if __name__ == '__main__':
    main()

我尝试使用熊猫，而且速度更快。下面的代码大约需要92秒，大约等于81 MB / s。

import pandas as pd, time


file = r'local_disk_file.csv'

start = time.monotonic()
df = pd.read_csv(file)
print(time.monotonic()-start)

编辑：我也尝试只读取文件而不做任何事情。这花了45秒，相当于177 MB / s，我很满意。

import time


file = r'local_disk_fileL.csv'

start = time.monotonic()
with open(file, 'r', newline='') as in_file:
    for row in in_file:
        pass
print(time.monotonic()-start)

Answer 1

所以对我来说最好的选择是通过熊猫阅读它，然后应用并行处理。这样我就能达到65 MB / s的读取速度。它不在驱动器的最大速度附近。但这大大加快了我的问题。

一个不错的选择是读取csv，另存为木地板文件并对其进行处理。这样，我们可以节省很多空间，并且读取速度非常快，因为它是并行读取文件的，列越多速度越好！

另外，如果我们阅读csv，对其进行处理并将其保存为木地板，则可以大大提高速度。

通过转换为实木复合地板并将其处理为实木复合地板文件，我可以在大约140秒内处理整个文件。如果我用熊猫读取csv，则读取只花费相同的时间，而另一个瓶颈就是将文件写回到磁盘。

这对我来说意味着停止使用csv文件！

import time, math, pandas as pd
import multiprocessing as mp
import pyarrow.parquet as pq

file = r'in_parquet.pqt'
out = r'out_parquet.pqt'


def gridcellid(x, y, xmin, ymin, xshift, yshift, m, n, r, mtidshift):
    ## some processing
    return gridcellid

def read(file):
    start = time.monotonic()
    df = pd.read_parquet(file, engine = 'pyarrow')
    print(f'File read in {time.monotonic()-start}s')
    return df

def calculate(df):


    df['CellID'] = 0
    df['CellID'] = [gridcellid(x, y, 97170, 274320, 0, 0, 0, 6, 10, 0) for x, y in zip(df['x'], df['y'])]


    cols = ['Domain', 'CellID', 'RP00005', 'RP00010', 'RP00050', 'RP00100', 'RP00200', 'RP00500', 'RP01000', 'RP10000']
    df.drop(columns = ['uwPop', 'uwInd', 'a01_5dPC', 'x', 'y'])
    df = df.reindex(columns=cols)
    df.rename({"Domain": "AreaID",  "RP00005": "Case1",
                             "RP00010": "Case2", "RP00050": "Case3", "RP00100": "Case4",
                             "RP00200": "Case5", "RP00500": "Case6", "RP01000" : "Case7", "RP10000" : "Case8"}, inplace=True)

    return df


def parallelize_dataframe(df, func, n_cores=16):
    n = 100000
    df_split = [df[i:i+n] for i in range(0,df.shape[0],n)]
    pool = mp.Pool(n_cores)
    li_df = []
    for i in pool.imap(func, df_split):
        li_df.append(i)
    df = pd.concat(li_df)
    pool.close()
    pool.join()
    return df


if __name__ == '__main__':
    start = time.monotonic()
    df_input = read(file)
    df_merged = parallelize_dataframe(df_input, calculate)
    df_merged.to_parquet(out, engine = 'pyarrow', index = False)
    print(f'File completely processed in {time.monotonic()-start}s')

CSV读取速度慢的python的速度

1 个答案: