Question

我正在使用多处理来尝试使用Pandas加速处理大约1000~500MB的csv文件。我试图将一个简单的字符串正则表达式应用于一列。该程序可行，但它似乎没有正确释放内存，并且最终每个进程占用40-80GB，尽管没有超过10GB的文件。你知道为什么会这样吗？我已经尝试了很多方法来清理记忆，但没有用。

import pandas as pd
import numpy as np
import os
import multiprocessing
import gc
from ctypes import cdll, CDLL
from random import shuffle

oldc = ""
newc = ""

NUMPROC = 8
rep = None

cdll.LoadLibrary("libc.so.6")
libc = CDLL("libc.so.6")

def main(filename, oldcol, newcol):
    global oldc 
    global newc 
    global rep
    names = np.empty([1,1])
    oldc = oldcol
    newc = newcol
    df = pd.read_csv(filename)
    names = df.as_matrix()
    del df
    rep = {}
    rep[newc] = {}
    for row in names[1:]:
        oldname = r"^"+str(row[0])+r"( .*|$)"
        newname = str(row[1]) + r"\1"
        rep[newc][oldname]=newname
    if not os.path.exists("./standardized/"):
        print("Making dir!")
        os.makedirs("./standardized/")
    files = [f for f in os.listdir('.') if (os.path.isfile(f) and ".csv" in f and not (f==filename or "household" in str(f) or os.path.exists("./standardized/"+f[:-4]+"_stnd.csv")))]
    shuffle(files)
    allfiles = [f for f in os.listdir('.') if ".csv" in f]
    for f in allfiles:
         if os.path.exists("./standardized/"+f[:-4]+"_stnd.csv"):
             if os.path.getsize(f) > os.path.getsize("./standardized/"+f[:-4]+"_stnd.csv"):
                files.append(f)
    print(len(files))
    bundle = [(idx, f) for idx, f in enumerate(files)]
    pool = multiprocessing.Pool(processes=NUMPROC, maxtasksperchild=1)
    r = pool.map_async(process, bundle)
    pool.close()
    pool.join()



def process(bundle):
    global oldc
    global rep
    global newc
    fname = bundle[1]
    idx = bundle[0]
    try:
        print(idx)
        libc.malloc_trim(0)
        curfile = pd.read_csv(fname, dtype="str")
        curfile[newc] = curfile[oldc].str.lower()
        curfile.replace(to_replace=rep, regex=True, inplace=True)
        curfile.to_csv("./standardized/"+fname[:-4]+"_stnd.csv")
        del curfile
    except:
        print("error on: " + str(fname))
    finally:
        gc.collect()
        libc.malloc_trim(0)






main("lookup.csv","namefrst","stndfrst")

多处理熊猫吃内存

0 个答案: