我有一个大的csv文件(~5-10 GB),我正在转换为pandas数据帧,处理它并转换回csv文件。
以下是我的代码
# Convert csv to dataframe
df = pd.read_csv('A.csv')
#Sort Dataframe Columns
df.columns = df.columns.str.lower()
df.columns = df.columns.str.strip()
df.columns = df.columns.str.replace(' ','_')
df.columns = df.columns.str.replace('-','_')
df.columns = df.columns.str.replace(':',"_")
#Remove duplicate columns in the dataframe
df = df.loc[:,~df.columns.duplicated()]
#Drop rows with null date which are aggregate values
df = df[pd.notnull(df['date'])]
## Formatting and handling headers
# Read headers of the dataframe into headers_new_unsorted variable
headers_new_unsorted = df.columns
headers_new_unsorted = [x.encode('UTF8') for x in headers_new_unsorted]
i = 0
while i<len(headers_new_unsorted):
headers_new_unsorted[i] = headers_new_unsorted[i].lower()
headers_new_unsorted[i] = headers_new_unsorted[i].strip()
headers_new_unsorted[i] = headers_new_unsorted[i].replace(" ","_")
headers_new_unsorted[i] = headers_new_unsorted[i].replace("-","_")
headers_new_unsorted[i] = headers_new_unsorted[i].replace(".","_")
headers_new_unsorted[i] = headers_new_unsorted[i].replace(":","_")
i += 1
headers_new = list(unique_everseen(headers_new_unsorted))
#If headers text file not present in the local folder create one
if not os.path.exists('head.txt'):
file('head.txt', 'w').close()
file = open('head.txt','r')
baseline_headers = file.read().split('\n')
Dfrn_header = set(baseline_headers).symmetric_difference(headers_new)
Dfrn_header = filter(None, Dfrn_header)
#sort columns
for i in Dfrn_header:
if i not in baseline_headers:
df[i] = df[i]
else:
df[i] = np.nan
organize_headers = baseline_headers
#append newly added columns to organize headers local list variable
for i in headers_new:
if i not in baseline_headers:
organize_headers = organize_headers+[i]
organize_headers = filter(None, organize_headers)
print organize_headers
new_headers_added = set(organize_headers) - set(baseline_headers)
new_headers_added = [o for o in organize_headers if o in new_headers_added]
#Organize dataframe columns same as organize header list
df = df[organize_headers]
#** Start Data processing **#
#Replace all null values to None
df["A"].fillna("None", inplace=True)
df["P"].fillna("None", inplace=True)
df["C"].fillna("None", inplace=True)
df["D"].fillna("None", inplace = True)
#Modify cities based on States
df.loc[df.C.str.startswith('New York'), 'State'] = "NY"
df.loc[df.C.str.startswith('San Jose'), 'State'] = "California"
df.loc[df.C.str.startswith('Portland'), 'State'] = "Oregon"
df.loc[df.C.str.startswith('Arlington'), 'State'] = "Texas"
df.loc[df.C.str.startswith('San Diego'), 'State'] = "California"
df.loc[df.C.str.startswith('LA'), 'State'] = "California"
df.loc[df.C.str.startswith('Rolla'), 'State'] = "Missouri"
df.loc[df.C.str.startswith('Detroit'), 'State'] = "MI"
df.loc[df.C.str.startswith('Chicago'), 'State'] = "IL"
df.loc[df.C.str.startswith('Louisville'), 'State'] = "Kentucky"
df.loc[df.C.str.startswith('San Francisco'), 'State'] = "California"
df["State"].fillna("None", inplace = True)
ref_data = json.load(open('test.json'))
json_len = len(ref_data)
for i in range(0,json_len):
for j in range (0,len(ref_data[i])):
if str(ref_data[i].keys()[j]) != "A":
df.loc[df[str(ref_data[i].keys()[j])].isnull() & (df.A.astype(str).str.contains(str(ref_data[0]["A"]))), str(ref_data[i].keys()[j])] = str(ref_data[i].values()[j])
#** End of Data Processing **#
df.to_csv('processed.csv',sep=',', index = False)
处理大文件时我的内存不足。我已经增加了运行此代码的机器的RAM大小。增加更多是不可行的。如何降低内存使用量?
答案 0 :(得分:1)
Pandas的read_csv有一个skiprows参数,可让您选择要读入内存的行。您只需将巨大文件的块读入内存并单独处理即可。
以上的例子。为简单起见,只需说您在process()函数中完成了所有处理。让我们说你的csv是1000000行。
from csv import writer
for i in range(0, 10):
skipfunc = lambda x: (i-1) * 100000<= x < i*100000
df = pd.read_csv('A.csv', skiprows = skipfunc) #only process 1/10 of csv at a time
processed_df = process(df)
list = df.to_csv(sep=',', index=False)
with open('processed.csv', 'w') as f:
w = writer(f)
f.writerows(list)