我希望在下面的代码中,for循环将遍历文件夹中的所有csv,并且df数据框将在从每个csv读取后追加。但是,这里的df永远不会附加,但只包含第一个csv的内容。有什么想法吗?谢谢!
我们在python 3.6和pandas 0.21
path = "/home/ubuntu/QA/client_" + CLIENT_ID + "_raw_data_" + year + "/_ACTUAL_*_Accrual*.xls"
if CLIENT_ID in ('7')
df_columns=pd.DataFrame(columns=['PropID','PROPERTY_CODE','TreeNodeID','ACCOUNT_CODE','TreeNodeName','ReportYear','Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
OUTPUT_CSV="Client_"+CLIENT_ID+"_"+year+"_"+ACCOUNTING_TYPE+"_QA.csv"
df_columns.to_csv(OUTPUT_CSV, header=True, index=False, encoding='utf-8',na_rep="NA", mode='w')
df = pd.DataFrame()
for fname in glob.iglob(path):
print (fname)
df2 = pd.DataFrame()
df2=pd.read_excel(fname,skiprows=4,converters={'TreeNodeCode':np.int64,'PropCode':np.str}).dropna(subset=['TreeNodeCode'],how='any') ## convert the account code in the raw data into strings. dropna drops the raw of the column 4 ,which is the IAM account code, if the column 4 is NA
print (df2)
df=df.append(df2)
df=df.rename(columns={'TreeNodeCode':'ACCOUNT_CODE'})
df=df.rename(columns={'PropCode':'PROPERTY_CODE'})
df['PROPERTY_CODE'] = df_QA['PROPERTY_CODE'].astype(np.str)
df['ACCOUNT_CODE'] = df_QA['ACCOUNT_CODE'].astype(np.str)
df_QA['PROPERTY_CODE'] = df_QA['PROPERTY_CODE'].astype(np.str)
df_QA['ACCOUNT_CODE'] = df_QA['ACCOUNT_CODE'].astype(np.str)
print ("this is df")
print (df)
print ("this is df_QA")
print (df_QA)
df_check=pd.merge(df,df_QA, how='inner',on=['PROPERTY_CODE','ACCOUNT_CODE'])
#print (df_check)
# tricks in this ticket: https://stackoverflow.com/questions/384192823/subtracting-multiple-columns-and-appending-results-in-pandas-dataframe
df_check[['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']] = df_check[['Jan_x','Feb_x','Mar_x','Apr_x','May_x','Jun_x','Jul_x','Aug_x','Sep_x','Oct_x','Nov_x','Dec_x']] - df_check[['Jan_y','Feb_y','Mar_y','Apr_y','May_y','Jun_y','Jul_y','Aug_y','Sep_y','Oct_y','Nov_y','Dec_y']].values
#print (df_check)
df_check2=df_check[['PropID','PROPERTY_CODE','TreeNodeID','ACCOUNT_CODE','TreeNodeName','ReportYear','Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']]
#print (df_check2)
# tricks of panda query: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.query.html#pandas-dataframe-query
df_check3=df_check2.query('Jan > 0 | Jan < 0 | Feb > 0 | Feb < 0 | Mar > 0 | Mar < 0 | Apr > 0 | Apr < 0 | May > 0 | May < 0 | Jun > 0 | Jun < 0 | Jul > 0 | Jul < 0 | Aug > 0 | Aug < 0 | Sep > 0 | Sep < 0 | Oct > 0 | Oct < 0 | Nov > 0 | Nov < 0 | Dec > 0 | Dec < 0')
#print (df_check3)
#print (df_check3.info())
df_check3.to_csv(OUTPUT_CSV, header=False, index=False,
na_rep="NA", mode='a')
答案 0 :(得分:2)
我认为您需要先将每个DataFrame附加到列表中,然后再添加concat
:
dfs = []
for fname in glob.iglob(path):
print (fname)
df2=pd.read_excel(fname,skiprows=4,converters={'TreeNodeCode':np.int64,'PropCode':np.str}).dropna(subset=['TreeNodeCode'],how='any') ## convert the account code in the raw data into strings. dropna drops the raw of the column 4 ,which is the IAM account code, if the column 4 is NA
print (df2)
dfs.append(df2)
df = pd.concat(dfs, ignore_index=True)
还有你的代码:
df=df.rename(columns={'TreeNodeCode':'ACCOUNT_CODE'})
df=df.rename(columns={'PropCode':'PROPERTY_CODE'})
df['PROPERTY_CODE'] = df_QA['PROPERTY_CODE'].astype(np.str)
df['ACCOUNT_CODE'] = df_QA['ACCOUNT_CODE'].astype(np.str)
df_QA['PROPERTY_CODE'] = df_QA['PROPERTY_CODE'].astype(np.str)
df_QA['ACCOUNT_CODE'] = df_QA['ACCOUNT_CODE'].astype(np.str)
应简化为:
df=df.rename(columns={'TreeNodeCode':'ACCOUNT_CODE', 'PropCode':'PROPERTY_CODE'})
cols = ['PROPERTY_CODE','ACCOUNT_CODE']
df_QA[cols] = df[cols] = df_QA[cols].astype(str)