所以我在下面有这个代码:
cols = ['Col001','Col002','Col003','Col004','Col005','Col006','Col007','Col008','Col009',]
import pandas as pd
dataA = [
['AB1', 'A', 100, 'NY', 0.01, 23, 'PQR', 1003, 0.002,],
['AB2', 'B', 201, 'NY', 0.03, 13, 'MNO', 1232, 0.004,],
['AB3', 'A', 234, 'NJ', 0.05, 54, 'ABC', 3443, 0.003,],
['AB4', 'V', 221, 'DE', 0.05, 67, 'ABC', 2345, 0.023,],
['AB5', 'B', 342, 'CT', 0.04, 89, 'MNO', 3457, 0.023,],
['AB6', 'N', 222, 'NY', 0.02, 67, 'PQR', 7665, 0.032,],
['AB7', 'F', 342, 'PA', 0.03, 56, 'ABC', 5767, 0.067,],
['AB8', 'C', 453, 'CA', 0.04, 34, 'PQR', 7563, 0.045,],
['AB9', 'B', 123, 'CT', 0.03, 65, 'PQR', 3465, 0.034,],
['AB10','C', 443, 'NJ', 0.03, 66, 'MNO', 3433, 0.087,],]
dataB = [
['AB1', 'A', 100, 'NY', 0.01, 23, 'PQR', 1003, 0.002,],
['AB2', 'B', 201, 'NY', 0.03, 13, 'MNO', 1232, 0.004,],
['AB3', 'A', 234, 'NJ', 0.05, 54, 'ABC', 3443, 0.003,],
['AB4', 'V', 221, 'DE', 0.08, 67, 'ABC', 2345, 0.023,],
['AB5', 'B', 342, 'NJ', 0.04, 89, 'MNO', 3457, 0.023,],
['AB6', 'N', 222, 'NY', 0.02, 67, 'PQR', 7665, 0.032,],
['AB7', 'F', 342, 'PA', 0.03, 56, 'MNO', 5767, 0.067,],
['AB8', 'C', 453, 'CA', 0.04, 34, 'PQR', 7563, 0.048,],
['AB9', 'B', 123, 'CT', 0.03, 65, 'PQR', 2353, 0.034,],
['AB10','C', 443, 'NJ', 0.03, 66, 'MNO', 3433, 0.087,],]
def getDataFrame(source,sourceName):
df = pd.DataFrame(source,columns=cols)
df['DataSource'] = sourceName
return df
def compareDataFrames(sourceDataFrame,newDataFrame):
targetDF = pd.concat([sourceDataFrame, newDataFrame])
targetDF = targetDF.reset_index(drop=True)
columnsGroup = list(targetDF.columns)
columnsGroup.remove('DataSource')
targetDF_GroupBy = targetDF.groupby(columnsGroup)
idx = [x[0] for x in targetDF_GroupBy.groups.values() if len(x) == 1]
targetDF = targetDF.reindex(idx)
targetDF = targetDF.sort_values(by=['Col001'], ascending=[True])
return targetDF
def getDiff():
sourceData = getDataFrame(dataA,'Legacy')
newData = getDataFrame(dataB,'New')
comparedData = compareDataFrames(sourceData,newData)
return comparedData
它完全按预期工作。输出是
print (getDiff()[cols])
Col001 Col002 Col003 Col004 Col005 Col006 Col007 Col008 Col009
3 AB4 V 221 DE 0.05 67 ABC 2345 0.023
13 AB4 V 221 DE 0.08 67 ABC 2345 0.023
4 AB5 B 342 CT 0.04 89 MNO 3457 0.023
14 AB5 B 342 NJ 0.04 89 MNO 3457 0.023
16 AB7 F 342 PA 0.03 56 MNO 5767 0.067
6 AB7 F 342 PA 0.03 56 ABC 5767 0.067
7 AB8 C 453 CA 0.04 34 PQR 7563 0.045
17 AB8 C 453 CA 0.04 34 PQR 7563 0.048
8 AB9 B 123 CT 0.03 65 PQR 3465 0.034
18 AB9 B 123 CT 0.03 65 PQR 2353 0.034
Press any key to continue . . .
到目前为止,这么好。但我不喜欢输出。即使它找到了diff,它也会显示每个diff行的所有列。
所以,我写了另一种方法,只给我差异:
def createDiffDataFrame(diffDataframe,ignoreCols):
diffData = []
compareCols = diffDataframe.columns
for eachContract in (diffDataframe.Col001.unique()):
legacyRow = diffDataframe[(diffDataframe['DataSource']=='Legacy') & (diffDataframe['Col001']==eachContract)]
newRow = diffDataframe[(diffDataframe['DataSource']=='New') & (diffDataframe['Col001']==eachContract)]
if len(legacyRow) == 0:
diffRow = [eachContract,'MISSING','MISSING','New']
elif len(newRow) == 0:
diffRow = [eachContract,'MISSING','Legacy','MISSING']
else:
for eachCol in compareCols:
if eachCol not in ignoreCols and legacyRow.iloc[0][eachCol] != newRow.iloc[0][eachCol]:
diffRow = [eachContract,eachCol,legacyRow.iloc[0][eachCol],newRow.iloc[0][eachCol]]
diffData.append(diffRow)
diffDF = pd.DataFrame(diffData,columns=['Col001','ColumnName','LegacyValue','NewValue'])
diffDF = diffDF.set_index('Col001')
return diffDF
现在,我的输出很完美:
x = getDiff()
print (createDiffDataFrame(x,['DataSource']))
ColumnName LegacyValue NewValue
Col001
AB4 Col005 0.05 0.08
AB5 Col004 CT NJ
AB7 Col007 ABC MNO
AB8 Col009 0.045 0.048
AB9 Col008 3465 2353
Press any key to continue . . .
我的问题是: 即使我得到了预期的输出,但在114列和超过50K行的现实世界中,运行createDiffDataFrame()模块需要永远。是否有更好的方法来取消createDiffDataFrame?大熊猫应该用于此吗?
答案 0 :(得分:2)
为什么不尝试仅使用pandas
dataA['New']=dataA.apply(lambda x :','.join(x.astype(str)),axis=1)
dataB['New']=dataB.apply(lambda x :','.join(x.astype(str)),axis=1)
DF=pd.concat([dataA,dataB],axis=0)
DF['DIFF']=DF.groupby('New')['New'].transform('count')
DF=DF[DF.DIFF==1]
DF.sort_values('Col001')
Out[1927]:
Col001 Col002 Col003 Col004 Col005 Col006 Col007 Col008 Col009 New DIFF
3 AB4 V 221 DE 0.05 67 ABC 2345 0.023 AB4,V,221,DE,0.05,67,ABC,2345,0.023,AB4,V,221,... 1
3 AB4 V 221 DE 0.08 67 ABC 2345 0.023 AB4,V,221,DE,0.08,67,ABC,2345,0.023,AB4,V,221,... 1
4 AB5 B 342 CT 0.04 89 MNO 3457 0.023 AB5,B,342,CT,0.04,89,MNO,3457,0.023,AB5,B,342,... 1
4 AB5 B 342 NJ 0.04 89 MNO 3457 0.023 AB5,B,342,NJ,0.04,89,MNO,3457,0.023,AB5,B,342,... 1
6 AB7 F 342 PA 0.03 56 ABC 5767 0.067 AB7,F,342,PA,0.03,56,ABC,5767,0.067,AB7,F,342,... 1
6 AB7 F 342 PA 0.03 56 MNO 5767 0.067 AB7,F,342,PA,0.03,56,MNO,5767,0.067,AB7,F,342,... 1
7 AB8 C 453 CA 0.04 34 PQR 7563 0.045 AB8,C,453,CA,0.04,34,PQR,7563,0.045,AB8,C,453,... 1
7 AB8 C 453 CA 0.04 34 PQR 7563 0.048 AB8,C,453,CA,0.04,34,PQR,7563,0.048,AB8,C,453,... 1
8 AB9 B 123 CT 0.03 65 PQR 3465 0.034 AB9,B,123,CT,0.03,65,PQR,3465,0.034,AB9,B,123,... 1
8 AB9 B 123 CT 0.03 65 PQR 2353 0.034 AB9,B,123,CT,0.03,65,PQR,2353,0.034,AB9,B,123,... 1
对于你的第二次出局(使用之前的结果)
DF1=DF.groupby('Col001').agg(lambda x:sorted(set(x), key=list(x).index)).reset_index()
DF1=DF1.set_index('Col001')
DF2=DF1.ix[:,2:10].reset_index()
DF3=pd.melt(DF2,id_vars=['Col001'])
DF3['select']=DF3.value.apply(lambda x : len(x))
DF4=DF3.loc[(DF3['select']>1)]
pd.concat([DF4.reset_index(drop=True),pd.DataFrame(DF4.value.values.tolist())],axis=1).\
rename(columns={0:'LegacyValue',1:'NewValue'}).drop(['value','select'],axis=1)
Out[1909]:
Col001 variable LegacyValue NewValue
0 AB5 Col004 CT NJ
1 AB4 Col005 0.05 0.08
2 AB7 Col007 ABC MNO
3 AB9 Col008 3465 2353
4 AB8 Col009 0.045 0.048
已更新请勿使用.ix
.loc
DF1=DF1.set_index('Col001')
cols = ['Col001','Col002','Col003','Col004','Col005','Col006','Col007','Col008','Col009',]
DF2=DF1.loc[:,DF1.columns.isin(cols)].reset_index()
DF3=pd.melt(DF2,id_vars=['Col001'])
DF3['select']=DF3.value.apply(lambda x : len(x))
DF4=DF3.loc[(DF3['select']>1)]
pd.concat([DF4.reset_index(drop=True),pd.DataFrame(DF4.value.values.tolist())],axis=1).\
rename(columns={0:'LegacyValue',1:'NewValue'}).drop(['value','select'],axis=1)