我有一个类似
的dfuid services
000c80b7d2b3643689b1e516918ec193 ['A']
001b292c588ec6cc11f57324d40e422d ['B','A',C']
006696f65899fdd87ba4894c784716f9 ['C','B']
(服务栏中没有排序列表)
我想在列
中重新映射列表uid services A B C
000c80b7d2b3643689b1e516918ec193 ['A'] 1 0 0
001b292c588ec6cc11f57324d40e422d ['B','A',C'] 1 1 1
006696f65899fdd87ba4894c784716f9 ['C','B'] 0 1 1
谢谢
答案 0 :(得分:0)
您可以先使用MultiLabelBinarizer
,然后再使用join
:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
print (pd.DataFrame(mlb.fit_transform(df['services']),columns=mlb.classes_, index=df.index))
A B C
0 1 0 0
1 1 1 1
2 0 1 1
df1 = pd.DataFrame(mlb.fit_transform(df['services']),columns=mlb.classes_, index=df.index)
df = df.join(df1)
print (df)
uid services A B C
0 000c80b7d2b3643689b1e516918ec193 [A] 1 0 0
1 001b292c588ec6cc11f57324d40e422d [B, A, C] 1 1 1
2 006696f65899fdd87ba4894c784716f9 [C, B] 0 1 1
使用get_dummies
的From Angular docs和groupby
的纯大熊猫替代方案,其中包含汇总max
的列:
df1 = pd.get_dummies(pd.DataFrame(df['services'].values.tolist()), prefix='', prefix_sep='')
.groupby(axis=1, level=0).max()
print (df1)
A B C
0 1 0 0
1 1 1 1
2 0 1 1
df = df.join(df1)
print (df)
uid services A B C
0 000c80b7d2b3643689b1e516918ec193 [A] 1 0 0
1 001b292c588ec6cc11f57324d40e422d [B, A, C] 1 1 1
2 006696f65899fdd87ba4894c784716f9 [C, B] 0 1 1
<强>计时强>:
#3k rows
df = pd.concat([df]*1000).reset_index(drop=True)
#John Galt solution
In [255]: %timeit (df.join(df.services.apply(lambda x: pd.Series({y:1 for y in x})).fillna(0).astype(int)))
1 loop, best of 3: 658 ms per loop
#user1717828 solution
In [256]: %timeit (df.join(df['services'].apply(lambda x: "|".join(x)).str.get_dummies()))
100 loops, best of 3: 16.8 ms per loop
#Jez solution1
In [257]: %timeit (df.join(pd.DataFrame(mlb.fit_transform(df['services']),columns=mlb.classes_, index=df.index)))
100 loops, best of 3: 4.66 ms per loop
#Jez solution2
In [258]: %timeit (df.join(pd.get_dummies(pd.DataFrame(df['services'].values.tolist()), prefix='', prefix_sep='').groupby(axis=1, level=0).max()))
100 loops, best of 3: 7.04 ms per loop
#30k rows
df = pd.concat([df]*10000).reset_index(drop=True)
#John Galt solution
In [260]: %timeit (df.join(df.services.apply(lambda x: pd.Series({y:1 for y in x})).fillna(0).astype(int)))
1 loop, best of 3: 6.68 s per loop
#user1717828 solution
In [261]: %timeit (df.join(df['services'].apply(lambda x: "|".join(x)).str.get_dummies()))
10 loops, best of 3: 138 ms per loop
#Jez solution1
In [262]: %timeit (df.join(pd.DataFrame(mlb.fit_transform(df['services']),columns=mlb.classes_, index=df.index)))
10 loops, best of 3: 39.8 ms per loop
#Jez solution2
In [263]: %timeit (df.join(pd.get_dummies(pd.DataFrame(df['services'].values.tolist()), prefix='', prefix_sep='').groupby(axis=1, level=0).max()))
10 loops, best of 3: 20.6 ms per loop
答案 1 :(得分:0)
In [1158]: df.join(df.services.apply(lambda x: pd.Series({y:1 for y in x})).fillna(0))
Out[1158]:
uid services A B C
0 000c80b7d2b3643689b1e516918ec193 [A] 1.0 0.0 0.0
1 001b292c588ec6cc11f57324d40e422d [B, A, C] 1.0 1.0 1.0
2 006696f65899fdd87ba4894c784716f9 [C, B] 0.0 1.0 1.0
答案 2 :(得分:0)
df ['A'] = list(map(lambda x:1如果'A'在x中0,df ['Services']。tolist()))
df ['B'] = list(map(lambda x:1如果'B'在x中0,df ['Services']。tolist()))
df ['C'] = list(map(lambda x:1如果'C'在x中0,df ['Services']。tolist()))
答案 3 :(得分:0)
快速回答:
df.join(df['services'].apply(lambda x: "|".join(x)).str.get_dummies())
一种方法是将字符列表转换为分隔字符串(使用此处的管道符号|
)并使用pd.Series.str.get_dummies
:
df = pd.DataFrame([[['A']],[list('ABC')],[list('BC')]],
columns=['services'],
index=['abc','def','ghi'])
df.index.name = 'UID'
df
services
UID
abc [A]
def [A, B, C]
ghi [B, C]
(df['services']
.apply(lambda x: "|".join(x))
.str.get_dummies())
A B C
UID
abc 1 0 0
def 1 1 1
ghi 0 1 1
合并到原作然后变成一个单行:
df.join(df['services'].apply(lambda x: "|".join(x)).str.get_dummies())
services A B C
UID
abc [A] 1 0 0
def [A, B, C] 1 1 1
ghi [B, C] 0 1 1
答案 4 :(得分:0)
import pandas as pd
d = pd.read_csv('TestingAccommodations.csv', encoding = 'ANSI')
q = dict(tuple(d.groupby('student_id')))
#%%
t1 =[]
t2 = []
t3 =[]
t4 =[]
t5 =[]
t6= []
t11 =[]
t12 =[]
t13 =[]
t14 = []
for k, v in q.items():
t1.append(k)
t2.append(v['last_nam'].iloc[0])
t3.append(v['first_nam'].iloc[0])
t4.append(v['school_dbn'].iloc[0])
t5.append(v['official_class'].iloc[0])
t6.append(v['grade_level'].iloc[0])
t7 = []
t8 = []
t9 =[]
t10 = []
for i in v['Accommodation']:
t7.append(i)
for j in v['Description']:
t8.append(j)
for k in v['SpecificImpRecoms']:
t9.append(k)
for l in v['OtherAccommodation']:
t10.append(l)
t11.append(t7)
t12.append(t8)
t13.append(t9)
t14.append(t10)
cols = d.columns.to_list()
new = pd.DataFrame({cols[0]:t1, cols[1]:t2,cols[2]:t3,cols[3]:t4,cols[4]:t5,cols[5]:t6,cols[6]:t11,
cols[7]:t12,cols[8]:t13,cols[9]:t14})
new.to_excel('new.xlsx',index = False)
#%%
new.columns
f = lambda x: 'Accomodation_{}'.format(x + 1)
op = pd.DataFrame(
new.Accommodation.values.tolist(),
new.index, dtype=str).fillna('').rename(columns=f)
f2 = lambda x: 'Description_{}'.format(x + 1)
op2 = pd.DataFrame(
new.Description.values.tolist(),
new.index, dtype=str).fillna('').rename(columns=f)
f3 = lambda x: 'SpecificImpRecoms_{}'.format(x + 1)
op3 = pd.DataFrame(
new.SpecificImpRecoms.values.tolist(),
new.index, dtype=str).fillna('').rename(columns=f)
f4 = lambda x: 'OtherAccommodation_{}'.format(x + 1)
op4 = pd.DataFrame(
new.OtherAccommodation.values.tolist(),
new.index, dtype=str).fillna('').rename(columns=f)
new2 = pd.concat([new,op,op2,op3,op4], axis = 1)