我有一个这样的数据框:
In [1]: df = pd.DataFrame([['jon snow', 'jon-snow'], ['jon snow', 'jon+snow'], [jon snow, 'jonsnow']], columns=['name', 'name_variation'])
我想要的是:
df_want = pd.DataFrame([['jon snow', 'jon-snow', 'jon-snow'],
['jon snow', 'jon-snow', 'jon+snow'],
['jon snow', 'jon-snow', 'jonsnow'],
['jon snow', 'jon-snow', np.nan],
['jon snow', 'jon+snow', 'jon-snow'],
['jon snow', 'jon+snow', 'jon+snow'],
['jon snow', 'jon+snow', 'jonsnow'],
['jon snow', 'jon+snow', np.nan],
['jon snow', 'jonsnow', 'jon-snow'],
['jon snow', 'jonsnow', 'jon+snow'],
['jon snow', 'jonsnow', 'jon-snow'],
['jon snow', 'jonsnow', np.nan],
['jon snow', np.nan, 'jon-snow'],
['jon snow', np.nan, 'jon+snow'],
['jon snow', np.nan, 'jonsnow'],
['jon snow', np.nan, np.nan]], columns=['name', 'name_variation', 'name_variation_2'])
我正在尝试这个有效,但感觉很长时间啰嗦:
def combinations(df):
df = df.drop_duplicates()
df = df.dropna()
df['k'] = df['brand_variation']
df['val'] = 1
df_final = pd.DataFrame(columns=['brand', 'k', 'brand_variation',])
for res in df['brand'].unique():
#print(res, len(df[df['brand'] == res]))
dfm = df[df['brand'] == res]
dfk = pd.pivot_table(dfm, index=['brand', 'k'], columns=['brand_variation'], values=['val'], fill_value=0, aggfunc=[np.sum]).stack().reset_index()
dfk.columns = dfk.columns.get_level_values(level=0)
dfk = dfk[['brand', 'k', 'brand_variation']]
df_final = df_final.append(dfk)
df_final = df_final.reset_index(drop=True)
return df_final
更好的方法吗?
答案 0 :(得分:2)
numpy
u = pd.unique(df.values.ravel())
r = np.arange(u.size)
i, j = r.repeat(u.size), np.tile(r, u.size)
pd.DataFrame(dict(
name=['jon snow' for _ in range(i.size)],
name_variation=u[i],
name_variation2=u[j]
))
name name_variation name_variation2
0 jon snow jon snow jon snow
1 jon snow jon snow jon-snow
2 jon snow jon snow jon+snow
3 jon snow jon snow jonsnow
4 jon snow jon-snow jon snow
5 jon snow jon-snow jon-snow
6 jon snow jon-snow jon+snow
7 jon snow jon-snow jonsnow
8 jon snow jon+snow jon snow
9 jon snow jon+snow jon-snow
10 jon snow jon+snow jon+snow
11 jon snow jon+snow jonsnow
12 jon snow jonsnow jon snow
13 jon snow jonsnow jon-snow
14 jon snow jonsnow jon+snow
15 jon snow jonsnow jonsnow
pandas
u = pd.unique(df.values.ravel())
pd.Series(
'jon snow',
pd.MultiIndex.from_product(
[u, u], names=['name_variation', 'name_variation2']
),
name='name'
).reset_index()
name_variation name_variation2 name
0 jon snow jon snow jon snow
1 jon snow jon-snow jon snow
2 jon snow jon+snow jon snow
3 jon snow jonsnow jon snow
4 jon-snow jon snow jon snow
5 jon-snow jon-snow jon snow
6 jon-snow jon+snow jon snow
7 jon-snow jonsnow jon snow
8 jon+snow jon snow jon snow
9 jon+snow jon-snow jon snow
10 jon+snow jon+snow jon snow
11 jon+snow jonsnow jon snow
12 jonsnow jon snow jon snow
13 jonsnow jon-snow jon snow
14 jonsnow jon+snow jon snow
15 jonsnow jonsnow jon snow
答案 1 :(得分:0)
实际上有一个非常简单的解决方案,我只是迟到了:
df = pd.merge(df, df, left_on='brand', right_on='brand', how='inner').drop_duplicates()
arrrrgh!