Question

我有一个如下所示的数据框

df

     school1    game1   game2   game3
     school2    game1       
     school3    game2   game3   
     school4            game2

输出

       game1    school1 school2 
       game2    school1 school4 school3
       game3    school1         school3

任何人都可以建议我如何获得输出我是熊猫的新手请帮助我

谢谢

Answer 1

这是一种依赖于 melt() 方法首先从原始表格中创建一个长表，然后使用 pivot() 方法将其转换为新的宽格式的方法：

import pandas as pd
import numpy as np

# Code that creates your input dataframe (replace column names as needed)
df = pd.DataFrame(
    {'school':['school1', 'school2', 'school3', 'school4'],
     'g1':['game1', 'game1', 'game2', None],
     'g2':['game2', None, 'game3', 'game2'],
     'g3':['game3', None, None, None],
    }
)

# Convert to long format (one row per school-game)
long_df = df.set_index('school').melt(ignore_index=False).reset_index()
# Remove null (non-existing) school-game combinations
# Also, convert index to column for next step
long_df = long_df[long_df.value.notnull()].reset_index(drop=True).reset_index()

# Convert to dataframe with one row per game ID
by_game_df = long_df.pivot(index='value',columns='index',values='school')

此时，数据框将如下所示：

index  value        0        1        2        3        4        5        6
0      game1  school1  school2      NaN      NaN      NaN      NaN      NaN
1      game2      NaN      NaN  school3  school1      NaN  school4      NaN
2      game3      NaN      NaN      NaN      NaN  school3      NaN  school1

您可以执行这些额外的步骤来将非空学校值向左移动并删除只剩下 NaN 的列：

# per https://stackoverflow.com/a/65596853:
idx = pd.isnull(by_game_df.values).argsort(axis=1)
squeezed_df = pd.DataFrame(
    by_game_df.values[np.arange(by_game_df.shape[0])[:,None], idx],
    index=by_game_df.index,
    columns=by_game_df.columns
)
result = squeezed_df.dropna(axis=1, how='all')
result
# index  value        0        1        2
# 0      game1  school1  school2      NaN
# 1      game2  school3  school1  school4
# 2      game3  school3  school1      NaN

Answer 2

或者使用一系列列表和一个饱受诟病的循环：

d = {'School': ['s1','s2','s3','s4'], 'c1': ['g1','g1','g2',np.nan], 'c2': ['g2',np.nan,'g3','g2'], 'c3': ['g3',np.nan,np.nan,np.nan]}
df = pd.DataFrame(d)
df

  School   c1   c2   c3
0     s1   g1   g2   g3
1     s2   g1  NaN  NaN
2     s3   g2   g3  NaN
3     s4  NaN   g2  NaN

gg = pd.Series(dtype=object)

def add_gs(game, sch):
    if type(game) is str:
        if game in gg.keys():
            gg[game] += [sch]
        else:
            gg[game] = [sch]

cols = df.filter(regex='c[0-9]').columns

for i in range(len(df)):
    for col in cols:
        add_gs(df.loc[i,col],df.loc[i,'School'])

gg

g1        [s1, s2]
g2    [s1, s3, s4]
g3        [s1, s3]

Answer 3

依赖于 defaultdict() 来重塑数据的解决方案：

from collections import defaultdict
import pandas as pd

# Code that creates your input dataframe (replace column names as needed)
df = pd.DataFrame(
    {'school':['school1', 'school2', 'school3', 'school4'],
     'g1':['game1', 'game1', 'game2', None],
     'g2':['game2', None, 'game3', 'game2'],
     'g3':['game3', None, None, None],
    }
)

# convert df to dictionary
d = df.set_index('school').to_dict(orient='index')

# reshape the dictionary
def_d = defaultdict(list)
for k, v in d.items():
    for i in v.values():
        if i is not None:
            def_d[i].append(k)
d_rs = dict(def_d)

# prepare dictionary for converting back to dataframe
dict_for_df = {
    k: pd.Series(
        v + [None] * (len(max(d_rs.values(), key=lambda x: len(x))) - len(v))
    ) for k, v in d_rs.items()
}

# convert dictionary to dataframe
final_df = pd.DataFrame.from_dict(dict_for_df, orient='index')
}
#              0        1        2
# game1  school1  school2     None
# game2  school1  school3  school4
# game3  school1  school3     None

熊猫将数据框形状列值更改为行

3 个答案: