我有一个熊猫数据框
state action reward absorb
0 [1.0, 2.0, 0.0, 0.0, 0.0, 0.0] 0.0 0.0 False
1 [0.0, 0.0, 4.0, 4.0, 5.0, 0.0] 3.0 1.0 False
2 [0.0, 0.0, 0.0, 2.0, 0.0, 1.0] 5.0 1.0 False
...
,我想将此数据帧转换为
s1 s2 s3 s4 s5 s6 action reward
0 1.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0
1 0.0 0.0 4.0 4.0 5.0 0.0 3.0 1.0
...
在这里,我将第一列分解为几列。我将如何轻松地做到这一点?
谢谢!
答案 0 :(得分:2)
为避免使用apply
(对于大型数据框,这可能会很慢):
new_df = pd.concat([df[['action', 'reward', 'absorb']],
pd.DataFrame(df.state.tolist(),
columns = [f's{i}' for i in range(1,7)])],
axis=1)
>>> new_df
action reward absorb s1 s2 s3 s4 s5 s6
0 0.0 0.0 False 1.0 2.0 0.0 0.0 0.0 0.0
1 3.0 1.0 False 0.0 0.0 4.0 4.0 5.0 0.0
2 5.0 1.0 False 0.0 0.0 0.0 2.0 0.0 1.0
在中等大小的数据帧上,与apply
相比,您会看到一些较大的时间改进。我还添加了@piRSquared的其他2个矢量化解决方案(在注释中)进行比较
# Create a dataframe of 1000 values
df = pd.DataFrame({'state':np.random.choice(df.state.values, size = 1000),
'action': np.random.randint(0,10,1000),
'reward': np.random.randint(0,10,1000),
'absorb': np.random.choice([True, False, 1000])})
>>> df.head()
absorb action reward state
0 1 6 8 [0.0, 0.0, 0.0, 2.0, 0.0, 1.0]
1 1 3 2 [0.0, 0.0, 4.0, 4.0, 5.0, 0.0]
2 1 8 3 [1.0, 2.0, 0.0, 0.0, 0.0, 0.0]
3 1 4 2 [0.0, 0.0, 0.0, 2.0, 0.0, 1.0]
4 1 6 3 [0.0, 0.0, 4.0, 4.0, 5.0, 0.0]
def concat_method(df1 = df.copy()):
return pd.concat([df1[['action', 'reward', 'absorb']],
pd.DataFrame(df1.state.tolist(),
columns = [f's{i}' for i in range(1,7)])],
axis=1)
def apply_method(df1 = df.copy()):
df1[['s1', 's2','s3', 's4','s5', 's6']] = df1['state'].apply(pd.Series)
return df1
def piR_method(df1 = df.copy()):
return df1.assign(**dict((f"s{i}", z) for i, z in enumerate(zip(*df1.state)))).drop('state', 1)
def piR_method2(df1 = df.copy()):
return df1.drop('state', 1).join(pd.DataFrame(df1.state.tolist(), df1.index).rename(columns=lambda x: f"s{x + 1}"))
def pir3(df=df):
mask = df.columns.values != 'state'
vals = df.values
state = vals[:, np.flatnonzero(~mask)[0]].tolist()
other = vals[:, mask]
newv = np.column_stack([other, state])
cols = df.columns.values[mask].tolist()
sss = [f"s{i}" for i in range(1, max(map(len, state)) + 1)]
return pd.DataFrame(newv, df.index, cols + sss)
import timeit
>>> timeit.timeit(concat_method, number = 100) / 100
0.0020290906500304118
>>> timeit.timeit(apply_method, number = 100) / 100
0.19950388665980426
>>> timeit.timeit(piR_method, number = 100) / 100
0.003522267839871347
>>> timeit.timeit(piR_method2, number = 100) / 100
0.002374379680259153
>>> timeit.timeit(pir3, number = 100)
0.17464107400155626