动作 分组,应用,从其他数据帧检索引用,为组计算距离中的每个值引用。
问题 引入无法解释的NaN值,不同运行的结果不同。
尝试 尝试应用函数的计算(没有group by)并且工作正常。所以问题似乎不在计算中。
问题 导致这些NaN值的原因是什么,为什么多次运行的计算不同?
示例
以下示例传递所有断言,但会产生意外结果
import dask.dataframe as dd
import pandas as pd
import numpy as np
pdf = pd.DataFrame({'x':[232126.703, 232126.674, 232126.650, 232126.644, 232126.966],
'y':[579530.01599999995,579530.05099999998,579530.09100000001,579530.15099999995,579530.23199999996],
'z':[16858.0, 16878.0, 16904.0, 16950.0, 16973.0],
'hash':[1,2,2,1,1],
'label':[3,5,3,5,3]})
df = dd.from_pandas(pdf, npartitions = 2)
df_pos = pd.DataFrame({'x_c':[232124.703, 232127.674, 232126.650, 232126.644, 232126.966],
'y_c':[579533.01599999995,579531.05099999998,579530.09100000001,579530.15099999995,579530.23199999996],
'hash':[1,2,3,4,5]})
def add_distance(df, df_pos=df_pos):
ref = df_pos[df_pos.hash == df.name].copy()
df = df.copy()
assert df[['x', 'y']].values.shape[1] == ref[['x_c', 'y_c']].values.shape[1]
assert ref[['x_c', 'y_c']].values.shape[1] == 2
d_traj = np.linalg.norm(df[['x', 'y']].values - ref[['x_c', 'y_c']].values, axis=1)
assert np.isnan(d_traj).any() == False
d_traj = pd.Series(d_traj)
assert len(d_traj) == len(df)
df['d_traj'] = d_traj
return df
df_traj = df.groupby('hash').apply(add_distance, meta=pd.DataFrame(columns=['hash', 'label', 'x', 'y', 'z', 'd_traj']))
df_traj.compute()
答案 0 :(得分:0)
此案例中的问题是df
的原始索引。要防止d_traj
多次覆盖自身并保留其他NaN
值的记录,请先使用reset_index()
。
示例强>
import dask.dataframe as dd
import pandas as pd
import numpy as np
pdf = pd.DataFrame({'x':[232126.703, 232126.674, 232126.650, 232126.644, 232126.966],
'y':[579530.01599999995,579530.05099999998,579530.09100000001,579530.15099999995,579530.23199999996],
'z':[16858.0, 16878.0, 16904.0, 16950.0, 16973.0],
'hash':[1,2,2,1,1],
'label':[3,5,3,5,3]})
df = dd.from_pandas(pdf, npartitions = 2)
df_pos = pd.DataFrame({'x_c':[232124.703, 232127.674, 232126.650, 232126.644, 232126.966],
'y_c':[579533.01599999995,579531.05099999998,579530.09100000001,579530.15099999995,579530.23199999996],
'hash':[1,2,3,4,5]})
def add_distance(df, df_pos=df_pos):
ref = df_pos[df_pos.hash == df.name].copy()
df = df.copy()
df.reset_index(inplace=True, drop=True) # added this line!
d_traj = np.linalg.norm(df[['x', 'y']].values - ref[['x_c', 'y_c']].values, axis=1)
d_traj = pd.Series(d_traj)
df['d_traj'] = d_traj
return df
df_traj = df.groupby('hash').apply(add_distance, meta=pd.DataFrame(columns=['hash', 'label', 'x', 'y', 'z', 'd_traj']))
df_traj.compute()