df1 = pd.DataFrame({'Chromosome': ['1A','1A','1A','1A','1A'],
'Marker': ['M1','M2','M3','M4','M5'],
'Position': [0,1.2,3.5,6,7.3]})
df2 = pd.DataFrame({'Chromosome': ['1A','1A','1A','1A','1A','1B','1B','1B'],
'Marker': ['M1','M2','M3','M4','M5','mk1','mk2','mk3'],
'Position': [0,1.2,3.5,6,7.3,0,2.3,3.2]})
#Expected result for df1
#'1A 5 M1 1.2 M2 2.3 M3 2.5 M4 1.3 M5'
#Expected result for df2
#'1A 5 M1 1.2 M2 2.3 M3 2.5 M4 1.3 M5'
#'1B 3 mk1 2.3 mk2 0.9 mk3'
#My function for computing intermarker distance
def position_interval(df):
df.loc[:,'diffPos'] = round(df['Position'].diff(),1).shift(-1)
a = []
i = 0
while i < df.shape[0]:#omit the last index
info = df['Marker'][i]+' '+str(round(df['diffPos'][i],1))
#print(info)
a.append(info)
i +=1
#print(a)
a.insert(0,str(len(df['Marker'])))
a.insert(0,df['Chromosome'][0])
new_info = ' '.join(a).replace(' nan','')#removing the last ' nan'
#print(new_info)
return new_info
将功能应用于df1的效果很好:
position_interval(df1)
但是我不确定如何应用于每个grouby对象:
position_interval(df2)
答案 0 :(得分:0)
由于该函数需要'Chromosome'键,因此必须将as_index=False
参数放在groupby中:
df2.groupby('Chromosome', as_index=False).apply(position_interval)
这将引发异常,因为未为“ 1B”组找到索引0。
在函数中用iloc
替换Series切片将解决此问题:
def position_interval(df):
df.loc[:,'diffPos'] = round(df['Position'].diff(),1).shift(-1)
a = []
i = 0
while i < df.shape[0]:#omit the last index
info = df['Marker'].iloc[0]+' '+str(round(df['diffPos'].iloc[i],1))
#print(info)
a.append(info)
i +=1
#print(a)
a.insert(0,str(len(df['Marker'])))
a.insert(0,df['Chromosome'].iloc[0])
new_info = ' '.join(a).replace(' nan','')#removing the last ' nan'
#print(new_info)
return new_info
1A 5 M1 1.2 M1 2.3 M1 2.5 M1 1.3 M1
1B 3 mk1 2.3 mk1 0.9 mk1
可以遍历groupby对象:
for i, sub_df in f2.groupby('Chromosome',as_index=False):
print(position_interval(sub_df))