我尝试定义一个函数calc
来执行一个for
循环。此for
循环将为数据帧计算的结果附加一些已定义的函数:weightedMean
,sd_pooled
和summation
。最后,预期结果将输出到output2
。没有为for
循环定义函数的代码可以很好地工作。但是,我的问题是执行calc
后,output2
为空,没有任何抱怨。因此,我无法相应地进行故障排除。完整的代码如下:
import pandas as pd
import numpy as np
from dplython import X, sift, DplyFrame, mutate, select
from plydata import define, group_by, summarize
def weightedMean(data):
length = len(data['Var1'])
if length == 1:
mx = data['Var1']
return(length)
else:
mx = data['Var1'][0]
nx = data['Var3'][0]
for i in range(1,length):
my = data['Var1'][i]
ny = data['Var3'][i]
nx = nx + ny
mx=(mx*nx+my*ny)/(nx+ny)
return(mx)
def summation(data):
length = len(data['Var3'])
cx = data['Var3'][0]
for i in range(1,length):
cy = data['Var3'][i]
cx = cx + cy
return(cx)
def sd_c(x_m, x_s, x_n, y_m, y_s, y_n):
al = x_n+y_n
tmp_sd = al*((x_n-1)*(x_s*x_s)+(y_n-1)*(y_s*y_s))+y_n*x_n*(x_m-y_m)*(x_m-y_m)
var = tmp_sd/(al*(al-1))
std = np.sqrt(var)
return(std)
def sd_pooled(data):
length = len(data['Var1'])
if length == 1:
mx = data['Var1']
return(length)
else:
mx = data['Var1'][0]
sx = data['Var2'][0]
nx = data['Var3'][0]
for i in range(1,length):
my = data['Var1'][i]
sy = data['Var2'][i]
ny = data['Var3'][i]
sx = sd_c(mx, sx, nx, my, sy, ny)
nx = nx + ny
mx = (mx*nx + my*ny)/(nx + ny)
return(sx)
dat = pd.read_csv("input.txt",sep="\t")
dat = {
'Group': ['A','A','A','A','A','A','A','A','A','A'],
'Process': [3,3,3,3,3,3,841,841,841,841],
'Category': ['cat1','cat1','cat1','cat1','cat1','cat1','cat2','cat2','cat2','cat2'],'Type': ['type1','type1','type1','type1','type1','type1','type2','type2','type2','type2'],
'Var1': [86.84,103.39,109.00,107.30,123.09,111.98,87.62,87.40,88.53,85.84],
'Var2': [2.913,2.835,1.478,2.979,2.424,7.462,3.049,4.781,3.025,2.703],
'Var3': [0.01096,0.00564,0.00365,0.00631,0.00531,0.00332,0.01195,0.00930,0.00697,0.00697]
}
dat = pd.DataFrame(dat)
dat_name = dat.loc[:,'Type'].unique()
dat = DplyFrame(dat)
output = pd.DataFrame([])
def calc(dat_name, dat, output):
out = pd.DataFrame([])
for i in range(len(dat_name)):
df = (dat >>
sift(X.Type == dat_name[i]) >>
mutate(Var3 = X.Var3*3021) >>
sift(X.Var2 < 50))
out = out.append(df)
out_grouped = out.groupby(['Group', 'Type', 'Process', 'Category'])
init = []
mean = []
stdv = []
freq = []
kmer = []
for name, group in out_grouped:
group = pd.DataFrame(group).reset_index()
nm = name
wm = weightedMean(group)
sd = sd_pooled(group)
fq = summation(group)
init.append(nm)
mean.append(wm)
freq.append(fq)
stdv.append(sd)
init = pd.DataFrame(init)
mean = pd.DataFrame(mean)
freq = pd.DataFrame(freq)
stdv = pd.DataFrame(stdv)
init.rename(columns={0:'Group',1:'Type',2:'Process',3:'Category'}, inplace=True)
mean.rename(columns={0:'Var1'}, inplace=True)
stdv.rename(columns={0:'Var2'}, inplace=True)
freq.rename(columns={0:'Var3'}, inplace=True)
combined = pd.concat([init.reset_index(drop=True), mean, stdv, freq], axis=1)
output = output.append(combined)
output2 = calc(dat_name, dat, output)
此预期输出如下:
Group Type Process Category Var1 Var2 Var3
0 A type1 3 cat1 101.207332 13.997181 106.30899
1 A type2 841 cat2 87.431341 3.584393 106.30899
我想知道如何在这种情况下成功完成calc
的工作。谢谢。
答案 0 :(得分:1)
我可能是错的,但是我想您应该在return output
函数的末尾添加calc
。