我正在尝试应用一个函数来计算data.frame每行的加权线性模型。在一个data.frame中,我有y值被回归,而另一个我有各自的权重。
DF1:
structure(list(CHR = c("2L", "2L", "2L", "2L", "2L", "2L"), POS = c(3519797L, 3519829L, 3519966L, 3519979L, 3519994L, 3520020L), REF = c("A", "T", "A", "T", "G", "C"), ALT = c("G", "A", "C", "A", "T", "T"), VA_11_2012 = c(0.332630651766266, 0.425793409044713, 0.272594219127124, 0.780596992222118, 0.333415910208956, 0.675131532937032), VA_07_2012 = c(0.209870592262738, 0.476119060911796, 0.280783378372373, 0.653675236445761, 0.329457139617922, 0.646710104121747), WI_06_2012 = c(0, 0.269932795833403, 0.105458525399318, 0.167448079219689, 0.0588575059470812, 0.289751701436047), MA_11_2012 = c(0.304692654015397, 0.317823703927881, 0.252680255142079, 0.627294544147551, 0.217394592267678, 0.664501358502862), MA_07_2012 = c(0.454036339512524, 0.511357677446223, 0.266294017118183, 0.439985035570606, 0.193513192510785, 0.460553991681322), NY_09_2012 = c(0.10016742116156, 0.576931345236457, 0.462770117792092, 0.729727656226966, 0.6048363315283, 0.638192502884691), NY_07_2012 = c(0.297471240638356, 0.57241857209282, 0.442911044073639, 0.711979054912567, 0.430936118152113, 0.571079469455595), PA_10_2012 = c(0.339836909454122, 0.399425239880595, 0.538466990906738, 0.800850984867155, 0.509573386691872, 0.771162998894462), PA_07_2012 = c(0.299541304682017, 0.495429820608484, 0.209066860366776, 0.626368955231887, 0.165133038995688, 0.55066707918282), WI_09_2012 = c(0.133731589409942, 0.32381101527355, 0.35757110364551, 0.487427384811993, 0.339836909454122, 0.531525906496347), PA_9_2012 = c(0.201357920790331, 0.333415910208956, 0.298532201184876, 0.402048533098003, 0.295968803358931, 0.469067604658962), ME2 = c(0.201357920790331, 0.252680255142079, 0.143347568905365, 0.411516846067488, 0.150568272776686, 0.654415562249177), PA_7_2010 = c(0.252680255142079, 0.523598775598299, 0.10363367951473, 0.372168533960326, 0.146868899528658, 0.339836909454122), ME1 = c(0.175659250884277, 0.415929551740757, 0.22289367094341, 0.529818048835127, 0.305820042106789, 0.478754603456347), PA_11_2011 = c(0.411516846067488, 0.377310032359749, 0.180986451246548, 0.35346838657747, 0.143347568905365, 0.534956853726549), PA_10_2011 = c(0.163774627712399, 0.249780225703982, 0.188616386175404, 0.442911044073639, 0.178363581929643, 0.440320601815601), PA_7_2011 = c(0.242365851038963, 0.172732678164473, 0.0690203040380642, 0.429775431304528, 0.0794486340851372, 0.366348842513037), PA_11_2010 = c(0.320528821968808, 0.339836909454122, 0.25631857618652, 0.567248398528934, 0.285555125754008, 0.567248398528934), PA_11_2009 = c(0.1585582806377, 0.394791119699762, 0.339836909454122, 0.60556995037205, 0.356723998691994, 0.575029622147029), PA_7_2009 = c(0.20481888149852, 0.459144998077838, 0.315044892907812, 0.411516846067488, 0.222521503104622, 0.512165725029403), NC = c(0.150568272776686, 0.304692654015397, 0.219141058997532, 0.365207221290367, 0.224093092301371, 0.482827962745019), SC = c(0.325143155970891, 0.540174570306572, 0.452816594744926, 0.74096470220302, 0.411516846067488, 0.74188086214973), GA = c(0.289751701436047, 0.409497198778823, 0.305820042106789, 0.506862820843914, 0.333415910208956, 0.417422271327919), FL2 = c(0.228395614287769, 0.394791119699762, 0.360712720363525, 0.907356892557472, 0.327237730359141, 0.703720508083224), FL1 = c(0.545950634023683, 0.811746380766976, 0.509573386691872, 0.848062078981481, 0.421457568467301, 0.825215328030001), START = c(3519677L, 3519677L, 3519677L, 3519677L, 3519677L, 3519677L), END = c(3522555L, 3522555L, 3522555L, 3522555L, 3522555L, 3522555L), GENE_ID = c("FBgn0031546", "FBgn0031546", "FBgn0031546", "FBgn0031546", "FBgn0031546", "FBgn0031546"), GENE_SYMBOL = c("CG8851", "CG8851", "CG8851", "CG8851", "CG8851", "CG8851")), .Names = c("CHR", "POS", "REF", "ALT", "VA_11_2012", "VA_07_2012", "WI_06_2012", "MA_11_2012", "MA_07_2012", "NY_09_2012", "NY_07_2012", "PA_10_2012", "PA_07_2012", "WI_09_2012", "PA_9_2012", "ME2", "PA_7_2010", "ME1", "PA_11_2011", "PA_10_2011", "PA_7_2011", "PA_11_2010", "PA_11_2009", "PA_7_2009", "NC", "SC", "GA", "FL2", "FL1", "START", "END", "GENE_ID", "GENE_SYMBOL"), row.names = c(NA, 6L), class = "data.frame")
DF2:
structure(list(CHR = c("2L", "2L", "2L", "2L", "2L", "2L"), POS = c(3519797L, 3519829L, 3519966L, 3519979L, 3519994L, 3520020L), REF = c("A", "T", "A", "T", "G", "C"), ALT = c("G", "A", "C", "A", "T", "T"), VA_11_2012 = c(24.7373737373737, 23.9479166666667, 25.4803921568627, 25.9519230769231, 26.1809523809524, 26.4056603773585), VA_07_2012 = c(47.7434554973822, 47.7434554973822, 44.2921348314607, 41.5917159763314, 39.6257668711656, 42.8265895953757), WI_06_2012 = c(9.70212765957447, 12.6938775510204, 15.4509803921569, 14.7821782178218, 14.1, 16.75), MA_11_2012 = c(20.3118279569892, 21.2105263157895, 25.8971962616822, 26.5779816513761, 28.1754385964912, 30.7235772357724), MA_07_2012 = c(34.891156462585, 30.8686131386861, 34.891156462585, 33.7430555555556, 32.9507042253521, 37.0522875816993), NY_09_2012 = c(19.7613636363636, 21.021978021978, 28.4824561403509, 27.1284403669725, 27.1284403669725, 25.952380952381), NY_07_2012 = c(19.7613636363636, 21.1666666666667, 21.5794392523364, 21.4190476190476, 21.7339449541284, 21.3365384615385), PA_10_2012 = c(24.8045112781955, 26.4632352941176, 28.0503597122302, 28.0503597122302, 29.0709219858156, 24.8045112781955), PA_07_2012 = c(39.8522727272727, 39.8522727272727, 36.2738095238095, 38.5491329479769, 44.6489361702128, 49.1990049751244), WI_09_2012 = c(31.0275862068966, 30.5486111111111, 44.4388888888889, 44.1284916201117, 41.8546511627907, 42.1907514450867), PA_9_2012 = c(24.99, 26.1809523809524, 28.8050847457627, 28.9831932773109, 29.5, 29.6666666666667), ME2 = c(18.74, 20.378640776699, 16.3958333333333, 18.74, 15.7789473684211, 17.5918367346939), PA_7_2010 = c(10.8671875, 9.1984126984127, 23.1931034482759, 25.6845637583893, 30.2866242038217, 30.8291139240506), ME1 = c(46.8624338624339, 46.0162162162162, 45.1325966850829, 44.6759776536313, 44.6759776536313, 43.7314285714286), PA_11_2011 = c(20.5764705882353, 21.5795454545455, 24.99, 25.4803921568627, 24.7373737373737, 25.2376237623762 ), PA_10_2011 = c(31.1007194244604, 30.75, 27.0900900900901, 26.9090909090909, 26.7247706422018, 26.537037037037), PA_7_2011 = c(37.4933333333333, 34.5251798561151, 32.6992481203008, 33.3259259259259, 34.231884057971, 35.3802816901408), PA_11_2010 = c(22.7169811320755, 22.6190476190476, 22.5192307692308, 22.1, 22.5192307692308, 22.1), PA_11_2009 = c(32.1908396946565, 34.5971223021583, 34.8785714285714, 34.5971223021583, 34.021897810219, 32.1908396946565), PA_7_2009 = c(41.9568965517241, 41.9004329004329, 39.6395939086294, 39.4820512820513, 39.87, 40.3106796116505 ), NC = c(28.3985507246377, 28.3985507246377, 18.6198347107438, 21.7698412698413, 21.16, 21.7698412698413), SC = c(28.7916666666667, 28.4661016949153, 23.9895833333333, 21.8068181818182, 21.8068181818182, 20.8823529411765), GA = c(35.0368098159509, 34.6352201257862, 32.9305555555556, 34.1038961038961, 34.8385093167702, 34.9382716049383 ), FL2 = c(25.1782178217822, 24.95, 19.890243902439, 19.5432098765432, 17.6710526315789, 19.890243902439), FL1 = c(22.2747252747253, 22.0888888888889, 19.975, 20.6626506024096, 20.6626506024096, 21.7045454545455), START = c(3519677L, 3519677L, 3519677L, 3519677L, 3519677L, 3519677L), END = c(3522555L, 3522555L, 3522555L, 3522555L, 3522555L, 3522555L), GENE_ID = c("FBgn0031546", "FBgn0031546", "FBgn0031546", "FBgn0031546", "FBgn0031546", "FBgn0031546"), GENE_SYMBOL = c("CG8851", "CG8851", "CG8851", "CG8851", "CG8851", "CG8851")), .Names = c("CHR", "POS", "REF", "ALT", "VA_11_2012", "VA_07_2012", "WI_06_2012", "MA_11_2012", "MA_07_2012", "NY_09_2012", "NY_07_2012", "PA_10_2012", "PA_07_2012", "WI_09_2012", "PA_9_2012", "ME2", "PA_7_2010", "ME1", "PA_11_2011", "PA_10_2011", "PA_7_2011", "PA_11_2010", "PA_11_2009", "PA_7_2009", "NC", "SC", "GA", "FL2", "FL1", "START", "END", "GENE_ID", "GENE_SYMBOL"), row.names = c(NA, 6L), class = "data.frame")
我创建了以下函数以应用于DF1并获得每行的回归系数:
lms = function(d) {
freqs = as.numeric(d[5:14])
m = lm(freqs~lat)
res = summary(m)$coefficients[2,1]
return (res)
}
lat <- c(25.47, 30.99, 33.40, 35.78, 38.03, 39.53, 42.44, 42.45, 44.02, 44.02)
df1$reg_lat = apply(df1, 1, lms)
我想以某种方式在DF1和DF2上应用lms以应用加权lms。 lms函数将是这样的:
lms = function(df1, df2) {
freqs = as.numeric(df1[5:14])
wgts = as.numeric(df2[5:14])
m = lm(freqs~lat, weights=wgts)
res = summary(m)$coefficients[2,1]
return (res)
}
感谢您的帮助!
略
答案 0 :(得分:1)
由于矩阵的长度相同,因此请在行索引上使用sapply。
lms = function(i) {
freqs = as.numeric(df1[i,5:14])
wgts = as.numeric(df2[i,5:14])
m = lm(freqs~lat, weights=wgts)
res = summary(m)$coefficients[2,1]
return (res)
}
sapply(seq_len(nrow(df1)), lms)