非线性模型预测离开一个ID出交叉验证模式中的NA值

时间:2015-10-08 08:59:15

标签: r prediction cross-validation nonlinear-functions

我有一个数据框df

df<-structure(list(ID = structure(c(4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 19L, 19L, 
19L, 19L, 19L, 19L, 24L, 24L, 24L, 25L, 25L, 26L, 26L, 36L, 36L, 
36L, 36L, 36L, 36L, 41L, 41L, 41L, 41L, 41L, 41L, 42L, 45L, 45L, 
45L, 45L, 45L, 47L, 55L, 64L, 64L, 65L, 71L, 71L, 71L, 71L, 71L, 
43L, 43L, 43L, 43L, 43L, 43L, 43L, 60L, 60L, 60L, 60L, 60L, 60L, 
60L, 60L, 60L), .Label = c("AU-Tum", "BE-Bra", "BR-Sa3", "CA-Ca1", 
"CA-Ca2", "CA-Ca3", "CA-Gro", "Ca-Man", "CA-NS1", "CA-NS2", "CA-NS3", 
"CA-NS4", "CA-NS5", "CA-NS6", "CA-NS7", "CA-Oas", "CA-Obs", "CA-Ojp", 
"CA-Qcu", "CA-Qfo", "CA-SF1", "CA-SF2", "CA-SF3", "CA-SJ1", "CA-SJ2", 
"CA-SJ3", "CA-TP1", "CA-TP2", "CA-TP4", "CZ-Bk1", "DE-Har", "DE-Wet", 
"DK-Sor", "FI-Hyy", "FR-Hes", "FR-Pue", "ID-Pag", "IT-Ro1", "IT-Ro2", 
"IT-Sro", "JP-Tak", "JP-Tef", "NL-Loo", "SE-Abi", "SE-Fla", "SE-Nor", 
"SE-Sk1", "SE-Sk2", "SE-St1", "UK-Gri", "US-Blo", "US-Bn1", "US-Bn2", 
"Us-Bn3", "US-Dk3", "US-Fmf", "US-Fwf", "US-Ha1", "US-Ha2", "US-Ho1", 
"US-Ho2", "US-Lph", "US-Me1", "US-Me3", "US-Nc2", "US-NR1", "US-Sp1", 
"US-Sp2", "US-Sp3", "US-Umb", "US-Wcr", "US-Wi0", "US-Wi1", "US-Wi2", 
"US-Wi4", "US-Wi8"), class = "factor"), y = c(380.654850683175, 
467.840079978108, 358.497598990798, 431.528439198621, 442.010991849005, 
351.189937948249, 343.098349086009, 357.122478995472, -471.194518864155, 
-514.390561270528, -518.123427070677, -554.919201846235, -614.669225180172, 
-391.545088194311, -124.396037524566, 32.3765077856369, 43.199114789255, 
-88.3050879996736, 58.9395819107303, 162.441016515717, 116.965395963751, 
-108.997818851843, -144.127755056645, -126.626824281528, -175.750439967494, 
-151.262252734334, -135.830685457215, 28.9567997518461, -68.2299433113076, 
-37.1677788909292, -167.045088054205, -108.258462657337, 100.907804159913, 
90.3369144331664, 233.031065647025, 287.956774678081, 189.082761215046, 
390.740067397826, 89.1989531565923, 155.527563805692, 224.442115622107, 
315.516411969438, 283.912847682368, 390.026366345584, 322.790248586796, 
312.50101460889, -638.973101716489, 132.601979068451, -42.7843619789928, 
82.2957233709167, 8.36848279205151, 115.376620422816, -186.42650026083, 
577.658561848104, 188.342473105964, 95.6089326666552, 936.236027855426, 
70.9266221858561, -5.91938436031342, 338.1149700284, 185.940875658067, 
198.121665383659, 254.551377562806, 45.4501812993549, 187.152575587854, 
152.183998291846, 226.360116416588, 225.67982583819, -0.0398367510642856, 
217.845216980437, 241.779151081573, 214.481376983225, 219.953942558961, 
315.959296110785, 263.547381375218, 194.449290025979, 305.158690313809, 
326.318877183832), x = c(49, 50, 51, 52, 53, 54, 55, 56, 0, 1, 
2, 3, 4, 5, 13, 14, 15, 16, 17, 71, 72, 1, 2, 3, 4, 5, 6, 9, 
10, 11, 1, 3, 29, 30, 54, 55, 56, 57, 58, 59, 64, 65, 66, 67, 
68, 69, 1, 34, 35, 37, 38, 39, 2, 19, 17, 18, 16, 67, 69, 70, 
72, 73, 101, 105, 106, 107, 108, 109, 110, 131, 132, 133, 134, 
135, 136, 137, 138, 139)), .Names = c("ID", "y", "x"), row.names = c(712L, 
713L, 714L, 715L, 716L, 717L, 718L, 719L, 720L, 721L, 722L, 723L, 
724L, 725L, 726L, 727L, 728L, 729L, 730L, 731L, 732L, 784L, 785L, 
786L, 787L, 788L, 789L, 793L, 794L, 795L, 796L, 797L, 798L, 799L, 
841L, 842L, 843L, 844L, 845L, 846L, 866L, 867L, 868L, 869L, 870L, 
871L, 872L, 880L, 881L, 882L, 883L, 884L, 889L, 892L, 916L, 917L, 
918L, 936L, 937L, 938L, 939L, 940L, 873L, 874L, 875L, 876L, 877L, 
878L, 879L, 905L, 906L, 907L, 908L, 909L, 910L, 911L, 912L, 913L
), class = "data.frame")

我在交叉验证模式下使用非线性模型来预测值。为此,我计算了一个函数stat。它首先基于数据帧创建非线性模型,然后将其用于预测。所有过程都以LOOCV模式完成。

stat<- function(dat) {
  id<-nrow(dat)
  Out<-c()
  for (i in 1:id){
    fit <- try(nls(y~A*(1-exp(k*x)), data = dat[-i,], 
                    start = list(A=1000, k= -0.224)), silent=TRUE); 
    Out[i]<- if (inherits(fit, "nls")) sim = predict(fit, newdata=dat[i,]) else NA; 
  }
  Out
}

然后我在我的数据框df

上应用此功能
stat(df)

但是,模型无法预测数据框df的所有y值的值。它提供NA个值。

[1]        NA        NA        NA        NA        NA        NA 208.10405        NA        NA        NA        NA        NA        NA  38.21535        NA
[16]  90.76849  95.81436 103.35304 105.50550        NA        NA        NA  15.39989        NA        NA        NA  44.12078        NA        NA        NA
[31]        NA        NA 153.22284 156.78703        NA        NA        NA        NA        NA        NA        NA        NA        NA 222.45487        NA
[46]        NA        NA        NA 175.06730        NA 181.72692 181.68818        NA        NA        NA        NA        NA        NA        NA        NA
[61]        NA        NA        NA        NA        NA 252.93564        NA        NA 259.31112        NA        NA        NA        NA 252.88759        NA
[76] 258.52018 253.69781 252.87979

有人有解释吗?

1 个答案:

答案 0 :(得分:2)

您使用的是错误的型号。让我们绘制您的数据:

plot(y ~ x, data = df)

你显然不能使用通过原点的模型。您可以使用带偏移量的渐近模型。

您还应该使用自我启动模型。

fit <- nls(y ~ SSasympOff(x, A, lrc, c0), data = df)
lines(predict(fit, newdata = data.frame(x = 0:140)))

resulting plot

现在你的功能:

stat<- function(dat) {
  id<-nrow(dat)
  Out<-c()
  for (i in 1:id){
    fit <- try(nls(y ~ SSasympOff(x, A, lrc, c0), data = dat[-i,]), silent = TRUE) 
    Out[i]<- if (inherits(fit, "nls")) predict(fit, newdata=dat[i,]) else NA; 
  }
  Out
}

stat(df)
#[1]  231.6054079  230.2816332  232.7219831  231.5793604  231.6226385 ...