使用K折交叉验证选择模型

时间:2019-11-24 21:22:50

标签: r csv k-fold

因此,我目前正在处理CSV文件中的数据,并且尝试使用K折交叉验证方法确定我的2个变量(between = ?0 + ?1?或between)之间的线性关系较好还是二次关系= ?0 + ?1? + ?2? ^ 2)。在这种情况下,我的变量是X:Hour.Meter.Reading”和Y:Cumalative.Cost。我的CSV数据集称为CombinedDataset;我附加了以下数据的说明以使其更加清楚:

structure(list(Unit.ID = c(925L, 967L, 1054L, 967L, 1054L, 967L, 
1160L, 1054L, 1160L, 967L, 967L, 1054L, 1160L, 967L, 1054L, 1160L, 
967L, 1160L, 1054L, 1054L, 967L, 1160L, 1054L, 967L, 1160L, 1054L, 
967L, 1160L, 1054L, 164L, 967L, 967L, 1160L, 1054L, 164L, 967L, 
164L, 1160L, 164L, 1054L, 967L, 164L, 1054L, 967L, 1054L, 164L, 
967L, 164L, 164L, 1054L, 967L, 164L, 967L, 164L, 1054L, 164L, 
925L, 164L, 967L, 1054L, 1054L, 925L, 925L, 164L, 165L, 164L, 
1054L, 967L, 164L, 165L, 967L, 164L, 164L, 165L, 1054L, 967L, 
967L, 165L, 164L, 1054L, 967L, 165L, 967L, 165L, 164L, 967L, 
164L, 967L, 164L, 967L, 164L, 967L, 164L, 1054L, 164L, 164L, 
164L, 164L, 164L, 164L, 164L), Hour.Meter.Reading = c(34L, 381L, 
532L, 600L, 732L, 783L, 796L, 947L, 1016L, 1038L, 1200L, 1282L, 
1290L, 1388L, 1481L, 1528L, 1579L, 1671L, 1704L, 1728L, 1755L, 
1906L, 1926L, 1936L, 2031L, 2063L, 2136L, 2205L, 2293L, 2321L, 
2342L, 2382L, 2425L, 2505L, 2524L, 2576L, 2704L, 2731L, 2777L, 
2811L, 2873L, 2960L, 2997L, 3080L, 3170L, 3175L, 3264L, 3371L, 
3386L, 3425L, 3485L, 3570L, 3690L, 3740L, 3746L, 3854L, 3863L, 
3976L, 3990L, 3991L, 4078L, 4103L, 4106L, 4138L, 4138L, 4216L, 
4249L, 4253L, 4305L, 4326L, 4353L, 4483L, 4489L, 4489L, 4500L, 
4580L, 4581L, 4652L, 4721L, 4742L, 4784L, 4805L, 4828L, 4943L, 
4947L, 4954L, 4968L, 5298L, 5316L, 5407L, 5533L, 5628L, 5712L, 
5747L, 5951L, 6165L, 6194L, 6439L, 6636L, 6702L, 6918L), Labour.Cost = c(1102.5, 
4270, 542.5, 2730, 682.5, 3097.5, 336, 871.5, 525, 2695, 1837.5, 
1092, 1995, 2572.5, 1092, 924, 840, 1575, 693, 693, 560, 2100, 
7959, 2747.5, 1092, 1764, 2030, 5355, 7434, 315, 1890, 2688, 
504, 3024, 805, 1701, 577.5, 777, 6440, 1281, 588, 4910, 1470, 
1911, 3738, 4140, 9219, 525, 1995, 1239, 1491, 2292.5, 4389, 
2012.5, 1134, 945, 490, 3307.5, 714, 756, 1302, 297.5, 875, 1872.5, 
1435, 1767.5, 2037, 3108, 1645, 1067.5, 3087, 1452.5, 11777.5, 
5670, 4872, 2916, 4158, 5350, 2817.5, 84, 1596, 3865, 714, 910, 
4112.5, 1197, 3622.5, 714, 3675, 4767, 3150, 2142, 2436, 210, 
1974, 3843, 14532, 2373, 2919, 7098, 2205), Parts.Cost = c(657.6733, 
6451.9113, 2235.8885, 6729.7326, 8357.0427, 9224.9012, 1957.0181, 
6890.5315, 3156.4815, 2009.3578, 4555.0977, 3458.6842, 1546.2183, 
6249.232, 4430.8058, 3835.5721, 3415.2062, 4868.2379, 2151.4558, 
2233.2055, 2554.7489, 7433.8141, 2563.289, 3348.7162, 2173.6179, 
1940.2806, 4404.6421, 5626.8595, 10553.4599, 12.62, 11405.5704, 
2554.2787, 1907.3543, 12625.7525, 243.5735, 6104.7416, 405.959, 
3609.1684, 4647.767, 12842.3638, 489.477, 9961.5883, 1706.0572, 
2381.7686, 15177.0692, 5416.7948, 16538.1428, 253.3975, 1390.5058, 
8699.7549, 7759.8042, 5128.0276, 8556.2625, 5760.523, 1923.699, 
628.643, 158.4313, 14481.7111, 3796.3243, 11671.4333, 7140.2504, 
1326.837, 441.0999, 2866.2141, 4229.31, 2935.825, 7452.8686, 
11683.7093, 2644.1532, 418.679, 11665.8066, 523.9236, 18247.2776, 
8115.265, 25011.6846, 13727.0801, 31786.6422, 6064.3123, 10599.0455, 
119.4423, 1228.3541, 3587.7566, 3666.517, 472.1537, 1968.7669, 
1417.8506, 8023.1254, 5831.6884, 14873.8008, 10193.2736, 6442.1719, 
7525.4562, 4378.1336, 1691.4286, 12144.6891, 13094.8609, 20582.1682, 
2544.103, 16934.6748, 17344.5551, 8912.7088), Total.Cost = c(1760.1733, 
10721.9113, 2778.3885, 9459.7326, 9039.5427, 12322.4012, 2293.0181, 
7762.0315, 3681.4815, 4704.3578, 6392.5977, 4550.6842, 3541.2183, 
8821.732, 5522.8058, 4759.5721, 4255.2062, 6443.2379, 2844.4558, 
2926.2055, 3114.7489, 9533.8141, 10522.289, 6096.2162, 3265.6179, 
3704.2806, 6434.6421, 10981.8595, 17987.4599, 327.62, 13295.5704, 
5242.2787, 2411.3543, 15649.7525, 1048.5735, 7805.7416, 983.459, 
4386.1684, 11087.767, 14123.3638, 1077.477, 14871.5883, 3176.0572, 
4292.7686, 18915.0692, 9556.7948, 25757.1428, 778.3975, 3385.5058, 
9938.7549, 9250.8042, 7420.5276, 12945.2625, 7773.023, 3057.699, 
1573.643, 648.4313, 17789.2111, 4510.3243, 12427.4333, 8442.2504, 
1624.337, 1316.0999, 4738.7141, 5664.31, 4703.325, 9489.8686, 
14791.7093, 4289.1532, 1486.179, 14752.8066, 1976.4236, 30024.7776, 
13785.265, 29883.6846, 16643.0801, 35944.6422, 11414.3123, 13416.5455, 
203.4423, 2824.3541, 7452.7566, 4380.517, 1382.1537, 6081.2669, 
2614.8506, 11645.6254, 6545.6884, 18548.8008, 14960.2736, 9592.1719, 
9667.4562, 6814.1336, 1901.4286, 14118.6891, 16937.8609, 35114.1682, 
4917.103, 19853.6748, 24442.5551, 11117.7088), Cumulative.Cost = c(1760.1733, 
12482.0846, 15260.4731, 24720.2057, 33759.7484, 46082.1496, 48375.1677, 
56137.1992, 59818.6807, 64523.0385, 70915.6362, 75466.3204, 79007.5387, 
87829.2707, 93352.0765, 98111.6486, 102366.8548, 108810.0927, 
111654.5485, 114580.754, 117695.5029, 127229.317, 137751.606, 
143847.8222, 147113.4401, 150817.7207, 157252.3628, 168234.2223, 
186221.6822, 186549.3022, 199844.8726, 205087.1513, 207498.5056, 
223148.2581, 224196.8316, 232002.5732, 232986.0322, 237372.2006, 
248459.9676, 262583.3314, 263660.8084, 278532.3967, 281708.4539, 
286001.2225, 304916.2917, 314473.0865, 340230.2293, 341008.6268, 
344394.1326, 354332.8875, 363583.6917, 371004.2193, 383949.4818, 
391722.5048, 394780.2038, 396353.8468, 397002.2781, 414791.4892, 
419301.8135, 431729.2468, 440171.4972, 441795.8342, 443111.9341, 
447850.6482, 453514.9582, 458218.2832, 467708.1518, 482499.8611, 
486789.0143, 488275.1933, 503027.9999, 505004.4235, 535029.2011, 
548814.4661, 578698.1507, 595341.2308, 631285.873, 642700.1853, 
656116.7308, 656320.1731, 659144.5272, 666597.2838, 670977.8008, 
672359.9545, 678441.2214, 681056.072, 692701.6974, 699247.3858, 
717796.1866, 732756.4602, 742348.6321, 752016.0883, 758830.2219, 
760731.6505, 774850.3396, 791788.2005, 826902.3687, 831819.4717, 
851673.1465, 876115.7016, 887233.4104)), class = "data.frame", row.names = c(NA, 
-101L))

下面是两个变量的数据图:

Graph

这是我为K验证方法尝试的代码:


set.seed(123)
idx <- sample(1:nrow(combinedDataset), nrow(combinedDataset))
test_size <- floor(nrow(combinedDataset)*0.2)
test <- combinedDataset[idx[1:test_size],]
train <- combinedDataset[-idx[1:test_size],]
train_X <- train$Hour.Meter.Reading
train_y <- train$Cumalative.Cost
test_X <- test$Hour.Meter.Reading
test_y <- test$Cumalative.Cost
X <- train_X
y <- train_y

poly_order <- 1
model <- lm(y~poly(X, poly_order))
print(model)

这是我得到的错误:

  

model.frame.default(formula = y〜poly(X,poly_order),drop.unused.levels = TRUE)中的错误:     变量“ y”的类型无效(NULL)   打印(型号)   print(model)错误:找不到对象'model'

我不确定此错误的确切含义。另外,我知道我的K值为5,是否应该使用某种for循环将我的K值输入代码?

0 个答案:

没有答案