我想编写一个将数据帧拆分为训练,交叉验证和测试集的函数。
我的代码如下,以小数据集为例:
library(ISLR)
library(data.table)
data <- Auto
seed <- 12
train <- 0.7
test <- 0.6
# Function_split_test_train_regression <- function(data, train, test, seed){
set.seed(seed)
setDT(data)
data[, index := row.names(data)]
train_index <- sample(data$index, train * nrow(data))
test_index <- ifelse(test == 1, setdiff(data$index, train_index),
sample(setdiff(data$index, train_index), test * length(setdiff(data$index, train_index))))
# etc
#}
此时我做了一些检查,结果令我感到惊讶:
> test == 1
[1] FALSE
> sample(setdiff(data$index, train_index),
test * length(setdiff(data$index, train_index)))
[1] "225" "186" "41" "381" "356" "178" "147" "158" "21" "259" "207" "159" "250" "167" "128" "218" "271" "197" "376" "19" "77"
[22] "205" "46" "3" "212" "238" "61" "11" "68" "130" "200" "274" "127" "305" "201" "32" "48" "184" "290" "349" "155" "370"
[43] "366" "333" "243" "161" "108" "65" "125" "306" "357" "189" "337" "118" "364" "6" "149" "87" "252" "194" "362" "383" "93"
[64] "38" "18" "322" "220" "307" "60" "353"
> test_index <- ifelse(test == 1, setdiff(data$index, train_index),
sample(setdiff(data$index, train_index),
test * length(setdiff(data$index, train_index))))
> test_index
[1] "219"
为什么iflese返回219而不是第二个参数的值(因为条件test == 1的计算结果为FALSE)?
您的建议将不胜感激。
=============================================== =================================
根据评论中提出的建议,我更改了名称test_fraction替换名称test的代码,但问题仍然存在。新代码:
library(ISLR)
library(data.table)
data <- Auto
seed <- 12
train_fraction <- 0.7
test_fraction <- 0.6
# Function_split_test_crossval_train_regr <- function(data, train, test, seed){
set.seed(seed)
setDT(data)
data[, index := row.names(data)]
train_index <- sample(data$index, train_fraction * nrow(data))
test_index <- ifelse(test_fraction == 1, setdiff(data$index, train_index), sample(setdiff(data$index, train_index),
test_fraction * length(setdiff(data$index, train_index))))
#}
结果:
> train_index
[1] "119" "118" "143" "344" "293" "341" "305" "95" "82" "58" "226" "35" "363" "111" "84" "137" "24" "151" "381" "110" "93"
[22] "198" "133" "6" "112" "228" "62" "36" "165" "353" "271" "385" "322" "291" "316" "268" "333" "37" "377" "176" "343" "281"
[43] "245" "75" "238" "183" "215" "68" "274" "64" "224" "391" "26" "83" "66" "308" "1" "372" "161" "170" "300" "52" "30"
[64] "15" "57" "148" "312" "311" "194" "367" "27" "342" "260" "181" "163" "171" "193" "210" "327" "248" "172" "263" "47" "351"
[85] "166" "292" "278" "61" "116" "204" "309" "200" "96" "330" "383" "346" "249" "368" "41" "38" "235" "4" "77" "273" "191"
[106] "212" "99" "31" "286" "79" "184" "284" "267" "374" "355" "358" "124" "114" "335" "70" "203" "379" "14" "287" "67" "34"
[127] "340" "127" "91" "222" "240" "387" "357" "242" "310" "347" "142" "103" "105" "117" "189" "361" "177" "126" "392" "5" "317"
[148] "174" "352" "87" "234" "147" "202" "261" "277" "214" "290" "339" "109" "43" "120" "169" "318" "56" "94" "115" "314" "320"
[169] "276" "237" "296" "307" "23" "186" "360" "146" "313" "152" "206" "328" "60" "195" "69" "107" "97" "92" "325" "20" "362"
[190] "157" "101" "10" "192" "134" "251" "259" "2" "29" "265" "331" "144" "63" "384" "81" "338" "364" "213" "380" "150" "48"
[211] "54" "354" "187" "283" "356" "389" "72" "32" "121" "376" "33" "359" "349" "239" "241" "232" "196" "74" "156" "201" "390"
[232] "326" "285" "51" "131" "304" "85" "45" "336" "280" "178" "128" "98" "275" "246" "65" "39" "188" "55" "90" "197" "9"
[253] "173" "40" "295" "149" "230" "140" "135" "236" "21" "369" "301" "220" "122" "253" "208" "388" "159" "282" "88" "158" "167"
[274] "257"
> sample(setdiff(data$index, train_index),
+ test_fraction * length(setdiff(data$index, train_index)))
[1] "337" "378" "164" "225" "16" "44" "221" "179" "25" "28" "324" "175" "139" "154" "17" "252" "211" "155" "233" "162" "130"
[22] "216" "255" "190" "365" "373" "73" "207" "42" "3" "348" "227" "49" "12" "53" "315" "199" "256" "129" "375" "205" "18"
[43] "289" "168" "264" "160" "145" "382" "136" "302" "185" "323" "100" "270" "113" "294" "247" "345" "209" "104" "321" "7" "138"
[64] "78" "386" "366" "298" "231" "86" "19"
> test_fraction == 1
[1] FALSE
> test_index <- ifelse(test_fraction == 1, setdiff(data$index, train_index), sample(setdiff(data$index, train_index),
+ test_fraction * length(setdiff(data$index, train_index))))
> test_index
[1] "28"
答案 0 :(得分:0)
我不知道为什么会这样,我希望有人来解释。
但我找到了解决问题的方法。您需要将参数传递给ifelse()
:
ifelse(
test_fraction == 1,
test_index <- setdiff(data$index, train_index),
test_index <- sample(setdiff(data$index, train_index),test_fraction * length(setdiff(data$index, train_index)))
)
如果这是不好的做法我不会,但它有效。它也可用于在我的回答here等条件中分配多个条件。