我有一个不平衡的时间序列数据集,其中较老的群组比新群组有更多的观察。我正在尝试在R的strata
函数中使用sampsize
和randomForest
来对旧群组进行下采样,以便构建回归模型。但是,我的尝试导致错误“sampsize应该是长度为1”。
以下是我的代码。我首先构建一个样本不平衡数据集,其中包含两个预测变量并添加了一点噪声。
#Create data frame in which older cohorts have more observations
date <- c(seq.Date(as.Date('2016-01-01'), as.Date('2016-10-01'), "month"),
seq.Date(as.Date('2016-02-01'), as.Date('2016-10-01'), "month"),
seq.Date(as.Date('2016-03-01'), as.Date('2016-10-01'), "month"),
seq.Date(as.Date('2016-04-01'), as.Date('2016-10-01'), "month"),
seq.Date(as.Date('2016-05-01'), as.Date('2016-10-01'), "month"),
seq.Date(as.Date('2016-06-01'), as.Date('2016-10-01'), "month"))
cohort <- factor(c(rep(1,10),rep(2,9),rep(3,8),rep(4,7),rep(5,6),rep(6,5)))
set.seed(10)
x1 <- 1:10 + rnorm(10)
x2 <- sin(10) + rnorm(10)
y <- x1 + x2 + rnorm(10)
set.seed(1)
x1noise2 <- rnorm(9, 0, .1)
x1noise3 <- rnorm(8, 0, .1)
x1noise4 <- rnorm(7, 0, .1)
x1noise5 <- rnorm(6, 0, .1)
x1noise6 <- rnorm(5, 0, .1)
x1 <- c(x1, x1[2:10]+x1noise2, x1[3:10]+x1noise3, x1[4:10]+x1noise4, x1[5:10]+x1noise5, x1[6:10]+x1noise6)
set.seed(2)
x2noise2 <- rnorm(9, 0, .1)
x2noise3 <- rnorm(8, 0, .1)
x2noise4 <- rnorm(7, 0, .1)
x2noise5 <- rnorm(6, 0, .1)
x2noise6 <- rnorm(5, 0, .1)
x2 <- c(x2, x2[2:10]+x2noise2, x2[3:10]+x2noise3, x2[4:10]+x2noise4, x2[5:10]+x2noise5, x2[6:10]+x2noise6)
set.seed(3)
ynoise2 <- rnorm(9, 0, .1)
ynoise3 <- rnorm(8, 0, .1)
ynoise4 <- rnorm(7, 0, .1)
ynoise5 <- rnorm(6, 0, .1)
ynoise6 <- rnorm(5, 0, .1)
y <- c(y, y[2:10]+ynoise2, y[3:10]+ynoise3, y[4:10]+ynoise4, y[5:10]+ynoise5, y[6:10]+ynoise6)
df <- data.frame(date, cohort, x1, x2, y)
#Plot of data shown below
plot(df$date, df$y, col=df$cohort)
#Build random forest models
library(randomForest)
set.seed(4)
rf1 <- randomForest(y~x1+x1, data=df) #This works
set.seed(4)
rf2 <- randomForest(y~x1+x1, data=df, strata=df$cohort, sampsize=rep(4,6)) #This results in error saying 'sampsize should be of length one'