创建特定的子样本

时间:2017-04-22 16:54:55

标签: r subsampling

因此,对于我的分析,我需要在我的样本中创建4个子样本:

我想比较选民的投票率百分比

1)家中有0名其他选民的选民 2)与家中另外1名选民的选民 3)与家中其他2名选民的选民 4)家庭中有3个以上其他选民的选民

我有7个变量:
1)家庭规模(vn437)
2)HH的第2人年龄(vn438a)
3)HH的第3人年龄(vn438b)
等到第6个人在HH的年龄

在我的逻辑中,我需要为每个组创建4个子样本:

第一组(HH中的其他0名选民)将是完成的观察:

d$vn437 == 1;  
d$vn437 == 2 & d$vn438a < 18;  
d$vn437 == 3 & d$vn438a < 18 & d$vn438b < 18  
d$vn437 == 4 & d$vn438a < 18 & d$vn438b < 18 & d$vn438c < 18; 

依此类推,直到我完成'd $ vn438e&lt; 18'

我是R的超级菜鸟,我不知道如何解决这个问题 我该如何创建这些组?我真的很绝望,我一直在找几个小时无济于事!

正如Richard Telford建议的那样,这是'dput(head(d))命令的输出:

structure(list(dat = c(20091026, 20091025, 20091025, 20091026, 
20091025, 20091025), vn1 = c(1, 2, 1, 1, 1, 1), vn542 = c(27, 
22, 25, 23, 24, 22), vn217 = c(4, 3, 2, 4, 3, 3), n111 = c(1, 
1, 1, 2, 1, 1), vn437 = c(2, 2, 2, 2, 2, 2), vn438a = c(28, 24, 
24, 24, 23, 25), vn438b = c(1000, 1000, 1000, 1000, 1000, 1000
), vn438c = c(1000, 1000, 1000, 1000, 1000, 1000), vn438d = c(1000, 
1000, 1000, 1000, 1000, 1000), vn438e = c(1000, 1000, 1000, 1000, 
1000, 1000), vn5 = c(4, 4, 4, 4, 4, 4), vn9a = c(5, 5, 5, 5, 
5, 5), vn75 = c(1, 1, 3, 2, 1, 3), vn79 = c(2, 2, 2, 2, 2, 2)), .Names = c("dat", 
"vn1", "vn542", "vn217", "n111", "vn437", "vn438a", "vn438b", 
"vn438c", "vn438d", "vn438e", "vn5", "vn9a", "vn75", "vn79"), row.names = c(2174L, 
2175L, 2177L, 2178L, 2180L, 2181L), class = "data.frame")  

vn438b = "1000"是NA值,但是如果我删除它们,我会失去其他观察结果,因此我没有清除HH年龄变量中的第N个人。

这也是我想要的结果to look like in the end

编辑

管理自己解决它。对于任何有兴趣的人,这是我的代码:

# changing variable names into understandable names
colnames(d)[2] <- "sex"
colnames(d)[3] <- "age"
colnames(d)[4] <- "polint"
colnames(d)[5] <- "turnout"
colnames(d)[6] <- "HHsize"
colnames(d)[7] <- "HHage2"
colnames(d)[8] <- "HHage3"
colnames(d)[9] <- "HHage4"
colnames(d)[10] <- "HHage5"
colnames(d)[11] <- "HHage6"
colnames(d)[12] <- "marital"
colnames(d)[13] <- "education"
colnames(d)[14] <- "income"
colnames(d)[15] <- "religion"


####################################################################
## creating subsets: no other voters in HH --> combine them later ##
####################################################################
noHHM <- d[d$HHsize==1, ]
noHHM1 <- d[d$HHsize==2 & d$HHage2<18, ]
noHHM2 <- d[d$HHsize==3 & d$HHage2<18 & d$HHage3<18, ]
noHHM3 <- d[d$HHsize==4 & d$HHage2<18 & d$HHage3<18 & d$HHage4<18, ]
noHHM4 <- d[d$HHsize==5 & d$HHage2<18 & d$HHage3<18 & d$HHage4<18 & d$HHage5<18, ] 
#at this point no more observations match specifications so we remove noHHM4
rm(noHHM4)

#merging the noHHM variables
zeroHHM <- rbind(noHHM, noHHM1, noHHM2, noHHM3)

#removing intermediate variables now
rm(noHHM, noHHM1, noHHM2, noHHM3)

#creating two subsets (youth voters [under 25] and non youth voters [over 25])
Youth0 <- zeroHHM[zeroHHM$age <26, ]
Old0 <- zeroHHM[zeroHHM$age >25, ]


##################################################
## repeat whole process for 1 other voter in HH ##
##################################################
one1HHM <- d[d$HHsize==2 & d$HHage2>17, ]
oneHHM1 <- d[d$HHsize==3 & d$HHage2>17 & d$HHage2<900 & d$HHage3<18,  ]
oneHHM2 <- d[d$HHsize==4 & d$HHage2>17 & d$HHage2<900 & d$HHage3<18 & d$HHage4<18, ]
oneHHM3 <- d[d$HHsize==5 & d$HHage2>17 & d$HHage2<900 & d$HHage3<18 & d$HHage4<18 & d$HHage5<18, ]
oneHHM4 <- d[d$HHsize==6 & d$HHage2>17 & d$HHage2<900 & d$HHage3<18 & d$HHage4<18 & d$HHage5<18 & d$HHage6<18, ]

#merging the oneHHM variables
oneHHM <- rbind(one1HHM, oneHHM1, oneHHM2, oneHHM3, oneHHM4)

#removing intermediate variables now
rm(one1HHM, oneHHM1, oneHHM2, oneHHM3, oneHHM4)

#creating two subsets (youth voters [under 25] and non youth voters [over 25])
Youth1 <- oneHHM[zeroHHM$age <26, ]
Old1 <- oneHHM[zeroHHM$age >25, ]


###################################################
## repeat whole process for 2 other voters in HH ##
###################################################
twoHHM1 <- d[d$HHsize==3 & d$HHage2>17 & d$HHage2<900 & d$HHage3>17 & d$HHage3<900, ]
twoHHM2 <- d[d$HHsize==4 & d$HHage2>17 & d$HHage2<900 & d$HHage3>17 & d$HHage3<900 & d$HHage4<18, ]
twoHHM3 <- d[d$HHsize==5 & d$HHage2>17 & d$HHage2<900 & d$HHage3>17 & d$HHage3<900 & d$HHage4<18 & d$HHage5<18, ]
twoHHM4 <- d[d$HHsize==6 & d$HHage2>17 & d$HHage2<900 & d$HHage3>17 & d$HHage3<900 & d$HHage4<18 & d$HHage5<18 & d$HHage6<18, ]

#merging the the twoHHM variables
twoHHM <- rbind(twoHHM1, twoHHM2, twoHHM3, twoHHM4)

#removing intermediate variables
rm(twoHHM1, twoHHM2, twoHHM3, twoHHM4)

#creating two subsets (youth voters [under 25] and non youth voters [over 25])
Youth2 <- twoHHM[zeroHHM$age <26, ]
Old2 <- twoHHM[zeroHHM$age >25, ]


####################################################
## repeat whole process for 3+ other voters in HH ##
####################################################
threeHHM1 <- d[d$HHsize==4 & d$HHage2>17 & d$HHage2<900 & d$HHage3>17 & d$HHage3<900 & d$HHage4>17 & d$HHage4<900, ]
threeHHM2 <- d[d$HHsize==5 & d$HHage2>17 & d$HHage2<900 & d$HHage3>17 & d$HHage3<900 & d$HHage4>17 & d$HHage4<900, ]
threeHHM3 <- d[d$HHsize==6 & d$HHage2>17 & d$HHage2<900 & d$HHage3>17 & d$HHage3<900 & d$HHage4>17 & d$HHage4<900, ]

#merging the the threeHHM variables
threeHHM <- rbind(threeHHM1, threeHHM2, threeHHM3)

#removing intermediate variables
rm(threeHHM1, threeHHM2, threeHHM3)

#creating two subsets (youth voters [under 25] and non youth voters [over 25])
Youth3 <- threeHHM[zeroHHM$age <26, ]
Old3 <- threeHHM[zeroHHM$age >25, ]


#renaming the bigsets
HHM0 <- zeroHHM
HHM1 <- oneHHM
HHM2 <- twoHHM
HHM3 <- threeHHM
#removing old name bigsets
rm(zeroHHM, oneHHM, twoHHM, threeHHM)

0 个答案:

没有答案