这是我的数据的快照:
structure(list(CPUBID = c(1000001L, 1000002L, 1000003L, 10001L,
1000201L, 1000203L, 10003L, 1000801L, 1000802L, 1000803L, 1001L,
1001101L, 1001102L, 1001601L, 1002401L, 1002402L, 1002403L, 1002601L,
1002602L, 1002604L), MPUBID = c(10000L, 10000L, 10000L, 100L,
10002L, 10002L, 100L, 10008L, 10008L, 10008L, 10L, 10011L, 10011L,
10016L, 10024L, 10024L, 10024L, 10026L, 10026L, 10026L), CYRB = c(1982L,
1984L, 1988L, 1985L, 1986L, 1992L, 1993L, 1984L, 1986L, 1988L,
1983L, 1987L, 1992L, 1977L, 1981L, 1984L, 1998L, 1980L, 1981L,
1984L), twinfam = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), SAMESEX = c(1L, 1L, 1L,
1L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L,
0L), top25 = c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
0, 0, 0, 0), top5 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0), quantity = c(1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1)), .Names = c("CPUBID", "MPUBID",
"CYRB", "twinfam", "SAMESEX", "top25", "top5", "quantity"), row.names = c(NA,
20L), class = "data.frame")
我试图使用twinfam(家庭中的双胞胎)和SAMESEX(前两个孩子同性别)二元变量来创建第四个变量,它具有4个可能的值:
1如果SAMESEX == 0& twinfam == 0
2如果SAMESEX == 1& twinfam == 0
3如果SAMESEX == 0& twinfam == 1
4如果SAMESEX == 1& twinfam == 1
玩了一下之后我尝试使用:
df <- df %>% mutate(both = for (i in 1:nrow(PIATmathreg6)) {
if(twinfam[i] == 0 & SAMESEX[i] == 0) both = 1
else if(twinfam[i] == 0 & SAMESEX[i] == 1) both = 2
else if(twinfam[i] == 1 & SAMESEX[i] == 0) both = 3
else both = 4})
但是我收到了错误:
Error: Unsupported type NILSXP for column "both"
似乎无法解决此错误。关于我为什么会遇到这个错误以及如何解决这个错误的任何建议都将不胜感激!
答案 0 :(得分:4)
最好创建一个键/值数据集并执行left_join
library(dplyr)
df2 <- data.frame(SAMESEX = c(0, 1, 0, 1), twinfam = c(0, 0, 1, 1), both = 1:4)
left_join(df, df2, by = c("SAMESEX", "twinfam"))
# CPUBID MPUBID CYRB twinfam SAMESEX top25 top5 quantity both
#1 1000001 10000 1982 0 1 0 0 1 2
#2 1000002 10000 1984 0 1 0 0 1 2
#3 1000003 10000 1988 0 1 0 0 1 2
#4 10001 100 1985 0 1 0 0 1 2
#5 1000201 10002 1986 0 0 0 0 1 1
#6 1000203 10002 1992 0 0 1 0 1 1
#7 10003 100 1993 0 1 0 0 1 2
#8 1000801 10008 1984 0 0 0 0 1 1
#9 1000802 10008 1986 0 0 0 0 1 1
#10 1000803 10008 1988 0 0 0 0 1 1
#11 1001 10 1983 0 1 1 0 0 2
#12 1001101 10011 1987 0 0 0 0 0 1
#13 1001102 10011 1992 0 0 0 0 0 1
#14 1001601 10016 1977 0 1 0 0 1 2
#15 1002401 10024 1981 0 0 1 0 1 1
#16 1002402 10024 1984 0 0 0 0 1 1
#17 1002403 10024 1998 0 0 0 0 1 1
#18 1002601 10026 1980 0 0 0 0 1 1
#19 1002602 10026 1981 0 0 0 0 1 1
#20 1002604 10026 1984 0 0 0 0 1 1
答案 1 :(得分:3)
如果您想使用dplyr,可以尝试以下方法。您的数据框在此处称为mydf
。您可以使用case_when()
并创建both
。如果我没有弄错,你还不能使用mutate()
中的函数。因此,您想要创建一个向量并最后使用cbind()
。
both <- case_when(mydf$SAMESEX == 0 & mydf$twinfam == 0 ~ 1,
mydf$SAMESEX == 1 & mydf$twinfam == 0 ~ 2,
mydf$SAMESEX == 0 & mydf$twinfam == 1 ~ 3,
mydf$SAMESEX == 1 & mydf$twinfam == 1 ~ 4)
cbind(mydf, both)
正如akrun所评论的,现在您可以在case_when()
中使用mutate()
。
mydf %>%
mutate(both = case_when(.$SAMESEX == 0 & .$twinfam == 0 ~ 1,
.$SAMESEX == 1 & .$twinfam == 0 ~ 2,
.$SAMESEX == 0 & .$twinfam == 1 ~ 3,
.$SAMESEX == 1 & .$twinfam == 1 ~ 4))
# CPUBID MPUBID CYRB twinfam SAMESEX top25 top5 quantity both
#1 1000001 10000 1982 0 1 0 0 1 2
#2 1000002 10000 1984 0 1 0 0 1 2
#3 1000003 10000 1988 0 1 0 0 1 2
#4 10001 100 1985 0 1 0 0 1 2
#5 1000201 10002 1986 0 0 0 0 1 1
#6 1000203 10002 1992 0 0 1 0 1 1
#7 10003 100 1993 0 1 0 0 1 2
#8 1000801 10008 1984 0 0 0 0 1 1
#9 1000802 10008 1986 0 0 0 0 1 1
#10 1000803 10008 1988 0 0 0 0 1 1
#11 1001 10 1983 0 1 1 0 0 2
#12 1001101 10011 1987 0 0 0 0 0 1
#13 1001102 10011 1992 0 0 0 0 0 1
#14 1001601 10016 1977 0 1 0 0 1 2
#15 1002401 10024 1981 0 0 1 0 1 1
#16 1002402 10024 1984 0 0 0 0 1 1
#17 1002403 10024 1998 0 0 0 0 1 1
#18 1002601 10026 1980 0 0 0 0 1 1
#19 1002602 10026 1981 0 0 0 0 1 1
#20 1002604 10026 1984 0 0 0 0 1 1