我是R新手,目前我的代码遇到一些困难。本质上,我在数据集中有几个变量,这些变量包含有关个人经常参加的活动类型的信息(例如1 =阅读,2 =手工艺,3 =园艺等)。
一些模拟数据:
df = data.frame(ID = c(1001, 1002, 1003, 1004, 1005,1006,1007,1008,1009,1010,1011),
orig_1 = c('-7', '2','1','1','NA','2', '3','NA','NA','2', '2'),
orig_2 = c('1','1','2','1','3','2', '2', '3','NA','2', '2'),
orig_3 = c('-7','3','NA','1','NA','2','NA','1','NA','2', '2'))
基于这些变量,我想创建新的变量,例如,该变量反映一个人是否参与了特定的变量(例如0 =否,1 =是)。我要做的第一件事是与“不知道”对应的代码值NA:
#Recode variables
df$orig_1[df$orig_1==-7] <- NA
df$orig_2[df$orig_2==-7] <- NA
df$orig_3[df$orig_3==-7] <- NA
然后我创建了新的“活动”变量:
# create new activity variable
df$activity_1 <- NA
df$activity_2 <- NA
df$activity_3 <- NA
接下来,我改编了一个函数(由@Sonny推荐)来搜索以下列,并返回“ 1”(对于那些报告参加活动的人)或“ 0”:
df$activity_1 <- na.omit(apply(df[, 2:4], 1, function(x) {
if(any(x %in% c(1))) {
return(1)
} else {
return(0)
}
}))
df$activity_2 <- na.omit(apply(df[, 2:4], 1, function(x) {
if(any(x %in% c(2))) {
return(1)
} else {
return(0)
}
}))
df$activity_3 <- na.omit(apply(df[, 2:4], 1, function(x) {
if(any(x %in% c(3))) {
return(1)
} else {
return(0)
}
}))
这部分不起作用,但是这里的想法是,如果所有原始变量都等于“ NA”,则将Na引入新变量:
df$activity_1[df$orig_1==NA & df$orig_2==NA & df$orig_3==NA] <- NA
理想情况下,结果数据框应如下所示:
ID orig_1 orig_2 orig_3 activity_1 activity_2 activity_3
1 1001 NA 1 NA 1 0 0
2 1002 2 1 NA 1 1 0
3 1003 1 2 NA 1 1 0
4 1004 1 1 1 1 0 0
5 1005 NA 3 NA 0 0 1
6 1006 2 2 2 0 1 0
7 1007 3 2 NA 0 1 1
8 1008 NA NA 1 1 0 0
9 1009 NA NA NA NA NA NA
10 1010 2 2 2 0 1 0
11 1011 2 2 2 0 1 0
非常感谢您对改进此代码的任何建议!
答案 0 :(得分:2)
首先,您需要制作真实的NA
。您正在做'NA'
,它是一个字符串,与NA
不同。我们可以这样解决:
df[df == "NA"] <- NA
然后,我们可以查看apply
,其中所有列2:4
都是NA
,并相应地设置activity_*
列。
df[apply(df[2:4], 1, function(x) all(is.na(x))), 5:7] <- NA
或矢量化,如 @akrun 所建议:
df[!rowSums(!is.na(df[2:4])), 5:7] <- NA
df
# ID orig_1 orig_2 orig_3 activity_1 activity_2 activity_3
# 1 1001 -7 1 -7 1 0 0
# 2 1002 2 1 3 1 1 1
# 3 1003 1 2 <NA> 1 1 0
# 4 1004 1 1 1 1 0 0
# 5 1005 <NA> 3 <NA> 0 0 1
# 6 1006 2 2 2 0 1 0
# 7 1007 3 2 <NA> 0 1 1
# 8 1008 <NA> 3 1 1 0 1
# 9 1009 <NA> <NA> <NA> NA NA NA
# 10 1010 2 2 2 0 1 0
# 11 1011 2 2 2 0 1 0
数据
df <- structure(list(ID = c(1001, 1002, 1003, 1004, 1005, 1006, 1007,
1008, 1009, 1010, 1011), orig_1 = structure(c(NA, 3L, 2L, 2L,
5L, 3L, 4L, 5L, 5L, 3L, 3L), .Label = c("-7", "1", "2", "3",
"NA"), class = "factor"), orig_2 = structure(c(1L, 1L, 2L, 1L,
3L, 2L, 2L, 3L, 4L, 2L, 2L), .Label = c("1", "2", "3", "NA"), class = "factor"),
orig_3 = structure(c(NA, 4L, 5L, 2L, 5L, 3L, 5L, 2L, 5L,
3L, 3L), .Label = c("-7", "1", "2", "3", "NA"), class = "factor"),
activity_1 = c(1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0), activity_2 = c(0,
1, 1, 0, 0, 1, 1, 0, 0, 1, 1), activity_3 = c(0, 1, 0, 0,
1, 0, 1, 1, 0, 0, 0)), .Names = c("ID", "orig_1", "orig_2",
"orig_3", "activity_1", "activity_2", "activity_3"), row.names = c(NA,
-11L), class = "data.frame")
答案 1 :(得分:1)
使用dplyr:
library(dplyr)
#df[df == "NA"] <- NA
df %>% mutate(activity_1 = case_when( orig_1 == 1 | orig_2 == 1 | orig_3 == 1 ~ 1,
TRUE ~ 0),
activity_2 = case_when( orig_1 == 2 | orig_2 == 2 | orig_3 == 2 ~ 1,
TRUE ~ 0),
activity_3 = case_when( orig_1 == 3 | orig_2 == 3 | orig_3 == 3 ~ 1,
TRUE ~ 0)) %>%
#mutate_at(.vars = c(5:7), list(~ifelse(is.na(orig_1) & is.na(orig_2) &is.na(orig_3), NA, .)))
mutate_at(.vars = c(5:7), list(~ifelse(orig_1 =="NA" & orig_2 =="NA" & orig_3 =="NA", NA, .)))
或
df %>% na_if(.,"NA") %>% #na_if replaces specified value (this case "NA") to NA
mutate(activity_1 = case_when( orig_1 == 1 | orig_2 == 1 | orig_3 == 1 ~ 1,
TRUE ~ 0),
activity_2 = case_when( orig_1 == 2 | orig_2 == 2 | orig_3 == 2 ~ 1,
TRUE ~ 0),
activity_3 = case_when( orig_1 == 3 | orig_2 == 3 | orig_3 == 3 ~ 1,
TRUE ~ 0)) %>%
mutate_at(.vars = c(5:7), list(~ifelse(is.na(orig_1) & is.na(orig_2) &is.na(orig_3), NA, .)))
ID orig_1 orig_2 orig_3 activity_1 activity_2 activity_3
1 1001 -7 1 -7 1 0 0
2 1002 2 1 3 1 1 1
3 1003 1 2 NA 1 1 0
4 1004 1 1 1 1 0 0
5 1005 NA 3 NA 0 0 1
6 1006 2 2 2 0 1 0
7 1007 3 2 NA 0 1 1
8 1008 NA 3 1 1 0 1
9 1009 NA NA NA NA NA NA
10 1010 2 2 2 0 1 0
11 1011 2 2 2 0 1 0
>