我试图计算多个列(t1,t2,t3)中唯一字符的数量,并将此值放入新变量中。是否计算一个字符取决于与(p1,p2,p3)关联的另一列的值是否大于或等于0.05。例如。我有以下数据集:
dat <- data.frame(id = c(1,2,3,4,5),t1 = c('a','a','b','b','c'),
p1 = c(0.98,1,0.5,0.9,1),t2 = c('b',NA,'a','c',NA),
p2 = c(0.02,NA,0.25,0.10,NA), t3 = c(NA,NA,'c',NA,NA),
p3 = c(NA,NA,0.25,NA,NA))
我希望计算给定行的t1,t2,t3列中存在的唯一值的数量,并将此数量放入新变量(总计)中,该变量应具有如下输出:
output <- data.frame(id = c(1,2,3,4,5),t1 = c('a','a','b','b','c'),
p1 = c(0.98,1,0.5,0.9,1),t2 = c('b',NA,'a','c',NA),
p2 = c(0.02,NA,0.25,0.10,NA), t3 = c(NA,NA,'c',NA,NA),
p3 = c(NA,NA,0.25,NA,NA), total = c(1,1,3,2,1))
使用dplyr,我可以使用以下代码计算t1,t2和t3中的唯一字符:
output <- dat %>%
group_by(id) %>%
mutate(total = n_distinct(c(t1,t2,t3), na.rm = TRUE))
但是,如果要分别计算t1,t2或t3以获得所需的输出,我无法设置p1,p2和p3必须大于等于0.05的条件。有没有办法为每列t1,t2,t3设置此条件?谢谢您的帮助。
答案 0 :(得分:1)
您可以添加条件,然后对结果求和。
dat %>%
group_by(id) %>%
mutate_if(is.factor,as.character) %>%
mutate(total = sum(p1>=.05*nchar(t1),p2>=.05*nchar(t2),p3>=.05*nchar(t3),na.rm = T))
# A tibble: 5 x 8
# Groups: id [5]
# id t1 p1 t2 p2 t3 p3 total
# <dbl> <chr> <dbl> <chr> <dbl> <chr> <dbl> <int>
#1 1 a 0.98 b 0.02 NA NA 1
#2 2 a 1 NA NA NA NA 1
#3 3 b 0.5 a 0.25 c 0.25 3
#4 4 b 0.9 c 0.1 NA NA 2
#5 5 c 1 NA NA NA NA 1
答案 1 :(得分:0)
使用tidyverse
:
dat %>%
mutate(t1 = ifelse(p1 >= 0.05, paste(t1), NA), #Applying the condition
t2 = ifelse(p2 >= 0.05, paste(t2), NA),
t3 = ifelse(p3 >= 0.05, paste(t3), NA)) %>%
select(t1, t2, t3, id) %>%
gather(var, val, -id) %>% #Transforming data from wide to long format
group_by(id) %>% #Grouping by ID
mutate(total = n_distinct(val, na.rm = TRUE)) %>% #Calculating the number of unique values
spread(var, val) %>% #Transforming the data back to wide format
select(id, total) %>%
left_join(dat, by = c("id" = "id"), copy = FALSE) #Joining with the original data
id total t1 p1 t2 p2 t3 p3
<dbl> <int> <fct> <dbl> <fct> <dbl> <fct> <dbl>
1 1. 1 a 0.980 b 0.0200 <NA> NA
2 2. 1 a 1.00 <NA> NA <NA> NA
3 3. 3 b 0.500 a 0.250 c 0.250
4 4. 2 b 0.900 c 0.100 <NA> NA
5 5. 1 c 1.00 <NA> NA <NA> NA
或者:
dat %>%
group_by(id) %>%
mutate(total = paste(t1[p1 >= 0.05], #Applying the condition
t2[p2 >= 0.05],
t3[p3 >= 0.05]),
total = sum(str_count(rawToChar(unique(charToRaw(total))), letters))) #Counting the number of unique characters
id t1 p1 t2 p2 t3 p3 total
<dbl> <fct> <dbl> <fct> <dbl> <fct> <dbl> <int>
1 1. a 0.980 b 0.0200 <NA> NA 1
2 2. a 1.00 <NA> NA <NA> NA 1
3 3. b 0.500 a 0.250 c 0.250 3
4 4. b 0.900 c 0.100 <NA> NA 2
5 5. c 1.00 <NA> NA <NA> NA 1
总体上的唯一字符数:
dat %>%
mutate(t1 = ifelse(p1 >= 0.05, paste(t1), NA),
t2 = ifelse(p2 >= 0.05, paste(t2), NA),
t3 = ifelse(p3 >= 0.05, paste(t3), NA)) %>%
select(t1, t2, t3, id) %>%
gather(var, val, -id) %>%
mutate(overall = n_distinct(val, na.rm = TRUE)) %>%
spread(var, val) %>%
select(id, overall) %>%
left_join(dat, by = c("id" = "id"), copy = FALSE)
id overall t1 p1 t2 p2 t3 p3
1 1 3 a 0.98 b 0.02 <NA> NA
2 2 3 a 1.00 <NA> NA <NA> NA
3 3 3 b 0.50 a 0.25 c 0.25
4 4 3 b 0.90 c 0.10 <NA> NA
5 5 3 c 1.00 <NA> NA <NA> NA