我的数据
Chemical date concentration limit
A 01-01-2016 0.2 0.01
A 01-02-2016 0.2 0.01
A 01-01-2017 0.005 0.01
A 01-02-2017 0.2 0.01
B 01-01-2016 0.3 0.1
B 01-02-2016 0.05 0.1
B 01-01-2017 0.2 0.1
B 01-02-2017 0.2 0.1
C 01-01-2016 1.2 1
C 01-02-2016 0.8 1
C 01-01-2017 0.9 1
C 01-02-2017 0.9 1
我想显示每种化学物质超过其每年限量的百分比(请注意,每种限量都不同)。所以我想得到这样的东西
Year A B C
2016 100% 50% 50%
2017 50% 100% 0
我已经有了代码来计算每种化学物质每年超过的次数,但是当计算百分比时,我会弄错了。
这我要数一遍。
library(tidyverse)
counts<- data %>%
group_by(Chemical, grp = format(date, format = '%Y')) %>%
mutate(exceed = concentration >= limit) %>% # TRUE/FALSE
summarise(tot_exceed = sum(exceed)) %>% # count each T/F
spread(Chemical, tot_exceed, fill = 0)
所以我明白了
Year A B C
2016 2 1 1
2017 1 2 0
对于百分比,我尝试了。
percentage_exceed<- data %>%
group_by(Chemical, grp = format(date, format = '%Y')) %>%
mutate(exceed = concentration >= limit, countconc = length(concentration))
%>%
summarise(percent = (sum(exceed)/countconc)*100) %>%
spread(Chemical, percent, fill = 0)
但是我没有得到想要的结果。你能帮我吗?
答案 0 :(得分:2)
使用tidyverse
:
library(tidyverse)
library(lubridate)
data %>%
mutate(yr=mdy(date) %>% year) %>%
group_by(Chemical,yr) %>%
mutate(exceed = ifelse(concentration>=limit,1,0 )) %>%
summarise(tot_exceed =sum(exceed)) %>%
group_by(Chemical) %>%
mutate(proc=tot_exceed/max(tot_exceed)*100) %>%
select(-tot_exceed) %>%
spread(Chemical,proc)
# A tibble: 2 x 4
yr A B C
<dbl> <dbl> <dbl> <dbl>
1 2016 100 50 100
2 2017 50 100 0
答案 1 :(得分:2)
dt = read.table(text = "
Chemical date concentration limit
A 01-01-2016 0.2 0.01
A 01-02-2016 0.2 0.01
A 01-01-2017 0.005 0.01
A 01-02-2017 0.2 0.01
B 01-01-2016 0.3 0.1
B 01-02-2016 0.05 0.1
B 01-01-2017 0.2 0.1
B 01-02-2017 0.2 0.1
C 01-01-2016 1.2 1
C 01-02-2016 0.8 1
C 01-01-2017 0.9 1
C 01-02-2017 0.9 1
", header=T)
library(tidyverse)
library(lubridate)
dt %>%
mutate(year = year(dmy(date))) %>%
group_by(year, Chemical) %>%
summarise(Total = n(),
Num_exceed = sum(concentration >= limit)) %>%
ungroup() %>%
mutate(Prc = paste0(Num_exceed / Total * 100,"%")) %>%
select(year, Chemical, Prc) %>%
spread(Chemical, Prc)
# # A tibble: 2 x 4
# year A B C
# <dbl> <chr> <chr> <chr>
# 1 2016 100% 50% 50%
# 2 2017 50% 100% 0%
答案 2 :(得分:0)
您的方法非常好,您只需将sum
替换为mean
,然后乘以100:
data %>% group_by(Chemical, grp = format(date, format = '%Y')) %>%
mutate(exceed = concentration >= limit) %>%
summarise(tot_exceed = mean(exceed) * 100) %>%
spread(Chemical, tot_exceed, fill = 0)
# A tibble: 2 x 4
# grp A B C
# <chr> <dbl> <dbl> <dbl>
# 1 2016 100 50 50
# 2 2017 50 100 0
您尝试的线路
summarise(percent = (sum(exceed)/countconc) * 100)
几乎可以做到这一点:错误是因为countconc
是整列而不是单个值(汇总所需)。因此,由于无论如何它都是每个组中的恒定列,因此您可以编写例如
summarise(percent = (sum(exceed)/countconc[1]) * 100)
但是在前面的那一行,
mutate(exceed = concentration >= limit, countconc = length(concentration))
这最终只是一个意思,所以我们回到答案开头的代码。
还要注意,使用lubridate
可以将第一行写为
data %>% group_by(Chemical, Year = year(date)) %>%
某些东西非常简洁,但可能不是您想要的格式
data %>% group_by(Chemical, Year = year(date)) %>%
summarise(Percentage = mean(concentration > limit) * 100)
# A tibble: 6 x 3
# Groups: Chemical [?]
# Chemical Year Percentage
# <fct> <dbl> <dbl>
# 1 A 2016 100
# 2 A 2017 50
# 3 B 2016 50
# 4 B 2017 100
# 5 C 2016 50
# 6 C 2017 0
答案 3 :(得分:0)
使用tidyverse
和reshape2
,您可以执行以下操作:
df %>%
mutate(date = str_sub(as.character(date), 7, 10)) %>% #Taking out the year from "date"
group_by(date, Chemical) %>% #Group by "date" and "Chemical"
summarise(temp = length(concentration[concentration > limit])/n()*100) %>% #Applying the condition
dcast(date~Chemical, value.var = "temp") #Reshaping the data
date A B C
1 2016 100 50 50
2 2017 50 100 0
或者仅使用tidyverse
的{{1}}
spread()