Question

我在下面有一个示例数据框，其中记录了month和precip的每一天。

set.seed(560)
 df<-data.frame(month= rep(1:4, each=30), 
           precip= rep(c(rnorm(30, 20, 10), rnorm(30, 10, 2), 
           rnorm(30, 50, 1), rnorm(30, 15, 3))))

对于每个子集，我希望计算值为+/- 2标准偏差（sd）的实例数量高于或低于该月precip值的平均值。基本上我需要在值的分布极值（即分布的尾部）找到值。此结果列将被称为count。

此示例数据集的输出显示如下：

set.seed(560)
output<-data.frame(month= rep(1:4, each=1), count= c(1,2,1,1))

第1个月值高于35.969且值低于2.61的通知在平均值的+/- 2sd范围内。一个值（沉降= 41.1）符合此要求。证明：

sub1<- subset(df, month==1)
  v1<- mean(sub1$precip)+ 2*sd(sub1$precip)#35.969
  v2<- mean(sub1$precip)- 2*sd(sub1$precip)#2.61
sub2<- subset(df, month==2)
 v3<- mean(sub2$precip)+ 2*sd(sub2$precip)#13.89
 v4<- mean(sub2$precip)- 2*sd(sub2$precip)#7.35
sub3<- subset(df, month==3)
 v5<- mean(sub3$precip)+ 2*sd(sub3$precip)#51.83
 v6<- mean(sub3$precip)- 2*sd(sub3$precip)#48.308
sub4<- subset(df, month==4)
 v7<- mean(sub4$precip)+ 2*sd(sub4$precip)#18.69
 v8<- mean(sub4$precip)- 2*sd(sub4$precip)#9.39

我试过了：

 output<- 
 df %>%
 group_by(month)%>%
 summarise(count= sum(precip > (mean(precip)+(2*sd(precip)))& 
                      precip < (mean(precip)-(2*sd(precip))))))

Answer 1

非常简单的修复，将您的逻辑AND &更改为OR |，因为两个条件都不会有行。

output<- 
  df %>%
  group_by(month)%>%
  summarise(count= sum(precip > (mean(precip)+(2*sd(precip))) | 
                       precip < (mean(precip)-(2*sd(precip)))))

output
# A tibble: 4 x 2
#   month count
#   <int> <int>
# 1     1     1
# 2     2     2
# 3     3     2
# 4     4     1

使用by（dplyr::group_by()的对应方式）添加基础R解决方案

do.call(rbind, 
        by(df, df$month, FUN=function(i){
           tmp <- i[i$precip < mean(i$precip) - 2*sd(i$precip) |
                    i$precip > mean(i$precip) + 2*sd(i$precip),]

           return(data.frame(month=i$month[[1]], count=nrow(tmp)))
           })
        )

#   month count
# 1     1     1
# 2     2     2
# 3     3     2
# 4     4     1

或者，使用ave，ifelse和aggregate：

df$count <- ifelse(df$precip > ave(df$precip, df$month, FUN=function(g) mean(g) + 2*sd(g)) | 
                   df$precip < ave(df$precip, df$month, FUN=function(g) mean(g) - 2*sd(g)), 1, 0)

aggregate(count ~ month, df, FUN=sum)

#   month count
# 1     1     1
# 2     2     2
# 3     3     2
# 4     4     1

Answer 2

在基地R

tapply(df$precip, df$month, function(a) sum(abs(scale(a)) >= 2))

输出

1 2 3 4 
1 2 2 1

使用tidyverse，有条件地对每个子集内的分布求和值

2 个答案: