我有一个数据集,想要计算3个不同的条件
coins
和copay
列中包含“0”的行数 - 将任一列中的N / A计为0 coins.copay
列= 1但排除满足条件1的值的行数coins.copay
列= 0但排除满足条件1的值的行数以下是来自更大数据集的示例:
Plan Year Coins Copay Type Coins.Copay
A 2018 0 NA HMO 1
B 2018 10 NA HMO 1
C 2017 NA 0 SNP 0
D 2015 20 20 SNP 0
E 2016 20 0 HMO 1
F 2018 10 10 HMO 0
G 2016 NA NA HMO 0
H 2014 NA NA HMO 0
I 2012 NA 10 PPO 0
J 2011 0 0 HMO 0
K 2014 5 10 SNP 0
L 2013 10 NA HMO 1
因此,我希望得到以下数量(基于上述条件):
答案 0 :(得分:7)
这可以使用布尔逻辑非常有效地完成:
zeros_or_na <- (is.na(df$Coins) | !df$Coins) & (is.na(df$Copay) | !df$Copay)
sum(zeros_or_na) # [1] 5
sum(df$Coins.Copay & !zeros_or_na) # [1] 3
sum(!df$Coins.Copay & !zeros_or_na) # [1] 4
答案 1 :(得分:2)
一个选项可能是:
方法:
1.在内部条件下使用
&
因此决定会很快。2.筛选条件
2&3
,因此过滤器仅应用一次。3.使用过滤后的数据计算了一次条件
2&3
,从总行数中减去sum
个来计算 条件1
Excluded_conditino_one = which((!is.na(df$Coins) &
df$Coins) | (!is.na(df$Copay) & df$Copay))
coins.copay_1 = sum(df[Excluded_conditino_one,"Coins.Copay"]==1) #3
coins.copay_0 = sum(df[Excluded_conditino_one,"Coins.Copay"]==0) #4
Condition_One = length(df$Plan) - (coins.copay_1+coins.copay_0) #5
#Test
paste(Condition_One, coins.copay_1, coins.copay_0)
[1] "5 3 4"
Workbench性能分析:
CBarun <- function(df){
zeros_or_na <- (is.na(df$Coins) | !df$Coins) & (is.na(df$Copay) | !df$Copay)
Condition_One = sum(zeros_or_na) # [1] 5
coins.copay_1 = sum(df$Coins.Copay & !zeros_or_na) # [1] 3
coins.copay_0 = sum(!df$Coins.Copay & !zeros_or_na) # [1] 4
}
Masoud <- function(df){
Condition_One = length(which(rowSums(cbind(df$Coins, df$Copay), na.rm=T)==0)) #5
coins.copay_1 = length(which(rowSums(cbind(df$Coins, df$Copay), na.rm=T)!=0 &
df$Coins.Copay!=0)) #3
coins.copay_0 = length(which(rowSums(cbind(df$Coins, df$Copay), na.rm=T)!=0 &
df$Coins.Copay==0))
}
MKR <- function(df){
Excluded_conditino_one = which((!is.na(df$Coins) &
df$Coins) | (!is.na(df$Copay) & df$Copay))
coins.copay_1 = sum(df[Excluded_conditino_one,"Coins.Copay"]==1) #3
coins.copay_0 = sum(df[Excluded_conditino_one,"Coins.Copay"]==0) #4
Condition_One = length(df$Plan) - (coins.copay_1+coins.copay_0) #5
}
library(microbenchmark)
microbenchmark(CBarun(df),
#AyushNigam(),
Masoud(df),
MKR(df),
times = 10
)
# Unit: microseconds
# expr min lq mean median uq max neval
# CBarun(df) 60.790 61.185 71.7644 62.7645 67.896 137.370 10
# Masoud(df) 185.923 186.317 209.9624 201.3180 222.633 273.949 10
# MKR(df) 101.054 102.633 122.1330 106.1850 121.975 227.370 10
数据强>
df <- read.table(text =
"Plan Year Coins Copay Type Coins.Copay
A 2018 0 NA HMO 1
B 2018 10 NA HMO 1
C 2017 NA 0 SNP 0
D 2015 20 20 SNP 0
E 2016 20 0 HMO 1
F 2018 10 10 HMO 0
G 2016 NA NA HMO 0
H 2014 NA NA HMO 0
I 2012 NA 10 PPO 0
J 2011 0 0 HMO 0
K 2014 5 10 SNP 0
L 2013 10 NA HMO 1",
header = TRUE, stringsAsFactors = FALSE)
答案 2 :(得分:1)
另一种选择是将length
与rowSums
:
length(which(rowSums(cbind(df$Coins, df$Copay), na.rm=T)==0)) #5
length(which(rowSums(cbind(df$Coins, df$Copay), na.rm=T)!=0 & df$Coins.Copay!=0)) #3
length(which(rowSums(cbind(df$Coins, df$Copay), na.rm=T)!=0 & df$Coins.Copay==0)) #4