我有一个相当大的数据帧(536 x 46000),我想计算每列的次要元素的频率。示例数据摘要如下:
require(dplyr)
m1 <- c(0:2,NA,0:2,NA)
m2 <- c(NA, NA, 0:2,NA, 0, 2)
m3 <- c(0,1,1,1,2,0,NA,2)
g1 <- seq(1:20)
dat <- as.data.frame(cbind(g1,m1,m2,m3))
dat$g1 <- as.factor(dat$g1)
dat
g1 m1 m2 m3
1 0 NA 0
2 1 NA 1
3 2 0 1
. . . .
. . . .
我写了一个函数来吐出次要元素的频率。该函数只计算每个元素的出现次数,并将次要元素除以NA之外的所有元素的总和:
maf.fun <- function(x) {
m0 <- length(which(x == 0))
m1 <- length(which(x == 1))
m2 <- length(which(x == 2))
MAF <- min(m0,m1,m2)/sum(m0,m1,m2)
MAF
}
然后使用summarize_each
包中的dplyr
来获取次要元素的频率:
MAF <- summarise_each(dat[,-1], funs(maf.fun))
输出:
m1 m2 m3
0.3333333 0.2 0.2857143
现在,当数据集较小时,这一切都快速而且很好,但是,对于庞大的数据框(数千列),代码变得非常缓慢。
是否有有效的方法以有效的方式进行此类计算?
答案 0 :(得分:2)
这是一个选项:
fnc = function(x) min(table(x))/sum(!is.na(x))
dat %>%
summarise_each(funs(fnc), -g1)
然而,这比样本数据上的maf.fun
要慢得多。另一方面,如果数据可以采用许多唯一值(而不仅仅是0,1和2),maf.fun
不灵活,并且可能需要大量输入才能处理超过一些独特的价值观。所以这是一个处理具有任意数量唯一值的向量的函数。此函数的运行速度几乎与maf.fun
一样快,但无论唯一值的数量如何都可以运行:
maf.fun2 = function(vec) {
min(sapply(na.omit(unique(vec)), function(j) sum(vec==j, na.rm=TRUE)))/sum(!is.na(vec))
}
dat %>% summarise_each(funs(maf.fun2), -g1)
基本R等价物是:
sapply(dat[, -1], maf.fun2)
以下是较大数据框的一些时间。请注意,基本sapply
比dplyr
summarise_each
快,而对于数据框越大,fnc
,maf.fun
之间没有太大区别,和maf.fun2
:
536行x 1,000列; 3个可能的列值,加上NA
:
set.seed(10)
dat = data.frame(g1=1:536, replicate(1000, sample(c(0:2,NA), 536, replace=TRUE)))
Unit: seconds expr min lq mean median uq max neval cld dplyr_maf.fun 0.48 0.49 0.50 0.50 0.52 0.53 5 b dplyr_fnc 0.80 0.82 0.84 0.84 0.86 0.86 5 d dplyr_maf.fun2 0.56 0.57 0.59 0.60 0.60 0.62 5 c sapply_maf.fun2 0.10 0.10 0.11 0.11 0.11 0.12 5 a
536行x 1,000列; 100个可能的列值,加上NA
:
set.seed(10)
dat = data.frame(g1=1:536, replicate(1000, sample(c(1:100,NA), 536, replace=TRUE)))
Unit: seconds expr min lq mean median uq max neval cld dplyr_fnc 0.90 0.91 0.92 0.92 0.92 0.93 5 b dplyr_maf.fun2 0.99 1.02 1.05 1.03 1.09 1.11 5 c sapply_maf.fun2 0.52 0.54 0.56 0.56 0.57 0.62 5 a
答案 1 :(得分:1)
作为评论的替代方法,此处尝试使用#include <stdio.h>
char name[20];
float avail_bal;
void options();
void open();
void list();
void deposit();
void withdraw();
void exit();
int main(void)
{
char option;
while(1){
printf("****Banking System WELCOME****\n");
printf("Enter 1-5 of the following options: \n");
option = getchar();
scanf("%c\n", &option);
switch(option)
{
case '1': open();
break;
case '2': list();
break;
case '3': deposit();
break;
case '4': withdraw();
break;
case '5': return 0;
default: exit();
break;
}
}
return 0;
}
void options()
{
printf("1. Open Account\n");
printf("2. List Accounts\n");
printf("3. Deposit\n");
printf("4. Withdraw\n");
printf("5. Exit");
}
void open()
{
float avail_bal = 0;
char name[20];
int acc_num;
printf("Open new account(enter number 1-5)\n\n");
scanf("%d", &acc_num);
printf("Account number: %d\n");
printf("Available balance: %f\n");
}
void list()
{
}
void deposit()
{
float add;
int acc_num;
printf("Which count do you want to deposit money in?");
scanf(" %d", &acc_num);
printf("Amount to deposit: ");
scanf("%f", &add);
while()
{
}
}
void withdraw()
{
int acc_num;
float withdraw;
printf("Account to withdraw from: ");
scanf("%d", &acc_num);
printf("Amount to withdraw from account: ")
scanf("%f", &withdraw);
while()
{
printf("Current balance for account %d: %f ");
break;
} acc_num++
}
来加快您的计算:
<body>
<div><!--add a div here-->
<div class="selectContainer">
...
<!--then end it here-->
</div>
<br />
<div class="row" style="padding: 0;">
<label class="small">Country</label>
答案 2 :(得分:1)
主要因素是您用于在每列上应用的函数,而不是自定义函数本身。以下是一些基准测试:
library(dplyr)
library(data.table)
library(microbenchmark)
dat1 <- as.data.table(dat)
cols <- colnames(dat1)[2:length(dat1)]
# wheatSingh
maf.fun1 <- function(x) {
m0 <- length(which(x == 0))
m1 <- length(which(x == 1))
m2 <- length(which(x == 2))
MAF <- min(m0,m1,m2)/sum(m0,m1,m2)
MAF
}
# lmo1
maf.fun2 <- function(x) {
m0 <- sum(x == 0, na.rm = T)
m1 <- sum(x == 1, na.rm = T)
m2 <- sum(x == 2, na.rm = T)
MAF <- min(m0,m1,m2)/sum(m0,m1,m2)
MAF
}
# lmo2
maf.fun3 <- function(x) {
myTable <- table(x)
myTable <- myTable[names(myTable) %in% c("0", "1", "2")]
min(myTable) / sum(myTable)
}
# sumedh
maf.fun4 <- function(x) {
x1 <- tabulate(x + 1)
x1 <- x1[x1!=0]
x2 <- min(x1)/sum(x1)
return(x2)
}
# eipi10 1
maf.fun5 <- function(x) {
min(table(x))/sum(!is.na(x))
}
# eipi10 2
maf.fun6 <- function(vec) {
min(sapply(na.omit(unique(vec)), function(j) sum(vec==j, na.rm=TRUE)))/sum(!is.na(vec))
}
# summarise each
wheatSingh_each <- function(x) summarise_each(x, funs(maf.fun1), -g1)
lmo1_each <- function(x) summarise_each(x, funs(maf.fun2), -g1)
lmo2_each <- function(x) summarise_each(x, funs(maf.fun3), -g1)
sumedh_each <- function(x) summarise_each(x, funs(maf.fun4), -g1)
eipi10_each <- function(x) summarise_each(x, funs(maf.fun5), -g1)
eipi10_each2 <- function(x) summarise_each(x, funs(maf.fun6), -g1)
microbenchmark(wheatSingh_each(dat), lmo1_each(dat), lmo2_each(dat),
sumedh_each(dat), eipi10_each(dat), eipi10_each2(dat), unit = "ms")
Unit: milliseconds
expr min lq mean median uq max neval
wheatSingh_each(dat) 1.260625 1.292623 1.385346 1.332168 1.414579 3.071865 100
lmo1_each(dat) 1.258813 1.288095 1.387961 1.362054 1.430579 2.224808 100
lmo2_each(dat) 1.782865 1.826939 1.962498 1.909652 2.009874 2.580416 100
sumedh_each(dat) 1.270888 1.298057 1.431485 1.353300 1.416994 3.170276 100
eipi10_each(dat) 1.700756 1.752377 1.896515 1.845957 1.921728 3.758326 100
eipi10_each2(dat) 1.425448 1.482200 1.606445 1.555556 1.628910 3.496904 100
# sapply
wheatSingh_sapply <- function(x) sapply(x, maf.fun1)
lmo1_sapply <- function(x) sapply(x, maf.fun2)
lmo2_sapply <- function(x) sapply(x, maf.fun3)
sumedh_sapply <- function(x) sapply(x, maf.fun4)
eipi10_sapply <- function(x) sapply(x[, names(dat) != "g1"], maf.fun5)
eipi10_sapply2 <- function(x) sapply(x[, names(dat) != "g1"], maf.fun6)
microbenchmark(wheatSingh_sapply(dat[,-1]), lmo1_sapply(dat[,-1]),
lmo2_sapply(dat[,-1]), sumedh_sapply(dat[,-1]),
eipi10_sapply(dat), eipi10_sapply2(dat), unit = "ms")
Unit: milliseconds
expr min lq mean median uq max neval
wheatSingh_sapply(dat[, -1]) 0.061583 0.0664130 0.07586755 0.0760730 0.0802995 0.114712 100
lmo1_sapply(dat[, -1]) 0.054942 0.0597720 0.06603859 0.0639975 0.0700350 0.095393 100
lmo2_sapply(dat[, -1]) 0.482394 0.5062425 0.52361843 0.5216380 0.5337130 0.607370 100
sumedh_sapply(dat[, -1]) 0.063395 0.0694320 0.07805922 0.0754695 0.0812045 0.118336 100
eipi10_sapply(dat) 0.420812 0.4431510 0.46422176 0.4603580 0.4787725 0.636954 100
eipi10_sapply2(dat) 0.171464 0.1880680 0.20320984 0.1965200 0.2155380 0.329646 100
# data.table
wheatSingh_dt <- function(x) t(x[,.(ans=lapply(.SD, maf.fun1)),.SDcols=cols])
lmo1_dt <- function(x) t(x[,.(ans=lapply(.SD, maf.fun2)),.SDcols=cols])
lmo2_dt <- function(x) t(x[,.(ans=lapply(.SD, maf.fun3)),.SDcols=cols])
sumedh_dt <- function(x) t(x[,.(ans=lapply(.SD, maf.fun4)),.SDcols=cols])
eipi10_dt <- function(x) t(x[,.(ans=lapply(.SD, maf.fun5)),.SDcols=cols])
eipi10_dt2 <- function(x) t(x[,.(ans=lapply(.SD, maf.fun6)),.SDcols=cols])
microbenchmark(wheatSingh_dt(dat1), lmo1_dt(dat1), lmo2_dt(dat1), sumedh_dt(dat1),
eipi10_dt(dat1), eipi10_dt2(dat1), unit = "ms")
Unit: milliseconds
expr min lq mean median uq max neval
wheatSingh_dt(dat1) 0.737780 0.7700795 0.8260051 0.8050970 0.8467555 1.307717 100
lmo1_dt(dat1) 0.717856 0.7773255 0.8248158 0.8093235 0.8401145 1.397071 100
lmo2_dt(dat1) 1.232248 1.2971515 1.3635452 1.3454505 1.4046180 2.021950 100
sumedh_dt(dat1) 0.737176 0.7743060 0.8260775 0.8096255 0.8527940 1.364469 100
eipi10_dt(dat1) 1.159195 1.2156455 1.3718648 1.2548890 1.3385075 4.757527 100
eipi10_dt2(dat1) 0.869397 0.9095455 0.9691232 0.9512035 1.0082580 1.246738 100
注意:我的解决方案(maf.fun4
使用tabulate
)仅在列中的值为整数时才有效
@ eipi10提供的较大数据框的结果:
set.seed(10)
dat = data.frame(g1=1:536, replicate(1000, sample(c(1:100,NA), 536, replace=TRUE)))
sumedh_sapply <- function(x) sapply(x, maf.fun4)
eipi10_sapply2 <- function(x) sapply(x, maf.fun6)
identical(sumedh_sapply(dat[,-1]), eipi10_sapply2(dat[,-1]))
[1] TRUE
microbenchmark(sumedh_sapply(dat[,-1]), eipi10_sapply2(dat[,-1]), unit = "s")
Unit: seconds
expr min lq mean median uq max neval cld
sumedh_sapply(dat[, -1]) 0.01308923 0.01393871 0.01615033 0.01714913 0.01766564 0.02056302 100 a
eipi10_sapply2(dat[, -1]) 0.40788421 0.42277774 0.44252427 0.42845991 0.43098025 0.56735431 100 b