使用data.table加速行添加

时间:2014-05-13 14:07:31

标签: r rows data.table subset

我在一个大数据集(10 000 * 25 000)中读到了来自data.table的fread,它很快就超速了。现在我需要用它做一些基本的算术,相比之下,它很慢。我想知道是否有人有一个建议我可能做错了什么/什么是一个很好的调整来加快它(我必须做10 000次,所以每一项改进都很重要!)。我想随机选择两个ngeno行,添加它们,并且只保留不是全部为0(或2)的列,例如在

  mytable = matrix(c(0, 0, 1, 2,
                     0, 1, 2, 2), ncol=4))

第1列和第4列需要删除(分别在第一时间不添加;通常以这种方式删除40%的cols)。到目前为止我有什么

# Test data
nrow = 1000
ncol = 10000
ngeno = 2000
require(data.table)
dat <- data.table(matrix(rbinom(nrow*ncol, 2, 0.001), ncol=ncol))

# What I do
myway <- function(dat, nrow, ngeno) {
  set.seed(123)
  # Choose random rows
  haplo1 <- sample.int(nrow, ngeno, replace=T)
  haplo2 <- sample.int(nrow, ngeno, replace=T)
  # Add them up
  geno <- dat[haplo1,] + dat[haplo2,]
  rec.names <- names(dat)
  maf <- colMeans(geno)/2
  # throw out columns where every row has a 0 or a 2
  throw.out <- maf==0 | maf == 1
  rec.names <- rec.names[!throw.out]
  maf <- maf[!throw.out]
  geno <- subset(geno, select = rec.names)
  return(list(maf, geno))
}

myway2 <- function(dat, nrow, ngeno) {
  set.seed(123)
  haplo1 <- sample.int(nrow, ngeno, replace=T)
  haplo2 <- sample.int(nrow, ngeno, replace=T)
  dat <- data.table(t(dat))
  geno <- dat[,haplo1,with=F] + dat[,haplo2,with=F]
  geno <- data.table(t(geno))
  maf <- colMeans(geno)/2
  throw.out <- maf==0 | maf == 1
  maf <- maf[!throw.out]
  geno <- geno[, which(!throw.out), with=F]
  return(list(maf, geno))
}

eddisway <- function(dat, nrow, ngeno) {
  set.seed(123)
  dat.m <- as.matrix(dat)
  haplo1 <- sample.int(nrow, ngeno, replace=T)
  haplo2 <- sample.int(nrow, ngeno, replace=T)
  geno <- dat.m[haplo1,] + dat.m[haplo2,]
  maf <- colMeans(geno)/2
  throw.out <- maf==0 | maf == 1
  maf <- maf[!throw.out]
  geno <- geno[,!throw.out]
  return(list(maf, geno))
}

require(reshape2)
rolandsway <- function(dat, nrow, ngeno) {
  set.seed(123)
  dat1 <- melt(dat, variable.factor=FALSE)

  # Choose random rows
  haplo1 <- sample.int(nrow, ngeno, replace=T)
  haplo2 <- sample.int(nrow, ngeno, replace=T)

  geno <- dat1[, value[haplo1]+value[haplo2], by=variable]
  maf <- geno[, mean(V1)/2, by=variable]  

  maf <- maf[!(V1==0 | V1==1),]
  setkey(geno, variable)
  geno <- geno[maf[, variable],]
#   geno[,"v2":=rep(1:dim(dat)[1],dim(maf)[1]),with=F]
#   test <- dcast.data.table(geno, v2 ~ variable)
  return(list(maf, geno))
}

# Warning messages from Roland's method:
#   1: In melt.data.table(dat, variable.factor = FALSE) :
#   To be consistent with reshape2's melt, id.vars and measure.vars are internally guessed when both are 'NULL'. All non-numeric/integer/logical type columns are conisdered id.vars, which in this case are columns ''. Consider providing at least one of 'id' or 'measure' vars in future.

require(microbenchmark)
out <- microbenchmark(myway(dat, nrow, ngeno), myway2(dat, nrow, ngeno), eddisway(dat, nrow, ngeno), rolandsway(dat, nrow, ngeno), times=5)

到目前为止的结果:

# Unit: seconds
#                               expr      min       lq   median       uq      max neval
# myway(dat, nrow, ngeno)      3.764377 3.804865 3.841819 3.924095 4.203679    10
# myway2(dat, nrow, ngeno)     3.595477 3.681658 3.703837 3.784004 3.851407    10
# eddisway(dat, nrow, ngeno)   1.388514 1.414389 1.438111 1.479081 1.574927    10
# rolandsway(dat, nrow, ngeno) 2.253587 2.299850 2.390655 2.579183 2.633778    10

分析myway让我有点像

$by.self
                        self.time self.pct total.time total.pct
"[["                         0.94     18.8       2.56      51.2
"[[.data.frame"              0.54     10.8       1.62      32.4
"match"                      0.48      9.6       0.92      18.4
"[.data.table"               0.40      8.0       2.84      56.8
"Ops.data.frame"             0.34      6.8       1.44      28.8
"setattr"                    0.24      4.8       2.18      43.6
"<Anonymous>"                0.18      3.6       0.46       9.2

$by.total
                           total.time total.pct self.time self.pct
"myway"                          5.00     100.0      0.00      0.0
"[.data.table"                   2.84      56.8      0.40      8.0
"["                              2.84      56.8      0.00      0.0
"[["                             2.56      51.2      0.94     18.8
"alloc.col"                      2.26      45.2      0.06      1.2
"setattr"                        2.18      43.6      0.24      4.8
"+"                              1.96      39.2      0.00      0.0

由于“+”仅是从顶部开始的第7个(并且大部分时间都是添加的行),我认为还有改进的余地。我尝试使用subset将haplo1和haplo2转换为布尔值,但这甚至需要更长的时间。编译也无济于事。任何建议如何使它更快或这已经是行的结束?

更新

我更新了功能(另外,我修正了一个错字)。直到现在我才能用Roland的方法获得相同的输出格式。但即使没有它,Eddi的建议似乎更快。还有其他想法吗?

1 个答案:

答案 0 :(得分:1)

melt长格式可以将速度提高2倍,但也会改变输出格式:

myway1 <- function(dat, nrow, ngeno) {

  dat1 <- melt(dat, variable.factor=FALSE)

  # Choose random rows
  haplo1 <- sample.int(nrow, ngeno, replace=T)
  haplo2 <- sample.int(nrow, ngeno, replace=T)

  geno <- dat1[, value[haplo1]+value[haplo2], by=variable]
  maf <- geno[, mean(V1)/2, by=variable]  

  maf <- maf[!(V1==0 | V1==1),]
  setkey(dat1, variable)
  geno <- dat1[maf[, variable],]
  return(list(maf, geno))
}

应该可以进一步改进。