我有一个大数据帧(mydata
)(5.2G:6000rows,230,000列),我需要从中为一些列配置,这些列的名称与注释文件中Name
列中的值匹配({ {1}}),而不是将其转换为annot
。我读到data.tables应该更快,所以我试图将我的脚本转换为matrix
而不是mydata
而不是data.table
。
到目前为止,当我将原始数据转换为data.table并且我试图理解为什么时,我并不觉得处理速度有所提高。
data.frame
mydata=read.table(text="IID A B E G H W Z D N K
1 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0
4 0 0 0 1 0 0 0 0 0 0
5 0 0 0 0 0 0 0 0 0 0
6 0 0 2 0 2 0 0 0 1 1
7 0 2 0 1 0 0 0 0 0 0
8 0 0 2 0 0 0 0 0 0 0
9 0 0 0 0 2 0 0 0 1 0
10 1 0 0 0 0 0 2 0 0 2
11 0 0 0 0 0 0 0 0 0 0
12 0 0 0 1 0 0 0 0 0 0
13 2 0 0 0 0 0 0 0 2 1
14 0 0 0 0 0 0 2 0 0 0
15 1 0 0 0 0 0 0 0 0 0
16 0 0 0 1 0 0 0 0 0 1
17 0 0 0 0 0 0 0 0 0 0
18 0 0 0 0 0 0 0 0 0 0
19 0 0 0 0 0 0 0 0 0 0
20 0 0 0 0 0 0 0 0 0 0",h=T)
annot=read.table(text="Name Gene
A Gene1
B Gene2
E Gene3
G Gene4
H Gene5
W Gene6
Z Gene7
D Gene8
N Gene9
K Gene10",stringsAsFactors=F,h=T)
genes = c("Gene2","Gene4","Gene9")
相同的代码,但这次#as DATA FRAME
start <- proc.time()
annot1 = annot[which(annot$Gene %in% genes),]
mydata1=mydata[,c(1,which(colnames(mydata) %in% annot1$Name))]
mydata1=mydata1[order(mydata1$IID),]
genomxwork=as.matrix(mydata1[,2:dim(mydata1)[2]])
df_time <- proc.time() - start
df_time
> utilisateur système écoulé
> 0.00 0.00 0.24
转换为mydata
:
data.table
在这个例子中,差异显然不是很显着,但是当我有几千列需要提取时,我的实际数据上升得非常快。
这是90,000列:
#as DATA TABLE
mydataDT=as.data.table(mydata)
annotDT=as.data.table(annot)
start <- proc.time()
annotDT1 = annotDT[which(Gene %in% genes),,]
mydataDT1=mydataDT[,c(1,which(colnames(mydataDT) %in% annot1$Name)),with=F]
mydataDT1=mydataDT1[order(IID),,]
genomxworkDT=as.matrix(mydatDT1[,2:dim(mydataDT1)[2]])
dt_time <- proc.time() - start
dt_time
> utilisateur système écoulé
> 0.00 0.00 0.25
答案 0 :(得分:1)
建议的各种评论
microbenchmark
包,data.table
代码,(此外,人们普遍关注用于基准测试的数据集的小尺寸以及生产数据集的布局,其列数比行数多40倍。)
但是,我使用microbenchmark
和
Unit: microseconds
expr min lq mean median uq max neval
df_orig 192.944 218.2420 235.02202 225.7940 236.9325 394.950 100
dt_orig 1012.672 1038.1590 1104.42052 1063.2675 1093.2855 3483.561 100
dt2 962.454 984.7315 1040.32245 1001.3445 1026.8315 3130.523 100
df2 47.953 53.2400 64.45366 63.0565 65.6995 217.109 100
mat 2.644 4.5310 6.46469 6.4190 7.1750 51.352 100
使用给出20行和11列的小数据集,矩阵版本比简化的data.frame版本快十倍,比原始data.frame版本快4倍。对于这种简单的数据检索任务,不使用data.table
的强点,例如,不进行复制的更新。因此,毫无疑问,这个玩具示例的开销占主导地位。
microbenchmark::microbenchmark(
df_orig = {
annot1 = annot[which(annot$Gene %in% genes),]
mydata1=mydata[,c(1,which(colnames(mydata) %in% annot1$Name))]
mydata1=mydata1[order(mydata1$IID),]
genomxwork=as.matrix(mydata1[,2:dim(mydata1)[2]])
},
dt_orig = {
annotDT1 = annotDT[which(Gene %in% genes),,]
mydataDT1=mydataDT[,c(1,which(colnames(mydataDT) %in% annot1$Name)),with=F]
mydataDT1=mydataDT1[order(IID),,]
genomxworkDT=as.matrix(mydataDT1[,2:dim(mydataDT1)[2]])
},
dt2 = {
genomxworkDT <- as.matrix(mydataDT[
, .SD, .SDcols = annotDT[J(genes), on = "Gene"]$Name])
},
df2 = {
genomxwork <- as.matrix(mydata0[, names(annot_vec)[annot_vec %in% genes]])
},
mat = {
genomxwork <- mat[, names(annot_vec)[annot_vec %in% genes]]
},
times = 100L
)
数据转换为data.table和matrix,分别在基准测试之外完成,因为在生产环境中可能也是如此。
mydata=read.table(text="IID A B E G H W Z D N K
1 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0
4 0 0 0 1 0 0 0 0 0 0
5 0 0 0 0 0 0 0 0 0 0
6 0 0 2 0 2 0 0 0 1 1
7 0 2 0 1 0 0 0 0 0 0
8 0 0 2 0 0 0 0 0 0 0
9 0 0 0 0 2 0 0 0 1 0
10 1 0 0 0 0 0 2 0 0 2
11 0 0 0 0 0 0 0 0 0 0
12 0 0 0 1 0 0 0 0 0 0
13 2 0 0 0 0 0 0 0 2 1
14 0 0 0 0 0 0 2 0 0 0
15 1 0 0 0 0 0 0 0 0 0
16 0 0 0 1 0 0 0 0 0 1
17 0 0 0 0 0 0 0 0 0 0
18 0 0 0 0 0 0 0 0 0 0
19 0 0 0 0 0 0 0 0 0 0
20 0 0 0 0 0 0 0 0 0 0",h=T)
mydataDT=as.data.table(mydata)
mat <- as.matrix(mydata[order(mydata$IID), ])
annot=read.table(text="Name Gene
A Gene1
B Gene2
E Gene3
G Gene4
H Gene5
W Gene6
Z Gene7
D Gene8
N Gene9
K Gene10",stringsAsFactors=F,h=T)
annotDT=as.data.table(annot)
annot_vec <- setNames(annot$Gene, annot$Name)
genes = c("Gene2","Gene4","Gene9")