给定data.table定义如下:
n <- 34916339
ds2 <- data.table(X=rep(as.integer(NA),n), Y=rep(as.integer(NA),n),
LAT=rep(as.numeric(NA),n), LON=rep(as.numeric(NA),n),
FCT_DATE=rep(as.Date(NA),n), VAR=rep(as.character(NA),n),
TYPE=rep(as.character(NA),n), VALUE=rep(as.numeric(NA),n))
以下代码太慢了。
ds = data.table dim = 572399 x 66
colNames <- rep("any_string",66) # only an example
for (i in 1:nrow(ds)) {
for (j in 6:66) {
colName <- colNames[j]
colName.split <- strsplit(colName, "_") # Split the elements by "_"
k <- ((i-1) * length(colIndex))+(j-5) # creates 61 lines each complete loop
ds2[k,6] <- colName.split[[1]][1]
ds2[k,7] <- colName.split[[1]][2] # so, it reads 61 cols from ds
ds2[k,8] <- ds[i,get(colName)] # and creates 61 lines in ds2
}
}
有谁知道如何改进此代码?特别是,cols 6,7和8的属性很慢。我试图将66列data.table ds转换为8列data.table。
提前致谢。
已编辑:
# Building of an example of the data.table ds (the faster way I know for the moment)
ds <- data.table(1:nds,1:nds,rep(3.3,nds),rep(4.4,nds),rep(as.Date("2014-08-16"),nds))
for (i in 1:61) {
ds <- cbind(ds,rep(i+i/10,nds))
}
# setting the real names
names.ds <- c("X","Y","LAT","LON","FCT_DATE",
"UVES_01N","VVES_01N","PSNM_01N","PREC_01N","UVES_01P","VVES_01P","PSNM_01P","PREC_01P",
"UVES_02N","VVES_02N","PSNM_02N","PREC_02N","UVES_02P","VVES_02P","PSNM_02P","PREC_02P",
"UVES_03N","VVES_03N","PSNM_03N","PREC_03N","UVES_03P","VVES_03P","PSNM_03P","PREC_03P",
"UVES_04N","VVES_04N","PSNM_04N","PREC_04N","UVES_04P","VVES_04P","PSNM_04P","PREC_04P",
"UVES_05N","VVES_05N","PSNM_05N","PREC_05N","UVES_05P","VVES_05P","PSNM_05P","PREC_05P",
"UVES_06N","VVES_06N","PSNM_06N","PREC_06N","UVES_06P","VVES_06P","PSNM_06P","PREC_06P",
"UVES_07N","VVES_07N","PSNM_07N","PREC_07N","UVES_07P","VVES_07P","PSNM_07P","PREC_07P",
"UVES_AVN","VVES_AVN","PSNM_AVN","PREC_AVN","PREC_OBS")
setnames(ds, old=1:66, new=names.ds)
我的目标是将其转换为data.table,如下所示:
X Y LAT LON FCT_DATE VAR TYPE VALUE
1: 312 54 -39.7401 -68.4375 2009-01-02 UVES 01N 0.63
2: 312 54 -39.7401 -68.4375 2009-01-02 VVES 01N -3.17
3: 312 54 -39.7401 -68.4375 2009-01-02 PSNM 01N 1019.52
...
34916339: 341 83 -39.7401 -68.4375 2009-01-02 PREC OBS 0.50
答案 0 :(得分:5)
我认为你正试图重新发明轮子。这有效:
library(reshape2)
ds2 <- melt(ds, 1:5, variable.name = "VAR", value.name = "VALUE")
ds2[, VAR := as.character(VAR)]
ds2[, `:=`(TYPE = sub(".*_", "", VAR), VAR = sub("_.*", "", VAR))]
只有1,000,000行 相当慢(在MacBook Pro w / OS 10.9,2.8 GHz i7上):
# user system elapsed
# 73.373 1.398 74.809
但至少它简约易懂。你也没有说过慢得多慢#34;是的,所以我不知道这是否有所改善。基于strsplit
的解决方案花费的时间更长(> 100秒),并且stringr::str_match_all
的时间更长。
答案 1 :(得分:4)
这是一种更快捷的方法。另一个答案为每行调用sub(...)
两次。没有必要这样做,因为这些只是列名,而且只有66个。使用代码nds <- 1e6
创建ds
,下面的代码运行速度提高了约20倍。
library(reshape2)
# code from other answer
system.time({
ds2 <- melt(ds, 1:5, variable.name = "VAR", value.name = "VALUE")
ds2[, VAR := as.character(VAR)]
ds2[, `:=`(TYPE = sub(".*_", "", VAR), VAR = sub("_.*", "", VAR))]
})
# user system elapsed
# 239.43 1.05 240.78
# this code does not call sub(...) 2 million times
system.time({
cn <- strsplit(colnames(ds)[6:66],"_")
ds3 <- melt(ds,1:5,variable.name="VAR",value.name="VALUE")
ds3[,":="(VAR =rep(sapply(cn,"[",1),each=nrow(ds)),
TYPE=rep(sapply(cn,"[",2),each=nrow(ds)))]
})
# user system elapsed
# 13.87 8.96 22.83
identical(ds2,ds3)
# [1] TRUE