我对data.table
很新,但我想解决它的问题,因为我觉得它比使用“常规”data.frames快1000倍。
这是我的问题:
我有什么:
2 data.tables dt1
和dt2
如此:
dt1 <- data.table(SID=paste0("S", 1:15), Chromo=rep(1:3, e=5), PP=rep(1:5, 3), P1=0, P2=0, P3=0)
set.seed(17)
dt2 <- data.table(PID=rep(paste0("P", 1:3), c(2, 6, 3)), Chr=c(1, 3, 1, 1, 2, 3, 3, 3, 2, 2, 3), start= c(1, 1, 1, 4, 2, 1, 2, 4, 2, 4, 2), end=c(3, 4, 2, 5, 4, 1, 3, 5, 3, 5, 5), val=rnorm(11))
我想要的是什么:
根据dt1
和dt2[, val]
以及{{1},根据dt2[, PID]
和右边的行,在dt1[, Chromo]
的{{1}}内填充dt2[, Chr]
在dt1[, PP]
和dt2[, start]
之间。
我现在做的是什么: (至少可以说,这让我感到骄傲......)
dt2[, end]
输入表和所需的输出(除了我想要# preparing the tables, computing dt1 rows indices
dt2[, numcol:=(1:ncol(dt1))[match(dt2[,PID], colnames(dt1))]]
setkey(dt2, Chr, start, end)
setkey(dt1, Chromo, PP)
ind_start <- dt1[dt2[,.(Chr, start)], which=T]
ind_end <- dt1[dt2[,.(Chr, end)], which=T]
dt2[,c("ind_start", "ind_end"):=list(ind_start, ind_end)]
# and feeling I'm that close but can't conclude with `data.table` so doing this "lame" `for` loop with `data.frames`.......................
df1 <- as.data.frame(dt1)
df2 <- as.data.frame(dt2)
nr_seg <- nrow(df2)
for(i in 1:nr_seg){
df1[df2[i,"ind_start"]:df2[i,"ind_end"], df2[i,"numcol"]] <- df2[i, "val"]
}
) :
data.table
答案 0 :(得分:6)
library(data.table)
dt1 <- data.table(SID=paste0("S", 1:15), Chromo=rep(1:3, e=5), PP=rep(1:5, 3), P1=0, P2=0, P3=0)
set.seed(17)
dt2 <- data.table(PID=rep(paste0("P", 1:3), c(2, 6, 3)), Chr=c(1, 3, 1, 1, 2, 3, 3, 3, 2, 2, 3), start= c(1, 1, 1, 4, 2, 1, 2, 4, 2, 4, 2), end=c(3, 4, 2, 5, 4, 1, 3, 5, 3, 5, 5), val=rnorm(11))
dt1[, PP1 := PP]
dt1[, c("P1", "P2", "P3") := NULL]
setkey(dt2, Chr, start, end)
setkey(dt1, Chromo, PP, PP1)
res <- foverlaps(dt1, dt2, type="within")
res[is.na(PID), PID := "P1"] #to ensure that dcast works if there is no match
res <- dcast.data.table(res, SID + Chromo + PP ~ PID, value.var = "val")
setkey(res, Chromo, PP)
# SID Chromo PP P1 P2 P3
# 1: S1 1 1 -1.01500872 -0.2329870 NA
# 2: S2 1 2 -1.01500872 -0.2329870 NA
# 3: S3 1 3 -1.01500872 NA NA
# 4: S4 1 4 NA -0.8172679 NA
# 5: S5 1 5 NA -0.8172679 NA
# 6: S6 2 1 NA NA NA
# 7: S7 2 2 NA 0.7720908 0.2552370
# 8: S8 2 3 NA 0.7720908 0.2552370
# 9: S9 2 4 NA 0.7720908 0.3665811
#10: S10 2 5 NA NA 0.3665811
#11: S11 3 1 -0.07963674 -0.1656119 NA
#12: S12 3 2 -0.07963674 0.9728744 1.1807892
#13: S13 3 3 -0.07963674 0.9728744 1.1807892
#14: S14 3 4 -0.07963674 1.7165340 1.1807892
#15: S15 3 5 NA 1.7165340 1.1807892