以下代码生成所需的输出。然而,缺乏矢量化意味着它运行得非常慢。我怎样才能加快速度?
我将dput
结果放在部分指示性数据中。
输入dput
s
StandRef
输入
structure(list(id = structure(c(43L, 50L, 17L, 45L, 9L, 5L, 49L,
33L, 48L, 39L, 71L, 64L, 44L, 47L, 58L, 24L, 15L, 37L, 14L, 11L,
26L, 57L, 4L, 30L, 72L, 21L, 23L, 60L, 38L, 59L, 29L, 19L, 6L,
46L, 36L, 3L, 63L, 55L, 51L, 35L, 10L, 7L, 16L, 73L, 42L, 52L,
41L, 27L, 25L, 61L, 20L, 70L, 53L, 18L, 31L, 22L, 1L, 8L, 2L,
40L, 65L, 67L, 28L, 56L, 13L, 32L, 54L, 66L, 68L, 34L, 12L, 69L,
62L), .Label = c("ID 1009445", "ID 120763", "ID 133883", "ID 136398",
"ID 171850", "ID 192595", "ID 197597", "ID 216406", "ID 21888",
"ID 230940", "ID 23777", "ID 282791", "ID 306348", "ID 309745",
"ID 326928", "ID 344897", "ID 34974", "ID 350157", "ID 391831",
"ID 402479", "ID 43010", "ID 484078", "ID 484697", "ID 537134",
"ID 562259", "ID 562455", "ID 567042", "ID 572866", "ID 578945",
"ID 595683", "ID 59759", "ID 598460", "ID 603611", "ID 603757",
"ID 607991", "ID 60976", "ID 622720", "ID 646989", "ID 656144",
"ID 668807", "ID 669435", "ID 720522", "ID 740555", "ID 745499",
"ID 746001", "ID 783969", "ID 78979", "ID 792426", "ID 793541",
"ID 797860", "ID 806559", "ID 810517", "ID 826054", "ID 837609",
"ID 839287", "ID 867918", "ID 869788", "ID 875380", "ID 876870",
"ID 882220", "ID 893116", "ID 895909", "ID 899050", "ID 900143",
"ID 908100", "ID 912185", "ID 916371", "ID 916620", "ID 957879",
"ID 966195", "ID 993247", "ID 998911", "ID 999610"), class = "factor"),
region = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L), location = c(259090L, 559306L, 2227063L, 2369217L, 4026978L,
4211264L, 4679449L, 5105226L, 5106345L, 5344670L, 5473601L,
5476528L, 5871970L, 6461228L, 6700029L, 6708265L, 7639959L,
9297695L, 10254788L, 10328812L, 11102816L, 11568295L, 11720437L,
12843457L, 14012506L, 14156669L, 14632300L, 14641938L, 15298211L,
15468425L, 15534406L, 16279682L, 16699353L, 17226952L, 17320785L,
269017L, 453097L, 828833L, 954610L, 954842L, 1066378L, 1217332L,
1253530L, 1277716L, 1292857L, 1337952L, 1439657L, 1452989L,
1712345L, 1758035L, 2601630L, 2640359L, 2778095L, 3151129L,
3369931L, 3399080L, 3529525L, 3810217L, 3821120L, 3841588L,
3901557L, 4111633L, 4220440L, 4528632L, 4665450L, 5099307L,
5260242L, 5958770L, 5966356L, 6137405L, 6246065L, 6297231L,
6807949L)), .Names = c("id", "region", "location"), class = "data.frame", row.names = c(NA,
-73L))
两个样本输入
样本1
structure(list(region = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L),
begin = c(0L, 2259252L, 5092077L, 9158205L, 0L, 135094L,
941813L, 5901391L, 6061324L), finish = c(2259252L, 5092077L,
9158205L, 20463033L, 135094L, 941813L, 5901391L, 6061324L,
7092402L), sed = c(3.98106154985726, 7.51649828394875, 5.15440228627995,
2.67456624889746, 7.54309412557632, 4.17413910385221, 7.47043058509007,
6.13362524658442, 1.00084994221106)), .Names = c("region",
"begin", "finish", "sed"), class = "data.frame", row.names = c(NA,
-9L))
样本2
structure(list(region = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L),
begin = c(0L, 2253252L, 7091077L, 9120205L, 0L, 135094L,
941813L, 5901391L, 6061324L), finish = c(2253252L, 7091077L,
9120205L, 17463033L, 135094L, 941813L, 5901391L, 6061324L,
7092402L), sed = c(3.31830840984048, 1.38014704208403, 6.13049140975458,
2.10349875097134, 0.48170587509345, 0.13058713509175, 9.13509713513509,
6.13047153058701, 3.81734081501503)), .Names = c("region",
"begin", "finish", "sed"), class = "data.frame", row.names = c(NA,
-9L))
未实施代码
matchLocationsToRegions <- function(path) {
# get list of data files (around 500 of these; only dput of 2 given: sample262519 and sample252519)
setwd(path,sep="",collapse=NULL)
data_files <- list.files()
# read in template file with complete regional boundaries
standRef <- read.table(paste(path, "StandRef.txt",sep="",collapse=NULL), header=TRUE, sep="\t")
# pre-allocate a df with row dimensions of standRef and num of columns according to num of data files
sediment.df <- as.data.frame(matrix(NA,nrow=nrow(standRef),ncol=length(data_files)))
colnames(sediment.df) <- data_files
rownames(sediment.df) <- standRef[,1]
# create a counter for columns filled
col_counter <- 1
for (file in data_files) {
# read in current, processed data
sample <- read.table(file, header=TRUE, sep="\t")
# pre-allocate vectors for sedimentation data vector
sed <- rep(NA, nrow(standRef))
# create a variable to track end boundary for a particular sample_ID
end_tracker <- 1
index <- unlist(lapply (unique(standRef$region), function(reg) {
reg.filter <- which(standRef$region == reg)
samp.filter <- which(sample$region == reg)
samp.filter[cut(standRef$location[reg.filter],c(0L,sample$finish[samp.filter]),labels=F)]
}))
sed <- sample$sed[index]
# fill in next, unfilled column of relevant df with data from relevant vector
sediment.df[col_counter] <- sed
# update column counter variable
col_counter <- col_counter + 1
}
# save df as a table
write.table(sediment.df,file="samples_sed.txt", row.names=TRUE, sep="\t")
}
正在运行Rprof
表明了这一点
"scan" "read.table" "matchLocationsToRegions"
和
"type.convert" "read.table" "matchLocationsToRegions"
主导运行时。据推测,在这条线上循环是一个瓶颈:
sample <- read.table(file, header=TRUE, sep="\t")
更新:区域上的for
循环已被更快执行的代码替换(h / t Simon Urbanek)。但是,其余的都很慢。
答案 0 :(得分:1)
您可以轻松删除循环:
sediment.df <- as.data.frame(lapply(data_files, function(file) {
sample <- read.table(file, header=TRUE, sep="\t")
index <- unlist(lapply (unique(standRef$region), function(reg) {
reg.filter <- which(standRef$region == reg)
samp.filter <- which(sample$region == reg)
samp.filter[cut(standRef$location[reg.filter],c(0L,sample$finish[samp.filter]),labels=F)]
}))
sample$sed[index]
}))
colnames(sediment.df) <- data_files
rownames(sediment.df) <- standRef[,1]
但是,在read.table
中花费大量时间并不是不太可能,因此您可以考虑a)使用scan
,b)仅使用所有样本创建一个文件(例如,使用额外的列定义样本)所以你不需要加载很多文件。