我从R开始使用预测模型,并有一个性能问题:
目前我有2个包含日落和日出数据的数据集。
我计算了2011年和2012年的数据(每日数据)。我的真实数据只包含每月的前半部分。我试图在数据框中将我的sundata与Loop:
集成library(lubridate)
library(maptools)
sunrise Functioin来自:https://gist.github.com/hilaryparker/2a83ca521353e8478c92
sunrise.set <- function (lat, long, date, timezone = "UTC", num.days = 1)
{
lat.long <- matrix(c(long, lat), nrow = 1)
day <- as.POSIXct(date, tz = timezone)
sequence <- seq(from = day, length.out = num.days, by = "days")
sunrise <- sunriset(lat.long, sequence, direction = "sunrise",
POSIXct =TRUE)
sunset <- sunriset(lat.long, sequence, direction = "sunset",
POSIXct = TRUE)
ss <- data.frame(sunrise, sunset)
ss <- ss[, -c(1, 3)]
colnames(ss) <- c("sunrise", "sunset")
return(ss)
}
datetime<-c(seq(from = as.POSIXct("2011-01-01 00:00"), to = as.POSIXct("2011-01-19 24:00"), by = "hour"),seq(from =as.POSIXct("2011-02-01 00:00"), to = as.POSIXct("2011-02-19 24:00"), by = "hour"))
train<-data.frame(datetime,1)
Date<-force_tz(as.POSIXlt(seq(as.Date("2011/01/01"),as.Date("2011/02/28"), by = "day"), format="%Y/%M/%D",tzone="America/Detroit"),tzone="America/Detroit")
sunrise<-data.frame(Date,sunrise.set(38.889931,-77.009003,"2011/01/01", timezone = "America/Detroit", num.days = 59))
sunrise.train<-as.POSIXlt(NA)
sunset.train<-as.POSIXlt(NA)
for (i in 1:length(train$datetime)){
for( j in 1:length(sunrise$Date)){print(i);print(j)
if(as.Date(as.POSIXlt(train$datetime[i]))==sunrise$Date[j]) {sunrise.train[i]<-sunrise$sunrise[j];sunset.train[i]<-sunrise$sunset[j]}
}}
train.modified<-data.frame(train,sunrise.train,sunset.train)
不幸的是,循环非常小,用完整数据计算循环需要2-3个小时。是否有更快的方式来填充&#34;训练数据框与合适的数据?
非常感谢!
编辑:更改了代码外观
答案 0 :(得分:1)
尝试首先将向量分配给内存,如下所示:
sunrise.train<-as.POSIXlt(rep(NA, length(train$datetime)))
sunset.train<-as.POSIXlt(rep(NA, length(train$datetime)))
答案 1 :(得分:1)
您想要一个简单的合并
更新
# reduce data for testing
train <- train[1:100,]
sunrise <- sunrise[1:10,]
yF <- function() {
for (i in 1:length(train$datetime)) {
for (j in 1:length(sunrise$Date)) {
if (as.Date(as.POSIXlt(train$datetime[i])) == sunrise$Date[j]) {
sunrise.train[i] <- sunrise$sunrise[j]
sunset.train[i] <- sunrise$sunset[j]
}
}
}
list(sunrise.train, sunset.train)
}
system.time(r <- yF()) # ~ 3 sek for 100 x 10
train.modified <- data.frame(train, sunrise = r[[1]], sunset = r[[2]])
# your results
使用data.table
:
require(data.table)
setDT(train)
setDT(sunrise)
# reformat dates for equal formats
train[, Date := as.Date(as.POSIXlt(datetime))]
sunrise[, Date := as.Date(Date)]
myRez <- merge(train, sunrise, by = "Date", all.x = T)
myRez <- myRez[, -1]
setDT(train.modified)
head(train.modified)
# datetime X1 sunrise sunset
#1: 2011-01-01 00:00:00 1 2011-01-01 07:26:39 2011-01-01 16:56:37
#2: 2011-01-01 01:00:00 1 2011-01-01 07:26:39 2011-01-01 16:56:37
#3: 2011-01-01 02:00:00 1 2011-01-01 07:26:39 2011-01-01 16:56:37
#4: 2011-01-01 03:00:00 1 2011-01-01 07:26:39 2011-01-01 16:56:37
#5: 2011-01-01 04:00:00 1 2011-01-01 07:26:39 2011-01-01 16:56:37
#6: 2011-01-01 05:00:00 1 2011-01-01 07:26:39 2011-01-01 16:56:37
head(myRez)
# datetime X1 sunrise sunset
#1: 2011-01-01 00:00:00 1 2011-01-01 07:26:39 2011-01-01 16:56:37
#2: 2011-01-01 01:00:00 1 2011-01-01 07:26:39 2011-01-01 16:56:37
#3: 2011-01-01 02:00:00 1 2011-01-01 07:26:39 2011-01-01 16:56:37
#4: 2011-01-01 03:00:00 1 2011-01-01 07:26:39 2011-01-01 16:56:37
#5: 2011-01-01 04:00:00 1 2011-01-01 07:26:39 2011-01-01 16:56:37
#6: 2011-01-01 05:00:00 1 2011-01-01 07:26:39 2011-01-01 16:56:37
检查平等:
all.equal(train.modified, myRez)
# [1] "Column 'sunrise': Attributes: < Component “tzone”: 1 string mismatch > Mean relative difference: 1.947453e-05"
结果有一些细微差别,可能是因为时区错过规范。您可能应该在转换日期时定义所需的时区。