我正在创建一个面板数据框。它是学校小组。我要在此面板上合并第一个最近的气象站,然后合并第二个,第三个气象站,以此类推,直到第10个最近的气象站。我编写了一个针对不同变量执行此操作的循环:最高温度,最低温度,降水量等。我遇到的问题是,由于内存不足,我似乎不必要在此循环内的某个地方分配了内存。
我知道我有足够的内存来创建面板,因为我曾经做过一次没有循环的面板。我正在使用8gb RAM在64位Windows上工作。我为2010年至2015年期间的7800所学校和800个气象站进行了抽样。
这是一个可重现的示例,只有5所学校,10个气象站和2个月的数据,并且仅匹配3个最近的气象站。真正的例子是7800所学校,800个气象站,5年的数据并匹配最近的10个站。
library(data.table)
Dist_Temp_Max<-data.frame(ID_School=seq(1:5),ID_Station_1=floor(runif(5, min=0, max=10)),Dist_1=floor(runif(5, min=0, max=10)),
ID_Station_2=floor(runif(5, min=0, max=10)),Dist_2=floor(runif(5, min=11, max=100)),
ID_Station_3=floor(runif(5, min=0, max=10)),Dist_3=floor(runif(5, min=101, max=200)))
Dist_Temp_Min<-data.frame(ID_School=seq(1:5),ID_Station_1=floor(runif(5, min=0, max=10)),Dist_1=floor(runif(5, min=0, max=10)),
ID_Station_2=floor(runif(5, min=0, max=10)),Dist_2=floor(runif(5, min=11, max=100)),
ID_Station_3=floor(runif(5, min=0, max=10)),Dist_3=floor(runif(5, min=101, max=200)))
Dist_Prec<-data.frame(ID_School=seq(1:5),ID_Station_1=floor(runif(5, min=0, max=10)),Dist_1=floor(runif(5, min=0, max=10)),
ID_Station_2=floor(runif(5, min=0, max=10)),Dist_2=floor(runif(5, min=11, max=100)),
ID_Station_3=floor(runif(5, min=0, max=10)),Dist_3=floor(runif(5, min=101, max=200)))
years<-seq.Date(as.Date("2014-01-01"),as.Date("2015-02-28"),by="1 day")
Weather_Data<-data.frame(ID_School=seq(1:5))
Weather_Data<-expand.grid(Weather_Data$ID_School,years)
names(Weather_Data)<-c("ID_Station","Date")
Weather_Data$Temp_Max_T<-runif(nrow(Weather_Data), min=10, max=40)
Weather_Data$Temp_Min_T<-Weather_Data$Temp_Max-10
Weather_Data$Prec_T<-floor(runif(nrow(Weather_Data),min=0, max=10))
Weather_Data$Cod_Merge<-paste(Weather_Data$ID_Station,Weather_Data$Date,sep="-")
#Add Values per Station
var_list<-c("Temp_Max","Temp_Min","Prec")
for (i in var_list) {
dist<-paste0("Dist_",i)
dist<-get(dist)
dist<-as.data.frame(subset(dist,!is.na(dist$ID_Station_1)))
matr<-dist[c("ID_School","ID_Station_1","Dist_1")]
matr<-setDT(matr)[, list(Date=years,ID_Station_1=ID_Station_1,Dist_1=Dist_1) , ID_School]
matr$Cod_Merge<-paste(matr$ID_Station_1,matr$Date,sep="-")
matr<-as.data.frame(matr[,c("Cod_Merge","ID_School","Date","ID_Station_1","Dist_1")])
matr<-merge(matr,Weather_Data[c("Cod_Merge",paste0(i,"_T"))],by="Cod_Merge",all.x=T)
matr$Cod_Merge<-paste(matr$ID_School,matr$Date,sep="-")
names(matr)[6]<-paste0(i,"_T_1")
Sys.sleep(0.1)
print(i)
for(n in 2:3) {
matr2<-dist[c("ID_School",paste0("ID_Station_",n),paste0("Dist_",n))]
matr2<-subset(dist,!is.na(dist[paste0("ID_Station_",n)]))
matr3<-expand.grid(matr2$ID_School,years)
names(matr3)<-c("ID_School","Date")
matr3<-matr3[order(matr3$ID_School,matr3$Date), ]
matr2<-merge(matr3,matr2,by="ID_School")
rm(matr3)
Sys.sleep(0.1)
print(i)
matr2$Cod_Merge<-paste(matr2[,paste0("ID_Station_",n)],matr2$Date,sep="-")
matr2<-matr2[c("Cod_Merge","ID_School","Date",paste0("ID_Station_",n),paste0("Dist_",n))]
matr2<-merge(matr2,Weather_Data[,c("Cod_Merge",paste0(i,"_T"))],by="Cod_Merge",all.x=T)
matr2$Cod_Merge<-paste(matr2$ID_School,matr2$Date,sep="-")
names(matr2)[6]<-paste0(i,"_T_",n)
matr<-merge(matr,matr2[,c("Cod_Merge",
paste0("ID_Station_",n),
paste0("Dist_",n),
paste0(i,"_T_",n))],
by="Cod_Merge",all.x=T)
Sys.sleep(0.1)
print(i)
}
assign(paste0("Mat_Dist_",i),matr)
}
任何帮助将不胜感激。
对于任何感兴趣的人,我在第二个循环中都缺少几个逗号:
library(data.table)
Dist_Temp_Max<-data.frame(ID_School=seq(1:5),ID_Station_1=floor(runif(5, min=0, max=10)),Dist_1=floor(runif(5, min=0, max=10)),
ID_Station_2=floor(runif(5, min=0, max=10)),Dist_2=floor(runif(5, min=11, max=100)),
ID_Station_3=floor(runif(5, min=0, max=10)),Dist_3=floor(runif(5, min=101, max=200)))
Dist_Temp_Min<-data.frame(ID_School=seq(1:5),ID_Station_1=floor(runif(5, min=0, max=10)),Dist_1=floor(runif(5, min=0, max=10)),
ID_Station_2=floor(runif(5, min=0, max=10)),Dist_2=floor(runif(5, min=11, max=100)),
ID_Station_3=floor(runif(5, min=0, max=10)),Dist_3=floor(runif(5, min=101, max=200)))
Dist_Prec<-data.frame(ID_School=seq(1:5),ID_Station_1=floor(runif(5, min=0, max=10)),Dist_1=floor(runif(5, min=0, max=10)),
ID_Station_2=floor(runif(5, min=0, max=10)),Dist_2=floor(runif(5, min=11, max=100)),
ID_Station_3=floor(runif(5, min=0, max=10)),Dist_3=floor(runif(5, min=101, max=200)))
years<-seq.Date(as.Date("2014-01-01"),as.Date("2015-02-28"),by="1 day")
Weather_Data<-data.frame(ID_School=seq(1:5))
Weather_Data<-expand.grid(Weather_Data$ID_School,years)
names(Weather_Data)<-c("ID_Station","Date")
Weather_Data$Temp_Max_T<-runif(nrow(Weather_Data), min=10, max=40)
Weather_Data$Temp_Min_T<-Weather_Data$Temp_Max-10
Weather_Data$Prec_T<-floor(runif(nrow(Weather_Data),min=0, max=10))
Weather_Data$Cod_Merge<-paste(Weather_Data$ID_Station,Weather_Data$Date,sep="-")
#Add Values per Station
var_list<-c("Temp_Max","Temp_Min","Prec")
for (i in var_list) {
dist<-paste0("Dist_",i)
dist<-get(dist)
dist<-as.data.frame(subset(dist,!is.na(dist$ID_Station_1)))
matr<-dist[c("ID_School","ID_Station_1","Dist_1")]
matr<-setDT(matr)[, list(Date=years,ID_Station_1=ID_Station_1,Dist_1=Dist_1) , ID_School]
matr$Cod_Merge<-paste(matr$ID_Station_1,matr$Date,sep="-")
matr<-as.data.frame(matr[,c("Cod_Merge","ID_School","Date","ID_Station_1","Dist_1")])
matr<-merge(matr,Weather_Data[c("Cod_Merge",paste0(i,"_T"))],by="Cod_Merge",all.x=T)
matr$Cod_Merge<-paste(matr$ID_School,matr$Date,sep="-")
names(matr)[6]<-paste0(i,"_T_1")
Sys.sleep(0.1)
print(i)
for(n in 2:3) {
matr2<-dist[c("ID_School",paste0("ID_Station_",n),paste0("Dist_",n))]
matr2<-subset(dist,!is.na(dist[paste0("ID_Station_",n)]))
matr3<-expand.grid(matr2$ID_School,years)
names(matr3)<-c("ID_School","Date")
matr3<-matr3[order(matr3$ID_School,matr3$Date), ]
matr2<-merge(matr3,matr2,by="ID_School")
rm(matr3)
Sys.sleep(0.1)
print(i)
matr2$Cod_Merge<-paste(matr2[,paste0("ID_Station_",n)],matr2$Date,sep="-")
matr2<-matr2[,c("Cod_Merge","ID_School","Date",paste0("ID_Station_",n),paste0("Dist_",n))]
matr2<-merge(matr2,Weather_Data[,c("Cod_Merge",paste0(i,"_T"))],by="Cod_Merge",all.x=T)
matr2$Cod_Merge<-paste(matr2$ID_School,matr2$Date,sep="-")
names(matr2)[6]<-paste0(i,"_T_",n)
matr<-merge(matr,matr2[,c("Cod_Merge",
paste0("ID_Station_",n),
paste0("Dist_",n),
paste0(i,"_T_",n))],
by="Cod_Merge",all.x=T)
Sys.sleep(0.1)
print(i)
}
assign(paste0("Mat_Dist_",i),matr)
}
答案 0 :(得分:0)
似乎您的所有代码需要做的是找到距每个学校最近的10个车站,然后您只需将车站数据子集分配给学校(对日期一无所知)。
您的最终数据帧应该更好,更易于使用-可能不是3个单独的宽数据帧,而是这样的:
set.seed(1) # FAKE DATA
final <- data.frame(ID_School = rep(LETTERS[1],10), ID_Station = sample(1:100,10),
Closeness_Rank = 1:10, Distance = 10*(1:10) + sample(-5:5,10),
Temp.Max = sample(70:100,10), Temp.Min = sample(30:69,10),
Precipitation = sample(20:30,10)/100)
final
# ID_School ID_Station Closeness_Rank Distance Temp.Max Temp.Min Precipitation
#1 A 27 1 7 98 49 0.29
#2 A 37 2 16 76 53 0.26
#3 A 57 3 31 88 48 0.27
#4 A 89 4 38 73 36 0.24
#5 A 20 5 50 77 59 0.23
#6 A 86 6 65 80 68 0.28
#7 A 97 7 72 70 57 0.20
#8 A 62 8 79 79 33 0.21
#9 A 58 9 94 90 64 0.22
#10 A 6 10 103 96 42 0.30
在不知道如何测量车站,学校数据或其他信息的距离的情况下,我无法帮助您获得这种格式,但是如果您提供更多信息,我将很乐意为您提供帮助。
编辑:
此方法似乎很慢,因为我没有真正正确地使用data.tables,但希望它能给您一些想法。我生成的伪造数据可能对您将来解释问题很有用。我的方法是只建立FINAL输出,这是一个日间学校数据表。该数据表是通过对最近10个站点的平均距离得出的天气数据进行反演而得出的。
这个过程非常缓慢,一天的学习时间大约为5分钟,大约需要7分钟的7800天的学校天气,因此6年半的时间才能完成5年-但没有内存问题!您会发布这种代码,询问是否有人可以提高速度。
# Starting from the beginning
set.seed(100)
library(data.table)
n_station <- 800
n_school <- 7800
station_info <- data.frame(ID_Station = 1:n_station,
xcoord = sample(-10000:10000,n_station),
ycoord = sample(-10000:10000,n_station))
school_info <- data.frame(ID_School = 1:n_school,
xcoord = sample(-10000:10000,n_school),
ycoord = sample(-10000:10000,n_school))
# save list of ~20 closest stations by school,
# and always use 10 of the closest where measurements are available
x <- 20
L <- vector('list', nrow(school_info)) # always initialize for speed
for(i in 1:nrow(school_info)){
distances <- sqrt((school_info[i,"xcoord"] - station_info[,"xcoord"])^2 +
(school_info[i,"ycoord"] - station_info[,"ycoord"])^2)
L[[i]] <- cbind.data.frame(ID_School = rep(school_info[i,"ID_School"],x),
ID_Station = station_info[ which(order(distances) <= x),
"ID_Station"],
Distance_Rank = 1:x,
Distance = sort(distances)[1:x])
}
L[[1]]
# ID_School ID_Station Distance_Rank Distance
# 1: 1 2 1 127.2242
# 2: 1 32 2 365.7896
# 3: 1 92 3 573.0428
# 4: 1 141 4 763.5837
# 5: 1 151 5 1003.4127
连续5年的每日虚假天气数据:
days <- seq.Date(as.Date("2010-01-01"),as.Date("2015-12-31"),by="1 day")
d <- length(days)
S <- vector('list', nrow(station_info))
for(i in 1:nrow(station_info)){
S[[i]] <- data.frame(ID_Station = rep(station_info[i,"ID_Station"],d),
Temp.Max = sample(70:100,d,T),
Temp.Min = sample(30:69,d,T),
Precipitation = sample(20:30,d,T)/100,
date = days)
# maybe remove some dates at random
if(sample(c(T,F),1)) S[[i]] <- S[[i]][-sample(1:d,1),]
}
station_data <- as.data.table(do.call(rbind,S))
station_data
# ID_Station Temp.Max Temp.Min Precipitation date
# 1: 1 88 55 0.23 2010-01-01
# 2: 1 73 57 0.24 2010-01-02
# 3: 1 93 33 0.29 2010-01-03
# 4: 1 81 52 0.27 2010-01-04
# 5: 1 82 48 0.24 2010-01-05
# ---
#291610: 800 86 31 0.28 2010-12-27
#291611: 800 98 57 0.22 2010-12-28
#291612: 800 71 50 0.26 2010-12-29
#291613: 800 83 35 0.26 2010-12-30
#291614: 800 71 34 0.23 2010-12-31
算法:
size <- length(days) * n_school
#OUT <- data.table(ID_School = integer(size),
# date = as.Date(x = integer(size), origin = "1970-01-01"),
# wtd_Temp.Max= numeric(size),
# wtd_Temp.Min= numeric(size),
# wtd_Precip= numeric(size))
OUT <- vector('list',size) # faster
unique_school <- unique(school_data$ID_School) # will be length(n_school)
#length(L) is the same as length(unique(school)= n_school)
count = 0
for(i in 1:length(days)){
t1 <- Sys.time()
temp_weather_data = station_data[date==days[i],]
m <- merge(school_data, temp_weather_data, "ID_Station")
setkey(m, ID_School) # the key is ID_School
for(j in 1:length(unique_school)){
count = count + 1
# assuming within the closest 20 stations, at least 10 have data every day
r <- m[.(j),][1:10] # find schools j in key
invd <- 1/r$Distance
sum.invd <- sum(invd)
OUT[[count]] <- data.table(ID_School = unique_school[j],
date = days[i],
wtd_Temp.Max = sum(invd * r$Temp.Max)/sum.invd,
wtd_Temp.Min = sum(invd * r$Temp.Min)/sum.invd,
wtd_Precip = sum(invd * r$Precipitation)/sum.invd)
if(j %% 100 == 0) cat(as.character(days[i]),".....",unique_school[j],"...\n")
}
cat(Sys.time()-t1)
}
哪个给出最终输出:
do.call(rbind,OUT)
# ID_School date wtd_Temp.Max wtd_Temp.Min wtd_Precip
# 1: 1 2010-01-01 88.64974 44.07872 0.2757571
# 2: 2 2010-01-01 83.34549 46.80225 0.2511073
# 3: 3 2010-01-01 85.32834 48.62004 0.2347837
# 4: 4 2010-01-01 82.95667 48.01814 0.2576482
# 5: 5 2010-01-01 87.88982 44.45357 0.2527794
# ---