解决方案

Question

我正在创建一个面板数据框。它是学校小组。我要在此面板上合并第一个最近的气象站，然后合并第二个，第三个气象站，以此类推，直到第10个最近的气象站。我编写了一个针对不同变量执行此操作的循环：最高温度，最低温度，降水量等。我遇到的问题是，由于内存不足，我似乎不必要在此循环内的某个地方分配了内存。

我知道我有足够的内存来创建面板，因为我曾经做过一次没有循环的面板。我正在使用8gb RAM在64位Windows上工作。我为2010年至2015年期间的7800所学校和800个气象站进行了抽样。

这是一个可重现的示例，只有5所学校，10个气象站和2个月的数据，并且仅匹配3个最近的气象站。真正的例子是7800所学校，800个气象站，5年的数据并匹配最近的10个站。

library(data.table)
Dist_Temp_Max<-data.frame(ID_School=seq(1:5),ID_Station_1=floor(runif(5, min=0, max=10)),Dist_1=floor(runif(5, min=0, max=10)),
                          ID_Station_2=floor(runif(5, min=0, max=10)),Dist_2=floor(runif(5, min=11, max=100)),
                          ID_Station_3=floor(runif(5, min=0, max=10)),Dist_3=floor(runif(5, min=101, max=200)))
Dist_Temp_Min<-data.frame(ID_School=seq(1:5),ID_Station_1=floor(runif(5, min=0, max=10)),Dist_1=floor(runif(5, min=0, max=10)),
                          ID_Station_2=floor(runif(5, min=0, max=10)),Dist_2=floor(runif(5, min=11, max=100)),
                          ID_Station_3=floor(runif(5, min=0, max=10)),Dist_3=floor(runif(5, min=101, max=200)))
Dist_Prec<-data.frame(ID_School=seq(1:5),ID_Station_1=floor(runif(5, min=0, max=10)),Dist_1=floor(runif(5, min=0, max=10)),
                      ID_Station_2=floor(runif(5, min=0, max=10)),Dist_2=floor(runif(5, min=11, max=100)),
                      ID_Station_3=floor(runif(5, min=0, max=10)),Dist_3=floor(runif(5, min=101, max=200)))

years<-seq.Date(as.Date("2014-01-01"),as.Date("2015-02-28"),by="1 day")
Weather_Data<-data.frame(ID_School=seq(1:5))
Weather_Data<-expand.grid(Weather_Data$ID_School,years)
names(Weather_Data)<-c("ID_Station","Date")
Weather_Data$Temp_Max_T<-runif(nrow(Weather_Data), min=10, max=40)
Weather_Data$Temp_Min_T<-Weather_Data$Temp_Max-10
Weather_Data$Prec_T<-floor(runif(nrow(Weather_Data),min=0, max=10))
Weather_Data$Cod_Merge<-paste(Weather_Data$ID_Station,Weather_Data$Date,sep="-")

#Add Values per Station
var_list<-c("Temp_Max","Temp_Min","Prec")
for (i in var_list) {
  dist<-paste0("Dist_",i)
  dist<-get(dist)
  dist<-as.data.frame(subset(dist,!is.na(dist$ID_Station_1)))
  matr<-dist[c("ID_School","ID_Station_1","Dist_1")]
  matr<-setDT(matr)[, list(Date=years,ID_Station_1=ID_Station_1,Dist_1=Dist_1) , ID_School]

  matr$Cod_Merge<-paste(matr$ID_Station_1,matr$Date,sep="-")
  matr<-as.data.frame(matr[,c("Cod_Merge","ID_School","Date","ID_Station_1","Dist_1")])
  matr<-merge(matr,Weather_Data[c("Cod_Merge",paste0(i,"_T"))],by="Cod_Merge",all.x=T)
  matr$Cod_Merge<-paste(matr$ID_School,matr$Date,sep="-")
  names(matr)[6]<-paste0(i,"_T_1")
  Sys.sleep(0.1)
  print(i)

  for(n in 2:3) {
    matr2<-dist[c("ID_School",paste0("ID_Station_",n),paste0("Dist_",n))]
    matr2<-subset(dist,!is.na(dist[paste0("ID_Station_",n)]))
    matr3<-expand.grid(matr2$ID_School,years)

    names(matr3)<-c("ID_School","Date")
    matr3<-matr3[order(matr3$ID_School,matr3$Date), ]
    matr2<-merge(matr3,matr2,by="ID_School")
    rm(matr3)
    Sys.sleep(0.1)
    print(i)

    matr2$Cod_Merge<-paste(matr2[,paste0("ID_Station_",n)],matr2$Date,sep="-")
    matr2<-matr2[c("Cod_Merge","ID_School","Date",paste0("ID_Station_",n),paste0("Dist_",n))]
    matr2<-merge(matr2,Weather_Data[,c("Cod_Merge",paste0(i,"_T"))],by="Cod_Merge",all.x=T)
    matr2$Cod_Merge<-paste(matr2$ID_School,matr2$Date,sep="-")

    names(matr2)[6]<-paste0(i,"_T_",n)
    matr<-merge(matr,matr2[,c("Cod_Merge",
                              paste0("ID_Station_",n),
                              paste0("Dist_",n),
                              paste0(i,"_T_",n))],
                by="Cod_Merge",all.x=T)
    Sys.sleep(0.1)
    print(i)
  }
  assign(paste0("Mat_Dist_",i),matr)
}

任何帮助将不胜感激。

解决方案

对于任何感兴趣的人，我在第二个循环中都缺少几个逗号：

library(data.table)
Dist_Temp_Max<-data.frame(ID_School=seq(1:5),ID_Station_1=floor(runif(5, min=0, max=10)),Dist_1=floor(runif(5, min=0, max=10)),
                          ID_Station_2=floor(runif(5, min=0, max=10)),Dist_2=floor(runif(5, min=11, max=100)),
                          ID_Station_3=floor(runif(5, min=0, max=10)),Dist_3=floor(runif(5, min=101, max=200)))
Dist_Temp_Min<-data.frame(ID_School=seq(1:5),ID_Station_1=floor(runif(5, min=0, max=10)),Dist_1=floor(runif(5, min=0, max=10)),
                          ID_Station_2=floor(runif(5, min=0, max=10)),Dist_2=floor(runif(5, min=11, max=100)),
                          ID_Station_3=floor(runif(5, min=0, max=10)),Dist_3=floor(runif(5, min=101, max=200)))
Dist_Prec<-data.frame(ID_School=seq(1:5),ID_Station_1=floor(runif(5, min=0, max=10)),Dist_1=floor(runif(5, min=0, max=10)),
                      ID_Station_2=floor(runif(5, min=0, max=10)),Dist_2=floor(runif(5, min=11, max=100)),
                      ID_Station_3=floor(runif(5, min=0, max=10)),Dist_3=floor(runif(5, min=101, max=200)))

years<-seq.Date(as.Date("2014-01-01"),as.Date("2015-02-28"),by="1 day")
Weather_Data<-data.frame(ID_School=seq(1:5))
Weather_Data<-expand.grid(Weather_Data$ID_School,years)
names(Weather_Data)<-c("ID_Station","Date")
Weather_Data$Temp_Max_T<-runif(nrow(Weather_Data), min=10, max=40)
Weather_Data$Temp_Min_T<-Weather_Data$Temp_Max-10
Weather_Data$Prec_T<-floor(runif(nrow(Weather_Data),min=0, max=10))
Weather_Data$Cod_Merge<-paste(Weather_Data$ID_Station,Weather_Data$Date,sep="-")

#Add Values per Station
var_list<-c("Temp_Max","Temp_Min","Prec")
for (i in var_list) {
  dist<-paste0("Dist_",i)
  dist<-get(dist)
  dist<-as.data.frame(subset(dist,!is.na(dist$ID_Station_1)))
  matr<-dist[c("ID_School","ID_Station_1","Dist_1")]
  matr<-setDT(matr)[, list(Date=years,ID_Station_1=ID_Station_1,Dist_1=Dist_1) , ID_School]

  matr$Cod_Merge<-paste(matr$ID_Station_1,matr$Date,sep="-")
  matr<-as.data.frame(matr[,c("Cod_Merge","ID_School","Date","ID_Station_1","Dist_1")])
  matr<-merge(matr,Weather_Data[c("Cod_Merge",paste0(i,"_T"))],by="Cod_Merge",all.x=T)
  matr$Cod_Merge<-paste(matr$ID_School,matr$Date,sep="-")
  names(matr)[6]<-paste0(i,"_T_1")
  Sys.sleep(0.1)
  print(i)

  for(n in 2:3) {
    matr2<-dist[c("ID_School",paste0("ID_Station_",n),paste0("Dist_",n))]
    matr2<-subset(dist,!is.na(dist[paste0("ID_Station_",n)]))
    matr3<-expand.grid(matr2$ID_School,years)

    names(matr3)<-c("ID_School","Date")
    matr3<-matr3[order(matr3$ID_School,matr3$Date), ]
    matr2<-merge(matr3,matr2,by="ID_School")
    rm(matr3)
    Sys.sleep(0.1)
    print(i)

    matr2$Cod_Merge<-paste(matr2[,paste0("ID_Station_",n)],matr2$Date,sep="-")
    matr2<-matr2[,c("Cod_Merge","ID_School","Date",paste0("ID_Station_",n),paste0("Dist_",n))]
    matr2<-merge(matr2,Weather_Data[,c("Cod_Merge",paste0(i,"_T"))],by="Cod_Merge",all.x=T)
    matr2$Cod_Merge<-paste(matr2$ID_School,matr2$Date,sep="-")

    names(matr2)[6]<-paste0(i,"_T_",n)
    matr<-merge(matr,matr2[,c("Cod_Merge",
                              paste0("ID_Station_",n),
                              paste0("Dist_",n),
                              paste0(i,"_T_",n))],
                by="Cod_Merge",all.x=T)
    Sys.sleep(0.1)
    print(i)
  }
  assign(paste0("Mat_Dist_",i),matr)
}

Answer 1

似乎您的所有代码需要做的是找到距每个学校最近的10个车站，然后您只需将车站数据子集分配给学校（对日期一无所知）。

您的最终数据帧应该更好，更易于使用-可能不是3个单独的宽数据帧，而是这样的：

set.seed(1)  # FAKE DATA
final <- data.frame(ID_School = rep(LETTERS[1],10), ID_Station = sample(1:100,10), 
                   Closeness_Rank = 1:10, Distance = 10*(1:10) + sample(-5:5,10), 
                   Temp.Max = sample(70:100,10), Temp.Min = sample(30:69,10), 
                   Precipitation = sample(20:30,10)/100)
final
#   ID_School ID_Station Closeness_Rank Distance Temp.Max Temp.Min Precipitation
#1          A         27              1        7       98       49          0.29
#2          A         37              2       16       76       53          0.26
#3          A         57              3       31       88       48          0.27
#4          A         89              4       38       73       36          0.24
#5          A         20              5       50       77       59          0.23
#6          A         86              6       65       80       68          0.28
#7          A         97              7       72       70       57          0.20
#8          A         62              8       79       79       33          0.21
#9          A         58              9       94       90       64          0.22
#10         A          6             10      103       96       42          0.30

在不知道如何测量车站，学校数据或其他信息的距离的情况下，我无法帮助您获得这种格式，但是如果您提供更多信息，我将很乐意为您提供帮助。

编辑：

此方法似乎很慢，因为我没有真正正确地使用data.tables，但希望它能给您一些想法。我生成的伪造数据可能对您将来解释问题很有用。我的方法是只建立FINAL输出，这是一个日间学校数据表。该数据表是通过对最近10个站点的平均距离得出的天气数据进行反演而得出的。

这个过程非常缓慢，一天的学习时间大约为5分钟，大约需要7分钟的7800天的学校天气，因此6年半的时间才能完成5年-但没有内存问题！您会发布这种代码，询问是否有人可以提高速度。

# Starting from the beginning
set.seed(100)
library(data.table)

n_station <- 800
n_school <- 7800
station_info <- data.frame(ID_Station = 1:n_station, 
           xcoord = sample(-10000:10000,n_station), 
           ycoord = sample(-10000:10000,n_station))

school_info <- data.frame(ID_School = 1:n_school, 
           xcoord = sample(-10000:10000,n_school), 
           ycoord = sample(-10000:10000,n_school))

# save list of ~20 closest stations by school, 
# and always use 10 of the closest where measurements are available
x <- 20 
L <- vector('list', nrow(school_info)) # always initialize for speed
for(i in 1:nrow(school_info)){
    distances <- sqrt((school_info[i,"xcoord"] - station_info[,"xcoord"])^2 + 
                      (school_info[i,"ycoord"] - station_info[,"ycoord"])^2)
    L[[i]] <- cbind.data.frame(ID_School = rep(school_info[i,"ID_School"],x),
                               ID_Station = station_info[ which(order(distances) <= x), 
                                                         "ID_Station"],
                               Distance_Rank = 1:x,
                               Distance = sort(distances)[1:x])
}
L[[1]]
#        ID_School ID_Station Distance_Rank  Distance
# 1:         1          2             1  127.2242
# 2:         1         32             2  365.7896
# 3:         1         92             3  573.0428
# 4:         1        141             4  763.5837
# 5:         1        151             5 1003.4127

连续5年的每日虚假天气数据：

days <- seq.Date(as.Date("2010-01-01"),as.Date("2015-12-31"),by="1 day")
d <- length(days)
S <- vector('list', nrow(station_info))
for(i in 1:nrow(station_info)){
  S[[i]] <- data.frame(ID_Station = rep(station_info[i,"ID_Station"],d),
                       Temp.Max = sample(70:100,d,T),
                       Temp.Min = sample(30:69,d,T), 
                       Precipitation = sample(20:30,d,T)/100,
                       date = days)
  # maybe remove some dates at random
  if(sample(c(T,F),1)) S[[i]] <- S[[i]][-sample(1:d,1),]
}
station_data <- as.data.table(do.call(rbind,S))
station_data
#        ID_Station Temp.Max Temp.Min Precipitation       date
#     1:          1       88       55          0.23 2010-01-01
#     2:          1       73       57          0.24 2010-01-02
#     3:          1       93       33          0.29 2010-01-03
#     4:          1       81       52          0.27 2010-01-04
#     5:          1       82       48          0.24 2010-01-05
#    ---                                                      
#291610:        800       86       31          0.28 2010-12-27
#291611:        800       98       57          0.22 2010-12-28
#291612:        800       71       50          0.26 2010-12-29
#291613:        800       83       35          0.26 2010-12-30
#291614:        800       71       34          0.23 2010-12-31

算法：

size <- length(days) * n_school
#OUT <- data.table(ID_School = integer(size),
#                  date = as.Date(x = integer(size), origin = "1970-01-01"),
#                  wtd_Temp.Max= numeric(size),
#                  wtd_Temp.Min= numeric(size),
#                  wtd_Precip= numeric(size))
OUT <- vector('list',size) # faster

unique_school <- unique(school_data$ID_School) # will be length(n_school)
#length(L) is the same as length(unique(school)= n_school)

count = 0
for(i in 1:length(days)){
  t1 <- Sys.time()
  temp_weather_data = station_data[date==days[i],]
  m <- merge(school_data, temp_weather_data, "ID_Station")
setkey(m, ID_School) # the key is ID_School
  for(j in 1:length(unique_school)){
    count = count + 1
    # assuming within the closest 20 stations, at least 10 have data every day
    r <- m[.(j),][1:10] # find schools j in key
    invd <- 1/r$Distance
    sum.invd <- sum(invd)
    OUT[[count]] <- data.table(ID_School = unique_school[j], 
                               date = days[i], 
                               wtd_Temp.Max = sum(invd * r$Temp.Max)/sum.invd,
                               wtd_Temp.Min = sum(invd * r$Temp.Min)/sum.invd,
                               wtd_Precip = sum(invd * r$Precipitation)/sum.invd)
  if(j %% 100 == 0) cat(as.character(days[i]),".....",unique_school[j],"...\n")
  }
  cat(Sys.time()-t1)
}

哪个给出最终输出：

do.call(rbind,OUT)
#    ID_School       date wtd_Temp.Max wtd_Temp.Min wtd_Precip
# 1:         1 2010-01-01     88.64974     44.07872  0.2757571
# 2:         2 2010-01-01     83.34549     46.80225  0.2511073
# 3:         3 2010-01-01     85.32834     48.62004  0.2347837
# 4:         4 2010-01-01     82.95667     48.01814  0.2576482
# 5:         5 2010-01-01     87.88982     44.45357  0.2527794
# ---

如何在R中的for循环中减少内存使用

解决方案

1 个答案: