更快地计算雨季的发病天数

时间:2017-07-25 15:49:17

标签: r performance loops

我有一大堆csv数据文件(大约400万),每个文件包含30年的气候数据(温度,降水等)。我想用这些数据来计算雨季的开始。每个文件对应于网格的单元格(2544行×1928列)。我写了一个for循环来计算每一行的开始天数,结果是一个57.840行(1928 x 30)的文件。脚本工作正常,但行需要大约18分钟,这意味着我需要31.8天才能完成所有计算。

有没有人知道如何加快这些计算?

以下是代码:

#SET PATH WORKING DIRECTORY
path="Z:/md/projects/.../climate-data/row-0"
setwd(path)
files <- list.files(path =path, full.names = T, recursive = T, pattern=glob2rx("*col*.csv*")) 

results<-data.frame()
name <- strsplit(path, "/")[[1]][6]

for (file in files){
     asc<-read.table(file, sep=",", skip=0, header=T) 
     name2 <- strsplit(file, "/")[[1]][7]
     name2<-str_sub(name2,-nchar(name2),-5)
     Q<-mean(asc$precip)
     x<-1
vector <- vector("numeric")
    for (i in 1:12){
       for (j in 1:31){
       d<-asc[ which(asc$month == i  & asc$day == j) , ]
       d<-mean(d$prec)
       vector[x] <-d # get average precipitation per day
       x<-x+1
     }
   } 
   vector<-vector[ !is.na(vector)] # delete NA values
   vector<-vector-Q # substract annual precipitation from daily precipitation 
   vector<-cumsum(vector) # build the cumulative sum of anomaly
   min<-which.min(vector)
   x<-1
   for (i in 1982:2011){
        df1<-subset(asc, asc$year==i |asc$year==i-1  | asc$year==i+1)
        DOY1<-seq(1, nrow(df1[df1$year == i-1,]), 1)
        DOY2<-seq(1, nrow(df1[df1$year == i,]), 1)
        DOY3<-seq(1, nrow(df1[df1$year == i+1,]), 1)
        DOY<-c(DOY1, DOY2, DOY3)
        df1$DOY<-DOY
        df1$DOY2<-seq(1, nrow(df1), 1)
        DOY2<-df1[df1$DOY == min & df1$year == i, ]
        DOY2<-DOY2$DOY2
        df2<-subset(df1, df1$DOY2>=DOY2-50 &  df1$DOY2<DOY2+50)
        df2$anomaly<-df2$precip-Q
        df2$accum<-cumsum(df2$anomaly)
        min_new<-df2[df2$accum == min(df2$accum), ]
        year<-min_new$year
        onset<-min_new$DOY
        results <-rbind(results, c(year, onset, name2), stringsAsFactors = FALSE)
        x<-x+1
     }  
  }

write.table(results, file=paste("results_", name, ".csv", sep=""), row.names = FALSE, col.names = T, sep = ", ", quote=F)

end.time <- Sys.time()
time<-end.time-start.time
time

0 个答案:

没有答案