我有一大堆csv数据文件(大约400万),每个文件包含30年的气候数据(温度,降水等)。我想用这些数据来计算雨季的开始。每个文件对应于网格的单元格(2544行×1928列)。我写了一个for循环来计算每一行的开始天数,结果是一个57.840行(1928 x 30)的文件。脚本工作正常,但行需要大约18分钟,这意味着我需要31.8天才能完成所有计算。
有没有人知道如何加快这些计算?
以下是代码:
#SET PATH WORKING DIRECTORY
path="Z:/md/projects/.../climate-data/row-0"
setwd(path)
files <- list.files(path =path, full.names = T, recursive = T, pattern=glob2rx("*col*.csv*"))
results<-data.frame()
name <- strsplit(path, "/")[[1]][6]
for (file in files){
asc<-read.table(file, sep=",", skip=0, header=T)
name2 <- strsplit(file, "/")[[1]][7]
name2<-str_sub(name2,-nchar(name2),-5)
Q<-mean(asc$precip)
x<-1
vector <- vector("numeric")
for (i in 1:12){
for (j in 1:31){
d<-asc[ which(asc$month == i & asc$day == j) , ]
d<-mean(d$prec)
vector[x] <-d # get average precipitation per day
x<-x+1
}
}
vector<-vector[ !is.na(vector)] # delete NA values
vector<-vector-Q # substract annual precipitation from daily precipitation
vector<-cumsum(vector) # build the cumulative sum of anomaly
min<-which.min(vector)
x<-1
for (i in 1982:2011){
df1<-subset(asc, asc$year==i |asc$year==i-1 | asc$year==i+1)
DOY1<-seq(1, nrow(df1[df1$year == i-1,]), 1)
DOY2<-seq(1, nrow(df1[df1$year == i,]), 1)
DOY3<-seq(1, nrow(df1[df1$year == i+1,]), 1)
DOY<-c(DOY1, DOY2, DOY3)
df1$DOY<-DOY
df1$DOY2<-seq(1, nrow(df1), 1)
DOY2<-df1[df1$DOY == min & df1$year == i, ]
DOY2<-DOY2$DOY2
df2<-subset(df1, df1$DOY2>=DOY2-50 & df1$DOY2<DOY2+50)
df2$anomaly<-df2$precip-Q
df2$accum<-cumsum(df2$anomaly)
min_new<-df2[df2$accum == min(df2$accum), ]
year<-min_new$year
onset<-min_new$DOY
results <-rbind(results, c(year, onset, name2), stringsAsFactors = FALSE)
x<-x+1
}
}
write.table(results, file=paste("results_", name, ".csv", sep=""), row.names = FALSE, col.names = T, sep = ", ", quote=F)
end.time <- Sys.time()
time<-end.time-start.time
time