我刚接触R(并且通常是编程),并且使自己失业;)
我编写了一个脚本,该脚本(1)提取一个“ API号”的CSV文件,(2)查找并下载每个API号的HTML表,(3)将信息另存为CSV表。它有效-只是不漂亮。问题之一是我从中下载数据的网站有时会显示500 Internal Server Error。为了解决网站的零星可用性,我构建了一些真正的丑陋的嵌套if语句,这些语句会延迟脚本以增加时间。这太过分了,但是我不希望一夜之间下载失败。
我正在寻找有关变通方法下载延迟的反馈。有一个更好的方法吗?有没有办法告诉R继续尝试下载,直到成功?
此脚本将下载数据并将每个API号保存为单独的CSV。 API编号的示例列表有60个。您可以在这里找到它:https://www.dropbox.com/s/fwvcxun8hr0xy4n/API%20List.csv?dl=0
谢谢!
######################### User-Defined Parameters ##########################################
### Specify where the API list is and where to download temp data
welllist = ".../API List.csv" # each API will have a seperate CSV in this directory as well
tempdata = ".../tempdata.txt"
######################### Get API List and Parse API ##########################################
wells = read.csv(file = welllist, header = 1, sep = ",")
colnum = 1
rownum = nrow(wells)
API = data.frame(1:rownum,"A","B","C",stringsAsFactors = F)
colnames(API) = c("number", "type","county","sequence")
for (i in 1:rownum) {
current = toString(wells[i,colnum])
dashloc = as.data.frame(gregexpr(pattern = "-", text = current))
type = substr(x = current, start = 0, stop = dashloc[1,1]-1)
if (type != "05") {print(paste("WARNING! API DOES NOT BEGIN WITH 05", "- WELL", i,wells[i,2]))}
county = substr(x = current, start = dashloc[1,1]+1, stop = dashloc[2,1]-1)
sequence = substr(x = current, start = dashloc[2,1]+1, stop = nchar(current))
API$type[i] = type
API$county[i] = county
API$sequence[i] = sequence
}
######################### Download the Data ##########################################
end = nrow(API)
for (i in 1:end) {
county = API$county[i]
sequence = API$sequence[i]
dataurl = paste("http://cogcc.state.co.us/production/?&apiCounty=",county,"&apiSequence=",sequence,sep = "")
### ***** U-G-L-Y Retry Data Download if Server Error or if File Size is Too Small ***** ###
err <- try(download.file(url = dataurl, destfile = tempdata, quiet = F, mode = "wb"))
if (class(err) == "try-error" || file.size(tempdata) < 300000) {
Sys.sleep(2)
err <- try(download.file(url = dataurl, destfile = tempdata, quiet = F, mode = "wb"))
}
if (class(err) == "try-error" || file.size(tempdata) < 300000) {
Sys.sleep(4)
err <- try(download.file(url = dataurl, destfile = tempdata, quiet = F, mode = "wb"))
}
if (class(err) == "try-error" || file.size(tempdata) < 300000) {
Sys.sleep(8)
err <- try(download.file(url = dataurl, destfile = tempdata, quiet = F, mode = "wb"))
}
if (class(err) == "try-error" || file.size(tempdata) < 300000) {
Sys.sleep(16)
err <- try(download.file(url = dataurl, destfile = tempdata, quiet = F, mode = "wb"))
}
if (class(err) == "try-error" || file.size(tempdata) < 300000) {
Sys.sleep(32)
err <- try(download.file(url = dataurl, destfile = tempdata, quiet = F, mode = "wb"))
}
if (class(err) == "try-error" || file.size(tempdata) < 300000) {
Sys.sleep(64)
err <- try(download.file(url = dataurl, destfile = tempdata, quiet = F, mode = "wb"))
}
if (class(err) == "try-error" || file.size(tempdata) < 300000) {
Sys.sleep(128)
err <- try(download.file(url = dataurl, destfile = tempdata, quiet = F, mode = "wb"))
}
if (class(err) == "try-error" || file.size(tempdata) < 300000) {
Sys.sleep(256)
err <- try(download.file(url = dataurl, destfile = tempdata, quiet = F, mode = "wb"))
}
if (class(err) == "try-error" || file.size(tempdata) < 300000) {
Sys.sleep(512)
err <- try(download.file(url = dataurl, destfile = tempdata, quiet = F, mode = "wb"))
}
if (class(err) == "try-error" || file.size(tempdata) < 300000) {
Sys.sleep(1024)
err <- try(download.file(url = dataurl, destfile = tempdata, quiet = F, mode = "wb"))
}
if (class(err) == "try-error" || file.size(tempdata) < 300000) {
write.csv(x = paste("Error downloading", sequence, "at", Sys.time()), file = paste(dirname(wells),"errorlog.txt",sep = "/"))
next
}
### Save the CSV ###
write.csv(x = tempdata, file = paste(dirname(welllist),"/",sequence,"_production.csv",sep = ""))
}
该网站有时会中断并给出以下消息:HTTP状态为“ 500 Internal Server Error”(内部服务器错误)