我正在尝试以编程方式在具有适当列名等的(命名)数据帧中检索和存储数据。我的for循环无法检索数据并将其存储在适当命名的数据框中(请参阅:pg $ Df_names) 。为您的所有帮助加油。 iv)中提供了固定代码以供参考-请提供您可能有的任何改进。
necessary_packages <- c("readr", "readxl", "xlsx","stringr", "dplyr", "tidyr", "rvest", "xml2","SnowballC", "httr", "Rcurl", "rvest")
new_packages <- necessary_packages[!(necessary_packages %in%
installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
lapply(necessary_packages, require, character.only = TRUE)
Base_URL <- "http://www.rba.gov.au/statistics/tables/"
pg <- as.data.frame((html_attr(html_nodes(read_html(Base_URL), "a"), "href")), stringsAsFactors = FALSE)
file_types <- c(".xls", ".csv", ".xlsx")
pg <- as.data.frame(paste0(Base_URL, gsub("/statistics/tables/",
"",
unique(grep(paste(file_types, collapse = "|"),
pg[,1], value = TRUE)))), stringsAsFactors = FALSE)
colnames(pg) <- "urls"
pg$Df_names <- gsub("\\-|[[:punct:]]", "_",
gsub("\\..*",
"",
sapply(strsplit(pg$urls, split= "\\/"), function(x){x[length(x)]})))
df_names_list <- NULL
files_to_remove <- NULL
df_list <- list()
h = 1
i = 1
j = 1
for (i in 1:nrow(pg)) {
if (length(grep(".xls", pg$urls[i])) > 0) {
tryit <- try(GET(pg$urls[i], write_disk(tf <-
tempfile(fileext = ".xls"), overwrite = TRUE)))
if (inherits(tryit, "try-error")) {
i <- i + 1
} else{
GET(pg$urls[i], write_disk(tf <-
tempfile(fileext = ".xls"), overwrite = TRUE))
shiet <- excel_sheets(tf)[which(
excel_sheets(tf) != "Summary" &
excel_sheets(tf) != "Notes" &
excel_sheets(tf) != "Series breaks"
)]
}
for (j in 1:length(shiet)) {
tryit <- try(tmp1 <-
as.data.frame(read_xls(tf, sheet = shiet[j], col_names = FALSE)))
if (inherits(tryit, "try-error")) {
j <- j + 1
} else{
tmp1 <-
as.data.frame(read_xls(tf, sheet = shiet[j], col_names = FALSE))
}
if (ncol(tmp1) > 2) {
tryit <- try(X <- as.data.frame(read_xls(
gsub('[\\]', '\\/', tf),
skip = 11,
col_names = FALSE,
sheet = shiet[j]
)))
if (inherits(tryit, "try-error")) {
j <- j + 1
} else{
X <- as.data.frame(read_xls(
gsub('[\\]', '\\/', tf),
skip = 11,
col_names = FALSE,
sheet = shiet[j]
))
}
colnames(X) <- c("Date",
c(trimws(
gsub(
"for series breaks.*|[[:punct:]]| |Seenot",
"",
tools::toTitleCase(wordStem(tmp1[2, 2:ncol(tmp1)],
language = "porter"))
),
which = c("both", "left", "right")
)))
df_list[[h]] <- X
df_names_list <-
c(df_names_list, gsub(" ", "_", trimws(
paste0(pg$Df_names[i], "_", shiet[j]),
which = c("both", "left", "right")
)))
h <- h + 1
} else{
rm(tmp1)
j <- j + 1
}
}
} else if (length(grep(".xslx", pg$urls[i])) > 0) {
tryit <- try(GET(pg$urls[i], write_disk(tf <-
tempfile(fileext = ".xlsx"), overwrite = TRUE)))
if (inherits(tryit, "try-error")) {
i <- i + 1
} else{
GET(pg$urls[i], write_disk(tf <-
tempfile(fileext = ".xlsx"), overwrite = TRUE))
shiet <- excel_sheets(tf)[which(
excel_sheets(tf) != "Summary" &
excel_sheets(tf) != "Notes" &
excel_sheets(tf) != "Series breaks"
)]
}
for (j in 1:length(shiet)) {
tryit <- try(tmp1 <-
as.data.frame(read_xlsx(tf, sheet = shiet[j], col_names = FALSE)))
if (inherits(tryit, "try-error")) {
j <- j + 1
} else{
tmp1 <-
as.data.frame(read_xlsx(tf, sheet = shiet[j], col_names = FALSE))
}
if (ncol(tmp1) > 2) {
tryit <- try(X <- as.data.frame(read_xlsx(
gsub('[\\]', '\\/', tf),
skip = 11,
col_names = FALSE,
sheet = shiet[j]
)))
if (inherits(tryit, "try-error")) {
j <- j + 1
} else{
X <- as.data.frame(read_xlsx(
gsub('[\\]', '\\/', tf),
skip = 11,
col_names = FALSE,
sheet = shiet[j]
))
}
colnames(X) <- c("Date",
c(trimws(
gsub(
"for series breaks.*|[[:punct:]]| |Seenot",
"",
tools::toTitleCase(wordStem(tmp1[2, 2:ncol(tmp1)],
language = "porter"))
),
which = c("both", "left", "right")
)))
df_list[[h]] <- X
df_names_list <-
c(df_names_list, gsub(" ", "_", trimws(
paste0(pg$Df_names[i], "_", shiet[j]),
which = c("both", "left", "right")
)))
h <- h + 1
} else{
rm(tmp1)
j <- j + 1
}
}
} else if (length(grep(".csv", pg$urls[i])) > 0) {
tryit <- try(X <- as.data.frame(read.csv(pg$urls[i],
skip = 11,
header = FALSE)))
if (inherits(tryit, "try-error")) {
i <- i + 1
} else {
X <- as.data.frame(read.csv(pg$urls[i],
skip = 11,
header = FALSE))
}
tmp1 <- as.data.frame(read.csv(pg$urls[i]))
if (ncol(tmp1) >= 2) {
colnames(X) <- c("Date",
c(trimws(
gsub(
"for series breaks.*|[[:punct:]]| |Seenot",
"",
tools::toTitleCase(wordStem(tmp1[2, 2:ncol(tmp1)],
language = "porter"))
),
which = c("both", "left", "right")
)))
df_list[[h]] <- X
df_names_list <-
c(df_names_list, gsub(" ", "_", trimws(
pg$Df_names[i],
which = c("both", "left", "right")
)))
h <- h + 1
} else{
rm(tmp1)
j <- j + 1
}
}
}