R,获取数据并将excel / csv文件存储为单独的数据框

时间:2018-12-23 13:20:49

标签: r dataframe web-scraping screen-scraping economics

我正在尝试以编程方式在具有适当列名等的(命名)数据帧中检索和存储数据。我的for循环无法检索数据并将其存储在适当命名的数据框中(请参阅:pg $ Df_names) 。为您的所有帮助加油。 iv)中提供了固定代码以供参考-请提供您可能有的任何改进。

necessary_packages <- c("readr", "readxl", "xlsx","stringr", "dplyr", "tidyr", "rvest", "xml2","SnowballC", "httr", "Rcurl", "rvest")

new_packages <- necessary_packages[!(necessary_packages %in% 
                                       installed.packages()[,"Package"])]

if(length(new_packages)) install.packages(new_packages)

lapply(necessary_packages, require, character.only = TRUE)

i)创建一个数据框,其中包含RBA数据的所有网址以及数据框名称:

Base_URL <- "http://www.rba.gov.au/statistics/tables/"

ii)将html或excel文件中的html读入R解析文本:

pg <- as.data.frame((html_attr(html_nodes(read_html(Base_URL), "a"), "href")), stringsAsFactors = FALSE)
file_types <- c(".xls", ".csv", ".xlsx")
pg <- as.data.frame(paste0(Base_URL, gsub("/statistics/tables/",
                                            "", 
                                            unique(grep(paste(file_types, collapse = "|"), 
                                                        pg[,1], value = TRUE)))), stringsAsFactors = FALSE)
colnames(pg) <- "urls"

iii)存储数据帧名称的字符向量:

pg$Df_names <- gsub("\\-|[[:punct:]]", "_", 
                    gsub("\\..*",
                    "",
                    sapply(strsplit(pg$urls, split= "\\/"), function(x){x[length(x)]})))

iv)将每个excel文件存储为由字符向量的每个元素命名的单独数据框:

df_names_list <- NULL
files_to_remove <- NULL
df_list <- list()
h = 1
i = 1
j = 1
for (i in 1:nrow(pg)) {
  if (length(grep(".xls", pg$urls[i])) > 0) {
    tryit <- try(GET(pg$urls[i], write_disk(tf <-
                                              tempfile(fileext = ".xls"), overwrite = TRUE)))
    if (inherits(tryit, "try-error")) {
      i <- i + 1
    } else{
      GET(pg$urls[i], write_disk(tf <-
                                   tempfile(fileext = ".xls"), overwrite = TRUE))
      shiet <- excel_sheets(tf)[which(
        excel_sheets(tf) != "Summary" &
          excel_sheets(tf) != "Notes" &
          excel_sheets(tf) != "Series breaks"
      )]
    }
    for (j in 1:length(shiet)) {
      tryit <- try(tmp1 <-
                     as.data.frame(read_xls(tf, sheet = shiet[j], col_names = FALSE)))
      if (inherits(tryit, "try-error")) {
        j <- j + 1
      } else{
        tmp1 <-
          as.data.frame(read_xls(tf, sheet = shiet[j], col_names = FALSE))
      }
      if (ncol(tmp1) > 2) {
        tryit <- try(X <- as.data.frame(read_xls(
          gsub('[\\]', '\\/', tf),
          skip = 11,
          col_names = FALSE,
          sheet = shiet[j]
        )))
        if (inherits(tryit, "try-error")) {
          j <- j + 1
        } else{
          X <- as.data.frame(read_xls(
            gsub('[\\]', '\\/', tf),
            skip = 11,
            col_names = FALSE,
            sheet = shiet[j]
          ))
        }
        colnames(X) <- c("Date",
                         c(trimws(
                           gsub(
                             "for series breaks.*|[[:punct:]]| |Seenot",
                             "",
                             tools::toTitleCase(wordStem(tmp1[2, 2:ncol(tmp1)],
                                                         language = "porter"))
                           ),
                           which = c("both", "left", "right")
                         )))
        df_list[[h]] <- X
        df_names_list <-
          c(df_names_list, gsub(" ", "_", trimws(
            paste0(pg$Df_names[i], "_", shiet[j]),
            which = c("both", "left", "right")
          )))
        h <- h + 1
      } else{
        rm(tmp1)
        j <- j + 1
      }
    }
  } else if (length(grep(".xslx", pg$urls[i])) > 0) {
    tryit <- try(GET(pg$urls[i], write_disk(tf <-
                                              tempfile(fileext = ".xlsx"), overwrite = TRUE)))
    if (inherits(tryit, "try-error")) {
      i <- i + 1
    } else{
      GET(pg$urls[i], write_disk(tf <-
                                   tempfile(fileext = ".xlsx"), overwrite = TRUE))
      shiet <- excel_sheets(tf)[which(
        excel_sheets(tf) != "Summary" &
          excel_sheets(tf) != "Notes" &
          excel_sheets(tf) != "Series breaks"
      )]
    }
    for (j in 1:length(shiet)) {
      tryit <- try(tmp1 <-
                     as.data.frame(read_xlsx(tf, sheet = shiet[j], col_names = FALSE)))
      if (inherits(tryit, "try-error")) {
        j <- j + 1
      } else{
        tmp1 <-
          as.data.frame(read_xlsx(tf, sheet = shiet[j], col_names = FALSE))
      }
      if (ncol(tmp1) > 2) {
        tryit <- try(X <- as.data.frame(read_xlsx(
          gsub('[\\]', '\\/', tf),
          skip = 11,
          col_names = FALSE,
          sheet = shiet[j]
        )))
        if (inherits(tryit, "try-error")) {
          j <- j + 1
        } else{
          X <- as.data.frame(read_xlsx(
            gsub('[\\]', '\\/', tf),
            skip = 11,
            col_names = FALSE,
            sheet = shiet[j]
          ))
        }
        colnames(X) <- c("Date",
                         c(trimws(
                           gsub(
                             "for series breaks.*|[[:punct:]]| |Seenot",
                             "",
                             tools::toTitleCase(wordStem(tmp1[2, 2:ncol(tmp1)],
                                                         language = "porter"))
                           ),
                           which = c("both", "left", "right")
                         )))
        df_list[[h]] <- X
        df_names_list <-
          c(df_names_list, gsub(" ", "_", trimws(
            paste0(pg$Df_names[i], "_", shiet[j]),
            which = c("both", "left", "right")
          )))
        h <- h + 1
      } else{
        rm(tmp1)
        j <- j + 1
      }
    }
  } else if (length(grep(".csv", pg$urls[i])) > 0) {
    tryit <- try(X <- as.data.frame(read.csv(pg$urls[i],
                                             skip = 11,
                                             header = FALSE)))
    if (inherits(tryit, "try-error")) {
      i <- i + 1
    } else {
      X <- as.data.frame(read.csv(pg$urls[i],
                                  skip = 11,
                                  header = FALSE))
    }
    tmp1 <- as.data.frame(read.csv(pg$urls[i]))
    if (ncol(tmp1) >= 2) {
      colnames(X) <- c("Date",
                       c(trimws(
                         gsub(
                           "for series breaks.*|[[:punct:]]| |Seenot",
                           "",
                           tools::toTitleCase(wordStem(tmp1[2, 2:ncol(tmp1)],
                                                       language = "porter"))
                         ),
                         which = c("both", "left", "right")
                       )))
      df_list[[h]] <- X
      df_names_list <-
        c(df_names_list, gsub(" ", "_", trimws(
          pg$Df_names[i],
          which = c("both", "left", "right")
        )))
      h <- h + 1

    } else{
      rm(tmp1)
      j <- j + 1
    }
  }
}

0 个答案:

没有答案