使用循环来处理和格式化多个.csv文件

时间:2018-07-31 13:12:10

标签: r loops dplyr lapply

我有一堆要读入R的csv文件。

以下示例数据可帮助进行此查询。

ABC_H <- structure(list(Name = c("Jim Smith", "John Doe"), user_id = c(23L, 25L), sales = c(300L, 140L), shape = c(6L, 200L), size = c(402L, 305L)),row.names= c(NA, -2L), class = c("tbl_df", "tbl", "data.frame"), spec = structure(list(cols = list(Name = structure(list(), class = c("collector_character", "collector")), user_id = structure(list(), class = c("collector_integer", "collector")), sales = structure(list(), class = c("collector_integer", "collector")), shape = structure(list(), class = c("collector_integer", "collector")), size = structure(list(), class = c("collector_integer", "collector"))), default = structure(list(), class = c("collector_guess", "collector"))), class = "col_spec"))
BCD_H <- structure(list(Name = c("Jim Smith", "John Doe"), user_id = c(23L, 25L), sales = c(190L, 120L), shape = c(10L, 5L), size = c(500L, 200L)), row.names = c(NA, -2L), class = c("tbl_df", "tbl", "data.frame"), spec =structure(list(cols = list(Name = structure(list(), class = c("collector_character", "collector")), user_id = structure(list(), class = c("collector_integer", "collector")), sales = structure(list(), class = c("collector_integer", "collector")), shape = structure(list(), class = c("collector_integer", "collector")), size = structure(list(), class = c("collector_integer", "collector"))), default = structure(list(), class = c("collector_guess", "collector"))), class = "col_spec"))
XYZ_H <- structure(list(Name = c("Jim Smith", "John Doe"), user_id = c(23L, 25L), sales = c(190L, 120L), shape = c(10L, 5L), size = c(500L, 200L)), row.names = c(NA, -2L), class = c("tbl_df", "tbl", "data.frame"), spec =structure(list(cols = list(Name = structure(list(), class = c("collector_character", "collector")), user_id = structure(list(), class = c("collector_integer", "collector")), sales = structure(list(), class = c("collector_integer", "collector")), shape = structure(list(), class = c("collector_integer", "collector")), size = structure(list(), class = c("collector_integer", "collector"))), default = structure(list(), class = c("collector_guess", "collector"))), class = "col_spec"))

ABC_P <- structure(list(Name = c("Alex Smith", "John Curry"), user_id = c(102L, 433L), color = c(164L, 153L), number = c(20L, 3L), scale = c(6L, 1L)), row.names = c(NA, -2L), class = c("tbl_df", "tbl", "data.frame"), spec = structure(list(cols = list(Name = structure(list(), class = c("collector_character", "collector")), user_id = structure(list(), class = c("collector_integer", "collector")), color = structure(list(), class = c("collector_integer", "collector")), number = structure(list(), class = c("collector_integer", "collector")), scale = structure(list(), class = c("collector_integer", "collector"))), default = structure(list(), class = c("collector_guess", "collector"))), class = "col_spec"))
BCD_P <- structure(list(Name = c("Alex Smith", "John Curry"), user_id = c(102L, 433L), color = c(300L, 200L), number = c(100L, 4L), scale = c(2L, 5L)), row.names = c(NA, -2L), class = c("tbl_df", "tbl", "data.frame"), spec = structure(list(cols = list(Name = structure(list(), class = c("collector_character", "collector")), user_id = structure(list(), class = c("collector_integer", "collector")), color = structure(list(), class = c("collector_integer", "collector")), number = structure(list(), class = c("collector_integer", "collector")), scale = structure(list(), class = c("collector_integer", "collector"))), default = structure(list(), class = c("collector_guess", "collector"))), class = "col_spec"))
XYZ_P <- structure(list(Name = c("Alex Smith", "John Curry"), user_id = c(102L, 433L), color = c(300L, 200L), number = c(100L, 4L), scale = c(2L, 5L)), row.names = c(NA, -2L), class = c("tbl_df", "tbl", "data.frame"), spec = structure(list(cols = list(Name = structure(list(), class = c("collector_character", "collector")), user_id = structure(list(), class = c("collector_integer", "collector")), color = structure(list(), class = c("collector_integer", "collector")), number = structure(list(), class = c("collector_integer", "collector")), scale = structure(list(), class = c("collector_integer", "collector"))), default = structure(list(), class = c("collector_guess", "collector"))), class = "col_spec"))   

我使用的命名约定基于文件名。

这是我当前用于加载.csv文件的手动代码的示例。还有其他几个csv文件。工作目录被加载到脚本的顶部。

ABC_H <- read_csv(file.path(getwd(), "input_data","ABC_H_2018.csv"))
ABC_P <- read_csv(file.path(getwd(), "input_data","ABC_P_2018.csv"))
BCD_H <- read_csv(file.path(getwd(), "input_data","BCD_H_2018.csv"))
BCD_P <- read_csv(file.path(getwd(), "input_data","BCD_P_2018.csv"))
XYZ_H <- read_csv(file.path(getwd(), "input_data","XYZ_H_2018.csv"))
XYZ_P <- read_csv(file.path(getwd(), "input_data","XYZ_P_2018.csv"))

然后,我使用dplyr分别格式化每个文件。包含后缀“ H”的所有文件都将使用相同的列进行格式化,而包含“ P”的所有文件都将使用相同的列进行格式化。这是我为前2个带有后缀“ H”的dplyr代码的示例。

ABC_H_formatted <- ABC_H %>%
mutate(data_source = "ABC")%>%
rename(user_id = user)%>%
select(user_id, Name, data_source, size, shape, sales)

BCD_H_formatted <- BCD_H %>%
mutate(data_source = "BCD")%>%
rename(user_id = user)%>%
select(user_id, Name, data_source, size, shape, sales)


XYZ_H_formatted <- BCD_H %>%
mutate(data_source = "BCD")%>%
rename(user_id = user)%>%
select(user_id, Name, data_source, size, shape, sales)

ABC_P_formatted <- ABC_P %>%
mutate(data_source = "ABC")%>%
rename(user_id = user)%>%
select(user_id, Name, data_source, color, number, scale)

BCD_P_formatted <- BCD_P %>%
mutate(data_source = "BCD")%>%
rename(user_id = user)%>%
select(user_id, Name, data_source, color, number, scale)

XYZ_P_formatted <- XYZ_P %>%
mutate(data_source = "XYZ")%>%
rename(user_id = user)%>%
select(user_id, Name, data_source, color, number, scale)

这就是我想要做的。

  1. 创建2个循环,一个对所有带有_H后缀的文件,一个对所有带有_P后缀的文件
  2. 根据文件名创建data_source列(如上所述)
  3. 为每个文件选择上面列出的列
  4. 为所有_H和一个_P创建一个data.frame

提前谢谢!

1 个答案:

答案 0 :(得分:1)

在这里,您真正地想到编写自定义函数以执行所需的操作,然后将该功能应用于对象列表的想法。这是R中通常进行迭代的方式。

由于您已经在使用dplyr,所以我将展示一种tidyverse的方法(不过,这也可以在base R中轻松完成)。创建一个执行所需处理步骤的函数,并分配一个后缀和data_source列(从文件名中提取一些正则表达式和stringr)。

然后,您将希望使用匹配模式从目录中获取文件列表,而不是手动输入每个文件。

最后,我们使用map_df中的purrrread_data函数“映射”到列表中的每个文件,然后将所有结果绑定到一个数据框中。这不会产生您想要的两个数据帧,但是使用suffix列,您可以轻松地将每个数据块子集化。

您可能会遇到一些错误,因为我没有用于测试的数据。将来,最好为您的问题提供一个可重复的示例,以便我们更好地帮助您!

library(tidyverse) #dplyr, purrr, stringr, readr

read_data <- function(file) {

  name <- basename(file)

  suffix <- str_extract(name, "(?<=_)[HP]")

  data_source <- str_extact(name, "[A-Z]+(?=_)")

  if (suffix == "P") {
    cols <- c("suffix", "user_id", "Name", "data_source", "color", "number", "scale")
  } 

  if (suffix == "H") {
    cols <- c("suffix", "user_id", "Name", "data_source", "size", "shape", "sales")
  }

  read_csv(file) %>% 
    rename(user_id = user) %>% 
    mutate(datar_source = data_source,
           suffix = suffix) %>% 
    select(one_of(cols))

}
files <- dir(file.path(getwd(), "input_data"), pattern = "ABC|BCD", full.names = TRUE)

result <- map_df(files, read_data)