我有一堆要读入R的csv文件。
以下示例数据可帮助进行此查询。
ABC_H <- structure(list(Name = c("Jim Smith", "John Doe"), user_id = c(23L, 25L), sales = c(300L, 140L), shape = c(6L, 200L), size = c(402L, 305L)),row.names= c(NA, -2L), class = c("tbl_df", "tbl", "data.frame"), spec = structure(list(cols = list(Name = structure(list(), class = c("collector_character", "collector")), user_id = structure(list(), class = c("collector_integer", "collector")), sales = structure(list(), class = c("collector_integer", "collector")), shape = structure(list(), class = c("collector_integer", "collector")), size = structure(list(), class = c("collector_integer", "collector"))), default = structure(list(), class = c("collector_guess", "collector"))), class = "col_spec"))
BCD_H <- structure(list(Name = c("Jim Smith", "John Doe"), user_id = c(23L, 25L), sales = c(190L, 120L), shape = c(10L, 5L), size = c(500L, 200L)), row.names = c(NA, -2L), class = c("tbl_df", "tbl", "data.frame"), spec =structure(list(cols = list(Name = structure(list(), class = c("collector_character", "collector")), user_id = structure(list(), class = c("collector_integer", "collector")), sales = structure(list(), class = c("collector_integer", "collector")), shape = structure(list(), class = c("collector_integer", "collector")), size = structure(list(), class = c("collector_integer", "collector"))), default = structure(list(), class = c("collector_guess", "collector"))), class = "col_spec"))
XYZ_H <- structure(list(Name = c("Jim Smith", "John Doe"), user_id = c(23L, 25L), sales = c(190L, 120L), shape = c(10L, 5L), size = c(500L, 200L)), row.names = c(NA, -2L), class = c("tbl_df", "tbl", "data.frame"), spec =structure(list(cols = list(Name = structure(list(), class = c("collector_character", "collector")), user_id = structure(list(), class = c("collector_integer", "collector")), sales = structure(list(), class = c("collector_integer", "collector")), shape = structure(list(), class = c("collector_integer", "collector")), size = structure(list(), class = c("collector_integer", "collector"))), default = structure(list(), class = c("collector_guess", "collector"))), class = "col_spec"))
ABC_P <- structure(list(Name = c("Alex Smith", "John Curry"), user_id = c(102L, 433L), color = c(164L, 153L), number = c(20L, 3L), scale = c(6L, 1L)), row.names = c(NA, -2L), class = c("tbl_df", "tbl", "data.frame"), spec = structure(list(cols = list(Name = structure(list(), class = c("collector_character", "collector")), user_id = structure(list(), class = c("collector_integer", "collector")), color = structure(list(), class = c("collector_integer", "collector")), number = structure(list(), class = c("collector_integer", "collector")), scale = structure(list(), class = c("collector_integer", "collector"))), default = structure(list(), class = c("collector_guess", "collector"))), class = "col_spec"))
BCD_P <- structure(list(Name = c("Alex Smith", "John Curry"), user_id = c(102L, 433L), color = c(300L, 200L), number = c(100L, 4L), scale = c(2L, 5L)), row.names = c(NA, -2L), class = c("tbl_df", "tbl", "data.frame"), spec = structure(list(cols = list(Name = structure(list(), class = c("collector_character", "collector")), user_id = structure(list(), class = c("collector_integer", "collector")), color = structure(list(), class = c("collector_integer", "collector")), number = structure(list(), class = c("collector_integer", "collector")), scale = structure(list(), class = c("collector_integer", "collector"))), default = structure(list(), class = c("collector_guess", "collector"))), class = "col_spec"))
XYZ_P <- structure(list(Name = c("Alex Smith", "John Curry"), user_id = c(102L, 433L), color = c(300L, 200L), number = c(100L, 4L), scale = c(2L, 5L)), row.names = c(NA, -2L), class = c("tbl_df", "tbl", "data.frame"), spec = structure(list(cols = list(Name = structure(list(), class = c("collector_character", "collector")), user_id = structure(list(), class = c("collector_integer", "collector")), color = structure(list(), class = c("collector_integer", "collector")), number = structure(list(), class = c("collector_integer", "collector")), scale = structure(list(), class = c("collector_integer", "collector"))), default = structure(list(), class = c("collector_guess", "collector"))), class = "col_spec"))
我使用的命名约定基于文件名。
这是我当前用于加载.csv文件的手动代码的示例。还有其他几个csv文件。工作目录被加载到脚本的顶部。
ABC_H <- read_csv(file.path(getwd(), "input_data","ABC_H_2018.csv"))
ABC_P <- read_csv(file.path(getwd(), "input_data","ABC_P_2018.csv"))
BCD_H <- read_csv(file.path(getwd(), "input_data","BCD_H_2018.csv"))
BCD_P <- read_csv(file.path(getwd(), "input_data","BCD_P_2018.csv"))
XYZ_H <- read_csv(file.path(getwd(), "input_data","XYZ_H_2018.csv"))
XYZ_P <- read_csv(file.path(getwd(), "input_data","XYZ_P_2018.csv"))
然后,我使用dplyr分别格式化每个文件。包含后缀“ H”的所有文件都将使用相同的列进行格式化,而包含“ P”的所有文件都将使用相同的列进行格式化。这是我为前2个带有后缀“ H”的dplyr代码的示例。
ABC_H_formatted <- ABC_H %>%
mutate(data_source = "ABC")%>%
rename(user_id = user)%>%
select(user_id, Name, data_source, size, shape, sales)
BCD_H_formatted <- BCD_H %>%
mutate(data_source = "BCD")%>%
rename(user_id = user)%>%
select(user_id, Name, data_source, size, shape, sales)
XYZ_H_formatted <- BCD_H %>%
mutate(data_source = "BCD")%>%
rename(user_id = user)%>%
select(user_id, Name, data_source, size, shape, sales)
ABC_P_formatted <- ABC_P %>%
mutate(data_source = "ABC")%>%
rename(user_id = user)%>%
select(user_id, Name, data_source, color, number, scale)
BCD_P_formatted <- BCD_P %>%
mutate(data_source = "BCD")%>%
rename(user_id = user)%>%
select(user_id, Name, data_source, color, number, scale)
XYZ_P_formatted <- XYZ_P %>%
mutate(data_source = "XYZ")%>%
rename(user_id = user)%>%
select(user_id, Name, data_source, color, number, scale)
这就是我想要做的。
提前谢谢!
答案 0 :(得分:1)
在这里,您真正地想到编写自定义函数以执行所需的操作,然后将该功能应用于对象列表的想法。这是R中通常进行迭代的方式。
由于您已经在使用dplyr
,所以我将展示一种tidyverse
的方法(不过,这也可以在base R中轻松完成)。创建一个执行所需处理步骤的函数,并分配一个后缀和data_source列(从文件名中提取一些正则表达式和stringr
)。
然后,您将希望使用匹配模式从目录中获取文件列表,而不是手动输入每个文件。
最后,我们使用map_df
中的purrr
将read_data
函数“映射”到列表中的每个文件,然后将所有结果绑定到一个数据框中。这不会产生您想要的两个数据帧,但是使用suffix
列,您可以轻松地将每个数据块子集化。
您可能会遇到一些错误,因为我没有用于测试的数据。将来,最好为您的问题提供一个可重复的示例,以便我们更好地帮助您!
library(tidyverse) #dplyr, purrr, stringr, readr
read_data <- function(file) {
name <- basename(file)
suffix <- str_extract(name, "(?<=_)[HP]")
data_source <- str_extact(name, "[A-Z]+(?=_)")
if (suffix == "P") {
cols <- c("suffix", "user_id", "Name", "data_source", "color", "number", "scale")
}
if (suffix == "H") {
cols <- c("suffix", "user_id", "Name", "data_source", "size", "shape", "sales")
}
read_csv(file) %>%
rename(user_id = user) %>%
mutate(datar_source = data_source,
suffix = suffix) %>%
select(one_of(cols))
}
files <- dir(file.path(getwd(), "input_data"), pattern = "ABC|BCD", full.names = TRUE)
result <- map_df(files, read_data)