我已经使用for循环在'R'中编写了一些代码,但是想知道是否有人知道更好的''apply'类型的方式?我将在下面介绍我的“循环”解决方案。
AIM - 我有许多表(同名),这些表存储在以当月,前一个月和前一个月命名的目录中。它们存储在名为“m1”的目录中。我需要计算这些表中列的三个月平均值,并将它们写回'm3'目录中的csv文件。
这些表格在几个月内通过“关键”列相关联。记录集在几个月内有许多类似的密钥,但不完全相同,所以我使用'merge'而不是rbind。
设置和一些数据......
set.seed(1234)
# dirs
if(!dir.exists("m1")){dir.create("m1")}
if(!dir.exists("m3")){dir.create("m3")}
if(!dir.exists("m1/201604")){dir.create("m1/201604")}
if(!dir.exists("m1/201605")){dir.create("m1/201605")}
if(!dir.exists("m1/201606")){dir.create("m1/201606")}
# objects
my_list = c("tbl_1", "tbl_2", "tbl_3")
month_list = c("201604", "201605", "201606")
my_files_paths = lapply(my_list, function(x) paste0("m1/", month_list, "/", x ,".csv"))
keys = replicate(100, paste0(sample(letters, 5), collapse = ""))
# some dummy data
# for 201604
tbl_1 = data.frame(key = sample(keys, 90, replace = F), count_abc = sample(1:10, 90, replace = T), amount_abc = sample(1:30, 90, replace = T))
write.table(tbl_1, "m1/201604/tbl_1.csv", col.names = T, row.names = F, sep = ",")
tbl_2 = data.frame(key = sample(keys, 90, replace = F), count_def = sample(1:10, 90, replace = T), amount_def = sample(1:30, 90, replace = T))
write.table(tbl_2, "m1/201604/tbl_2.csv", col.names = T, row.names = F, sep = ",")
tbl_3 = data.frame(key = sample(keys, 90, replace = F), count_ghi = sample(1:10, 90, replace = T), amount_ghi = sample(1:30, 90, replace = T))
write.table(tbl_3, "m1/201604/tbl_3.csv", col.names = T, row.names = F, sep = ",")
# for 201605
tbl_1 = data.frame(key = sample(keys, 90, replace = F), count_abc = sample(1:10, 90, replace = T), amount_abc = sample(1:30, 90, replace = T))
write.table(tbl_1, "m1/201605/tbl_1.csv", col.names = T, row.names = F, sep = ",")
tbl_2 = data.frame(key = sample(keys, 90, replace = F), count_def = sample(1:10, 90, replace = T), amount_def = sample(1:30, 90, replace = T))
write.table(tbl_2, "m1/201605/tbl_2.csv", col.names = T, row.names = F, sep = ",")
tbl_3 = data.frame(key = sample(keys, 90, replace = F), count_ghi = sample(1:10, 90, replace = T), amount_ghi = sample(1:30, 90, replace = T))
write.table(tbl_3, "m1/201605/tbl_3.csv", col.names = T, row.names = F, sep = ",")
# for 201606
tbl_1 = data.frame(key = sample(keys, 90, replace = F), count_abc = sample(1:10, 90, replace = T), amount_abc = sample(1:30, 90, replace = T))
write.table(tbl_1, "m1/201606/tbl_1.csv", col.names = T, row.names = F, sep = ",")
tbl_2 = data.frame(key = sample(keys, 90, replace = F), count_def = sample(1:10, 90, replace = T), amount_def = sample(1:30, 90, replace = T))
write.table(tbl_2, "m1/201606/tbl_2.csv", col.names = T, row.names = F, sep = ",")
tbl_3 = data.frame(key = sample(keys, 90, replace = F), count_ghi = sample(1:10, 90, replace = T), amount_ghi = sample(1:30, 90, replace = T))
write.table(tbl_3, "m1/201606/tbl_3.csv", col.names = T, row.names = F, sep = ",")
# I am trying to merge the 'same named csvs' from dirs '201604', '201605' and '201606'
# and get the averages for the "identical' columns in each month's dataframes
rm(month_list, my_list, tbl_1, tbl_2, tbl_3, keys)
答案 0 :(得分:0)
# My 'loop' version answer... :)
# Does anyone have a better 'r/apply family' method
# NB - stringr needs to be installed
for(tbl in my_files_paths)
{
# read the files in across the 3 months and merge
my_tbl = lapply(tbl, function(x) read.table(x, header = T, stringsAsFactors = F, sep = ","))
merged.tbl = Reduce(function(...) merge(..., by = "key", all = T), my_tbl)
# set the key to rownames and remove the key column to leave only columns of interest
row.names(merged.tbl) = merged.tbl$key
merged.tbl$key = NULL
# R appends '.x' and '.y' to duplicate column names, so these are cleaned off
colnames(merged.tbl) = stringr::str_replace_all(colnames(merged.tbl), "\\.[a-z]", "")
# unique column names are established to help subset
col_names_merged_tbls = colnames(merged.tbl)
col_names_merged_tbls = unique(col_names_merged_tbls)
# subset the similar columns and derive the mean
# store the means from each iteration in 'empty_vessel' and merge the results
for(nme in col_names_merged_tbls)
{
sub_tbl = merged.tbl[, names(merged.tbl) == nme]
sub_tbl[,1] = round(apply(sub_tbl, 1, mean, na.rm = T),2)
sub_tbl = sub_tbl[,1, drop = F]
# bring 'key' back into play to facilitate the merge
sub_tbl$key = row.names(sub_tbl)
if(!exists("empty_vessel")){empty_vessel <- list()}
empty_vessel[[nme]] = sub_tbl
new_tbl = Reduce(function(...) merge(..., by = "key", all = T), empty_vessel)
}
rm(empty_vessel)
# set the file_path up to write to
file_path = stringr::str_replace_all(tbl[1], "m1/[0-9]*/", "")
# write each new table with 3 month means to 'm3'
write.table(new_tbl, paste0("m3/", file_path), sep = ",", col.names = T, row.names = F)
# cleanup
rm(col_names_merged_tbls, file_path, merged.tbl, my_tbl, new_tbl, nme, sub_tbl, tbl)
}
rm(my_files_paths)