如何使用apply而不是loop来获取不同目录中文件的平均值

时间:2016-10-25 12:10:01

标签: r loops apply mean lapply

我已经使用for循环在'R'中编写了一些代码,但是想知道是否有人知道更好的''apply'类型的方式?我将在下面介绍我的“循环”解决方案。

AIM - 我有许多表(同名),这些表存储在以当月,前一个月和前一个月命名的目录中。它们存储在名为“m1”的目录中。我需要计算这些表中列的三个月平均值,并将它们写回'm3'目录中的csv文件。

这些表格在几个月内通过“关键”列相关联。记录集在几个月内有许多类似的密钥,但不完全相同,所以我使用'merge'而不是rbind。

设置和一些数据......

set.seed(1234)

# dirs
if(!dir.exists("m1")){dir.create("m1")}
if(!dir.exists("m3")){dir.create("m3")}
if(!dir.exists("m1/201604")){dir.create("m1/201604")}
if(!dir.exists("m1/201605")){dir.create("m1/201605")}
if(!dir.exists("m1/201606")){dir.create("m1/201606")}

# objects
my_list = c("tbl_1", "tbl_2", "tbl_3")
month_list = c("201604", "201605", "201606")
my_files_paths = lapply(my_list, function(x) paste0("m1/", month_list, "/", x ,".csv"))

keys = replicate(100, paste0(sample(letters, 5), collapse = ""))


# some dummy data
# for 201604
tbl_1 = data.frame(key = sample(keys, 90, replace = F), count_abc = sample(1:10, 90, replace = T), amount_abc = sample(1:30, 90, replace = T))
write.table(tbl_1, "m1/201604/tbl_1.csv", col.names = T, row.names = F, sep = ",")
tbl_2 = data.frame(key = sample(keys, 90, replace = F), count_def = sample(1:10, 90, replace = T), amount_def = sample(1:30, 90, replace = T))
write.table(tbl_2, "m1/201604/tbl_2.csv", col.names = T, row.names = F, sep = ",")
tbl_3 = data.frame(key = sample(keys, 90, replace = F), count_ghi = sample(1:10, 90, replace = T), amount_ghi = sample(1:30, 90, replace = T))
write.table(tbl_3, "m1/201604/tbl_3.csv", col.names = T, row.names = F, sep = ",")

# for 201605
tbl_1 = data.frame(key = sample(keys, 90, replace = F), count_abc = sample(1:10, 90, replace = T), amount_abc = sample(1:30, 90, replace = T))
write.table(tbl_1, "m1/201605/tbl_1.csv", col.names = T, row.names = F, sep = ",")
tbl_2 = data.frame(key = sample(keys, 90, replace = F), count_def = sample(1:10, 90, replace = T), amount_def = sample(1:30, 90, replace = T))
write.table(tbl_2, "m1/201605/tbl_2.csv", col.names = T, row.names = F, sep = ",")
tbl_3 = data.frame(key = sample(keys, 90, replace = F), count_ghi = sample(1:10, 90, replace = T), amount_ghi = sample(1:30, 90, replace = T))
write.table(tbl_3, "m1/201605/tbl_3.csv", col.names = T, row.names = F, sep = ",")

# for 201606
tbl_1 = data.frame(key = sample(keys, 90, replace = F), count_abc = sample(1:10, 90, replace = T), amount_abc = sample(1:30, 90, replace = T))
write.table(tbl_1, "m1/201606/tbl_1.csv", col.names = T, row.names = F, sep = ",")
tbl_2 = data.frame(key = sample(keys, 90, replace = F), count_def = sample(1:10, 90, replace = T), amount_def = sample(1:30, 90, replace = T))
write.table(tbl_2, "m1/201606/tbl_2.csv", col.names = T, row.names = F, sep = ",")
tbl_3 = data.frame(key = sample(keys, 90, replace = F), count_ghi = sample(1:10, 90, replace = T), amount_ghi = sample(1:30, 90, replace = T))
write.table(tbl_3, "m1/201606/tbl_3.csv", col.names = T, row.names = F, sep = ",")


# I am trying to merge the 'same named csvs' from dirs '201604', '201605' and '201606'
# and get the averages for the "identical' columns in each month's dataframes

rm(month_list, my_list, tbl_1, tbl_2, tbl_3, keys)

1 个答案:

答案 0 :(得分:0)

# My 'loop' version answer... :)
# Does anyone have a better 'r/apply family' method

# NB - stringr needs to be installed


for(tbl in my_files_paths)
{

  # read the files in across the 3 months and merge
  my_tbl = lapply(tbl, function(x) read.table(x, header = T, stringsAsFactors = F, sep = ","))
  merged.tbl = Reduce(function(...) merge(..., by = "key", all = T), my_tbl)

  # set the key to rownames and remove the key column to leave only columns of interest
  row.names(merged.tbl) = merged.tbl$key
  merged.tbl$key = NULL

  # R appends '.x' and '.y' to duplicate column names, so these are cleaned off
  colnames(merged.tbl) = stringr::str_replace_all(colnames(merged.tbl), "\\.[a-z]", "")

  # unique column names are established to help subset
  col_names_merged_tbls = colnames(merged.tbl)
  col_names_merged_tbls = unique(col_names_merged_tbls)

  # subset the similar columns and derive the mean
  # store the means from each iteration in 'empty_vessel' and merge the results
  for(nme in col_names_merged_tbls)
  {
    sub_tbl = merged.tbl[, names(merged.tbl) == nme]
    sub_tbl[,1] = round(apply(sub_tbl, 1, mean, na.rm = T),2)
    sub_tbl = sub_tbl[,1, drop = F]
    # bring 'key' back into play to facilitate the merge
    sub_tbl$key = row.names(sub_tbl)
    if(!exists("empty_vessel")){empty_vessel <- list()}
    empty_vessel[[nme]] = sub_tbl

    new_tbl = Reduce(function(...) merge(..., by = "key", all = T), empty_vessel)

  }
  rm(empty_vessel)

  # set the file_path up to write to
  file_path = stringr::str_replace_all(tbl[1], "m1/[0-9]*/", "")

  # write each new table with 3 month means to 'm3'
  write.table(new_tbl, paste0("m3/", file_path), sep = ",", col.names = T, row.names = F)

  # cleanup
  rm(col_names_merged_tbls, file_path, merged.tbl, my_tbl, new_tbl, nme, sub_tbl, tbl)

}

rm(my_files_paths)