Question

我需要从几个文件中执行一系列计算，并使用所有文件的结果创建一个输出表。

我尝试过列出文件夹和循环中的所有文件，使用for或lapply函数，但我有些遗漏。

以下是我对某些“假”文件的简化版本：

# Create new folder -- "trials"
setwd("C:/Users/.../Desktop")
dir.create("trials")

# Create 'trial' files
setwd("C:/Users/.../Desktop/trials")
pathFiles <- "C:/Users/.../Desktop/trials"

df_1 <- data.frame(x=c(1,2,3,4,5,6,7,8,9,10))
df_1$y <- c(1,2,3,4,5,6,7,8,9,10)
df_1$z <- c(10,20,30,40,50,60,70,80,90,100)
write.table(df_1, "table1.csv", col.names = TRUE, row.names = FALSE, sep = ",")

df_2 <- data.frame(x=c(2,3,4,5,6,7,8,9,10,11))
df_2$y <- c(2,3,4,5,6,7,8,9,10,11)
df_2$z <- c(20,30,40,50,60,70,80,90,100,110)
write.table(df_2, "table2.csv", col.names = TRUE, row.names = FALSE, sep = ",")

df_3 <- data.frame(x=c(3,4,5,6,7,8,9,10,11,12))
df_3$y <- c(3,4,5,6,7,8,9,10,11,12)
df_3$z <- c(30,40,50,60,70,80,90,100,110,120)
write.table(df_3, "table3.csv", col.names = TRUE, row.names = FALSE, sep = ",")

对于这些文件中的每一个，我想提取某些信息并创建包含所有计算字段的输出表。

我尝试过for循环：

Final <- NULL
M <- NULL
slp <- NULL
eval <- NULL

dfs <- dir(pathFiles, "*.csv", full.names = TRUE, ignore.case = TRUE, all.files = TRUE)

for (df in dfs) {

  t <- read.csv(df, header = TRUE, sep = ",")
  x <- t$x
  y <- t$y
  z <- t$z

  lim_y <- y >= 3 & y <=6
  lim_x <- x[lim_y]
  lim_z <- z[lim_y]

  iFinal <- x[nrow(t)]
  Final <- c(Final, iFinal) # add value to the string

  iM <- mean(lim_z)
  M <- c(M, iM) # add value to the string

  p <- lm(lim_x ~ lim_z)
  iSlp <- summary(p)$coefficients[2,1]
  slp <- c(slp, iSlp) # add value to the string

  ifelse ((Slp <= 0.05 & Slp >= -0.05), ieval <- "ok", ieval <- "false") 
  eval <- c(eval, ieval) # add value to the string
}

sum_df <- data.frame(df, M, Slp, eval, Final)
write.table(sum_df, "sum_df.csv", sep = ",", row.names = FALSE, col.names = TRUE)

之前我以类似的方式使用过这个for循环并且工作正常但不在这里。

使用lapply函数，我得不到更好的结果：

dfs <- list.files(pathFiles, "^.+\\.csv", full.names = TRUE, ignore.case = TRUE, all.files = TRUE)

Final <- NULL
M <- NULL
slp <- NULL
eval <- NULL

model <- function(x){
  t <- read.csv(x, header = TRUE, sep = ",")
  x <- t$x
  y <- t$y
  z <- t$z

  lim_y <- y >= 3 & y <=6
  lim_x <- x[lim_y]
  lim_z <- z[lim_y]

  iFinal <- x[nrow(t)]
  Final <- c(Final, iFinal)

  iM <- mean(lim_z)
  M <- c(M, iM)

  p <- lm(lim_x ~ lim_z)
  iSlp <- summary(p)$coefficients[2,1]
  slp <- c(slp, iSlp)

  ifelse ((Slp <= 0.05 & Slp >= -0.05), ieval <- "ok", ieval <- "false") 
  eval <- c(eval, ieval)
}

lapply(dfs, model)

函数和输出表只有一个文件可以正常工作，所以我猜错误必须是我如何循环遍历文件。但我不知道我哪里出错了。

我将不胜感激。

Answer 1

我建议在data.table库中使用rbindlist。

lapply会返回一个长度文件列表，将此列表一起列入单个表

library(data.table)
files <- dir(pathFiles, "*.csv", full.names = TRUE, ignore.case = TRUE, all.files = TRUE)
desiredTable <- rbindlist(
                          lapply(
                                 files,
                                 function(x){
                                 fileData <- fread(x)
                                 CalculatedData <- ...do stuff...
                                 return(CalculatedData)
                                 }
                                )
                            )

这是一个使用do.call的工作示例，避免使用data.table

numFiles <- 100 #number of random files to generate

# Generate a bunch of .csv with a fileID, some letters, and some numbers and put those files in the working dir
sapply(
  1:numFiles,
  function(f){
    dataReplicates <- 12
    dataLetters <- sample(LETTERS,12)
    dataNumbers <- sample(seq(1:100),12)
    fileID <- rep(f,dataReplicates)
    fileData <- cbind(
      fileID,
      dataLetters,
      dataNumbers
    )
    write.csv(
      fileData,
      paste0(getwd(),"/",Sys.Date(),"_",f,".csv"),
      row.names = FALSE
    )
   }
  )

# Read those files back in and store the names in a vector
thoseRandFiles <- dir(
  path = getwd(),
  pattern = as.character(Sys.Date()),
  full.names = TRUE
)

#using lapply and rbind, read in each file, perform operations, and bind into a single table
desiredTable <- do.call(
      rbind,
       lapply(
       thoseRandFiles,
        function(x){
         fileData <- read.csv(x)
         fileID <- fileData$fileID[1]
         firstLetter <- as.character(fileData$dataLetters[1])
         sumNumbers <- sum(fileData$dataNumbers)
         calData <- cbind.data.frame(fileID,firstLetter,sumNumbers)
         return(calData)
         }
       )
      )

Answer 2

您引用了slp和Slp，因此某处出现了拼写错误。对其中一个修复bug进行全局替换。

您的for循环不会对我产生错误。

您的lapply在几个音符上出错：

一般来说，使用*apply函数的一个好处是它们可以在没有side-effect的情况下工作，这就是你在for循环中所做的事情，以及你在做什么正在设置全局分配eval和朋友的时间。没有试图达到＆＃34; out＆＃34;在lapply内并分配给全局命名空间中的变量，当函数退出时，您对M和朋友的分配将被静默丢弃。当你想到使用这些应用函数时（它们很棒），你应该几乎总是假设当函数退出时它们的宇宙完全消失，并且它们无法退出。如果您是Trekkie，请想一想Remember Me (Star Trek TNG)，其中Beverly的宇宙只是泡沫中的内容。（无论是在R还是在电视节目中都可以刺穿它。）
您的函数仅返回eval，这只是偶然的。如果您想要将所有已经突出显示为＆＃34;有趣＆＃34;的内容归还，那么您需要明确地返回它们，可能是list或data.frame。（不是vector，因为ieval会将所有变量上转换为character。）

所以不要考虑在lapply内连接数据，考虑保持结果良好的结构并稍后合并。试试这个：

model2 <- function(fname) {
  dat <- read.csv(fname, header = TRUE, sep = ",")
  lim_y <- dat$y >= 3 & dat$y <=6
  lim_x <- dat$x[lim_y]
  lim_z <- dat$z[lim_y]

  iFinal <- dat$x[nrow(dat)]
  iM <- mean(lim_z)

  p <- lm(lim_x ~ lim_z)
  iSlp <- summary(p)$coefficients[2,1]

  iEval <- (iSlp <= 0.05 & iSlp >= -0.05) 

  return(data.frame(
    fname = fname,
    M = iM, Slp = iSlp, Eval = iEval, Final = iFinal,
    stringsAsFactors = FALSE))
}

do.call(rbind, lapply(dfs, model2))
# Warning in summary.lm(p) :
#   essentially perfect fit: summary may be unreliable
# Warning in summary.lm(p) :
#   essentially perfect fit: summary may be unreliable
# Warning in summary.lm(p) :
#   essentially perfect fit: summary may be unreliable
#          fname  M Slp  Eval Final
# 1 ./table1.csv 45 0.1 FALSE    10
# 2 ./table2.csv 45 0.1 FALSE    11
# 3 ./table3.csv 45 0.1 FALSE    12

*apply家族有很多方法可以做到这一点，但我认为这是一个不错的方法。

有关*apply，list内的帧等的正确读物，请参阅：

R - 循环遍历文件并创建输出表

2 个答案: