如何快速导入许多.txt文件到R

时间:2018-01-28 22:42:43

标签: r performance file-import

我正在寻找一种将几个“.txt”文件导入一个数据框(添加文件名)的功能的速度提升。 “.txt”文件的数量是> 10 000和所有这些文件具有相同的结构,并位于具有多个子目录的一个目录中。所有10 000个文件的大小总共约为800 MB。将所有10 000个文件加载到df需要几个小时。 我的电脑:东芝P50t配8GB RAM和1TB硬盘

请参阅我正在使用的代码。 我很高兴听到如何提高加载速度的建议(我宁愿不使用中间工具,如加载数据到MS SQL并将其导入R)我试图使用fread而不是read_csv而没有明显的速度差异。

files_to_df_v01 <- function( directory , Output_file_name , What_stocks) {

  List <-   data.frame(dir(directory, pattern="*.txt", recursive = T))
  names(List)[1] <- "Path_file"
  List <-  arrange(List,List$Path_file)
  List_wse_stocks <- (filter ( List , str_count(List$Path_file , pattern = What_stocks ) > 0 ))

  library(readr)

  rownumber = 1
  setwd(directory)

  ############## LOOP ################ 

  for (i in List_wse_stocks$Path_file) {    
    if (file.info(i)$size != 0) {           
      dat <- read_csv(i,col_types = cols(Ticker = col_character(), Date = col_date(format = "%Y-%m-%d"), Open = col_double(), High = col_double(), Low = col_double(), Close = col_double(), Volume = col_integer(), OpenInt = col_integer() ))
      L_ = (str_locate_all(i,"/")) 
      sapply(L_,max) 
      File_name <- substr(i,sapply(L_,max)+1, nchar(i)) 
      dat$Ticker <- substr(File_name,1,nchar(File_name)-4) 
      datt = dat %>% select(Ticker, Date, Open, High, Low, Close, Volume, OpenInt)
      if (rownumber == 1) { rownumber = rownumber + 1
      GPW_wse_stocks <- datt }
      else{GPW_wse_stocks <- rbind(GPW_wse_stocks, datt)}
    }   
  }   
  # )                   ##############  END of LOOP

  save(GPW_wse_stocks,file=Output_file_name)

  return(data.frame(GPW_wse_stocks)) 

}

2 个答案:

答案 0 :(得分:2)

使用data.table我设法解决了大约4倍的问题:

# Creating test data :

dir.create("Test")
dd <- "Test/csvReadingTest2"
dir.create(dd)
dir.create(file.path(dd, "v1"))
dir.create(file.path(dd, "v2"))

n <- 3000
f <- function(x) sample(x, n, replace = T)
require(data.table)
set.seed(123)
d1 <- data.table(Ticker = f(LETTERS),
                 Date = f(seq.Date(as.Date("2016-01-01"), by = "month",
                                   length.out = n/100)),
                 Open = f(c(1.2, 1.3)), High = f(c(1.2, 1.3)),
                 Low = f(c(1.2, 1.3)), Close = f(c(1.2, 1.3)),
                 Volume = f(1:10), OpenInt = f(1:10))
d1
#       Ticker       Date Open High Low Close Volume OpenInt
#    1:      H 2203-04-01  1.2  1.3 1.2   1.2      6       4
#    2:      N 2121-05-01  1.2  1.3 1.2   1.2      9       6
#    3:      E 2060-04-01  1.3  1.2 1.2   1.3      1       3
#    4:      V 2132-04-01  1.3  1.3 1.3   1.2      7       8
#    5:      F 2253-04-01  1.2  1.3 1.3   1.2      3      10
#  ---                                                     
# 2996:      J 2027-05-01  1.3  1.3 1.2   1.2      7       6
# 2997:      K 2177-05-01  1.2  1.3 1.2   1.2      5       4
# 2998:      S 2200-03-01  1.2  1.2 1.2   1.2      6       2
# 2999:      V 2110-05-01  1.3  1.3 1.3   1.2      4       3
# 3000:      Q 2043-05-01  1.2  1.3 1.2   1.2      3       5

invisible(lapply(1:100, function(x) fwrite(d1, paste0(dd, "/v1/d", x, ".txt"))))
invisible(lapply(1:100, function(x) fwrite(d1, paste0(dd, "/v2/d", x, ".txt"))))

稍微修改了你的功能:

################################################################################

yourFunction_modified <- function(directory, Output_file_name, What_stocks) {

  # require(plyr)
  require(dplyr)
  require(stringr)
  library(readr)

  # List <-   data.frame(dir(directory, pattern = "*.txt", recursive = T))
  # names(List)[1] <- "Path_file"
  # List <-  arrange(List, List$Path_file)
  # List_wse_stocks <- (filter(List , str_count(List$Path_file ,
  #                                               pattern = What_stocks ) > 0 ))

  l <- list.files(directory, recursive = T, full.names = T, pattern = "*.txt")
  l <- l[grepl(What_stocks, l)]

  rownumber = 1

  for (i in l) {    
    if (file.info(i)$size != 0) {           
      dat <- read_csv(i,
                      col_types = cols(Ticker = col_character(),
                                       Date = col_date(format = "%Y-%m-%d"),
                                       Open = col_double(), High = col_double(),
                                       Low = col_double(), Close = col_double(),
                                       Volume = col_integer(),
                                       OpenInt = col_integer()))
      L_ = (str_locate_all(i,"/")) 
      File_name <- substr(i,sapply(L_,max) + 1, nchar(i)) 
      dat$Ticker <- substr(File_name,1,nchar(File_name) - 4) 
      datt = dat %>% select(Ticker, Date, Open, High, Low, Close,
                            Volume, OpenInt)
      if (rownumber == 1) {
        rownumber = rownumber + 1
        GPW_wse_stocks <- datt
      } else {
          GPW_wse_stocks <- rbind(GPW_wse_stocks, datt)
      }
    } 
  }   
  save(GPW_wse_stocks, file = Output_file_name)
  return(data.frame(GPW_wse_stocks)) 
}


system.time(
  x <- yourFunction_modified(dd, file.path(dirname(dd), "csvReadingTest2.Rdat"),
                       "/d[0-9]")
)

 # 25 - 18 sek

我的功能:

myFun <- function(directory, Output_file_name, What_stocks) {
  require(data.table)
  require(Hmisc)

  l <- list.files(directory, recursive = T, full.names = T, pattern = "*.txt")
  l <- l[grepl(What_stocks, l)]
  l <- l[file.info(l)$size != 0]

  dtList <- lapply(l, function(i) { 
      dat <- fread(i)
      File_name <- basename(i)
      dat$Ticker <- substr(File_name, 1, nchar(File_name) - 4) 
      necessary <- Cs(Ticker, Date, Open, High, Low, Close, Volume, OpenInt)
      # Delete unnecesary columns:
      for (ii in setdiff(colnames(dat), necessary)) {
        set(dat, j = ii, value = NULL)
      }
      dat
  })
  dtList[1:2]
  dt <- rbindlist(dtList, use.names = T, fill = T, idcol = F)
  require(fasttime)
  dt[, Date := as.Date(fastPOSIXct(Date))]
  save(dt, file = Output_file_name)
  return(dt[]) 
}

system.time(
  x2 <- myFun(dd, file.path(dirname(dd), "csvReadingTest2v2.Rdat"),
                       "/d[0-9]")
)

# 6 - 4 sek

all.equal(as.data.table(x), x2)
# [1] TRUE1

答案 1 :(得分:2)

y非常快,但如果您有大量小文件并且不关心保留文件名,则最好直接使用操作系统。

设置数据因为OP没有:10,000行100个文件。

var response = new MakesResponse();
var tbl = new DynamicModel("SONICAPI");
string sql = "EXEC pGetMakes";
var result = tbl.Query(sql);

return new MakesResponse()
{
makes = (string[])result.ToArray(),
ExecutionTime = sw.ElapsedMilliseconds,
Result = "200",
ResultText = "OK",
Source = "DB"
};

简单方法(也处理重复的标题并使colClasses更接近事实。)

rbindlist(lapply(files, fread))

使用Windows的系统setwd(tempdir()) dir.create("48492154") setwd("48492154") dates <- as.character(seq.Date(as.Date("2012-01-01"), as.Date(Sys.Date()), length.out = 500)) library(data.table) for (i in 1:1e4) { DT <- data.table(Ticker = 1:100, Date = sample(dates, size = 100), Open = round(runif(100) + 100, 1), Close = round(runif(100) + 100, 1), Volume = sample(1:100), OpenInt = 1:100) cat(i, "of 10,000\r") flush.console() fwrite(DT, paste0(i, ".csv"), showProgress = FALSE) }

system.time({
  res <- rbindlist(lapply(dir(pattern = "\\.csv"), fread))
})
#>   user  system elapsed 
#>   5.46    3.17    8.62