我正在寻找一种将几个“.txt”文件导入一个数据框(添加文件名)的功能的速度提升。 “.txt”文件的数量是> 10 000和所有这些文件具有相同的结构,并位于具有多个子目录的一个目录中。所有10 000个文件的大小总共约为800 MB。将所有10 000个文件加载到df需要几个小时。 我的电脑:东芝P50t配8GB RAM和1TB硬盘
请参阅我正在使用的代码。 我很高兴听到如何提高加载速度的建议(我宁愿不使用中间工具,如加载数据到MS SQL并将其导入R)我试图使用fread而不是read_csv而没有明显的速度差异。
files_to_df_v01 <- function( directory , Output_file_name , What_stocks) {
List <- data.frame(dir(directory, pattern="*.txt", recursive = T))
names(List)[1] <- "Path_file"
List <- arrange(List,List$Path_file)
List_wse_stocks <- (filter ( List , str_count(List$Path_file , pattern = What_stocks ) > 0 ))
library(readr)
rownumber = 1
setwd(directory)
############## LOOP ################
for (i in List_wse_stocks$Path_file) {
if (file.info(i)$size != 0) {
dat <- read_csv(i,col_types = cols(Ticker = col_character(), Date = col_date(format = "%Y-%m-%d"), Open = col_double(), High = col_double(), Low = col_double(), Close = col_double(), Volume = col_integer(), OpenInt = col_integer() ))
L_ = (str_locate_all(i,"/"))
sapply(L_,max)
File_name <- substr(i,sapply(L_,max)+1, nchar(i))
dat$Ticker <- substr(File_name,1,nchar(File_name)-4)
datt = dat %>% select(Ticker, Date, Open, High, Low, Close, Volume, OpenInt)
if (rownumber == 1) { rownumber = rownumber + 1
GPW_wse_stocks <- datt }
else{GPW_wse_stocks <- rbind(GPW_wse_stocks, datt)}
}
}
# ) ############## END of LOOP
save(GPW_wse_stocks,file=Output_file_name)
return(data.frame(GPW_wse_stocks))
}
答案 0 :(得分:2)
使用data.table
我设法解决了大约4倍的问题:
# Creating test data :
dir.create("Test")
dd <- "Test/csvReadingTest2"
dir.create(dd)
dir.create(file.path(dd, "v1"))
dir.create(file.path(dd, "v2"))
n <- 3000
f <- function(x) sample(x, n, replace = T)
require(data.table)
set.seed(123)
d1 <- data.table(Ticker = f(LETTERS),
Date = f(seq.Date(as.Date("2016-01-01"), by = "month",
length.out = n/100)),
Open = f(c(1.2, 1.3)), High = f(c(1.2, 1.3)),
Low = f(c(1.2, 1.3)), Close = f(c(1.2, 1.3)),
Volume = f(1:10), OpenInt = f(1:10))
d1
# Ticker Date Open High Low Close Volume OpenInt
# 1: H 2203-04-01 1.2 1.3 1.2 1.2 6 4
# 2: N 2121-05-01 1.2 1.3 1.2 1.2 9 6
# 3: E 2060-04-01 1.3 1.2 1.2 1.3 1 3
# 4: V 2132-04-01 1.3 1.3 1.3 1.2 7 8
# 5: F 2253-04-01 1.2 1.3 1.3 1.2 3 10
# ---
# 2996: J 2027-05-01 1.3 1.3 1.2 1.2 7 6
# 2997: K 2177-05-01 1.2 1.3 1.2 1.2 5 4
# 2998: S 2200-03-01 1.2 1.2 1.2 1.2 6 2
# 2999: V 2110-05-01 1.3 1.3 1.3 1.2 4 3
# 3000: Q 2043-05-01 1.2 1.3 1.2 1.2 3 5
invisible(lapply(1:100, function(x) fwrite(d1, paste0(dd, "/v1/d", x, ".txt"))))
invisible(lapply(1:100, function(x) fwrite(d1, paste0(dd, "/v2/d", x, ".txt"))))
稍微修改了你的功能:
################################################################################
yourFunction_modified <- function(directory, Output_file_name, What_stocks) {
# require(plyr)
require(dplyr)
require(stringr)
library(readr)
# List <- data.frame(dir(directory, pattern = "*.txt", recursive = T))
# names(List)[1] <- "Path_file"
# List <- arrange(List, List$Path_file)
# List_wse_stocks <- (filter(List , str_count(List$Path_file ,
# pattern = What_stocks ) > 0 ))
l <- list.files(directory, recursive = T, full.names = T, pattern = "*.txt")
l <- l[grepl(What_stocks, l)]
rownumber = 1
for (i in l) {
if (file.info(i)$size != 0) {
dat <- read_csv(i,
col_types = cols(Ticker = col_character(),
Date = col_date(format = "%Y-%m-%d"),
Open = col_double(), High = col_double(),
Low = col_double(), Close = col_double(),
Volume = col_integer(),
OpenInt = col_integer()))
L_ = (str_locate_all(i,"/"))
File_name <- substr(i,sapply(L_,max) + 1, nchar(i))
dat$Ticker <- substr(File_name,1,nchar(File_name) - 4)
datt = dat %>% select(Ticker, Date, Open, High, Low, Close,
Volume, OpenInt)
if (rownumber == 1) {
rownumber = rownumber + 1
GPW_wse_stocks <- datt
} else {
GPW_wse_stocks <- rbind(GPW_wse_stocks, datt)
}
}
}
save(GPW_wse_stocks, file = Output_file_name)
return(data.frame(GPW_wse_stocks))
}
system.time(
x <- yourFunction_modified(dd, file.path(dirname(dd), "csvReadingTest2.Rdat"),
"/d[0-9]")
)
# 25 - 18 sek
我的功能:
myFun <- function(directory, Output_file_name, What_stocks) {
require(data.table)
require(Hmisc)
l <- list.files(directory, recursive = T, full.names = T, pattern = "*.txt")
l <- l[grepl(What_stocks, l)]
l <- l[file.info(l)$size != 0]
dtList <- lapply(l, function(i) {
dat <- fread(i)
File_name <- basename(i)
dat$Ticker <- substr(File_name, 1, nchar(File_name) - 4)
necessary <- Cs(Ticker, Date, Open, High, Low, Close, Volume, OpenInt)
# Delete unnecesary columns:
for (ii in setdiff(colnames(dat), necessary)) {
set(dat, j = ii, value = NULL)
}
dat
})
dtList[1:2]
dt <- rbindlist(dtList, use.names = T, fill = T, idcol = F)
require(fasttime)
dt[, Date := as.Date(fastPOSIXct(Date))]
save(dt, file = Output_file_name)
return(dt[])
}
system.time(
x2 <- myFun(dd, file.path(dirname(dd), "csvReadingTest2v2.Rdat"),
"/d[0-9]")
)
# 6 - 4 sek
all.equal(as.data.table(x), x2)
# [1] TRUE1
答案 1 :(得分:2)
y
非常快,但如果您有大量小文件并且不关心保留文件名,则最好直接使用操作系统。
设置数据因为OP没有:10,000行100个文件。
var response = new MakesResponse();
var tbl = new DynamicModel("SONICAPI");
string sql = "EXEC pGetMakes";
var result = tbl.Query(sql);
return new MakesResponse()
{
makes = (string[])result.ToArray(),
ExecutionTime = sw.ElapsedMilliseconds,
Result = "200",
ResultText = "OK",
Source = "DB"
};
简单方法(也处理重复的标题并使colClasses更接近事实。)
rbindlist(lapply(files, fread))
使用Windows的系统setwd(tempdir())
dir.create("48492154")
setwd("48492154")
dates <- as.character(seq.Date(as.Date("2012-01-01"),
as.Date(Sys.Date()),
length.out = 500))
library(data.table)
for (i in 1:1e4) {
DT <- data.table(Ticker = 1:100,
Date = sample(dates, size = 100),
Open = round(runif(100) + 100, 1),
Close = round(runif(100) + 100, 1),
Volume = sample(1:100),
OpenInt = 1:100)
cat(i, "of 10,000\r")
flush.console()
fwrite(DT, paste0(i, ".csv"), showProgress = FALSE)
}
:
system.time({
res <- rbindlist(lapply(dir(pattern = "\\.csv"), fread))
})
#> user system elapsed
#> 5.46 3.17 8.62