Perhaps the answer is already posted but as I can't understand the problem, I can't find it!
I'm benching R to read a csv file of 150MB and I use csv.read / fread and indeed fread is really faster than read.csv : 134.663 / 7.838 s
(time Rscript "${path2}"/R_readcsv/myR_readcsv.R $Data_folder_csv >> $target) 2>> $target2
But if I measure the time elapsed inside the code, I mean after the loading of R and the libraries I find that read.csv is faster than fread : 1759 / 3828 ms
For me it's unexpected, is there a explanation?
R version 3.5.0, Platform: x86_64-redhat-linux-gnu (64-bit), Xeon and 16G ram.
The code for read.csv : if=1 round, else=several rounds but it's the same measure
# arguments of the command line
args <- commandArgs(trailingOnly=TRUE)
path <- args[1]
runs <- args[3]
if (length(args)==1) {
# time measure start
tmp_start <- Sys.time()
##
# import the data
charset="Latin-1"
h=TRUE
fff = read.csv( paste(path,"file.csv",sep=""),
header = h, check.names = FALSE, encoding = charset)
##
# count rows without blank lines
cat( nrow(fff),"\n" )
##
# count cols with separator into quotes
cat( ncol(fff),"\n" )
##
# time measure end
tmp_end <- Sys.time()
cat( "TIME_INSIDE",round(as.numeric((tmp_end-tmp_start)*1000)),"\n" )
##
}else{
capture.output({
vector <- c()
for (i in 1:runs) {
# time measure start
tmp_start <- Sys.time()
##
# import the data
charset="Latin-1"
h=TRUE
fff = read.csv( paste(path,"file.csv",sep=""),
header = h, check.names = FALSE, encoding = charset)
##
# count rows without blank lines
cat( nrow(fff),"\n" )
##
# count cols with separator into quotes
cat( ncol(fff),"\n" )
##
# time measure end
tmp_end <- Sys.time()
vector <- c(vector,round(as.numeric((tmp_end-tmp_start)*1000)))
##
}
})
cat( "BENCH_TIME_INSIDE",round(mean(vector), digits = 0),"\n" )
}
and the code for fread :
#install.packages("data.table")
library(data.table)
# arguments of the command line
args <- commandArgs(trailingOnly=TRUE)
path <- args[1]
runs <- args[3]
if (length(args)==1) {
# time measure start
tmp_start <- Sys.time()
##
# import the data
charset="Latin-1"
h=TRUE
fff = fread( paste(path,"file.csv",sep=""),
header = h, check.names = FALSE, encoding = charset, showProgress = FALSE, blank.lines.skip = TRUE)
##
# count rows without blank lines
cat( nrow(fff),"\n" )
##
# count cols with separator into quotes
cat( ncol(fff),"\n" )
##
# time measure end
tmp_end <- Sys.time()
cat( "TIME_INSIDE",round(as.numeric((tmp_end-tmp_start)*1000)),"\n" )
##
}else{
capture.output({
vector <- c()
for (i in 1:runs) {
# time measure start
tmp_start <- Sys.time()
##
# import the data
charset="Latin-1"
h=TRUE
fff = fread( paste(path,"file.csv",sep=""),
header = h, check.names = FALSE, encoding = charset, showProgress = FALSE, blank.lines.skip = TRUE)
##
# count rows without blank lines
cat( nrow(fff),"\n" )
##
# count cols with separator into quotes
cat( ncol(fff),"\n" )
##
# time measure end
tmp_end <- Sys.time()
vector <- c(vector,round(as.numeric((tmp_end-tmp_start)*1000)))
##
}
})
cat( "BENCH_TIME_INSIDE",round(mean(vector), digits = 0),"\n" )
}