Unexpected results in benchmark of read.csv / fread

时间:2018-07-24 10:18:05

标签: r csv fread read.csv

Perhaps the answer is already posted but as I can't understand the problem, I can't find it!

I'm benching R to read a csv file of 150MB and I use csv.read / fread and indeed fread is really faster than read.csv : 134.663 / 7.838 s

(time Rscript "${path2}"/R_readcsv/myR_readcsv.R $Data_folder_csv >> $target) 2>> $target2

But if I measure the time elapsed inside the code, I mean after the loading of R and the libraries I find that read.csv is faster than fread : 1759 / 3828 ms

For me it's unexpected, is there a explanation?

R version 3.5.0, Platform: x86_64-redhat-linux-gnu (64-bit), Xeon and 16G ram.

The code for read.csv : if=1 round, else=several rounds but it's the same measure

# arguments of the command line
args <- commandArgs(trailingOnly=TRUE)
path <- args[1]
runs <- args[3]

if (length(args)==1) {
    # time measure start
    tmp_start <- Sys.time()
    ##
    # import the data
    charset="Latin-1"
    h=TRUE
    fff = read.csv( paste(path,"file.csv",sep=""),
                header = h, check.names = FALSE, encoding = charset)
    ##
    # count rows without blank lines
    cat( nrow(fff),"\n" )
    ##
    # count cols with separator into quotes
    cat( ncol(fff),"\n" )
    ##
    # time measure end
    tmp_end <- Sys.time()
    cat( "TIME_INSIDE",round(as.numeric((tmp_end-tmp_start)*1000)),"\n" )
    ##
}else{
    capture.output({
    vector <- c()
    for (i in 1:runs) {
        # time measure start
        tmp_start <- Sys.time()
        ##
        # import the data
        charset="Latin-1"
        h=TRUE
        fff = read.csv( paste(path,"file.csv",sep=""),
                    header = h, check.names = FALSE, encoding = charset)
        ##
        # count rows without blank lines
        cat( nrow(fff),"\n" )
        ##
        # count cols with separator into quotes
        cat( ncol(fff),"\n" )
        ##
        # time measure end
        tmp_end <- Sys.time()
        vector <- c(vector,round(as.numeric((tmp_end-tmp_start)*1000)))
        ##
    }
    })
    cat( "BENCH_TIME_INSIDE",round(mean(vector), digits = 0),"\n" )
}

and the code for fread :

#install.packages("data.table")
library(data.table)
# arguments of the command line
args <- commandArgs(trailingOnly=TRUE)
path <- args[1]
runs <- args[3]

if (length(args)==1) {
    # time measure start
    tmp_start <- Sys.time()
    ##
    # import the data
    charset="Latin-1"
    h=TRUE
    fff = fread( paste(path,"file.csv",sep=""),
                header = h, check.names = FALSE, encoding = charset, showProgress = FALSE, blank.lines.skip = TRUE)
    ##
    # count rows without blank lines
    cat( nrow(fff),"\n" )
    ##
    # count cols with separator into quotes
    cat( ncol(fff),"\n" )
    ##
    # time measure end
    tmp_end <- Sys.time()
    cat( "TIME_INSIDE",round(as.numeric((tmp_end-tmp_start)*1000)),"\n" )
    ##
}else{
    capture.output({
    vector <- c()
    for (i in 1:runs) {
        # time measure start
        tmp_start <- Sys.time()
        ##
        # import the data
        charset="Latin-1"
        h=TRUE
        fff = fread( paste(path,"file.csv",sep=""),
                    header = h, check.names = FALSE, encoding = charset, showProgress = FALSE, blank.lines.skip = TRUE)
        ##
        # count rows without blank lines
        cat( nrow(fff),"\n" )
        ##
        # count cols with separator into quotes
        cat( ncol(fff),"\n" )
        ##
        # time measure end
        tmp_end <- Sys.time()
        vector <- c(vector,round(as.numeric((tmp_end-tmp_start)*1000)))
        ##
    }
    })
    cat( "BENCH_TIME_INSIDE",round(mean(vector), digits = 0),"\n" )
}

0 个答案:

没有答案