Question

我需要重复切片大dataframe（基于行索引）并根据一列的值对其进行子集化。我DF的维度是＆gt; 1E8行，看起来这两个步骤最耗时。我想知道是否有任何方法可以减少计算时间？

我尝试使用data.table，但setkey需要一些时间，所以我不确定这是否是更好的选择。

library(profvis)
profvis({

    library(dplyr)
    set.seed(1234)

    # create testing df
    n=1e8
    raw_data <- data.frame(label=sample(c("A", "B", "C", NA), n, replace=TRUE), value=rnorm(n))

    # data slice indices
    slide_begin=3000000
    slide_end=9000000

    # subset df
    raw_data_sub <- raw_data[slide_begin:slide_end,]
    raw_data_sub_A <-raw_data_sub[which(raw_data_sub$label=="A"), ]
    raw_data_sub_NA <-raw_data_sub[is.na(raw_data_sub$label), ]


})

Answer 1

我使用以下代码比较基数R，dplyr和data.table中的数据帧子集方法。结果显示data.table是最快的。

library(dplyr)
library(data.table)
library(microbenchmark)

set.seed(1234)

# create testing df
n <- 1e8
raw_data <- data.frame(label=sample(c("A", "B", "C", NA), n, replace=TRUE), value=rnorm(n))

# data slice indices
slide_begin <- 3000000
slide_end <- 9000000

# Convert raw_data to a data.table
raw_data_dt <- as.data.table(raw_data)

# Microbenchmark
microbenchmark(m1 = {raw_data_sub <- raw_data[slide_begin:slide_end,]
                     raw_data_sub_A <-raw_data_sub[which(raw_data_sub$label %in% "A"), ]
                     raw_data_sub_NA <-raw_data_sub[is.na(raw_data_sub$label), ]},
               m2 = {raw_data_sub_tbl <- raw_data %>% slice(slide_begin:slide_end)
                     raw_data_sub_A_tbl <- raw_data_sub_tbl %>% filter(label %in% "A")
                     raw_data_sub_NA_tbl <- raw_data_sub_tbl %>% filter(is.na(label))},
               m3 = {raw_data_sub_dt <- raw_data_dt[slide_begin:slide_end, ]
                     raw_data_sub_A_dt <- raw_data_sub_dt[label %in% "A", ]
                     raw_data_sub_NA_dt <-  raw_data_sub_dt[is.na(label), ]})

<强>结果

Unit: milliseconds
 expr      min        lq      mean    median        uq       max neval
   m1 978.6438 1122.0304 1222.5156 1190.4979 1292.8300 1880.6840   100
   m2 392.9376  397.2833  466.8690  409.5631  517.5746 1010.1591   100
   m3 221.8958  225.9560  281.8417  231.0487  281.1333  678.4515   100

在R

1 个答案: