Question

我正在寻找有关在有多个类时如何将列类分配给data.table的指导。因此，例如，如果我只想分配一个，那就很简单了：

library(data.table)

dt <- as.data.table(iris)
str(dt)
#> Classes 'data.table' and 'data.frame':   150 obs. of  5 variables:
#>  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
#>  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
#>  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
#>  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
#>  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
#>  - attr(*, ".internal.selfref")=<externalptr>

dtnew <- dt[, lapply(.SD, as.character )]
str(dtnew)
#> Classes 'data.table' and 'data.frame':   150 obs. of  5 variables:
#>  $ Sepal.Length: chr  "5.1" "4.9" "4.7" "4.6" ...
#>  $ Sepal.Width : chr  "3.5" "3" "3.2" "3.1" ...
#>  $ Petal.Length: chr  "1.4" "1.4" "1.3" "1.5" ...
#>  $ Petal.Width : chr  "0.2" "0.2" "0.2" "0.2" ...
#>  $ Species     : chr  "setosa" "setosa" "setosa" "setosa" ...
#>  - attr(*, ".internal.selfref")=<externalptr>

请考虑以下情况：其中可能有一个像这样的列类向量：

col_classes <- c('character', 'character', 'numeric', 'factor', 'character')

我想将这些列类应用于dt对象，但是正在用合适的方法在data.table中进行操作。

谢谢。

^{由reprex package（v0.3.0）于2019-12-03创建}

下面有两个很好的答案。我认为可以进行一些基准测试来确定采用哪种方法：

library(data.table)


## options
foo <- function(d, col_classes) {
  cc <- setNames(col_classes, names(d))
  res = lapply(setNames(, names(cc)), function(n) match.fun(sprintf("as.%s", cc[[n]]))(d[[n]]))

  setDT(res)[]
}

bar <- function(d, col_classes) {
  d[, setNames(Map(function(x, y) match.fun(x)(y), paste0("as.", col_classes), .SD), names(d))]
}

## Attempt one
dt <- as.data.table(iris)


col_classes <- c('character', 'character', 'numeric', 'factor', 'character')

bench::mark(
  foo(dt, col_classes),
  bar(dt, col_classes),
  iterations = 10
)
#> # A tibble: 2 x 6
#>   expression                min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>           <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 foo(dt, col_classes)  553.2us  594.3us      799.    1.64MB      0  
#> 2 bar(dt, col_classes)   1.07ms   1.82ms      407.  365.71KB     45.2




number_of_rows <- 1E7 ## way increase this to really test this out

## Create fake data
fake_data <- data.table(sample(1:100, number_of_rows, replace=TRUE),
                       sample(1900:2000, number_of_rows, replace = TRUE),
                       sample(c("MALE", "FEMALE"), number_of_rows, replace = TRUE),
                       sample(c("E", "M", "H"), number_of_rows, replace = TRUE))
colnames(fake_data) <- c("RAW_SCORE", "BIRTHYEAR", "TYPE", "CLASS")


col_classes <- c('numeric', 'character', 'factor', 'factor')

bench::mark(
  foo(fake_data, col_classes),
  bar(fake_data, col_classes), 
  iterations = 10
)
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> # A tibble: 2 x 6
#>   expression                       min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>                  <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 foo(fake_data, col_classes)    4.72s    7.34s     0.143     943MB    0.357
#> 2 bar(fake_data, col_classes)     7.2s    7.62s     0.131    1019MB    0.368

^{由reprex package（v0.3.0）于2019-12-04创建}

Answer 1

将col_classes转换为函数名称，将其与match.fun匹配并应用于每列

dt_temp <- dtnew[, setNames(Map(function(x, y) match.fun(x)(y), 
                   paste0("as.", col_classes), .SD), names(dtnew))]

str(dt_temp)
#Classes ‘data.table’ and 'data.frame': 150 obs. of  5 variables:
# $ Sepal.Length: chr  "5.1" "4.9" "4.7" "4.6" ...
# $ Sepal.Width : chr  "3.5" "3" "3.2" "3.1" ...
# $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
# $ Petal.Width : Factor w/ 22 levels "0.1","0.2","0.3",..: 2 2 2 2 2 4 3 2 2 1 ...
# $ Species     : chr  "setosa" "setosa" "setosa" "setosa" ...
# - attr(*, ".internal.selfref")=<externalptr>

PS-必须有一种更好的保留列名的方法，而不是使用setNames。

Answer 2

与@RonakShah的答案相同，但假设OP已显式命名列而不是按位置传递：

# different input format
cc <- setNames(col_classes, names(dtnew)) 

# usage
res = lapply(setNames(, names(cc)), function(n) 
  match.fun(sprintf("as.%s", cc[[n]]))(dtnew[[n]])
)
setDT(res)[]

可以通过其他方式解决问题：

如果要读取数据，请使用colClasses=的{{1}}参数或类似的函数。
也许还考虑了fread()，它将自动猜测并将类应用于每列。但是，它不能返回字符列和因子列的混合。

从data.table中的向量设置多个列类

2 个答案: