Question

是否有dplyr（或其他包）命令用于获取SQL表的列（字段？）类型？例如......

library(RSQLite)
library(dplyr)

data(iris)

dat_sql <- src_sqlite("test.sqlite", create = TRUE)
copy_to(dat_sql, iris, name = "iris_df")

iris_tbl <- tbl(dat_sql, "iris_df")
iris_tbl
# Source:   query [?? x 5]
# Database: sqlite 3.8.6 [test.sqlite]
# 
#    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#           <dbl>       <dbl>        <dbl>       <dbl>   <chr>
# 1           5.1         3.5          1.4         0.2  setosa
# 2           4.9         3.0          1.4         0.2  setosa
# 3           4.7         3.2          1.3         0.2  setosa
# 4           4.6         3.1          1.5         0.2  setosa
# 5           5.0         3.6          1.4         0.2  setosa
# 6           5.4         3.9          1.7         0.4  setosa
# 7           4.6         3.4          1.4         0.3  setosa
# 8           5.0         3.4          1.5         0.2  setosa
# 9           4.4         2.9          1.4         0.2  setosa
# 10          4.9         3.1          1.5         0.1  setosa
# # ... with more rows

我对一个命令感兴趣，它会告诉我前四列的类型为dbl，最后一列是chr（或更好的是，R类型numeric和character）而实际上没有 collect 内存中的数据。既然是印刷品，就必须有办法做到这一点，对吧？我试过str无济于事：

str(iris_tbl)
# List of 2
#  $ src:List of 2
#   ..$ con :Formal class 'SQLiteConnection' [package "RSQLite"] with 5 slots
#   .. .. ..@ Id                 :<externalptr> 
#   .. .. ..@ dbname             : chr "test.sqlite"
#   .. .. ..@ loadable.extensions: logi TRUE
#   .. .. ..@ flags              : int 6
#   .. .. ..@ vfs                : chr ""
#   ..$ path: chr "test.sqlite"
#   ..- attr(*, "class")= chr [1:3] "src_sqlite" "src_sql" "src"
#  $ ops:List of 3
#   ..$ src :List of 2
#   .. ..$ con :Formal class 'SQLiteConnection' [package "RSQLite"] with 5 slots
#   .. .. .. ..@ Id                 :<externalptr> 
#   .. .. .. ..@ dbname             : chr "test.sqlite"
#   .. .. .. ..@ loadable.extensions: logi TRUE
#   .. .. .. ..@ flags              : int 6
#   .. .. .. ..@ vfs                : chr ""
#   .. ..$ path: chr "test.sqlite"
#   .. ..- attr(*, "class")= chr [1:3] "src_sqlite" "src_sql" "src"
#   ..$ x   :Classes 'ident', 'sql', 'character'  chr "iris_df"
#   ..$ vars: chr [1:5] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" ...
#   ..- attr(*, "class")= chr [1:3] "op_base_remote" "op_base" "op"
#  - attr(*, "class")= chr [1:4] "tbl_sqlite" "tbl_sql" "tbl_lazy" "tbl"
# NULL

Answer 1

查看glimpse()

这就像print的转置版本：列在页面上运行，和数据贯穿始终。这样就可以看到a中的每一列数据框。它有点像str应用于数据框但它试图向您显示尽可能多的数据。（它总是显示出来的基础数据，即使应用于远程数据源。）

给出了：

> glimpse(iris_tbl)
#Observations: NA
#Variables: 5
#$ Sepal.Length <dbl> 5.1, 4.9, 4.7, 4.6, 5.0, 5.4, 4.6, 5.0,...
#$ Sepal.Width  <dbl> 3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4,...
#$ Petal.Length <dbl> 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5,...
#$ Petal.Width  <dbl> 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2,...
#$ Species      <chr> "setosa", "setosa", "setosa", "setosa",...

如果你想获得一个矢量，你可以这样做：

vapply(as.data.frame(head(iris_tbl)), typeof, character(1))

给出了：

#Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
#    "double"     "double"     "double"     "double"  "character"

Answer 2

打印远程表的预览时，看起来dplyr确实在表的前几行使用collect。因为dplyr检索一些样本数据，所以你也可以这样做。

在这里，我们使用head，collect查询结果查询前几行，并检查每列的类。

iris_tbl %>% 
  head %>% 
  collect %>% 
  lapply(class) %>% 
  unlist
#> Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
#>    "numeric"    "numeric"    "numeric"    "numeric"  "character"

（当与数据框一起使用时，lapply执行逐列函数应用程序，因此它将class应用于每列。）

要获取dplyr使用的类型名称，请使用type_sum。

iris_tbl %>% head %>% collect %>% lapply(type_sum) %>% unlist
#> Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
#>        "dbl"        "dbl"        "dbl"        "dbl"        "chr"

使用dplyr获取SQL表的列类型

2 个答案: