Question

rbind如何在SQL中创建一个bind_rows或dplyr临时表（在Postgres和SQLite中测试并失败）？

E.g。

library(dplyr)
con <- DBI::dbConnect(RSQLite::SQLite(), path = ":memory:")

copy_to(con, nycflights13::flights, "flights",
        temporary = FALSE, 
        indexes = list(
            c("year", "month", "day"), 
            "carrier", 
            "tailnum",
            "dest"
        )
)

copy_to(con, nycflights13::flights, "flights2",
        temporary = FALSE, 
        indexes = list(
            c("year", "month", "day"), 
            "carrier", 
            "tailnum",
            "dest"
        )
)



flights_db <- tbl(con, "flights")
flights_db_2 <- tbl(con, "flights2")

调用bind_rows会出现以下错误：

> bind_rows(flights_db, flights_db_2)
Error in bind_rows_(x, .id) : 
  Argument 1 must be a data frame or a named atomic vector, not a tbl_dbi/tbl_sql/tbl_lazy/tbl

Answer 1

由于database包含唯一记录，因此此处的对象包括＆＃39;航班＆＃39;航班2＆＃39;是相同的。否则，我们需要

union(flights_db, flights_db_2)

以上内容仅会创建＆＃39; flights_db＆＃39;中的尺寸。因为两个对象都是一样的。如果我们需要创建两倍的行数，那么创建一个唯一标识符

flights1 <- nycflights13::flights %>%
                  mutate(id= 1)

flights2 <- nycflights13::flights %>%
                  mutate(id = 2)

copy_to(con, flights1, "flights",
        temporary = FALSE,
        overwrite = TRUE, 
        indexes = list(
            c("year", "month", "day"), 
            "carrier", 
            "tailnum",
            "dest"
        )
)

copy_to(con, flights2, "flights2",
        temporary = FALSE, 
        overwrite = TRUE,
        indexes = list(
            c("year", "month", "day"), 
            "carrier", 
            "tailnum",
            "dest"
        )
)

flights_db <- tbl(con, "flights")
flights_db_2 <- tbl(con, "flights2")

现在我们执行union

union(flights_db, flights_db_2) %>% 
                  summarise(n = n())
# Source:   lazy query [?? x 1]
# Database: sqlite 3.19.3 []
#       n
#   <int>
#1 673552

dim(nycflights13::flights)
#[1] 336776     19

为了证明唯一性，我们可以为两个对象选择一小部分脱节行，然后执行union

copy_to(con, nycflights13::flights[1:20,], "flights",
        temporary = FALSE,
        overwrite = TRUE, 
        indexes = list(
            c("year", "month", "day"), 
            "carrier", 
            "tailnum",
            "dest"
        )
)

copy_to(con, nycflights13::flights[21:30,], "flights2",
        temporary = FALSE, 
        overwrite = TRUE,
        indexes = list(
            c("year", "month", "day"), 
            "carrier", 
            "tailnum",
            "dest"
        )
)


flights_db <- tbl(con, "flights")
flights_db_2 <- tbl(con, "flights2")
union(flights_db, flights_db_2) %>%
          collect
# A tibble: 30 x 19
#    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin  dest air_time distance
#   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>     <dbl>   <chr>  <int>   <chr>  <chr> <chr>    <dbl>    <dbl>
# 1  2013     1     1      517            515         2      830            819        11      UA   1545  N14228    EWR   IAH      227     1400
# 2  2013     1     1      533            529         4      850            830        20      UA   1714  N24211    LGA   IAH      227     1416
# 3  2013     1     1      542            540         2      923            850        33      AA   1141  N619AA    JFK   MIA      160     1089
# 4  2013     1     1      544            545        -1     1004           1022       -18      B6    725  N804JB    JFK   BQN      183     1576
# 5  2013     1     1      554            558        -4      740            728        12      UA   1696  N39463    EWR   ORD      150      719
# 6  2013     1     1      554            600        -6      812            837       -25      DL    461  N668DN    LGA   ATL      116      762
# 7  2013     1     1      555            600        -5      913            854        19      B6    507  N516JB    EWR   FLL      158     1065
# 8  2013     1     1      557            600        -3      709            723       -14      EV   5708  N829AS    LGA   IAD       53      229
# 9  2013     1     1      557            600        -3      838            846        -8      B6     79  N593JB    JFK   MCO      140      944
#10  2013     1     1      558            600        -2      753            745         8      AA    301  N3ALAA    LGA   ORD      138      733
# ... with 20 more rows, and 3 more variables: hour <dbl>, minute <dbl>, time_hour <dbl>

Answer 2

感谢Akrun将我指向union家庭，可以在某种程度上复制bind_rows：

Reduce(union_all, list(flights_db, flights_db, flights_db))

正如评论中所述，在Akrun的回答中，union会在结果中生成唯一记录，union_all相当于SQL的{{1}}。

Union使用dplyr创建的所有SQL临时表？

2 个答案: