将数据集过滤为零观测值时,按周和组获取零计数

时间:2019-08-13 10:48:32

标签: tidyverse

我有一个闪亮的应用程序,其中的某些输入选择可能会使数据在进行某些计算时过滤为零观测值。当我按周和组计数观察值,然后尝试将此结果数据集加入下游的另一个数据集时,这会引起问题。

例如,如果下面的dat_2的计算过滤为零观测值,则我得到的group_bycount()如下:

# A tibble: 1 x 3
# Groups:   date, diss_group [1]
  date       diss_group date2_n
  <date>     <fct>        <int>
1 NA         NA               0

当我需要在几周内按组进行零计数时:

    # A tibble: 22 x 3
   date       diss_group date1_n
   <date>     <chr>        <dbl>
 1 2019-05-15 a                0
 2 2019-05-22 a                0
 3 2019-05-29 a                0
 4 2019-06-05 a                0
 5 2019-06-12 a                0
 6 2019-06-19 a                0
 7 2019-06-26 a                0
 8 2019-07-03 a                0
 9 2019-07-10 a                0
10 2019-07-17 a                0
# … with 12 more rows

这是一个例子

library(tidyverse)

# selecting date range via Shiny input
  start <- "2019-05-15"
  end <- "2019-07-25"
  startdate_adjusted <- ymd(start) - lubridate::floor_date(ymd(start), 
                                                           "1 week")

# generate some data
  dat <- data.frame(date1 = c("2019-05-04",
                              "2019-05-15",
                              "2019-05-18",
                              "2019-05-23",
                              "2019-05-23",
                              "2019-06-10",
                              "2019-06-15",
                              "2019-06-25",
                              "2019-06-26"),
                    date2 = c("2019-05-06",
                              "2019-05-16",
                              NA,
                              NA,
                              "2019-06-06",
                              "2019-06-15",
                              NA,
                              "2019-06-29",
                              "2019-06-29"),
                    Total = as.character(rep("Total", 9)),
                    letter = as.character(c("a", "a", "b",
                                            "a", "a", "b",
                                            "a", "a", "b"))
  )

  dat <-
    dat %>%
    mutate(date1 = ymd(as.character(date1))) %>%
    mutate(date2 = ymd(as.character(date2))) %>%
    mutate(Total = factor(Total, levels="Total")) %>%
    mutate(letter = factor(letter, levels=c("a", "b")))

# count 1
  dat_1 <- 
  dat %>%
    filter(!is.na(date1)) %>%
    mutate(date = date(date1)) %>%
    mutate(diss_group = letter) %>%
    filter(!is.na(diss_group)) %>%
    filter(diss_group!="") %>%
    group_by(date = lubridate::floor_date(date, "1 week") + startdate_adjusted,
             diss_group) %>%
    count(name = "date1_n") %>%
    spread(diss_group, date1_n) %>% 
    ungroup() %>%
    padr::pad(interval = "1 week",
              by = "date",
              start_val = ymd(start),
              end_val = ymd(end)
    ) %>% 
    replace(is.na(.), 0) %>%
    gather("diss_group", "date1_n", -date)

 dat_1
 #   # A tibble: 22 x 3
 #  date       diss_group date1_n
 #  <date>     <chr>        <dbl>
 #1 2019-05-15 a                1
 #2 2019-05-22 a                2
 #3 2019-05-29 a                0
 #4 2019-06-05 a                0
 #5 2019-06-12 a                1
 #6 2019-06-19 a                0
 #7 2019-06-26 a                1
 #8 2019-07-03 a                0
 #9 2019-07-10 a                0
 #10 2019-07-17 a                0
 # … with 12 more rows

# count 2, problem from `filter(letter=="c")` 
  dat_2 <- 
  dat %>%
    filter(!is.na(date2)) %>%
    mutate(date = date(date2)) %>%
    mutate(diss_group = letter) %>%
    filter(!is.na(diss_group)) %>%
    filter(diss_group!="") %>%
    filter(letter=="c") %>%          # letter c does not exist
    group_by(date = lubridate::floor_date(date, "1 week") + startdate_adjusted,
             diss_group) %>%
    count(name = "date2_n") %>%
    spread(diss_group, date2_n) %>% 
    ungroup() %>%
    padr::pad(interval = "1 week",
              by = "date",
              start_val = ymd(start),
              end_val = ymd(end)
    ) %>% 
    replace(is.na(.), 0) %>%
    gather("diss_group", "date2_n", -date)

1 个答案:

答案 0 :(得分:0)

.drop = FALSE添加到产生的group_by语句中:

# A tibble: 1 x 3
# Groups:   date [1]
  date           a     b
  <date>     <int> <int>
1 NA             0     0

如果datestart在一起,那么从这里我将is.na(date)设置为等于mutate_if(is.Date, funs(if_else(is.na(.), ymd(start), .)))

dat_2 <- 
dat %>%
  filter(!is.na(date2)) %>%
  mutate(date = date(date2)) %>%
  mutate(diss_group = letter) %>%
  filter(!is.na(diss_group)) %>%
  filter(diss_group!="") %>%
  filter(letter=="c") %>%          # letter c does not exist
  group_by(date = lubridate::floor_date(date, "1 week") + startdate_adjusted,
           diss_group, 
           .drop = FALSE) %>%            #### Added 
  count(name = "date2_n") %>%
  spread(diss_group, date2_n) %>% 
  ungroup() %>%
  mutate_if(is.Date, funs(if_else(is.na(.), ymd(start), .))) %>% #### Added
  padr::pad(interval = "1 week",
            by = "date",
            start_val = ymd(start),
            end_val = ymd(end)
  ) %>% 
  replace(is.na(.), 0) %>%
  gather("diss_group", "date2_n", -date)

产生:

# A tibble: 22 x 3
   date       diss_group date2_n
   <date>     <chr>        <dbl>
 1 2019-05-15 a                0
 2 2019-05-22 a                0
 3 2019-05-29 a                0
 4 2019-06-05 a                0
 5 2019-06-12 a                0
 6 2019-06-19 a                0
 7 2019-06-26 a                0
 8 2019-07-03 a                0
 9 2019-07-10 a                0
10 2019-07-17 a                0
# … with 12 more rows