使用循环为特定日期范围内的数据提取数据,具体取决于多个列的值

时间:2018-11-12 16:11:11

标签: r dplyr

首先,感谢您抽出宝贵时间查看/回答我的问题。

我之前曾问过这个问题,尽管不清楚,但是,我认为我已经接近解决方案,希望您能为我提供帮助!

我有2个主要df:客户(基本表)和Top_Customers(每季度前n位客户的子集)

两个表具有相同的布局,如下所示:

Cust_ID   Date    QTR     Sales    Action   Link_Cust_ID
  1      1/1/18  2018 Q1   23       NA       NA
  1      1/2/18  2018 Q1   22.2     NA       NA
  1      1/3/18  2018 Q1   12.1     2        5
  1      1/4/18  2018 Q1   14.1     5        NA
  3      1/1/18  2018 Q1   101      NA       NA
  3      1/2/18  2018 Q1   55       2        18
 ...      ...      ...     ...     ...      ...

有时-一个客户可能引用了另一个客户-因此link_cust_id列可能会填充另一个客户的ID。
我的目标是:如果“操作”列== 2,我想从link_cust_id填充到季度末日期起将该客户(来自Link_Cust_ID)包括在我的Top_Customer表中

例如,我将从上图的1/3/18到3/31/18(季度末)包括Cust_ID = 5

我已经尝试了一段时间,并想出了以下代码(目前不起作用,但是我认为这个想法已经存在了)

Linking_ID <- function(data)
{
if (data$link_type == 2)
{
temp.linkid <- data$link_cust_id[i] #stores the linked customer_id
temp.date   <- data$Date[i] #stores the date linking occurs
temp.data   <- customer_data %>% group_by(Quarter) %>% filter(customer_id = temp.linkid & Date >= temp.date)
#the above line of code is suppose to subset only link_customer_id data from the link_date to the end of the quarter
data <- rbind(data, temp.data)
}}

我对循环不是很满意,请尝试在代码中不要过多使用循环,但是在这种情况下,我可能别无选择。如果您认为另一种方法可能更好,请提出建议!

基本表的投放量(所有客户)

    structure(list(Cust_ID = c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 5, 5, 5, 5), 
               Date = structure(c(1514764800, 1514851200, 1514937600, 1515024000, 
                                  1514764800, 1514851200, 1514937600, 1515024000, 
                                  1514764800, 1514851200, 1514937600, 1515024000, 
                                  1514764800, 1514851200, 1514937600, 1515024000), 
                                class = c("POSIXct", "POSIXt"), tzone = "UTC"),
               Quarter = c("2018 Q1", "2018 Q1", "2018 Q1", "2018 Q1", "2018 Q1", 
                           "2018 Q1", "2018 Q1", "2018 Q1", "2018 Q1", "2018 Q1", 
                           "2018 Q1", "2018 Q1", "2018 Q1", "2018 Q1", "2018 Q1", 
                           "2018 Q1"), 
               Sales = c(23, 22.2, 12.1, 14.1, 18, 18.8, 19.2, 19.8, 101, 55, 56, 
                         55, NA, NA, 10.5, 11.1), 
               Action = c(NA, NA, 2, NA, NA, NA, NA, NA, NA, 2, NA, NA, NA, NA, NA, NA), 
               Link_Cust_ID = c(NA, NA, 5, NA, NA, NA, NA, NA, NA, 18, NA, NA, NA, NA, NA, NA)), 
          row.names = c(NA, -16L), class = c("tbl_df", "tbl", "data.frame"))

top_customer tble的投放:

structure(list(Cust_ID = c(1, 1, 1, 1, 3, 3, 3, 3), Date = 
structure(c(1514764800, 
1514851200, 1514937600, 1515024000, 1514764800, 1514851200, 1514937600, 
1515024000), class = c("POSIXct", "POSIXt"), tzone = "UTC"), 
Quarter = c("2018 Q1", "2018 Q1", "2018 Q1", "2018 Q1", "2018 Q1", 
"2018 Q1", "2018 Q1", "2018 Q1"), Sales = c(23, 22.2, 12.1, 
14.1, 101, 55, 56, 55), Action = c(NA, NA, 2, NA, NA, 2, 
NA, NA), Link_Cust_ID = c(NA, NA, 5, NA, NA, 18, NA, NA)), row.names = c(NA, 
-8L), class = c("tbl_df", "tbl", "data.frame"))

2 个答案:

答案 0 :(得分:1)

我认为这可能会有所帮助。您无需在此处执行循环。

all_cust <-  structure(list(Cust_ID = c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 5, 5, 5, 5), 
Date = structure(c(1514764800, 1514851200, 1514937600, 1515024000, 
1514764800, 1514851200, 1514937600, 1515024000, 
1514764800, 1514851200, 1514937600, 1515024000, 
1514764800, 1514851200, 1514937600, 1515024000), 
class = c("POSIXct", "POSIXt"), tzone = "UTC"),
Quarter = c("2018 Q1", "2018 Q1", "2018 Q1", "2018 Q1", "2018 Q1", 
"2018 Q1", "2018 Q1", "2018 Q1", "2018 Q1", "2018 Q1", 
"2018 Q1", "2018 Q1", "2018 Q1", "2018 Q1", "2018 Q1", 
"2018 Q1"), 
Sales = c(23, 22.2, 12.1, 14.1, 18, 18.8, 19.2, 19.8, 101, 55, 56, 
55, NA, NA, 10.5, 11.1), 
Action = c(NA, NA, 2, NA, NA, NA, NA, NA, NA, 2, NA, NA, NA, NA, NA, NA), 
Link_Cust_ID = c(NA, NA, 5, NA, NA, NA, NA, NA, NA, 18, NA, NA, NA, NA, NA, NA)), 
row.names = c(NA, -16L), class = c("tbl_df", "tbl", "data.frame"))

top_cust <- structure(list(Cust_ID = c(1, 1, 1, 1, 3, 3, 3, 3), Date = 
structure(c(1514764800, 
1514851200, 1514937600, 1515024000, 1514764800, 1514851200, 1514937600, 
1515024000), class = c("POSIXct", "POSIXt"), tzone = "UTC"), 
Quarter = c("2018 Q1", "2018 Q1", "2018 Q1", "2018 Q1", "2018 Q1", 
"2018 Q1", "2018 Q1", "2018 Q1"), Sales = c(23, 22.2, 12.1, 
14.1, 101, 55, 56, 55), Action = c(NA, NA, 2, NA, NA, 2, 
NA, NA), Link_Cust_ID = c(NA, NA, 5, NA, NA, 18, NA, NA)), row.names = c(NA, 
-8L), class = c("tbl_df", "tbl", "data.frame"))


library(dplyr)
#get where action is equal to 2
top_cust2 <- filter(top_cust, Action == 2)
#join on cust_id to link_cust_id
#Then filter where date is greater or equal too 
#reference date and in the same quarter
   bth <- inner_join(all_cust,top_cust2, 
                  by =c("Cust_ID"="Link_Cust_ID")) %>% 
        filter(Date.x >= Date.y, Quarter.x == Quarter.y) 
#remove .y columns from all_cust
bth <- bth[,!grepl(".y",colnames(bth))]
#drop .x from variable names
colnames(bth) <- gsub(".x","",colnames(bth))

cmb <- bind_rows(top_cust, bth)

答案 1 :(得分:1)

您可以执行以下操作:

首先,提取那些具有与操作2关联的客户ID的客户,并设置季度结束日期。

library(dplyr) 

link_ids_action2 <- customer %>% 
  select(Cust_ID, Action, Link_Cust_ID, Date, Quarter) %>% 
  filter(Action == 2) %>% 
  mutate(Quarter_end_dates = case_when(grepl("Q1", Quarter) ~ "2018-03-31", 
                                       grepl("Q2", Quarter) ~ "2018-06-30", 
                                       grepl("Q3", Quarter) ~ "2018-09-30", 
                                       grepl("Q4", Quarter) ~ "2018-12-31"), 
         Quarter_end_dates = as.Date(Quarter_end_dates)) %>% 
  select(Link_Cust_ID, New_Cust_ID = Cust_ID, Start_date = Date, Quarter_end_dates)

link_ids_action2
# A tibble: 2 x 4
#   Link_Cust_ID New_Cust_ID Start_date          Quarter_end_dates
#          <dbl>       <dbl> <dttm>              <date>           
# 1            5           1 2018-01-03 00:00:00 2018-03-31       
# 2           18           3 2018-01-02 00:00:00 2018-03-31     

在原始数据框上进行右联接,这仅为您提供那些作为链接客户并按日期过滤的顾客。我必须添加as.Dates才能正确过滤您的日期时间。仅选择top_customer数据所需的列。

new_top_customers <- 
  right_join(customer, link_ids_action2, by = c("Cust_ID" = "Link_Cust_ID")) %>% 
  filter(as.Date(Date) >= as.Date(Start_date), as.Date(Date) <= Quarter_end_dates) %>% 
  select(Cust_ID, Date, Quarter, Sales, Action, Link_Cust_ID)

new_top_customers
# A tibble: 2 x 6
#   Cust_ID Date                Quarter Sales Action Link_Cust_ID
#     <dbl> <dttm>              <chr>   <dbl>  <dbl>        <dbl>
# 1       5 2018-01-03 00:00:00 2018 Q1  10.5     NA           NA
# 2       5 2018-01-04 00:00:00 2018 Q1  11.1     NA           NA

否,bind_rows可以添加新的主要客户:

bind_rows(top_customer, new_top_customers)

# A tibble: 10 x 6
#    Cust_ID Date                Quarter Sales Action Link_Cust_ID
#      <dbl> <dttm>              <chr>   <dbl>  <dbl>        <dbl>
#  1       1 2018-01-01 00:00:00 2018 Q1  23       NA           NA
#  2       1 2018-01-02 00:00:00 2018 Q1  22.2     NA           NA
#  3       1 2018-01-03 00:00:00 2018 Q1  12.1      2            5
#  4       1 2018-01-04 00:00:00 2018 Q1  14.1     NA           NA
#  5       3 2018-01-01 00:00:00 2018 Q1 101       NA           NA
#  6       3 2018-01-02 00:00:00 2018 Q1  55        2           18
#  7       3 2018-01-03 00:00:00 2018 Q1  56       NA           NA
#  8       3 2018-01-04 00:00:00 2018 Q1  55       NA           NA
#  9       5 2018-01-03 00:00:00 2018 Q1  10.5     NA           NA
# 10       5 2018-01-04 00:00:00 2018 Q1  11.1     NA           NA

数据

customer <- structure(list(Cust_ID = c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 5, 5, 5, 5), 
                           Date = structure(c(1514764800, 1514851200, 1514937600, 1515024000, 
                                              1514764800, 1514851200, 1514937600, 1515024000, 
                                              1514764800, 1514851200, 1514937600, 1515024000, 
                                              1514764800, 1514851200, 1514937600, 1515024000), 
                                            class = c("POSIXct", "POSIXt"), tzone = "UTC"),
                           Quarter = c("2018 Q1", "2018 Q1", "2018 Q1", "2018 Q1", "2018 Q1", 
                                       "2018 Q1", "2018 Q1", "2018 Q1", "2018 Q1", "2018 Q1", 
                                       "2018 Q1", "2018 Q1", "2018 Q1", "2018 Q1", "2018 Q1", 
                                       "2018 Q1"), 
                           Sales = c(23, 22.2, 12.1, 14.1, 18, 18.8, 19.2, 19.8, 101, 55, 56, 
                                     55, NA, NA, 10.5, 11.1), 
                           Action = c(NA, NA, 2, NA, NA, NA, NA, NA, NA, 2, NA, NA, NA, NA, NA, NA), 
                           Link_Cust_ID = c(NA, NA, 5, NA, NA, NA, NA, NA, NA, 18, NA, NA, NA, NA, NA, NA)), 
                      row.names = c(NA, -16L), class = c("tbl_df", "tbl", "data.frame"))


top_customer <- structure(list(Cust_ID = c(1, 1, 1, 1, 3, 3, 3, 3), 
                               Date = structure(c(1514764800, 1514851200, 1514937600, 1515024000, 
                                                  1514764800, 1514851200, 1514937600, 1515024000), 
                                                class = c("POSIXct", "POSIXt"), tzone = "UTC"), 
                               Quarter = c("2018 Q1", "2018 Q1", "2018 Q1", "2018 Q1", "2018 Q1", 
                                           "2018 Q1", "2018 Q1", "2018 Q1"), 
                               Sales = c(23, 22.2, 12.1, 14.1, 101, 55, 56, 55), 
                               Action = c(NA, NA, 2, NA, NA, 2, NA, NA), 
                               Link_Cust_ID = c(NA, NA, 5, NA, NA, 18, NA, NA)), 
                          row.names = c(NA, -8L), class = c("tbl_df", "tbl", "data.frame"))