将日期与R中的不同数据框匹配

时间:2018-08-16 16:52:10

标签: r datetime dplyr

我有两个数据帧。第一个数据帧看起来像这样。

user_id   date  
1         2016-12-30
2         2016-08-01

第二个数据帧看起来像这样

user_id    date         total     type 
1          2016-12-19   100       1
1          2016-11-02   200       2
1          2016-10-18   50        1
1          2016-07-15   100       3
1          2016-01-21   200       1
1          2016-01-18   152       2
2          2016-08-01   30        4
2          2016-01-29   133       2

我正在尝试在两个数据框中匹配user_id和日期,并>>选择最近3个月>>汇总总数

我已经尝试将其分组,但是我仍然无法将日期与数据框进行比较。

library(dplyr)
    df %>%
      group_by(user_id, type_cat) %>% 
      summarise(total= sum(total))

library(lubridate). 
 df %>%
  select (user_id,date, total, type) %>%
  filter(date  >= today() - months(3))

我正在寻找的结果是:

user_id total type
1       150   1  
1       200   2  
1       0     3
1       0     4
2       0     1
2       0     2
2       0     3
2       30    4    

4 个答案:

答案 0 :(得分:0)

这可以满足您的要求,尽管它无法添加数据中不存在的user_id / type对。大部分操作非常简单left_join,其中有几个mutate步骤将date的值转换为实际的数据对象。

密钥位于最后的summarize步骤中。我们将每个组的total值相加,但是在求和之前,我们将其乘以logical值是否datekey_date的三个月内的测试TRUE 。如果是=1,则将这FALSE和值相加;如果是=0,则将这filter和值相加。

我们还可以使用user_id删除未通过此测试的行,但是即使type也保留了所有total == 0 / df1 %>% mutate(date = ymd(date)) %>% rename(key_date = date) %>% left_join(df2, by='user_id') %>% mutate(date = ymd(date)) %>% group_by(user_id, type) %>% summarize(total = sum(total * (date > key_date - months(3)))) # A tibble: 5 x 3 # Groups: user_id [?] user_id type total <int> <int> <int> 1 1 1 150 2 1 2 200 3 1 3 0 4 2 2 0 5 2 4 30 组合:

$sqlPickInsert = "INSERT INTO record_pickup(pickup_id, order_code, customer_id,
                                s_date, r_date, payment_amount, payment_mode, 
                                pickup_comments, branch, order_status) 
                               VALUES (?,?,?,?,?,?,?,?,?,?)";

    echo "$sqlPickInsert <br>"; 
    echo "pickupID $nullvalues" ."<br>";
    echo "ordercode $newPickupCode" ."<br>";
    echo "customer ID $finalexistCustID" ."<br>";
    echo "loginDate $login_Date" ."<br>";
    echo "pickup $finalpick_date" ."<br>";
    echo "amount $newExpenditures" ."<br>";
    echo "paymode $paymentMode" ."<br>";
    echo "notes$finalpick_notes" ."<br>";
    echo "login branch$login_Branch" ."<br>";
    echo "status $nullvalues" ."<br>";

    $stmt1 = $conn->prepare($sqlPickInsert);
    if (!$stmt) die ('prepare() failed!');

    $checkID = $stmt1->bindParam(1, $nullvalues,PDO::PARAM_INT); //id
    if (!$checkID) die ('bindParam() ID failed!<br>');

    $checkCODE=$stmt1->bindParam(2, $newPickupCode,PDO::PARAM_STR);//ordercode
    if (!$checkCODE) die ('bindParam() ID failed!<br>');

    $checkCUSTID= $stmt1->bindParam(3, $finalexistCustID,PDO::PARAM_STR);//customerid
    if (!$checkCUSTID) die ('bindParam() ID failed!<br>');

    $checkSDATE=$stmt1->bindParam(4, $login_Date,PDO::PARAM_STR);//s_date
    if (!$checkSDATE) die ('bindParam() ID failed!<br>');

    $checkRDATE=$stmt1->bindParam(5, $finalpick_date,PDO::PARAM_STR);//r_date
    if (!$checkRDATE) die ('bindParam() ID failed!<br>');

    $checkAMOUNT=$stmt1->bindParam(6, $newExpenditures,PDO::PARAM_STR);//amount
    if (!$checkAMOUNT) die ('bindParam() ID failed!<br>');

    $checkPAYMODE=$stmt1->bindParam(7, $paymentMode,PDO::PARAM_STR);//paymode
    if (!$checkPAYMODE) die ('bindParam() ID failed!<br>');

    $checkNOTE=$stmt1->bindParam(8, $finalpick_notes,PDO::PARAM_STR);//note
    if (!$checkNOTE) die ('bindParam() ID failed!<br>');

    $checkBRANCH=$stmt1->bindParam(9, $login_Branch,PDO::PARAM_STR);//branch
    if (!$checkBRANCH) die ('bindParam() ID failed!<br>');

    $checkSTATUS=$stmt1->bindParam(10, $nullvalues,PDO::PARAM_STR);//status
    if (!$checkSTATUS) die ('bindParam() ID failed!<br>');

    $stmt1->execute();

答案 1 :(得分:0)

df1 = read.table(text = "
user_id   date  
1         2016-12-30
2         2016-08-01
", header=T, stringsAsFactors=F)

df2 = read.table(text = "
user_id    date         total     type 
1          2016-12-19   100       1
1          2016-11-02   200       2
1          2016-10-18   50        1
1          2016-07-15   100       3
1          2016-01-21   200       1
1          2016-01-18   152       2
2          2016-08-01   30        4
2          2016-01-29   133       2
", header=T, stringsAsFactors=F)

library(tidyverse)
library(lubridate)

# update to date columns (if needed; you probably have date columns already)
df1$date = ymd(df1$date)
df2$date = ymd(df2$date)

df1 %>%
  left_join(df2, by="user_id") %>%           # join datasets
  filter(date.y >= date.x - months(3)) %>%   # keep df2 dates within last 3 months from date in df1
  group_by(user_id, type) %>%                # for each user and type
  summarise(total= sum(total)) %>%           # get totals
  ungroup() %>%                              # forget the grouping
  complete(user_id, type=unique(df2$type))   # add missing combinations

哪个返回:

# # A tibble: 8 x 3
#   user_id  type total
#     <int> <int> <int>
# 1       1     1   150
# 2       1     2   200
# 3       1     3    NA
# 4       1     4    NA
# 5       2     1    NA
# 6       2     2    NA
# 7       2     3    NA
# 8       2     4    30

答案 2 :(得分:0)

您需要先加入两个数据框,然后使用group_bysummarize组合。为了保留所有类型,关于日期的子集必须在summarize调用中完成,而不是在filter参数中完成(我将进一步说明我的意思)。

library(dplyr)
library(lubridate)

my_data1 %>% 
  rename(due_date = date) %>% 
  right_join(my_data2, by = "user_id") %>%
  group_by(user_id, type) %>% 
  summarise(total = sum(total[date >= due_date - months(3)]))

# A tibble: 5 x 3
# Groups:   user_id [?]
#   user_id  type total
#     <int> <int> <int>
# 1       1     1   150
# 2       1     2   200
# 3       1     3     0
# 4       2     2     0
# 5       2     4    30

我在第一个数据框中重命名了date列,以明确表明这是扣除3个月的日期。 right_join确保只有带有user_id的{​​{1}}与数据联合,否则,您可能希望在第一个数据帧中指定没有日期的due_date。 如上所述,有关日期范围的子设置是在summarize调用中完成的,如果在发生这种情况之前是在filter调用中完成的:

my_data1 %>% 
  rename(due_date = date) %>% 
  right_join(my_data2, by = "user_id") %>% 
  group_by(user_id, type) %>% 
  filter(date >= due_date - months(3)) %>% 
  summarise(total = sum(total))

# A tibble: 3 x 3
# Groups:   user_id [?]
#   user_id  type total
#     <int> <int> <int>
# 1       1     1   150
# 2       1     2   200
# 3       2     4    30

数据

my_data1 <- 
  structure(list(user_id = 1:2, 
                 date = structure(c(17165, 17014), class = "Date")), 
            class = "data.frame", row.names = c(NA, -2L))

my_data2 <- 
  structure(list(user_id = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L), 
                 date = structure(c(17154, 17107, 17092, 16997, 16821, 16818, 17014, 16829), class = "Date"), 
                 total = c(100L, 200L, 50L, 100L, 200L, 152L, 30L, 133L), 
                 type = c(1L, 2L, 1L, 3L, 1L, 2L, 4L, 2L)), 
           class = "data.frame", row.names = c(NA, -8L))

答案 3 :(得分:0)

首先,我创建您的数据框。

df1 <- read.table(text="user_id   date  
1         2016-12-30
2         2016-08-01", 
                  header = TRUE, 
                  colClasses = c("integer", "POSIXct"))

df2 <- read.table(text = "user_id    date         total     type 
1          2016-12-19   100       1
                  1          2016-11-02   200       2
                  1          2016-10-18   50        1
                  1          2016-07-15   100       3
                  1          2016-01-21   200       1
                  1          2016-01-18   152       2
                  2          2016-08-01   30        4
                  2          2016-01-29   133       2", 
                  header = TRUE,
                  colClasses = c("integer", "POSIXct", "integer", "factor"))

接下来,为了避免混淆,我将它们一起重命名为date。然后,我过滤到date_ref(第一个数据帧中date的新名称)之前的最后三个月。我按user_idtype分组,计算出总计,使用typetotal的{​​{1}}填充缺失的0值。

complete

这给出了:

df2 %>% 
  left_join(df1, by ="user_id", suffix = c("", "_ref")) %>% 
  filter(date >= date_ref %m-% months(3)) %>% 
  group_by(user_id, type) %>% 
  summarise(total = sum(total) ) %>% 
  complete(user_id, type = levels(type), fill = list(total = 0))