我有两个数据帧。第一个数据帧看起来像这样。
user_id date
1 2016-12-30
2 2016-08-01
第二个数据帧看起来像这样
user_id date total type
1 2016-12-19 100 1
1 2016-11-02 200 2
1 2016-10-18 50 1
1 2016-07-15 100 3
1 2016-01-21 200 1
1 2016-01-18 152 2
2 2016-08-01 30 4
2 2016-01-29 133 2
我正在尝试在两个数据框中匹配user_id和日期,并>>选择最近3个月>>汇总总数
我已经尝试将其分组,但是我仍然无法将日期与数据框进行比较。
library(dplyr)
df %>%
group_by(user_id, type_cat) %>%
summarise(total= sum(total))
library(lubridate).
df %>%
select (user_id,date, total, type) %>%
filter(date >= today() - months(3))
我正在寻找的结果是:
user_id total type
1 150 1
1 200 2
1 0 3
1 0 4
2 0 1
2 0 2
2 0 3
2 30 4
答案 0 :(得分:0)
这可以满足您的要求,尽管它无法添加数据中不存在的user_id
/ type
对。大部分操作非常简单left_join
,其中有几个mutate
步骤将date
的值转换为实际的数据对象。
密钥位于最后的summarize
步骤中。我们将每个组的total
值相加,但是在求和之前,我们将其乘以logical
值是否date
在key_date
的三个月内的测试TRUE
。如果是=1
,则将这FALSE
和值相加;如果是=0
,则将这filter
和值相加。
我们还可以使用user_id
删除未通过此测试的行,但是即使type
也保留了所有total == 0
/ df1 %>%
mutate(date = ymd(date)) %>%
rename(key_date = date) %>%
left_join(df2, by='user_id') %>%
mutate(date = ymd(date)) %>%
group_by(user_id, type) %>%
summarize(total = sum(total * (date > key_date - months(3))))
# A tibble: 5 x 3
# Groups: user_id [?]
user_id type total
<int> <int> <int>
1 1 1 150
2 1 2 200
3 1 3 0
4 2 2 0
5 2 4 30
组合:
$sqlPickInsert = "INSERT INTO record_pickup(pickup_id, order_code, customer_id,
s_date, r_date, payment_amount, payment_mode,
pickup_comments, branch, order_status)
VALUES (?,?,?,?,?,?,?,?,?,?)";
echo "$sqlPickInsert <br>";
echo "pickupID $nullvalues" ."<br>";
echo "ordercode $newPickupCode" ."<br>";
echo "customer ID $finalexistCustID" ."<br>";
echo "loginDate $login_Date" ."<br>";
echo "pickup $finalpick_date" ."<br>";
echo "amount $newExpenditures" ."<br>";
echo "paymode $paymentMode" ."<br>";
echo "notes$finalpick_notes" ."<br>";
echo "login branch$login_Branch" ."<br>";
echo "status $nullvalues" ."<br>";
$stmt1 = $conn->prepare($sqlPickInsert);
if (!$stmt) die ('prepare() failed!');
$checkID = $stmt1->bindParam(1, $nullvalues,PDO::PARAM_INT); //id
if (!$checkID) die ('bindParam() ID failed!<br>');
$checkCODE=$stmt1->bindParam(2, $newPickupCode,PDO::PARAM_STR);//ordercode
if (!$checkCODE) die ('bindParam() ID failed!<br>');
$checkCUSTID= $stmt1->bindParam(3, $finalexistCustID,PDO::PARAM_STR);//customerid
if (!$checkCUSTID) die ('bindParam() ID failed!<br>');
$checkSDATE=$stmt1->bindParam(4, $login_Date,PDO::PARAM_STR);//s_date
if (!$checkSDATE) die ('bindParam() ID failed!<br>');
$checkRDATE=$stmt1->bindParam(5, $finalpick_date,PDO::PARAM_STR);//r_date
if (!$checkRDATE) die ('bindParam() ID failed!<br>');
$checkAMOUNT=$stmt1->bindParam(6, $newExpenditures,PDO::PARAM_STR);//amount
if (!$checkAMOUNT) die ('bindParam() ID failed!<br>');
$checkPAYMODE=$stmt1->bindParam(7, $paymentMode,PDO::PARAM_STR);//paymode
if (!$checkPAYMODE) die ('bindParam() ID failed!<br>');
$checkNOTE=$stmt1->bindParam(8, $finalpick_notes,PDO::PARAM_STR);//note
if (!$checkNOTE) die ('bindParam() ID failed!<br>');
$checkBRANCH=$stmt1->bindParam(9, $login_Branch,PDO::PARAM_STR);//branch
if (!$checkBRANCH) die ('bindParam() ID failed!<br>');
$checkSTATUS=$stmt1->bindParam(10, $nullvalues,PDO::PARAM_STR);//status
if (!$checkSTATUS) die ('bindParam() ID failed!<br>');
$stmt1->execute();
答案 1 :(得分:0)
df1 = read.table(text = "
user_id date
1 2016-12-30
2 2016-08-01
", header=T, stringsAsFactors=F)
df2 = read.table(text = "
user_id date total type
1 2016-12-19 100 1
1 2016-11-02 200 2
1 2016-10-18 50 1
1 2016-07-15 100 3
1 2016-01-21 200 1
1 2016-01-18 152 2
2 2016-08-01 30 4
2 2016-01-29 133 2
", header=T, stringsAsFactors=F)
library(tidyverse)
library(lubridate)
# update to date columns (if needed; you probably have date columns already)
df1$date = ymd(df1$date)
df2$date = ymd(df2$date)
df1 %>%
left_join(df2, by="user_id") %>% # join datasets
filter(date.y >= date.x - months(3)) %>% # keep df2 dates within last 3 months from date in df1
group_by(user_id, type) %>% # for each user and type
summarise(total= sum(total)) %>% # get totals
ungroup() %>% # forget the grouping
complete(user_id, type=unique(df2$type)) # add missing combinations
哪个返回:
# # A tibble: 8 x 3
# user_id type total
# <int> <int> <int>
# 1 1 1 150
# 2 1 2 200
# 3 1 3 NA
# 4 1 4 NA
# 5 2 1 NA
# 6 2 2 NA
# 7 2 3 NA
# 8 2 4 30
答案 2 :(得分:0)
您需要先加入两个数据框,然后使用group_by
和summarize
组合。为了保留所有类型,关于日期的子集必须在summarize
调用中完成,而不是在filter
参数中完成(我将进一步说明我的意思)。
library(dplyr)
library(lubridate)
my_data1 %>%
rename(due_date = date) %>%
right_join(my_data2, by = "user_id") %>%
group_by(user_id, type) %>%
summarise(total = sum(total[date >= due_date - months(3)]))
# A tibble: 5 x 3
# Groups: user_id [?]
# user_id type total
# <int> <int> <int>
# 1 1 1 150
# 2 1 2 200
# 3 1 3 0
# 4 2 2 0
# 5 2 4 30
我在第一个数据框中重命名了date
列,以明确表明这是扣除3个月的日期。 right_join
确保只有带有user_id
的{{1}}与数据联合,否则,您可能希望在第一个数据帧中指定没有日期的due_date
。
如上所述,有关日期范围的子设置是在summarize
调用中完成的,如果在发生这种情况之前是在filter
调用中完成的:
my_data1 %>%
rename(due_date = date) %>%
right_join(my_data2, by = "user_id") %>%
group_by(user_id, type) %>%
filter(date >= due_date - months(3)) %>%
summarise(total = sum(total))
# A tibble: 3 x 3
# Groups: user_id [?]
# user_id type total
# <int> <int> <int>
# 1 1 1 150
# 2 1 2 200
# 3 2 4 30
数据
my_data1 <-
structure(list(user_id = 1:2,
date = structure(c(17165, 17014), class = "Date")),
class = "data.frame", row.names = c(NA, -2L))
my_data2 <-
structure(list(user_id = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L),
date = structure(c(17154, 17107, 17092, 16997, 16821, 16818, 17014, 16829), class = "Date"),
total = c(100L, 200L, 50L, 100L, 200L, 152L, 30L, 133L),
type = c(1L, 2L, 1L, 3L, 1L, 2L, 4L, 2L)),
class = "data.frame", row.names = c(NA, -8L))
答案 3 :(得分:0)
首先,我创建您的数据框。
df1 <- read.table(text="user_id date
1 2016-12-30
2 2016-08-01",
header = TRUE,
colClasses = c("integer", "POSIXct"))
df2 <- read.table(text = "user_id date total type
1 2016-12-19 100 1
1 2016-11-02 200 2
1 2016-10-18 50 1
1 2016-07-15 100 3
1 2016-01-21 200 1
1 2016-01-18 152 2
2 2016-08-01 30 4
2 2016-01-29 133 2",
header = TRUE,
colClasses = c("integer", "POSIXct", "integer", "factor"))
接下来,为了避免混淆,我将它们一起重命名为date
。然后,我过滤到date_ref
(第一个数据帧中date
的新名称)之前的最后三个月。我按user_id
和type
分组,计算出总计,使用type
用total
的{{1}}填充缺失的0
值。
complete
这给出了:
df2 %>%
left_join(df1, by ="user_id", suffix = c("", "_ref")) %>%
filter(date >= date_ref %m-% months(3)) %>%
group_by(user_id, type) %>%
summarise(total = sum(total) ) %>%
complete(user_id, type = levels(type), fill = list(total = 0))