我有一个包含超过200行的交易记录数据集,如下所示:
balance clientId transactionDate type_ Approval Date
0 2038533 44291 2016-06-09 22:10:47 credit 2016-06-16 18:21:44
1 2038533 44291 2016-06-09 22:11:47 debit 2016-06-16 18:21:44
2 61698 44291 2016-06-10 10:16:00 debit 2016-06-16 18:21:44
3 1538533 44291 2016-06-10 10:20:37 debit 2016-06-16 18:21:44
4 561698 44291 2016-06-10 10:55:00 credit 2016-06-16 18:21:44
5 511698 44291 2016-06-10 11:09:00 debit 2016-06-16 18:21:44
我正在尝试获取每个客户在批准日期之前的最近5天内的交易数量
这是我尝试过的一些代码:
library(dplyr)
summarise(groupd, transactions = (`Approval Date`-days(5))
filter(groupd, `Approval Date` == (`Approval Date`-days(5)))
一切都无济于事。我似乎无法绕过它。任何帮助将不胜感激
答案 0 :(得分:0)
可能不是最优雅的方式但是我会这样做: 我假设transactionDate和ApprovalDate列是字符格式。
# Create df for transactions within last 5 days
df_last5days <- subset(df, as.Date(ApprovalDate) - as.Date(transactionDate) < 6)
# Create table of transactions
table(df_last5days$clientID)
或者,如果你喜欢1-liner:
table(subset(df, as.Date(ApprovalDate) - as.Date(transactionDate) < 6)$clientID)
答案 1 :(得分:0)
您的意思是在批准日期的5天内(ApprovalDate)总结交易次数(transactionDate)吗?如果是的话
data.table
dt[ApprovalDate-transactionDate < ddays(5) ,.N, by=list(clientID)]
# DT[ i, j, by ] # + extra arguments
# | | |
# | | -------> grouped by what?
# | -------> what to do? .N here summurise number of instances by group
# ---> on which rows?
通过输入试验,我添加了一些满足条件的记录。我使用了lubridate包来保存时间数据。如果5天约束仅基于日期,as.Date()可能会更好。
df<-read.table(text="
0 2038533, 44291 2016-06-09 22:10:47 credit 2016-06-16 18:21:44
1 2038533 44291 2016-06-09 22:11:47 debit 2016-06-16 18:21:44
2 61698 44291 2016-06-10 10:16:00 debit 2016-06-16 18:21:44
3 1538533 44291 2016-06-10 10:20:37 debit 2016-06-16 18:21:44
4 561698 44291 2016-06-10 10:55:00 credit 2016-06-16 18:21:44
5 511698 44291 2016-06-10 11:09:00 debit 2016-06-16 18:21:44
6 511698 44292 2016-06-10 11:09:00 debit 2016-06-16 18:21:44
7 511692 44291 2016-06-13 11:09:00 debit 2016-06-16 18:21:44
7 511692 44292 2016-06-13 11:09:00 debit 2016-06-16 18:21:44
", stringsAsFactors = F)
library(data.table)
library(lubridate) #required for parse_date_time
# using data.table to clear the input, as well as parse the timedate
dt <- setDT(df)[,{t.Date=parse_date_time(paste0(V4,V5), "YmdHMS")
a.date=parse_date_time(paste0(V7,V8), "YmdHMS")
list(ID=V1, balance=V2, clientID=V3, transactionDate=t.Date, type_=V6, ApprovalDate=a.date)}]
# library(dplyr)
# glimpse(dt)
# Observations: 7
# Variables: 6
# $ ID <int> 0, 1, 2, 3, 4, 5, 6
# $ balance <chr> "2038533,", "2038533", "61698", "1538533", "561698", "511698", "511698"
# $ clientID <int> 44291, 44291, 44291, 44291, 44291, 44291, 44292
# $ transactionDate <dttm> 2016-06-09 22:10:47, 2016-06-09 22:11:47, 2016-06-10 10:16:00, 2016-06-10 10:20:37, 2016-...
# $ type_ <chr> "credit", "debit", "debit", "debit", "credit", "debit", "debit"
# $ ApprovalDate <dttm> 2016-06-16 18:21:44, 2016-06-16 18:21:44, 2016-06-16 18:21:44, 2016-06-16 18:21:44, 2016-...
dt[ApprovalDate-transactionDate<ddays(5) ,.N, by=list(clientID)]
$ clientID <int> 44291, 44292
$ N <int> 1, 1
答案 2 :(得分:0)
假设您的transactionDate
和Approval_Date
已经是日期时间变量,您可以这样做:
library(dplyr)
df %>%
group_by(clientId) %>%
filter(Approval_Date-transactionDate < 5) %>%
summarize(num_of_transac = n())
否则,请执行:
library(dplyr)
library(lubridate)
df %>%
mutate_at(c("transactionDate", "Approval_Date"), funs(parse_date_time(., "Ymd.HMS"))) %>%
group_by(clientId) %>%
filter(Approval_Date-transactionDate < 5) %>%
summarize(num_of_transac = n())
<强>结果:强>
# A tibble: 2 x 2
clientId num_of_transac
<int> <int>
1 44291 2
2 44292 3
我修改了OP的例子,以证明这确实有效。
transactionDate
和Approval_Date
已经确定日期时间的数据:
df = structure(list(balance = c(2038533L, 2038533L, 61698L, 1538533L,
561698L, 511698L, 511698L, 5116123L, 511123L, 511244L), clientId = c(44291L,
44291L, 44291L, 44291L, 44291L, 44291L, 44292L, 44292L, 44292L,
44292L), transactionDate = structure(c(1465510247, 1465510307,
1465553760, 1465554037, 1465728900, 1465729740, 1465556940, 1466075340,
1466075340, 1466161740), tzone = "UTC", class = c("POSIXct",
"POSIXt")), type_ = c("credit", "debit", "debit", "debit", "credit",
"debit", "debit", "debit", "debit", "debit"), Approval_Date = structure(c(1466101304,
1466101304, 1466101304, 1466101304, 1466101304, 1466101304, 1466446904,
1466446904, 1466446904, 1466446904), tzone = "UTC", class = c("POSIXct",
"POSIXt"))), .Names = c("balance", "clientId", "transactionDate",
"type_", "Approval_Date"), class = "data.frame", row.names = c(NA,
-10L))
transactionDate
和Approval_Date
不日期时间的数据:
df = structure(list(balance = c(2038533L, 2038533L, 61698L, 1538533L,
561698L, 511698L, 511698L, 5116123L, 511123L, 511244L), clientId = c(44291L,
44291L, 44291L, 44291L, 44291L, 44291L, 44292L, 44292L, 44292L,
44292L), transactionDate = c("2016-06-09.22:10:47", "2016-06-09.22:11:47",
"2016-06-10.10:16:00", "2016-06-10.10:20:37", "2016-06-12.10:55:00",
"2016-06-12.11:09:00", "2016-06-10.11:09:00", "2016-06-16.11:09:00",
"2016-06-16.11:09:00", "2016-06-17.11:09:00"), type_ = c("credit",
"debit", "debit", "debit", "credit", "debit", "debit", "debit",
"debit", "debit"), Approval_Date = c("2016-06-16.18:21:44", "2016-06-16.18:21:44",
"2016-06-16.18:21:44", "2016-06-16.18:21:44", "2016-06-16.18:21:44",
"2016-06-16.18:21:44", "2016-06-20.18:21:44", "2016-06-20.18:21:44",
"2016-06-20.18:21:44", "2016-06-20.18:21:44")), .Names = c("balance",
"clientId", "transactionDate", "type_", "Approval_Date"), row.names = 0:9, class = "data.frame")