Question

我有一个包含超过200行的交易记录数据集，如下所示：

  balance  clientId     transactionDate   type_    Approval Date 
0 2038533   44291    2016-06-09 22:10:47   credit  2016-06-16 18:21:44                 
1 2038533   44291    2016-06-09 22:11:47   debit   2016-06-16 18:21:44         
2   61698   44291    2016-06-10 10:16:00   debit   2016-06-16 18:21:44
3 1538533   44291    2016-06-10 10:20:37   debit   2016-06-16 18:21:44         
4  561698   44291    2016-06-10 10:55:00   credit  2016-06-16 18:21:44           
5  511698   44291    2016-06-10 11:09:00   debit   2016-06-16 18:21:44

我正在尝试获取每个客户在批准日期之前的最近5天内的交易数量

这是我尝试过的一些代码：

library(dplyr)

summarise(groupd, transactions = (`Approval Date`-days(5))
filter(groupd, `Approval Date` == (`Approval Date`-days(5)))

一切都无济于事。我似乎无法绕过它。任何帮助将不胜感激

Answer 1

可能不是最优雅的方式但是我会这样做：我假设transactionDate和ApprovalDate列是字符格式。

# Create df for transactions within last 5 days
df_last5days <- subset(df, as.Date(ApprovalDate) - as.Date(transactionDate) < 6)

# Create table of transactions
table(df_last5days$clientID)

或者，如果你喜欢1-liner：

table(subset(df, as.Date(ApprovalDate) - as.Date(transactionDate) < 6)$clientID)

Answer 2

您的意思是在批准日期的5天内（ApprovalDate）总结交易次数（transactionDate）吗？如果是的话

data.table

dt[ApprovalDate-transactionDate < ddays(5) ,.N, by=list(clientID)]
# DT[ i,  j,  by ] # + extra arguments
#    |   |   |
#    |   |    -------> grouped by what?
#    |    -------> what to do? .N here summurise number of instances by group
#     ---> on which rows?

通过输入试验，我添加了一些满足条件的记录。我使用了lubridate包来保存时间数据。如果5天约束仅基于日期，as.Date（）可能会更好。

df<-read.table(text="
0 2038533,   44291    2016-06-09 22:10:47   credit  2016-06-16 18:21:44                 
1 2038533   44291    2016-06-09 22:11:47   debit   2016-06-16 18:21:44         
2   61698   44291    2016-06-10 10:16:00   debit   2016-06-16 18:21:44
3 1538533   44291    2016-06-10 10:20:37   debit   2016-06-16 18:21:44         
4  561698   44291    2016-06-10 10:55:00   credit  2016-06-16 18:21:44           
5 511698   44291    2016-06-10 11:09:00   debit   2016-06-16 18:21:44
6  511698   44292    2016-06-10 11:09:00   debit   2016-06-16 18:21:44
7  511692   44291    2016-06-13 11:09:00   debit   2016-06-16 18:21:44
7  511692   44292    2016-06-13 11:09:00   debit   2016-06-16 18:21:44
", stringsAsFactors = F)

library(data.table)
library(lubridate) #required for parse_date_time

# using data.table to clear the input, as well as parse the timedate
dt <- setDT(df)[,{t.Date=parse_date_time(paste0(V4,V5), "YmdHMS")
a.date=parse_date_time(paste0(V7,V8), "YmdHMS")
list(ID=V1, balance=V2, clientID=V3, transactionDate=t.Date, type_=V6, ApprovalDate=a.date)}]

# library(dplyr)
# glimpse(dt)
# Observations: 7
# Variables: 6
# $ ID              <int> 0, 1, 2, 3, 4, 5, 6
# $ balance         <chr> "2038533,", "2038533", "61698", "1538533", "561698", "511698", "511698"
# $ clientID        <int> 44291, 44291, 44291, 44291, 44291, 44291, 44292
# $ transactionDate <dttm> 2016-06-09 22:10:47, 2016-06-09 22:11:47, 2016-06-10 10:16:00, 2016-06-10 10:20:37, 2016-...
# $ type_           <chr> "credit", "debit", "debit", "debit", "credit", "debit", "debit"
# $ ApprovalDate    <dttm> 2016-06-16 18:21:44, 2016-06-16 18:21:44, 2016-06-16 18:21:44, 2016-06-16 18:21:44, 2016-...

dt[ApprovalDate-transactionDate<ddays(5) ,.N, by=list(clientID)]

$ clientID <int> 44291, 44292
$ N        <int> 1, 1

Answer 3

假设您的transactionDate和Approval_Date已经是日期时间变量，您可以这样做：

library(dplyr)
df %>%
  group_by(clientId) %>%
  filter(Approval_Date-transactionDate < 5) %>%
  summarize(num_of_transac = n())

否则，请执行：

library(dplyr)
library(lubridate)
df %>%
  mutate_at(c("transactionDate", "Approval_Date"), funs(parse_date_time(., "Ymd.HMS"))) %>%
  group_by(clientId) %>%
  filter(Approval_Date-transactionDate < 5) %>%
  summarize(num_of_transac = n())

<强>结果：

# A tibble: 2 x 2
  clientId num_of_transac
     <int>          <int>
1    44291              2
2    44292              3

我修改了OP的例子，以证明这确实有效。

transactionDate和Approval_Date已经确定日期时间的数据：

df = structure(list(balance = c(2038533L, 2038533L, 61698L, 1538533L, 
561698L, 511698L, 511698L, 5116123L, 511123L, 511244L), clientId = c(44291L, 
44291L, 44291L, 44291L, 44291L, 44291L, 44292L, 44292L, 44292L, 
44292L), transactionDate = structure(c(1465510247, 1465510307, 
1465553760, 1465554037, 1465728900, 1465729740, 1465556940, 1466075340, 
1466075340, 1466161740), tzone = "UTC", class = c("POSIXct", 
"POSIXt")), type_ = c("credit", "debit", "debit", "debit", "credit", 
"debit", "debit", "debit", "debit", "debit"), Approval_Date = structure(c(1466101304, 
1466101304, 1466101304, 1466101304, 1466101304, 1466101304, 1466446904, 
1466446904, 1466446904, 1466446904), tzone = "UTC", class = c("POSIXct", 
"POSIXt"))), .Names = c("balance", "clientId", "transactionDate", 
"type_", "Approval_Date"), class = "data.frame", row.names = c(NA, 
-10L))

transactionDate和Approval_Date 不日期时间的数据：

df = structure(list(balance = c(2038533L, 2038533L, 61698L, 1538533L, 
561698L, 511698L, 511698L, 5116123L, 511123L, 511244L), clientId = c(44291L, 
44291L, 44291L, 44291L, 44291L, 44291L, 44292L, 44292L, 44292L, 
44292L), transactionDate = c("2016-06-09.22:10:47", "2016-06-09.22:11:47", 
"2016-06-10.10:16:00", "2016-06-10.10:20:37", "2016-06-12.10:55:00", 
"2016-06-12.11:09:00", "2016-06-10.11:09:00", "2016-06-16.11:09:00", 
"2016-06-16.11:09:00", "2016-06-17.11:09:00"), type_ = c("credit", 
"debit", "debit", "debit", "credit", "debit", "debit", "debit", 
"debit", "debit"), Approval_Date = c("2016-06-16.18:21:44", "2016-06-16.18:21:44", 
"2016-06-16.18:21:44", "2016-06-16.18:21:44", "2016-06-16.18:21:44", 
"2016-06-16.18:21:44", "2016-06-20.18:21:44", "2016-06-20.18:21:44", 
"2016-06-20.18:21:44", "2016-06-20.18:21:44")), .Names = c("balance", 
"clientId", "transactionDate", "type_", "Approval_Date"), row.names = 0:9, class = "data.frame")

使用R中的日期

3 个答案: