我正在使用一种新颖的方法来使用RFM模型来细分电话应用的成瘾程度。对RFM的快速解释,因为它与我编写的代码相关:这是一种营销模型,R为新近度,即客户上一次在网站上购买商品起多少天; F(频率),客户在网站上进行了多少次交易; M(货币):每次交易在该网站上花费的平均金钱(总金钱/频率)。通过将这些分数分为不同的组,您可以知道哪些组更忠于您的品牌,但我想尝试一下以衡量不同类型的应用程序的成瘾性。
我将这些值替换为:
R:自用户在午夜12:00 PM之前使用此类应用程序以来的秒数;
F:他一天中使用过这种类型的应用几次;
M:此类应用使用的平均持续时间;
您可以在此处找到dput示例数据,很抱歉,该数据只有两个用户的信息,我仍然不知道如何创建随机dput数据集:
structure(list(application = c("com.android.calculator2", "com.whatsapp",
"com.whatsapp", "com.android.mediacenter", "com.whatsapp", "com.whatsapp",
"com.android.mediacenter", "com.whatsapp", "com.facebook.orca",
"com.whatsapp", "com.android.chrome", "com.google.android.youtube",
"com.tinder", "com.android.vending", "com.android.mms", "com.google.android.youtube",
"com.whatsapp", "com.google.android.youtube", "com.facebook.orca",
"com.huawei.android.internal.app", "com.android.chrome", "com.android.calculator2",
"com.android.server.telecom", "com.android.incallui", "com.whatsapp",
"com.android.mediacenter", "com.android.mediacenter", "com.android.settings",
"com.google.android.youtube", "com.whatsapp", "com.facebook.orca",
"com.android.mediacenter", "com.whatsapp", "com.whatsapp", "com.ninegag.android.app",
"com.whatsapp", "com.huawei.android.internal.app", "com.whatsapp",
"com.facebook.orca", "com.android.server.telecom", "com.android.contacts",
"com.whatsapp", "com.whatsapp", "com.facebook.orca", "com.whatsapp",
"com.audible.application", "com.facebook.orca", "com.android.vending",
"com.android.mediacenter", "com.audible.application", "com.spotlightsix.zentimerlite2"
), battery = c(99L, 91L, 91L, 91L, 59L, 59L, 86L, 82L, 82L, 78L,
78L, 78L, 59L, 23L, 24L, 24L, 21L, 20L, 27L, 27L, 27L, 66L, 66L,
66L, 51L, 78L, 79L, 79L, 61L, 15L, 83L, 64L, 64L, 64L, 77L, 77L,
76L, 74L, 74L, 68L, 67L, 26L, 26L, 26L, 14L, 42L, 21L, 7L, 49L,
47L, 7L), endTime = structure(c(1552937669.979, 1552939304.982,
1552940267.085, 1552940491.247, 1552927214.751, 1552927358.731,
1552943502.52, 1552947058.616, 1552947085.757, 1552947640.862,
1552948140.615, 1552950642.956, 1552950670.904, 1552698488.211,
1552699286.179, 1552699661.943, 1552694622.527, 1552695838.488,
1552669634.35, 1552669720.844, 1552669759.436, 1552658315.76,
1552658392.324, 1552658435.825, 1552826238.709, 1552829407.296,
1552830394.329, 1552830666.554, 1552834920.948, 1552843002.461,
1552850435.957, 1552924112.501, 1552924305.967, 1552924485.245,
1552746587.447, 1552746621.156, 1552746808.486, 1552747504.807,
1552747525.748, 1552749348.81, 1552749531.786, 1552774429.995,
1552774593.78, 1552774601.257, 1552765986.942, 1552866265.965,
1552869582.984, 1552871863.451, 1552863539.106, 1552864201.43,
1552872500.501), class = c("POSIXct", "POSIXt"), tzone = ""),
session = c(1552929316L, 1552937670L, 1552937670L, 1552940489L,
1552926942L, 1552926942L, 1552942385L, 1552947023L, 1552947023L,
1552947023L, 1552947023L, 1552947023L, 1552947023L, 1552698280L,
1552698280L, 1552698280L, 1552694528L, 1552695704L, 1552669479L,
1552669479L, 1552669479L, 1552658249L, 1552658249L, 1552658249L,
1552825368L, 1552829142L, 1552830354L, 1552830378L, 1552830378L,
1552842287L, 1552849970L, 1552923851L, 1552924111L, 1552924284L,
1552745790L, 1552746579L, 1552746579L, 1552747501L, 1552747501L,
1552748903L, 1552748903L, 1552774264L, 1552774264L, 1552774264L,
1552765953L, 1552865369L, 1552869549L, 1552869549L, 1552862301L,
1552862301L, 1552869549L), startTime = structure(c(1552937669.974,
1552939288.014, 1552940265.404, 1552940489.402, 1552927083.565,
1552927349.671, 1552943488.401, 1552947031.581, 1552947061.03,
1552947572.997, 1552948109.636, 1552948146.197, 1552950662.47,
1552698481.19, 1552699269.439, 1552699288.018, 1552694548.992,
1552695764.75, 1552669520.073, 1552669719.309, 1552669722.031,
1552658293.438, 1552658391.914, 1552658392.34, 1552826236.588,
1552829400.281, 1552830375.788, 1552830660.017, 1552834299.004,
1552842297.013, 1552850071.788, 1552924108.617, 1552924282.513,
1552924479.884, 1552746579.19, 1552746590.718, 1552746807.361,
1552747501.668, 1552747507.62, 1552749347.688, 1552749522.781,
1552774269.867, 1552774430.015, 1552774600.383, 1552765963.791,
1552866265.186, 1552869577.804, 1552871854.773, 1552863054.623,
1552864194.888, 1552872479.38), class = c("POSIXct", "POSIXt"
), tzone = ""), user_id = c(10161L, 10161L, 10161L, 10161L,
10161L, 10161L, 10161L, 10161L, 10161L, 10161L, 10161L, 10161L,
10161L, 10161L, 10161L, 10161L, 10161L, 10161L, 10161L, 10161L,
10161L, 10161L, 10161L, 10161L, 10161L, 10161L, 10161L, 10161L,
10161L, 10161L, 10161L, 10161L, 10161L, 10162L, 10162L, 10162L,
10162L, 10162L, 10162L, 10162L, 10162L, 10162L, 10162L, 10162L,
10162L, 10162L, 10162L, 10162L, 10162L, 10162L, 10162L),
categories = structure(c(6L, 1L, 1L, 3L, 1L, 1L, 3L, 1L,
1L, 1L, 6L, 2L, 3L, 6L, 1L, 2L, 1L, 2L, 1L, 6L, 6L, 6L, 6L,
6L, 1L, 3L, 3L, 6L, 2L, 1L, 1L, 3L, 1L, 1L, 5L, 1L, 6L, 1L,
1L, 6L, 6L, 1L, 1L, 1L, 1L, 6L, 1L, 6L, 3L, 6L, 3L), .Label = c("communication",
"games & entertainment", "lifestyle", "news & information outlet",
"social network", "utility & tools"), class = "factor"),
date = structure(c(17973, 17973, 17973, 17973, 17973, 17973,
17973, 17973, 17973, 17973, 17973, 17973, 17974, 17971, 17971,
17971, 17971, 17971, 17970, 17970, 17970, 17970, 17970, 17970,
17972, 17972, 17972, 17972, 17972, 17972, 17972, 17973, 17973,
17973, 17971, 17971, 17971, 17971, 17971, 17971, 17971, 17971,
17971, 17971, 17971, 17973, 17973, 17973, 17972, 17973, 17973
), class = "Date"), duration = structure(c(0, 17, 1.7, 1.8,
131.2, 9.1, 14.1, 27, 24.7, 67.9, 31, 2496.8, 8.4, 7, 16.7,
373.9, 73.5, 73.7, 114.3, 1.5, 37.4, 22.3, 0.4, 43.5, 2.1,
7, 18.5, 6.5, 621.9, 705.4, 364.2, 3.9, 23.5, 5.4, 8.3, 30.4,
1.1, 3.1, 18.1, 1.1, 9, 160.1, 163.8, 0.9, 23.2, 0.8, 5.2,
8.7, 484.5, 6.5, 21.1), class = "difftime", units = "secs")), row.names = 162574:162624, class = "data.frame")
由于我对处理时间类型数据不熟悉,因此在Recency部分遇到了麻烦。到目前为止,我只能弄清楚这一点,它不计算每日RFM,而仅计算整个数据集的最后一天。
df_RFM <- df_data %>%
group_by(user_id) %>%
summarise(recency=as.numeric(as.Date(endTime)-max(endTime)),
frequency=n_distinct(categories), monetary= sum(duration)/n_distinct(categories))
与频率和货币相同,它是基于整个数据集计算的,但我需要每天对其进行计算。简而言之,一个数据集包含每个用户的每日应用RFM(按每个类别划分),看起来像这样(只是示例,不是实际值):
user_id date recency frequency monetary categories
10161 2019-03-15 21040 sec 5 109.7 utility & tools
10161 2019-03-15 77538 sec 1 181.6 Communication
10161 2019-03-16 12345 sec 4 123.5 games&entertainment
10161 2019-03-16 77538 sec 1 181.6 communication
10162 2019-03-15 21040 sec 2 109.7 utility & tools
10162 2019-03-15 77538 sec 3 181.6 Communication
10162 2019-03-17 12345 sec 12 123.5 games&entertainment
10162 2019-03-17 77538 sec 2 181.6 utility & tools
通过阅读本指南,您可以获得以下信息:用户03161上的10161,使用过的实用程序和工具5次,平均109.7秒。他最后一次使用实用程序和工具是在午夜之前的21040年前。
欢迎任何建议,谢谢!
答案 0 :(得分:1)
由于您希望按user_id
,date
组合使用它,因此应该group_by
两个变量。另外,您可能希望使用difftime
来实现时差,以便更好地控制将事物返回到的单位。
df_RFM <- df_data %>%
group_by(user_id, date, categories) %>%
summarise(recency=difftime(lubridate::ceiling_date(max(endTime), unit='day'), max(endTime), units='secs'),
frequency=n(), monetary=mean(duration))
答案 1 :(得分:1)
我已经阅读了您的请求,我想我会这样处理:
df_rfm %>%
# add the endTime as date
mutate(date_end =ymd_hms(endTime )) %>%
# group by user, type and date, due it seems you need
group_by(user_id, categories, date ) %>%
# here RFM
summarise(
# here max date and midnight, to understand the steps: midnight
# should be 00:00:00 of the day after +days(1)
# comment/delete them for the final result
max_d = max(date_end),
midnight = ymd_hms(paste0(format.Date( max(date_end),"%Y-%m-%d")," 00:00:00"))+ days(1),
# recency is the difference between max date_end and 00:00:00 of that date_end
recency = (ymd_hms(paste0(format.Date( max(date_end),"%Y-%m-%d")," 00:00:00"))+ days(1))-max(date_end)
# frequency is the distinct of categories
,frequency = n_distinct(categories)
# monetary is the average of duration by categories
, monetary = = mean(duration))
# A tibble: 21 x 8
# Groups: user_id, categories [8]
user_id categories date max_d midnight recency frequency monetary
<int> <fct> <date> <dttm> <dttm> <time> <int> <time>
1 10161 communication 2019-03-15 2019-03-15 18:07:14 2019-03-16 00:00:00 5.879444 hours 1 114.3000 secs
2 10161 communication 2019-03-16 2019-03-16 02:21:26 2019-03-17 00:00:00 21.642778 hours 1 45.1000 secs
3 10161 communication 2019-03-17 2019-03-17 20:20:35 2019-03-18 00:00:00 3.656944 hours 1 357.2333 secs
4 10161 communication 2019-03-18 2019-03-18 23:20:40 2019-03-19 00:00:00 39.333333 hours 1 37.7625 secs
5 10161 games & entertainment 2019-03-16 2019-03-16 02:27:41 2019-03-17 00:00:00 21.538611 hours 1 223.8000 secs
6 10161 games & entertainment 2019-03-17 2019-03-17 16:02:00 2019-03-18 00:00:00 7.966667 hours 1 621.9000 secs
7 10161 games & entertainment 2019-03-18 2019-03-19 00:10:42 2019-03-20 00:00:00 23.821667 hours 1 2496.8000 secs
8 10161 lifestyle 2019-03-17 2019-03-17 14:46:34 2019-03-18 00:00:00 9.223889 hours 1 12.7500 secs
9 10161 lifestyle 2019-03-18 2019-03-18 22:11:42 2019-03-19 00:00:00 1.805000 hours 1 6.6000 secs
10 10161 lifestyle 2019-03-19 2019-03-19 00:11:10 2019-03-20 00:00:00 23.813889 hours 1 8.4000 secs
# ... with 11 more rows
所有值都是按应用程序类型,用户和日期划分的。