Question

我正在尝试计算每月唯一“新”用户的数量。 New是一个之前没有出现的用户（从一开始）我也在尝试计算上个月没有出现的唯一用户数。

原始数据看起来像

library(dplyr)
    date <- c("2010-01-10","2010-02-13","2010-03-22","2010-01-11","2010-02-14","2010-03-23","2010-01-12","2010-02-14","2010-03-24")
    mth <- rep(c("2010-01","2010-02","2010-03"),3)
    user <- c("123","129","145","123","129","180","180","184","145")

    dt <- data.frame(date,mth,user)

    dt <- dt %>% arrange(date)

    dt

       date     mth user
1 2010-01-10 2010-01  123
2 2010-01-11 2010-01  123
3 2010-01-12 2010-01  180
4 2010-02-13 2010-02  129
5 2010-02-14 2010-02  129
6 2010-02-14 2010-02  184
7 2010-03-22 2010-03  145
8 2010-03-23 2010-03  180
9 2010-03-24 2010-03  145

答案应该是

    new <- c(2,2,2,2,2,2,1,1,1)
    totNew <- c(2,2,2,4,4,4,5,5,5)
    notLastMonth <- c(2,2,2,2,2,2,2,2,2)

    tmp <- cbind(dt,new,totNew,notLastMonth)
    tmp

        date     mth user new totNew notLastMonth
1 2010-01-10 2010-01  123   2      2            2
2 2010-01-11 2010-01  123   2      2            2
3 2010-01-12 2010-01  180   2      2            2
4 2010-02-13 2010-02  129   2      4            2
5 2010-02-14 2010-02  129   2      4            2
6 2010-02-14 2010-02  184   2      4            2
7 2010-03-22 2010-03  145   1      5            2
8 2010-03-23 2010-03  180   1      5            2
9 2010-03-24 2010-03  145   1      5            2

Answer 1

这是一次尝试（代码正文中的解释）

dt %>%
  group_by(user) %>%
  mutate(Count = row_number()) %>% # Count appearances per user
  group_by(mth) %>%
  mutate(new = sum(Count == 1)) %>% # Count first appearances per months
  summarise(new = first(new), # Summarise new users per month (for cumsum)
            users = list(unique(user))) %>% # Create a list of unique users per month (for notLastMonth)
  mutate(totNew = cumsum(new), # Calculate overall cummulative sum of unique users
         notLastMonth = lengths(Map(setdiff, users, lag(users)))) %>% # Compare new users to previous month
  select(-users) %>%
  right_join(dt) # Join back to the real data

# A tibble: 9 × 6
#       mth   new totNew notLastMonth       date   user
#    <fctr> <int>  <int>        <int>     <fctr> <fctr>
# 1 2010-01     2      2            2 2010-01-10    123
# 2 2010-01     2      2            2 2010-01-11    123
# 3 2010-01     2      2            2 2010-01-12    180
# 4 2010-02     2      4            2 2010-02-13    129
# 5 2010-02     2      4            2 2010-02-14    129
# 6 2010-02     2      4            2 2010-02-14    184
# 7 2010-03     1      5            2 2010-03-22    145
# 8 2010-03     1      5            2 2010-03-23    180
# 9 2010-03     1      5            2 2010-03-24    145

Answer 2

这里的另一个想法是从制表＆＃34;用户＆＃34;每个＆＃34; mth＆＃34;：

table(dt[c("user", "mth")]) > 0L

假设这条路径很可能导致内存问题，我们可以从稀疏的替代方案开始：

library(Matrix)
tab = as(xtabs( ~ user + mth, dt, sparse = TRUE) > 0L, "TsparseMatrix")
tab
#5 x 3 sparse Matrix of class "lgTMatrix"
#    2010-01 2010-02 2010-03
#123       |       .       .
#129       .       |       .
#145       .       .       |
#180       |       .       |
#184       .       |       .

然后，拥有＆＃34; mth＆＃34; （作为列索引）每个＆＃34;用户＆＃34;第一次出现：

tapply(tab@j, rownames(tab)[tab@i + 1L], min) + 1L
#123 129 145 180 184 
#  1   2   3   1   2

我们可以找到每个＆＃34; mth＆＃34;：

的新条目数

new = setNames(tabulate(tapply(tab@j, rownames(tab)[tab@i + 1L], min) + 1L, 
                        ncol(tab)), 
               colnames(tab))
new
#2010-01 2010-02 2010-03 
#      2       2       1

和新条目的累计总和：

totNew = cumsum(new)
totNew
#2010-01 2010-02 2010-03 
#      2       4       5

并且，减去＆＃34;用户＆＃34;的数量每个＆＃34;＆＃34;＆＃34;存在于＆＃34; mth＆＃34;和它以前的：

setNames(colSums(cbind(FALSE, tab[, -ncol(tab)]) & tab), colnames(tab))
#2010-01 2010-02 2010-03 
#      0       0       0

来自每月的用户数量：

colSums(tab)
#2010-01 2010-02 2010-03 
#      2       2       2

我们得到：

notLast = colSums(tab) - colSums(cbind(FALSE, tab[, -ncol(tab)]) & tab)
notLast
#2010-01 2010-02 2010-03 
#      2       2       2

达到所需输出的一种方法可能是：

merge(dt, data.frame(mth = names(new), new, totNew, notLast), by = "mth")
#      mth       date user new totNew notLast
#1 2010-01 2010-01-10  123   2      2       2
#2 2010-01 2010-01-11  123   2      2       2
#3 2010-01 2010-01-12  180   2      2       2
#4 2010-02 2010-02-13  129   2      4       2
#5 2010-02 2010-02-14  129   2      4       2
#6 2010-02 2010-02-14  184   2      4       2
#7 2010-03 2010-03-22  145   1      5       2
#8 2010-03 2010-03-23  180   1      5       2
#9 2010-03 2010-03-24  145   1      5       2

Answer 3

由于还没有人发布，这是我的首选方式：

library(zoo)
dt <- dt %>% mutate(ym = as.yearmon(mth))

ct_dt = dt %>% distinct(user, ym) %>% arrange(user, ym) %>%
  group_by(user) %>% mutate(last_ym = dplyr::lag(ym)) %>%
  group_by(ym) %>% summarise(
    new         = sum(is.na(last_ym)), 
    not_last_ym = sum(is.na(last_ym) | 12*(ym - last_ym) > 1)
  )

# # A tibble: 3 x 3
#              ym   new not_last_ym
#   <S3: yearmon> <int>       <int>
# 1      Jan 2010     2           2
# 2      Feb 2010     2           2
# 3      Mar 2010     1           2

如果您真的想要cumsum列，可以从此处获取new的{{1}};如果你真的想要查看这些数据（令人困惑地）在多行上展开，你可以totNew left_join和ct_dt。

或者使用data.table ...

dt

Answer 4

这是纯碱R解决方案。当变量不是因子并假设数据按月排序时，它最有效。

# get list of active monthly users
activeUsers <- lapply(unique(dt$mth), function(i) unique(dt[dt$mth==i, "user"]))
# get accumulating list of all users
allUsers <- Reduce(union, activeUsers, accumulate=TRUE)

现在，所有月度用户都存储在activeUsers中，并且所有用户的增长列表都存储在allUsers中。有了这些信息，我们就可以轻松计算出前两个变量。

# get the calculations
totNew <- lengths(allUsers)
new <- c(totNew[1], diff(totNew))
notLastMonth <- c(totNew[1], lengths(lapply(seq_along(activeUsers)[-1],
                                 function(i) setdiff(activeUsers[[i]], activeUsers[[i-1]]))))

lengths函数有效地计算每个列表项的长度。第二行使用diff来计算新用户的数量。第二行和第三行都使用totNew变量添加初始值（2）。第三行涉及更多，并使用setdiff和lapply构建上个月不存在的月份中的活动用户集。 lengths再次被用来计算。

#merge on to data set
merge(dt, data.frame(mth=unique(dt$mth), new=new, totNew=totNew, notLastMonth=notLastMonth),
      by="mth")

      mth       date user new totNew notLastMonth
1 2010-01 2010-01-10  123   2      2            2
2 2010-01 2010-01-12  180   2      2            2
3 2010-01 2010-01-11  123   2      2            2
4 2010-02 2010-02-13  129   2      4            2
5 2010-02 2010-02-14  129   2      4            2
6 2010-02 2010-02-14  184   2      4            2
7 2010-03 2010-03-23  180   1      5            2
8 2010-03 2010-03-22  145   1      5            2
9 2010-03 2010-03-24  145   1      5            2

数据

dt <- data.frame(date,mth,user, stringsAsFactors=FALSE)

计算之前未发生的新值，而不是在最后一组中发生的值

4 个答案: