Question

我有一个用户ID的数据集test以及用户ID进行交易的季度：

> test
   id quarter
1   7      Q1
2   7      Q1
3   5      Q1
4   8      Q1
5   3      Q1
6   6      Q1
7  10      Q1
8   3      Q2
9  10      Q2
10  8      Q2
11  2      Q2
12  7      Q2
13  6      Q2
14  6      Q3
15  9      Q3
16  6      Q3
17  4      Q3
18  9      Q3
19  2      Q3
20  5      Q4
21  8      Q4
22 10      Q4
23  7      Q4
24  1      Q4
25  8      Q4

我的目标是查找在第1季度中处于活动状态的唯一身份用户数， ，获取在后续季度进行交易的用户数。

以上样本的输出应为：

> output quarter unique_cohort_ids 1 Q1 6 2 Q2 5 3 Q3 1 4 Q4 4

以下代码有效，但它冗长且重复，并且对于具有数十个季度的较大数据集而言并不理想。有关如何改进它的任何建议将不胜感激：

subsets <-split(test, test$quarter, drop=TRUE) for (i in 1:length(subsets)) { assign(paste0("Q", i), as.data.frame(subsets[[i]])) } Q1_ids <- sum(uniqueQ1$id) Q2_ids <- sum(unique(Q1$id) %in% unique(Q2$id)) Q3_ids <- sum(unique(Q1$id) %in% unique(Q3$id)) Q4_ids <- sum(unique(Q1$id) %in% unique(Q4$id))

队列分析还包括计算新用户的数量每季度。与手动计算每季度出现的任何前一季度未出现的唯一用户ID相反，我正在寻求一种程序化解决方案。在这种情况下，输出将是以下列表：

> mylist [[1]] [1] 7 5 8 3 6 10 [[2]] [1] 2 [[3]] [1] 9 4 [[4]] [1] 1

任何建议都将不胜感激

以下是示例数据：

> dput(test) structure(list(id = c(7, 7, 5, 8, 3, 6, 10, 3, 10, 8, 2, 7, 6, 6, 9, 6, 4, 9, 2, 5, 8, 10, 7, 1, 8), quarter = c("Q1", "Q1", "Q1", "Q1", "Q1", "Q1", "Q1", "Q2", "Q2", "Q2", "Q2", "Q2", "Q2", "Q3", "Q3", "Q3", "Q3", "Q3", "Q3", "Q4", "Q4", "Q4", "Q4", "Q4", "Q4")), .Names = c("id", "quarter"), row.names = c(NA, -25L), class = "data.frame")

Answer 1

以下是使用data.table

的示例

library(data.table)
setDT(test)[, sum(unique(id) %in% test[quarter=="Q1", id]), by = quarter]

#   quarter V1
#1:      Q1  6
#2:      Q2  5
#3:      Q3  1
#4:      Q4  4

分析的第二部分：

split(test$id[!duplicated(test$id)], test$quarter[!duplicated(test$id)])

#$Q1
#[1]  7  5  8  3  6 10

#$Q2
#[1] 2

#$Q3
#[1] 9 4

#$Q4
#[1] 1

根据新要求进行更新：

这对我来说似乎有点笨拙，但是当你在宿舍中进步时，这是跟踪所见的最简单的方法。

quarts <- sort(unique(test$quarter))
test$occur <- 1
mat <- dcast.data.table(test, id ~ quarter, value.var = "occur", fun.aggregate = sum)

res <- mat[Q1 >0, lapply(.SD, function(x) sum(x>0)), .SDcols=colnames(mat)[-1]] ##initalize the results with "Q1"
cumMat <- mat$Q1 ##this will keep track of the ids that were seen in previous quarters  

for(i in quarts[-1]){                                  ##foreach quarter (except ("Q1") we will count the number of unique ids that are in that quarter and not in any previous quarters.
  res <- rbind(res, mat[cumMat == 0 & get(i) > 0, lapply(.SD, function(x) sum(x>0)), .SDcols=colnames(mat)[-1]])
  cumMat <- rowSums(cbind(cumMat, mat[, i, with = F])) ##update the ids seen
}
t(res) 

#   [,1] [,2] [,3] [,4]
#Q1    6    0    0    0
#Q2    5    1    0    0
#Q3    1    1    2    0
#Q4    4    0    0    1

Answer 2

这种做法怎么样？

library(tidyverse)
test %>% 
distinct() %>% 
mutate(value = T) %>% 
spread(quarter, value) %>% 
filter(!is.na(Q1)) %>%
select(-id) %>%
colSums(na.rm = T)
# Q1 Q2 Q3 Q4 
# 6  5  1  4

Answer 3

对于每季度的独特群组计数：

q1_ids <- test[test$quarter == 'Q1', 'id']
test_from_q1 <- test[test$id %in% q1_ids, ]
unique_from_q1 <- unique(test_from_q1)
quarter_counts <- table(unique_from_q1$quarter)
output <- as.data.frame(quarter_counts)
names(output) <- c('quarter', 'unique_cohort_ids')

对于第二部分，data.frame是否可以接受？

unique_ids <- unique(test$id)
first_appearance <- data.frame(
  id = unique_ids,
  quarter = test$quarter[match(unique_ids, test$id)]
)

如果没有，那么tapply可以列出 - 如果它：

tapply(
    first_appearance$id,
    first_appearance$quarter,
    'identity',
    simplify = FALSE
)

R中的队列分析

3 个答案:

根据新要求进行更新：