考虑以下数据集:
SimulatedDated <- structure(list(CustumerId = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L,
10L, 10L, 10L, 10L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L,
11L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 13L, 13L,
13L, 13L, 13L, 13L, 13L, 13L, 13L, 14L, 14L, 14L, 14L, 14L, 14L,
14L, 14L, 14L, 14L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L,
15L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 17L, 17L, 17L,
17L, 17L, 17L, 17L, 17L, 17L, 18L, 18L, 18L, 18L, 18L, 18L, 18L,
18L, 18L, 18L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 20L, 20L,
20L, 20L, 20L, 20L, 20L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L,
22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 23L, 23L, 23L, 23L,
23L, 23L, 23L, 23L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L,
25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 26L, 26L, 26L,
26L, 26L, 26L, 27L, 27L, 27L, 27L, 27L, 27L, 27L, 27L, 27L, 28L,
28L, 28L, 28L, 28L, 28L, 28L, 29L, 29L, 29L, 29L, 29L, 29L, 29L,
29L, 29L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 31L, 31L,
31L, 31L, 31L, 31L, 31L, 31L, 31L, 32L, 32L, 32L, 32L, 32L, 32L,
32L, 32L, 32L, 32L, 33L, 33L, 33L, 33L, 33L, 33L, 33L, 33L, 33L,
34L, 34L, 34L, 34L, 34L), ProductId = c(6L, 3L, 4L, 9L, 8L, 10L,
1L, 5L, 7L, 1L, 5L, 3L, 4L, 2L, 7L, 6L, 10L, 8L, 7L, 4L, 10L,
5L, 1L, 3L, 8L, 6L, 2L, 9L, 6L, 1L, 2L, 4L, 7L, 8L, 5L, 9L, 10L,
3L, 2L, 5L, 9L, 4L, 10L, 3L, 6L, 1L, 8L, 8L, 10L, 2L, 4L, 3L,
9L, 5L, 6L, 5L, 6L, 4L, 9L, 10L, 8L, 2L, 7L, 1L, 3L, 10L, 3L,
2L, 8L, 9L, 7L, 5L, 4L, 1L, 7L, 1L, 3L, 2L, 4L, 8L, 9L, 6L, 5L,
10L, 1L, 9L, 2L, 4L, 7L, 3L, 8L, 7L, 9L, 8L, 4L, 10L, 3L, 5L,
1L, 6L, 2L, 6L, 4L, 9L, 3L, 10L, 1L, 8L, 7L, 5L, 2L, 9L, 5L,
7L, 4L, 10L, 1L, 3L, 2L, 6L, 5L, 9L, 2L, 4L, 3L, 8L, 1L, 10L,
6L, 7L, 10L, 9L, 2L, 1L, 5L, 8L, 6L, 4L, 7L, 3L, 9L, 8L, 3L,
5L, 6L, 10L, 1L, 7L, 4L, 1L, 6L, 9L, 10L, 3L, 4L, 2L, 8L, 7L,
10L, 8L, 1L, 6L, 4L, 5L, 9L, 3L, 7L, 2L, 4L, 8L, 3L, 7L, 10L,
1L, 6L, 5L, 5L, 6L, 4L, 7L, 1L, 10L, 3L, 10L, 8L, 3L, 1L, 4L,
5L, 6L, 2L, 9L, 5L, 6L, 4L, 8L, 2L, 10L, 3L, 1L, 8L, 4L, 10L,
6L, 9L, 7L, 2L, 3L, 8L, 3L, 6L, 7L, 9L, 4L, 5L, 2L, 10L, 1L,
5L, 9L, 3L, 7L, 6L, 10L, 8L, 2L, 4L, 8L, 7L, 1L, 4L, 2L, 10L,
10L, 3L, 8L, 1L, 7L, 5L, 4L, 6L, 2L, 10L, 6L, 1L, 2L, 5L, 4L,
8L, 1L, 10L, 8L, 3L, 2L, 9L, 5L, 6L, 4L, 9L, 10L, 6L, 2L, 1L,
7L, 4L, 8L, 5L, 1L, 5L, 9L, 10L, 3L, 8L, 7L, 2L, 4L, 10L, 1L,
5L, 7L, 6L, 2L, 3L, 4L, 9L, 8L, 1L, 5L, 2L, 7L, 3L, 6L, 10L,
4L, 9L, 9L, 5L, 10L, 8L, 2L), DaysSinceEpoch = c(7L, 20L, 31L,
40L, 105L, 146L, 162L, 169L, 212L, 10L, 18L, 31L, 65L, 84L, 122L,
156L, 202L, 206L, 1L, 4L, 7L, 11L, 14L, 24L, 25L, 100L, 148L,
149L, 3L, 10L, 12L, 14L, 18L, 26L, 35L, 41L, 96L, 147L, 9L, 22L,
66L, 80L, 102L, 104L, 170L, 199L, 234L, 10L, 24L, 36L, 38L, 75L,
122L, 163L, 169L, 9L, 16L, 35L, 39L, 54L, 58L, 79L, 116L, 133L,
224L, 27L, 35L, 37L, 49L, 73L, 91L, 105L, 141L, 252L, 16L, 28L,
51L, 73L, 76L, 83L, 126L, 202L, 97L, 105L, 150L, 172L, 203L,
207L, 223L, 256L, 259L, 25L, 28L, 38L, 40L, 63L, 100L, 120L,
176L, 186L, 191L, 7L, 22L, 36L, 37L, 40L, 41L, 53L, 67L, 114L,
233L, 1L, 16L, 17L, 23L, 40L, 52L, 125L, 184L, 186L, 12L, 42L,
53L, 65L, 67L, 69L, 83L, 149L, 154L, 265L, 10L, 14L, 33L, 47L,
67L, 106L, 133L, 181L, 247L, 258L, 6L, 21L, 26L, 41L, 49L, 68L,
89L, 112L, 119L, 9L, 34L, 88L, 91L, 102L, 110L, 132L, 171L, 200L,
6L, 14L, 21L, 36L, 40L, 60L, 64L, 88L, 109L, 208L, 8L, 17L, 21L,
55L, 77L, 85L, 97L, 168L, 18L, 28L, 42L, 44L, 70L, 77L, 101L,
14L, 23L, 33L, 84L, 107L, 123L, 124L, 125L, 25L, 29L, 33L, 57L,
79L, 83L, 98L, 112L, 119L, 5L, 31L, 64L, 91L, 102L, 131L, 222L,
234L, 27L, 46L, 48L, 60L, 61L, 64L, 72L, 103L, 161L, 8L, 24L,
27L, 50L, 60L, 62L, 92L, 99L, 147L, 159L, 16L, 19L, 20L, 84L,
175L, 202L, 17L, 21L, 25L, 46L, 69L, 121L, 161L, 175L, 267L,
10L, 14L, 20L, 39L, 58L, 90L, 229L, 32L, 35L, 39L, 40L, 60L,
66L, 98L, 153L, 173L, 2L, 3L, 25L, 46L, 51L, 80L, 96L, 166L,
202L, 43L, 70L, 76L, 77L, 115L, 160L, 183L, 202L, 223L, 25L,
33L, 61L, 72L, 74L, 77L, 85L, 91L, 152L, 265L, 16L, 62L, 63L,
64L, 66L, 82L, 104L, 126L, 181L, 47L, 49L, 55L, 58L, 67L), BoughtPAD = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L,
1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L)), .Names = c("CustumerId",
"ProductId", "DaysSinceEpoch", "BoughtPAD"), row.names = c(NA,
300L), class = "data.frame")
然后,做
library(TraMineR)
SimSeq <- seqecreate(id = SimulatedDated$CustumerId,
timestamp = SimulatedDated$DaysSinceEpoch,
event = SimulatedDated$ProductId)
Cohort <- factor(SimulatedDated$BoughtPAD, labels = c("PAD", "NPAD"))
Fsubseq <- seqefsub(seq = SimSeq, pMinSupport = .01)
DiscrCohort <- seqecmpgroup(subseq = Fsubseq, group = Cohort)
产生:
Error in model.frame.default(formula = ww ~ group + seqmatrix[, index]) :
variable lengths differ (found for 'group')
我想知道,这可能导致这个问题?
答案 0 :(得分:1)
group
变量的长度应等于序列数,即您案例中的客户数。此外,它应该在整个序列中保持不变(在您的示例中不是这种情况)。
当您只有34个客户时,用作Cohort
参数的group
变量的长度为事件总数(300)。所以你需要通过CustumerID聚合它。
以下是如何做到这一点(此处为每个客户取最大的组值。)
bylist <- list(id = SimulatedDated$CustumerId)
agg.PAD <- aggregate(SimulatedDated[,c("CustumerId","BoughtPAD")], by=bylist, FUN="max")
Cohort <- agg.PAD$BoughtPAD
现在,您可以查找最能区分群组的子序列
DiscrCohort <- seqecmpgroup(subseq = Fsubseq, group = Cohort)
print(DiscrCohort[1:10])
希望这有帮助。