希望有人能帮助我:)
我有一个包含 3 列(ID、日期和项目)的(非常大的)数据集,每个 ID 和日期有多个行。
简化示例(请参阅下面的可重现示例)
ID day item
A 1 xc
A 185 aa
B 1 xc
B 102 cc
B 102 aa
B 128 tg
C 55 xx
C 183 aa
我想聚合数据,以便对于每个 ID,所有项目都按天数排列。 注意(1)对于每个重复的一天应该有一个新的序列(即同一天的项目不能在序列中)和(2) 序列应该相隔不超过 182 天。
像这样...
ID items
A xc
A aa
B xc, cc, tg
B xc, aa, tg
C xx, aa
我的编程技能和知识有限,所以我掉了几个兔子洞……有人可以帮忙吗?
附言。我发布了我需要的全部内容here。希望将其分成可管理的部分使其更易于理解/如果上述问题得到解决,我认为我可以管理。
可重现的样本数据 (每行包含多个项目,以防在同一天)
DT <- structure(list(items = c("bxM01, T-Other", "bxD01", "S-In",
"bxD02", "L-I", "A9", "R-S, bxR01, bxR03",
"bxA02", "HDTR", "S-In", "HVAL", "SC.R", "bxD11, S-Other",
"SC.R", "K-Other", "bxD06, S-In", "A-s.spec", "LON",
"bxJ01", "S-Other", "HVAL, SC.R", "bxN02, bxN02, bxC07, S-Other",
"K-Other", "A-s.spec", "bxC09", "R-all-rhin", "S-S, bxD07, bxD01",
"S-In", "bxD07, ECZM", "X-resp-prev", "bxD07", "HVAL",
"T-Other", "bxA11", "HVAL", "HVAL", "P-S", "K-Other",
"bxN01, NKSH", "A-s.spec", "bxJ01", "X-resp-prev", "D-S",
"FYS, B-Other", "K-Other", "bxC07, RON, NKSH", "bxM01, bxA01",
"bxS01", "NKSH", "T-Other", "bxC08", "bxD04, K-Other", "bxN02",
"bxD07, Y-S", "bxD07, bxR06, ALGY", "bxJ01", "SC.R, S-In",
"bxD10", "bxD10", "bxJ01", "SC.R", "S-In", "L-I",
"Y-S", "S-S", "K-Other", "bxR03, LON", "S-In",
"RON, S-Ne", "S-In", "S-In", "SC.R",
"S-In", "S-Other", "Z-S", "SX", "NKSH", "F-In, bxS01",
"N-Other", "FYS, NKSH", "bxN02, TROT", "S-Sdf", "OBES",
"bxJ01, bxN02, K-Other", "bxR01, LAB, TROT", "OBES", "K-Other",
"A-Unknown", "Z-S", "K-S", "OBES", "bxM01, bxA02",
"SC.R", "L-Other", "bxD02", "X-Other", "bxN05", "bxR06",
"bxJ01, bxA02, bxN02", "TROT"), days = c(613L, 861L, 883L,
1210L, 1408L, 1699L, 391L, 409L, 745L, 1448L, 28L, 32L, 43L,
98L, 105L, 231L, 439L, 442L, 446L, 544L, 704L, 801L, 845L, 846L,
851L, 1097L, 1131L, 1168L, 1246L, 1264L, 1309L, 1313L, 1323L,
1327L, 1452L, 1475L, 1482L, 1484L, 1518L, 1588L, 1629L, 1630L,
1631L, 1634L, 1641L, 1645L, 1699L, 1727L, 1741L, 1769L, 1809L,
28L, 790L, 953L, 999L, 1004L, 1013L, 1015L, 1034L, 1055L, 1168L,
1190L, 1211L, 1375L, 1544L, 1802L, 241L, 353L, 416L, 437L, 451L,
547L, 548L, 706L, 831L, 832L, 839L, 1099L, 1276L, 1301L, 1567L,
1598L, 287L, 574L, 854L, 872L, 943L, 1089L, 1147L, 1170L, 1177L,
1201L, 1202L, 1512L, 20L, 30L, 52L, 53L, 87L, 309L), ID = c("G",
"G", "G", "G", "G", "G", "F", "F", "F", "F", "E", "E", "E", "E",
"E", "E", "E", "E", "E", "E", "E", "E", "E", "E", "E", "E", "E",
"E", "E", "E", "E", "E", "E", "E", "E", "E", "E", "E", "E", "E",
"E", "E", "E", "E", "E", "E", "E", "E", "E", "E", "E", "D", "D",
"D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D",
"C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C",
"C", "C", "C", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B",
"B", "B", "A", "A", "A", "A", "A", "A")), row.names = c(NA, -100L
), class = c("data.table", "data.frame"))
取消嵌套以每天获取一件商品
DT <- DT %>%
mutate(items = strsplit(as.character(items), ",")) %>%
unnest(items)
**** 编辑 ****
不知道这是否有帮助,但我尝试了以下操作,但又卡住了:
# gives unique events in one day a unique number
DT[(duplicated(DT, by = c("ID", "day")) | duplicated(DT, by = c("id", "day"), fromLast=TRUE)), Dup_val := 1]
# put different sequence (1,2,3,4......) for duplicated and 0 for non duplicated (item that only has single item in the list)
DT[!is.na(Dup_val), Dup_val := seq_len(.N), by = ID]
DT[is.na(Dup_val), Dup_val := 0]
**** 编辑 ****
基于 DT 的预期结果(前 11 行)
ID items
#1: A bxD02, X-Other, bxN05, bxN06, bxJ01
#2: A bxD02, X-Other, bxN05, bxN06, bxA02
#3: A bxD02, X-Other, bxN05, bxN06, bxN02
#4: A TROT
#5: B OBES
#6: B bxJ01
#7: B bxN02
#8: B K-Other
#9: B bxR01, OBES
#10: B LAB, OBES
#11: B TROT, OBES
答案 0 :(得分:-1)
我已设法按照上述我需要的方式获取序列,并根据需要选择序列(请参阅 )。虽然我认为应该有一种更有效的方式,但我需要发布答案。如果有人对如何提高效率有任何建议,也请发表。
# create a column with same number for duplicate days
DT[(duplicated(DT, by = c("ID", "days")) | duplicated(DT, by = c("ID", "days"), fromLast=TRUE)), Dup_val := 1]
DT[!is.na(Dup_val), Dup_val := seq_len(.N), by = ID]
DT[is.na(Dup_val), Dup_val := 0]
max(DT$Dup_val) # is 15, so I make 15 subsets below.
# subset all duplicate days (>0) + all non-duplicates (0)
DT1 <- DT[Dup_val == 0 | Dup_val == 1]
DT2 <- DT[Dup_val == 0 | Dup_val == 2]
DT3 <- DT[Dup_val == 0 | Dup_val == 3]
DT4 <- DT[Dup_val == 0 | Dup_val == 4]
DT5 <- DT[Dup_val == 0 | Dup_val == 5]
DT6 <- DT[Dup_val == 0 | Dup_val == 6]
DT7 <- DT[Dup_val == 0 | Dup_val == 7]
DT8 <- DT[Dup_val == 0 | Dup_val == 8]
DT9 <- DT[Dup_val == 0 | Dup_val == 9]
DT10 <- DT[Dup_val == 0 | Dup_val == 10]
DT11 <- DT[Dup_val == 0 | Dup_val == 11]
DT12 <- DT[Dup_val == 0 | Dup_val == 12]
DT13 <- DT[Dup_val == 0 | Dup_val == 13]
DT14 <- DT[Dup_val == 0 | Dup_val == 14]
DT15 <- DT[Dup_val == 0 | Dup_val == 15]
# create column indicating if 182 days apart
DT1[, grp := cumsum(c(TRUE, diff(days) >= 182)), ID]
DT1=DT1[, .(items = toString(unique(items))), .(ID, grp)]
DT2[, grp := cumsum(c(TRUE, diff(days) >= 182)), ID]
DT2=DT2[, .(items = toString(unique(items))), .(ID, grp)]
DT3[, grp := cumsum(c(TRUE, diff(days) >= 182)), ID]
DT3=DT3[, .(items = toString(unique(items))), .(ID, grp)]
DT4[, grp := cumsum(c(TRUE, diff(days) >= 182)), ID]
DT4=DT4[, .(items = toString(unique(items))), .(ID, grp)]
DT5[, grp := cumsum(c(TRUE, diff(days) >= 182)), ID]
DT5=DT5[, .(items = toString(unique(items))), .(ID, grp)]
DT6[, grp := cumsum(c(TRUE, diff(days) >= 182)), ID]
DT6=DT6[, .(items = toString(unique(items))), .(ID, grp)]
DT7[, grp := cumsum(c(TRUE, diff(days) >= 182)), ID]
DT7=DT7[, .(items = toString(unique(items))), .(ID, grp)]
DT8[, grp := cumsum(c(TRUE, diff(days) >= 182)), ID]
DT8=DT8[, .(items = toString(unique(items))), .(ID, grp)]
DT9[, grp := cumsum(c(TRUE, diff(days) >= 182)), ID]
DT9=DT9[, .(items = toString(unique(items))), .(ID, grp)]
DT10[, grp := cumsum(c(TRUE, diff(days) >= 182)), ID]
DT10=DT10[, .(items = toString(unique(items))), .(ID, grp)]
DT11[, grp := cumsum(c(TRUE, diff(days) >= 182)), ID]
DT11=DT11[, .(items = toString(unique(items))), .(ID, grp)]
DT12[, grp := cumsum(c(TRUE, diff(days) >= 182)), ID]
DT12=DT12[, .(items = toString(unique(items))), .(ID, grp)]
DT13[, grp := cumsum(c(TRUE, diff(days) >= 182)), ID]
DT13=DT13[, .(items = toString(unique(items))), .(ID, grp)]
DT14[, grp := cumsum(c(TRUE, diff(days) >= 182)), ID]
DT14=DT14[, .(items = toString(unique(items))), .(ID, grp)]
DT15[, grp := cumsum(c(TRUE, diff(days) >= 182)), ID]
DT15=DT15[, .(items = toString(unique(items))), .(ID, grp)]
# row bind subsets
DT_seq <- rbindlist(list(DT1, DT2, DT3, DT4, DT5, DT6, DT7, DT8, DT9, DT10, DT11, DT12, DT13, DT14, DT15))
# remove duplicates
DT_seq = DT_seq[!duplicated(DT_seq, by = c("ID", "items"), fromLast = T)]