我有大型分类数据集,并希望快速生成互惠表,比较类别之间常见元素的比例。
输入示例:
df <- structure(list(YEAR = structure(c(5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L,
1L), .Label = c("2013", "2014", "2015", "2016", "2017"), class = "factor"),
Entry_Number_F = c(3170L, 3182L, 3169L, 3178L, 3180L, 3181L,
3097L, 3168L, 3164L, 3179L, 3171L, 3169L, 3170L, 3178L, 3097L,
3177L, 3168L, 3164L, 3179L, 3097L, 3164L, 3168L, 3169L, 3170L,
3171L, 3172L, 3173L, 3174L, 3175L, 3176L, 3097L, 3164L, 3168L,
3169L, 3170L, 3097L, 3156L, 3168L, 3169L, 3170L)), .Names = c("YEAR",
"Entry_Number_F"), row.names = c(181L, 182L, 183L, 184L, 186L,
196L, 199L, 202L, 204L, 768L, 3213L, 3948L, 3950L, 3954L, 3957L,
3958L, 3963L, 3964L, 3969L, 7836L, 7837L, 7838L, 7839L, 7840L,
7841L, 7842L, 7843L, 7844L, 7845L, 7846L, 10785L, 10786L, 10787L,
10788L, 10789L, 13679L, 13680L, 13681L, 13682L, 13683L), class = "data.frame")
期望的输出:
structure(list(X2017 = c(1, 0.6666667, 0.4888889, 0.4888889,
0.4888889), X2016 = c(0.7317073, 1, 0.6585366, 0.6341463, 0.6097561
), X2015 = c(0.44, 0.54, 1, 0.68, 0.58), X2014 = c(0.468, 0.553,
0.723, 1, 0.702), X2013 = c(0.423, 0.481, 0.557, 0.6346, 1)), .Names = c("X2017",
"X2016", "X2015", "X2014", "X2013"), class = "data.frame", row.names = c(2017L,
2016L, 2015L, 2014L, 2013L))
我可以通过以下方式获得我想要的答案:
unique(df$YEAR)
这些年子集。
Year1 <- "2017"
Year2 <- "2016"
Year3 <- "2015"
Year4 <- "2014"
Year5 <- "2013"
每年的子集。
df.2017 <- droplevels(subset(df, YEAR=="2017"))
df.2016 <- droplevels(subset(df, YEAR=="2016"))
df.2015 <- droplevels(subset(df, YEAR=="2015"))
df.2014 <- droplevels(subset(df, YEAR=="2014"))
df.2013 <- droplevels(subset(df, YEAR=="2013"))
查找年份之间常见条目的比例。
length(Reduce(intersect,
list(df.2017$Entry_Number_F,
df.2017$Entry_Number_F
)))/length(unique(df.2017$Entry_Number_F))
length(Reduce(intersect,
list(df.2017$Entry_Number_F,
df.2016$Entry_Number_F
)))/length(unique(df.2017$Entry_Number_F))
length(Reduce(intersect,
list(df.2017$Entry_Number_F,
df.2015$Entry_Number_F
)))/length(unique(df.2017$Entry_Number_F))
length(Reduce(intersect,
list(df.2017$Entry_Number_F,
df.2015$Entry_Number_F
)))/length(unique(df.2017$Entry_Number_F))
length(Reduce(intersect,
list(df.2017$Entry_Number_F,
df.2014$Entry_Number_F
)))/length(unique(df.2017$Entry_Number_F))
length(Reduce(intersect,
list(df.2017$Entry_Number_F,
df.2013$Entry_Number_F
)))/length(unique(df.2017$Entry_Number_F))
这显然不是理想的。我已尝试过一些事情,例如下面的代码,以及类似于此处链接example的内容,但它并没有通过简单的步骤向我提供互惠表。
x <- df %>% group_by(YEAR) %>% mutate(count = n_distinct(Entry_Number_F))
x <- aggregate(Entry_Number_F ~ YEAR, Data.input, function(x) unique(x))
任何人都可以提出直截了当的做法吗?
非常感谢!
编辑:
我很欣赏快速反馈。不幸的是,以下建议都没有回复正确的答案,我不这么认为。这可能是因为我没有让我的问题足够清楚和/或我的例子不够充分。
此数据集与我的实际数据更具可比性,但为了清楚起见,我删除了许多列。
df2 <- structure(list(YEAR = structure(c(5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L), .Label = c("2013", "2014", "2015", "2016",
"2017"), class = "factor"), Entry_Number_F = c(3170L, 3182L,
3169L, 3178L, 3169L, 3180L, 3180L, 3170L, 3182L, 3178L, 3170L,
3180L, 3178L, 3169L, 3182L, 3181L, 3181L, 3181L, 3097L, 3097L,
3097L, 3168L, 3168L, 3164L, 3168L, 3164L, 3164L, 3097L, 3164L,
3168L, 3169L, 3170L, 3178L, 3180L, 3181L, 3182L, 3180L, 3182L,
3180L, 3182L, 3169L, 3178L, 3170L, 3182L, 3169L, 3097L, 3178L,
3170L, 3181L, 3169L, 3170L, 3178L, 3181L, 3180L, 3181L, 3168L,
3164L, 3097L, 3097L, 3164L, 3168L, 3168L, 3164L, 3097L, 3164L,
3168L, 3169L, 3170L, 3178L, 3180L, 3181L, 3182L, 3182L, 3182L,
3178L, 3168L, 3170L, 3170L, 3178L, 3169L, 3181L, 3169L, 3097L,
3168L, 3182L, 3164L, 3097L, 3178L, 3169L, 3181L, 3180L, 3164L,
3181L, 3164L, 3097L, 3168L, 3180L, 3170L, 3180L, 3097L, 3164L,
3168L, 3169L, 3170L, 3178L, 3180L, 3181L, 3182L, 3170L, 3181L,
3182L, 3169L, 3180L, 3169L, 3169L, 3182L, 3170L, 3180L, 3182L,
3180L, 3170L, 3168L, 3181L, 3178L, 3097L, 3178L, 3168L, 3178L,
3164L, 3097L, 3097L, 3181L, 3164L, 3168L, 3164L, 3097L, 3164L,
3168L, 3169L, 3170L, 3178L, 3180L, 3181L, 3182L, 3170L, 3170L,
3169L, 3169L, 3164L, 3178L, 3178L, 3170L, 3097L, 3168L, 3177L,
3097L, 3178L, 3164L, 3168L, 3097L, 3164L, 3168L, 3169L, 3177L,
3179L, 3177L, 3179L, 3179L, 3097L, 3164L, 3168L, 3169L, 3170L,
3177L, 3178L, 3179L, 3169L, 3169L, 3178L, 3178L, 3170L, 3170L,
3170L, 3170L, 3178L, 3177L, 3169L, 3168L, 3168L, 3178L, 3097L,
3168L, 3177L, 3177L, 3164L, 3097L, 3097L, 3164L, 3177L, 3097L,
3164L, 3164L, 3179L, 3179L, 3179L, 3168L, 3169L, 3179L, 3170L,
3169L, 3170L, 3178L, 3169L, 3178L, 3169L, 3178L, 3097L, 3170L,
3097L, 3097L, 3177L, 3177L, 3177L, 3164L, 3164L, 3168L, 3168L,
3164L, 3168L, 3179L, 3179L, 3179L, 3097L, 3164L, 3168L, 3169L,
3170L, 3177L, 3178L, 3179L, 3169L, 3178L, 3170L, 3169L, 3097L,
3178L, 3170L, 3170L, 3169L, 3178L, 3168L, 3164L, 3177L, 3177L,
3097L, 3168L, 3168L, 3097L, 3177L, 3164L, 3164L, 3097L, 3164L,
3168L, 3169L, 3170L, 3177L, 3178L, 3179L)), .Names = c("YEAR",
"Entry_Number_F"), class = "data.frame", row.names = c(181L,
182L, 183L, 184L, 185L, 186L, 187L, 188L, 189L, 190L, 191L, 192L,
193L, 194L, 195L, 196L, 197L, 198L, 199L, 200L, 201L, 202L, 203L,
204L, 205L, 206L, 207L, 208L, 209L, 210L, 211L, 212L, 213L, 214L,
215L, 216L, 552L, 553L, 554L, 555L, 556L, 557L, 558L, 559L, 560L,
561L, 562L, 563L, 564L, 565L, 566L, 567L, 568L, 569L, 570L, 571L,
572L, 573L, 574L, 575L, 576L, 577L, 578L, 579L, 580L, 581L, 582L,
583L, 584L, 585L, 586L, 587L, 984L, 985L, 986L, 987L, 988L, 989L,
990L, 991L, 992L, 993L, 994L, 995L, 996L, 997L, 998L, 999L, 1000L,
1001L, 1002L, 1003L, 1004L, 1005L, 1006L, 1007L, 1008L, 1009L,
1010L, 1011L, 1012L, 1013L, 1014L, 1015L, 1016L, 1017L, 1018L,
1019L, 1357L, 1358L, 1359L, 1360L, 1361L, 1362L, 1363L, 1364L,
1365L, 1366L, 1367L, 1368L, 1369L, 1370L, 1371L, 1372L, 1373L,
1374L, 1375L, 1376L, 1377L, 1378L, 1379L, 1380L, 1381L, 1382L,
1383L, 1384L, 1385L, 1386L, 1387L, 1388L, 1389L, 1390L, 1391L,
1392L, 4139L, 4140L, 4141L, 4142L, 4143L, 4144L, 4145L, 4146L,
4147L, 4148L, 4149L, 4150L, 4151L, 4152L, 4153L, 4154L, 4155L,
4156L, 4157L, 4158L, 4159L, 4160L, 4161L, 4162L, 4163L, 4164L,
4165L, 4166L, 4167L, 4168L, 4169L, 4170L, 4444L, 4445L, 4446L,
4447L, 4448L, 4449L, 4450L, 4451L, 4452L, 4453L, 4454L, 4455L,
4456L, 4457L, 4458L, 4459L, 4460L, 4461L, 4462L, 4463L, 4464L,
4465L, 4466L, 4467L, 4468L, 4469L, 4470L, 4471L, 4472L, 4473L,
4474L, 4475L, 4968L, 4969L, 4970L, 4971L, 4972L, 4973L, 4974L,
4975L, 4976L, 4977L, 4978L, 4979L, 4980L, 4981L, 4982L, 4983L,
4984L, 4985L, 4986L, 4987L, 4988L, 4989L, 4990L, 4991L, 4992L,
4993L, 4994L, 4995L, 4996L, 4997L, 4998L, 4999L, 5409L, 5410L,
5411L, 5412L, 5413L, 5414L, 5415L, 5416L, 5417L, 5418L, 5419L,
5420L, 5421L, 5422L, 5423L, 5424L, 5425L, 5426L, 5427L, 5428L,
5429L, 5430L, 5431L, 5432L, 5433L, 5434L, 5435L, 5436L, 5437L
))
我们现在看到每个子集中的唯一条目。
unique(df2.2017$Entry_Number_F)
unique(df2.2016$Entry_Number_F)
以上显示:
2017年有9个独特的条目 2016年有8个独特的条目 他们共享6个独特的条目。 因此:6/11 = 0.6666667
以下内容也会返回此号码。
length(Reduce(intersect, list(df2.2017$Entry_Number_F, unique(df2.2016$Entry_Number_F))))/length(unique(df2.2017$Entry_Number_F))
这会让我的问题更清晰吗?由于空间限制,我无法提供更多数据,但如果我有更多年份,我将如何比较所有年份?
示例here,使用crossprod(table(stack(l)))似乎是一种可能性,但我不确定如何将每年的唯一条目列表放入列表中。
一位教R的同事说,&gt; expand.grid,外部和lapply可能是要走的路。&#34;但他没有时间详细说明。
答案 0 :(得分:3)
这并没有给出与你的输出相同的答案,但也许它真的是你想要的。如果您想要移调,请使用length(y)
代替length(x)
,如果您希望年份以相反的顺序显示,请使用s <- rev(s)
。
prop <- function(x, y) length(intersect(x, y)) / length(x)
s <- with(unique(df), split(Entry_Number_F, YEAR))
outer(s, s, Vectorize(prop))
,并提供:
2013 2014 2015 2016 2017
2013 1.0000000 0.8000000 0.8000000 0.8000000 0.8000000
2014 0.8000000 1.0000000 1.0000000 1.0000000 1.0000000
2015 0.3636364 0.4545455 1.0000000 0.4545455 0.5454545
2016 0.5000000 0.6250000 0.6250000 1.0000000 0.8750000
2017 0.3636364 0.4545455 0.5454545 0.6363636 1.0000000
答案 1 :(得分:0)
我也不确定您所需的输出是否是您真正想要的,但这是一个使用dplyr
和tidyr
的解决方案:
left_join(df, df, by = "Entry_Number_F") %>%
group_by(YEAR.x, YEAR.y) %>%
summarise(newcol = n() / sum(YEAR.x[1] == df$YEAR)) %>%
spread(YEAR.x, newcol)
# A tibble: 5 x 6
# YEAR.y `2013` `2014` `2015` `2016` `2017`
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 2013 1.00 0.800 0.364 0.500 0.364
# 2 2014 0.800 1.00 0.455 0.625 0.455
# 3 2015 0.800 1.00 1.00 0.625 0.545
# 4 2016 0.800 1.00 0.455 1.00 0.636
# 5 2017 0.800 1.00 0.545 0.875 1.00