在R中,如何在类别组合之间轻松返回常见唯一元素的比例?

时间:2018-04-27 16:06:06

标签: r

我有大型分类数据集,并希望快速生成互惠表,比较类别之间常见元素的比例。

输入示例:

df <- structure(list(YEAR = structure(c(5L, 5L, 5L, 5L, 5L, 5L, 5L, 
5L, 5L, 5L, 5L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 
1L), .Label = c("2013", "2014", "2015", "2016", "2017"), class = "factor"), 
    Entry_Number_F = c(3170L, 3182L, 3169L, 3178L, 3180L, 3181L, 
    3097L, 3168L, 3164L, 3179L, 3171L, 3169L, 3170L, 3178L, 3097L, 
    3177L, 3168L, 3164L, 3179L, 3097L, 3164L, 3168L, 3169L, 3170L, 
    3171L, 3172L, 3173L, 3174L, 3175L, 3176L, 3097L, 3164L, 3168L, 
    3169L, 3170L, 3097L, 3156L, 3168L, 3169L, 3170L)), .Names = c("YEAR", 
"Entry_Number_F"), row.names = c(181L, 182L, 183L, 184L, 186L, 
196L, 199L, 202L, 204L, 768L, 3213L, 3948L, 3950L, 3954L, 3957L, 
3958L, 3963L, 3964L, 3969L, 7836L, 7837L, 7838L, 7839L, 7840L, 
7841L, 7842L, 7843L, 7844L, 7845L, 7846L, 10785L, 10786L, 10787L, 
10788L, 10789L, 13679L, 13680L, 13681L, 13682L, 13683L), class = "data.frame")

期望的输出:

structure(list(X2017 = c(1, 0.6666667, 0.4888889, 0.4888889, 
0.4888889), X2016 = c(0.7317073, 1, 0.6585366, 0.6341463, 0.6097561
), X2015 = c(0.44, 0.54, 1, 0.68, 0.58), X2014 = c(0.468, 0.553, 
0.723, 1, 0.702), X2013 = c(0.423, 0.481, 0.557, 0.6346, 1)), .Names = c("X2017", 
"X2016", "X2015", "X2014", "X2013"), class = "data.frame", row.names = c(2017L, 
2016L, 2015L, 2014L, 2013L))

我可以通过以下方式获得我想要的答案:

unique(df$YEAR)

这些年子集。

Year1 <- "2017"
Year2 <- "2016"
Year3 <- "2015"
Year4 <- "2014"
Year5 <- "2013"

每年的子集。

df.2017 <- droplevels(subset(df, YEAR=="2017"))
df.2016 <- droplevels(subset(df, YEAR=="2016"))
df.2015 <- droplevels(subset(df, YEAR=="2015"))
df.2014 <- droplevels(subset(df, YEAR=="2014"))
df.2013 <- droplevels(subset(df, YEAR=="2013"))

查找年份之间常见条目的比例。

length(Reduce(intersect, 
              list(df.2017$Entry_Number_F, 
                   df.2017$Entry_Number_F
              )))/length(unique(df.2017$Entry_Number_F))

length(Reduce(intersect, 
              list(df.2017$Entry_Number_F, 
                   df.2016$Entry_Number_F
              )))/length(unique(df.2017$Entry_Number_F))

length(Reduce(intersect, 
              list(df.2017$Entry_Number_F, 
                   df.2015$Entry_Number_F
              )))/length(unique(df.2017$Entry_Number_F))

length(Reduce(intersect, 
              list(df.2017$Entry_Number_F, 
                   df.2015$Entry_Number_F
              )))/length(unique(df.2017$Entry_Number_F))

length(Reduce(intersect, 
              list(df.2017$Entry_Number_F, 
                   df.2014$Entry_Number_F
              )))/length(unique(df.2017$Entry_Number_F))

length(Reduce(intersect, 
              list(df.2017$Entry_Number_F, 
                   df.2013$Entry_Number_F
              )))/length(unique(df.2017$Entry_Number_F))

这显然不是理想的。我已尝试过一些事情,例如下面的代码,以及类似于此处链接example的内容,但它并没有通过简单的步骤向我提供互惠表。

x <- df %>% group_by(YEAR) %>% mutate(count = n_distinct(Entry_Number_F))

x <- aggregate(Entry_Number_F ~ YEAR, Data.input, function(x) unique(x))

任何人都可以提出直截了当的做法吗?

非常感谢!

编辑:

我很欣赏快速反馈。不幸的是,以下建议都没有回复正确的答案,我不这么认为。这可能是因为我没有让我的问题足够清楚和/或我的例子不够充分。

此数据集与我的实际数据更具可比性,但为了清楚起见,我删除了许多列。

df2 <- structure(list(YEAR = structure(c(5L, 5L, 5L, 5L, 5L, 5L, 5L, 
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L), .Label = c("2013", "2014", "2015", "2016", 
"2017"), class = "factor"), Entry_Number_F = c(3170L, 3182L, 
3169L, 3178L, 3169L, 3180L, 3180L, 3170L, 3182L, 3178L, 3170L, 
3180L, 3178L, 3169L, 3182L, 3181L, 3181L, 3181L, 3097L, 3097L, 
3097L, 3168L, 3168L, 3164L, 3168L, 3164L, 3164L, 3097L, 3164L, 
3168L, 3169L, 3170L, 3178L, 3180L, 3181L, 3182L, 3180L, 3182L, 
3180L, 3182L, 3169L, 3178L, 3170L, 3182L, 3169L, 3097L, 3178L, 
3170L, 3181L, 3169L, 3170L, 3178L, 3181L, 3180L, 3181L, 3168L, 
3164L, 3097L, 3097L, 3164L, 3168L, 3168L, 3164L, 3097L, 3164L, 
3168L, 3169L, 3170L, 3178L, 3180L, 3181L, 3182L, 3182L, 3182L, 
3178L, 3168L, 3170L, 3170L, 3178L, 3169L, 3181L, 3169L, 3097L, 
3168L, 3182L, 3164L, 3097L, 3178L, 3169L, 3181L, 3180L, 3164L, 
3181L, 3164L, 3097L, 3168L, 3180L, 3170L, 3180L, 3097L, 3164L, 
3168L, 3169L, 3170L, 3178L, 3180L, 3181L, 3182L, 3170L, 3181L, 
3182L, 3169L, 3180L, 3169L, 3169L, 3182L, 3170L, 3180L, 3182L, 
3180L, 3170L, 3168L, 3181L, 3178L, 3097L, 3178L, 3168L, 3178L, 
3164L, 3097L, 3097L, 3181L, 3164L, 3168L, 3164L, 3097L, 3164L, 
3168L, 3169L, 3170L, 3178L, 3180L, 3181L, 3182L, 3170L, 3170L, 
3169L, 3169L, 3164L, 3178L, 3178L, 3170L, 3097L, 3168L, 3177L, 
3097L, 3178L, 3164L, 3168L, 3097L, 3164L, 3168L, 3169L, 3177L, 
3179L, 3177L, 3179L, 3179L, 3097L, 3164L, 3168L, 3169L, 3170L, 
3177L, 3178L, 3179L, 3169L, 3169L, 3178L, 3178L, 3170L, 3170L, 
3170L, 3170L, 3178L, 3177L, 3169L, 3168L, 3168L, 3178L, 3097L, 
3168L, 3177L, 3177L, 3164L, 3097L, 3097L, 3164L, 3177L, 3097L, 
3164L, 3164L, 3179L, 3179L, 3179L, 3168L, 3169L, 3179L, 3170L, 
3169L, 3170L, 3178L, 3169L, 3178L, 3169L, 3178L, 3097L, 3170L, 
3097L, 3097L, 3177L, 3177L, 3177L, 3164L, 3164L, 3168L, 3168L, 
3164L, 3168L, 3179L, 3179L, 3179L, 3097L, 3164L, 3168L, 3169L, 
3170L, 3177L, 3178L, 3179L, 3169L, 3178L, 3170L, 3169L, 3097L, 
3178L, 3170L, 3170L, 3169L, 3178L, 3168L, 3164L, 3177L, 3177L, 
3097L, 3168L, 3168L, 3097L, 3177L, 3164L, 3164L, 3097L, 3164L, 
3168L, 3169L, 3170L, 3177L, 3178L, 3179L)), .Names = c("YEAR", 
"Entry_Number_F"), class = "data.frame", row.names = c(181L, 
182L, 183L, 184L, 185L, 186L, 187L, 188L, 189L, 190L, 191L, 192L, 
193L, 194L, 195L, 196L, 197L, 198L, 199L, 200L, 201L, 202L, 203L, 
204L, 205L, 206L, 207L, 208L, 209L, 210L, 211L, 212L, 213L, 214L, 
215L, 216L, 552L, 553L, 554L, 555L, 556L, 557L, 558L, 559L, 560L, 
561L, 562L, 563L, 564L, 565L, 566L, 567L, 568L, 569L, 570L, 571L, 
572L, 573L, 574L, 575L, 576L, 577L, 578L, 579L, 580L, 581L, 582L, 
583L, 584L, 585L, 586L, 587L, 984L, 985L, 986L, 987L, 988L, 989L, 
990L, 991L, 992L, 993L, 994L, 995L, 996L, 997L, 998L, 999L, 1000L, 
1001L, 1002L, 1003L, 1004L, 1005L, 1006L, 1007L, 1008L, 1009L, 
1010L, 1011L, 1012L, 1013L, 1014L, 1015L, 1016L, 1017L, 1018L, 
1019L, 1357L, 1358L, 1359L, 1360L, 1361L, 1362L, 1363L, 1364L, 
1365L, 1366L, 1367L, 1368L, 1369L, 1370L, 1371L, 1372L, 1373L, 
1374L, 1375L, 1376L, 1377L, 1378L, 1379L, 1380L, 1381L, 1382L, 
1383L, 1384L, 1385L, 1386L, 1387L, 1388L, 1389L, 1390L, 1391L, 
1392L, 4139L, 4140L, 4141L, 4142L, 4143L, 4144L, 4145L, 4146L, 
4147L, 4148L, 4149L, 4150L, 4151L, 4152L, 4153L, 4154L, 4155L, 
4156L, 4157L, 4158L, 4159L, 4160L, 4161L, 4162L, 4163L, 4164L, 
4165L, 4166L, 4167L, 4168L, 4169L, 4170L, 4444L, 4445L, 4446L, 
4447L, 4448L, 4449L, 4450L, 4451L, 4452L, 4453L, 4454L, 4455L, 
4456L, 4457L, 4458L, 4459L, 4460L, 4461L, 4462L, 4463L, 4464L, 
4465L, 4466L, 4467L, 4468L, 4469L, 4470L, 4471L, 4472L, 4473L, 
4474L, 4475L, 4968L, 4969L, 4970L, 4971L, 4972L, 4973L, 4974L, 
4975L, 4976L, 4977L, 4978L, 4979L, 4980L, 4981L, 4982L, 4983L, 
4984L, 4985L, 4986L, 4987L, 4988L, 4989L, 4990L, 4991L, 4992L, 
4993L, 4994L, 4995L, 4996L, 4997L, 4998L, 4999L, 5409L, 5410L, 
5411L, 5412L, 5413L, 5414L, 5415L, 5416L, 5417L, 5418L, 5419L, 
5420L, 5421L, 5422L, 5423L, 5424L, 5425L, 5426L, 5427L, 5428L, 
5429L, 5430L, 5431L, 5432L, 5433L, 5434L, 5435L, 5436L, 5437L
))

我们现在看到每个子集中的唯一条目。

unique(df2.2017$Entry_Number_F)
unique(df2.2016$Entry_Number_F)

以上显示:

2017年有9个独特的条目 2016年有8个独特的条目 他们共享6个独特的条目。 因此:6/11 = 0.6666667

以下内容也会返回此号码。

length(Reduce(intersect, list(df2.2017$Entry_Number_F, unique(df2.2016$Entry_Number_F))))/length(unique(df2.2017$Entry_Number_F))

这会让我的问题更清晰吗?由于空间限制,我无法提供更多数据,但如果我有更多年份,我将如何比较所有年份?

示例here,使用crossprod(table(stack(l)))似乎是一种可能性,但我不确定如何将每年的唯一条目列表放入列表中。

一位教R的同事说,&gt; expand.grid,外部和lapply可能是要走的路。&#34;但他没有时间详细说明。

2 个答案:

答案 0 :(得分:3)

这并没有给出与你的输出相同的答案,但也许它真的是你想要的。如果您想要移调,请使用length(y)代替length(x),如果您希望年份以相反的顺序显示,请使用s <- rev(s)

prop <- function(x, y) length(intersect(x, y)) / length(x)
s <- with(unique(df), split(Entry_Number_F, YEAR))
outer(s, s, Vectorize(prop))

,并提供:

          2013      2014      2015      2016      2017
2013 1.0000000 0.8000000 0.8000000 0.8000000 0.8000000
2014 0.8000000 1.0000000 1.0000000 1.0000000 1.0000000
2015 0.3636364 0.4545455 1.0000000 0.4545455 0.5454545
2016 0.5000000 0.6250000 0.6250000 1.0000000 0.8750000
2017 0.3636364 0.4545455 0.5454545 0.6363636 1.0000000

答案 1 :(得分:0)

我也不确定您所需的输出是否是您真正想要的,但这是一个使用dplyrtidyr的解决方案:

left_join(df, df, by = "Entry_Number_F") %>% 
    group_by(YEAR.x, YEAR.y) %>% 
    summarise(newcol = n() / sum(YEAR.x[1] == df$YEAR)) %>% 
    spread(YEAR.x, newcol)

# A tibble: 5 x 6
#   YEAR.y `2013` `2014` `2015` `2016` `2017`
#   <fct>   <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
# 1 2013    1.00   0.800  0.364  0.500  0.364
# 2 2014    0.800  1.00   0.455  0.625  0.455
# 3 2015    0.800  1.00   1.00   0.625  0.545
# 4 2016    0.800  1.00   0.455  1.00   0.636
# 5 2017    0.800  1.00   0.545  0.875  1.00