我有一个像这样的数据框:
df<-structure(list(id = c("A", "A", "A", "B", "B", "C", "C", "D",
"D", "E", "E"), expertise = c("r", "python", "julia", "python",
"r", "python", "julia", "python", "julia", "r", "julia")), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -11L), .Names = c("id",
"expertise"), spec = structure(list(cols = structure(list(id = structure(list(), class = c("collector_character",
"collector")), expertise = structure(list(), class = c("collector_character",
"collector"))), .Names = c("id", "expertise")), default = structure(list(), class = c("collector_guess",
"collector"))), .Names = c("cols", "default"), class = "col_spec"))
df
id expertise
1 A r
2 A python
3 A julia
4 B python
5 B r
6 C python
7 C julia
8 D python
9 D julia
10 E r
11 E julia
我可以使用以下方法获得“专业知识”的总数:
library(dplyr)
df %>% group_by(expertise) %>% mutate (counts_overall= n())
但是,我想要的是专业知识价值组合的计数。换句话说,有多少“ id”具有两种专业知识的相同组合,例如“ r”和“ julia”? 这是所需的输出:
df_out<-structure(list(expertise1 = c("r", "r", "python"), expertise2 = c("python",
"julia", "julia"), count = c(2L, 2L, 3L)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -3L), .Names = c("expertise1",
"expertise2", "count"), spec = structure(list(cols = structure(list(
expertise1 = structure(list(), class = c("collector_character",
"collector")), expertise2 = structure(list(), class = c("collector_character",
"collector")), count = structure(list(), class = c("collector_integer",
"collector"))), .Names = c("expertise1", "expertise2", "count"
)), default = structure(list(), class = c("collector_guess",
"collector"))), .Names = c("cols", "default"), class = "col_spec"))
df_out
expertise1 expertise2 count
1 r python 2
2 r julia 2
3 python julia 3
答案 0 :(得分:4)
linked answer中的latemail's comment创建矩阵
crossprod(table(df) > 0)
expertise expertise julia python r julia 4 3 2 python 3 4 2 r 2 2 3
OP希望使用长格式的数据帧。
下面是一个data.table
解决方案,它使用了CJ()
(交叉联接)功能:
library(data.table)
setDT(df)[, CJ(expertise, expertise)[V1 < V2], by = id][
, .N, by = .(expertise1 = V1, expertise2 = V2)]
expertise1 expertise2 N 1: julia python 3 2: julia r 2 3: python r 2
CJ(expertise, expertise)[V1 < V2]
与data.table
或t(combn(df$expertise, 2))
的{{1}}等效。
这是另一个使用 self-join 的变体:
combinat::combn2(df$expertise)
library(data.table) setDT(df)[df, on = "id", allow = TRUE][ expertise < i.expertise, .N, by = .(expertise1 = expertise, expertise2 = i.expertise)]
答案 1 :(得分:2)
一种解决方案不如交叉产品表方法有效,但易于理解:
library(tidyr)
df %>% group_by(id) %>%
summarize(expertise = list(combn(sort(expertise), 2, FUN = paste, collapse = '_'))) %>%
unnest(expertise) %>%
group_by(expertise) %>%
summarize(count = n()) %>%
separate(expertise, c('expertise1', 'expertise2'), sep = '_')
# # A tibble: 3 x 3
# expertise1 expertise2 count
# <chr> <chr> <int>
# 1 julia python 3
# 2 julia r 2
# 3 python r 2