test_data <- data.frame(event= c("event1","event2","event3","event4","event5","event6","event7"),
rater1_1 = c("red", "orange", "red", "purple", "orange", "red", "yellow"),
rater2_1 = c("red", "orange", "orange", "purple", "orange", "red", "purple"),
rater3_1 = c("red", "red", "yellow", "purple", "orange", "red", "yellow"),
rater4_1 = c("orange", "orange", "blue", "orange", "orange", "red", "purple"),
rater5_1 = c("blue", "blue", "purple", "orange", "orange", "blue", "yellow")
)
利用上述数据,我试图创建一个混淆矩阵,在该矩阵中我可以观察到每个事件的所有评估者之间的分歧。也就是说,对于事件1,3个评分者给出了“红色”,1个给出了“橙色”,1个给出了“蓝色”。
我相信解决此问题的最佳方法是进行每个评估者对比较(y轴上的rater1和x轴上的rater2),然后在所有评估者对之间进行迭代和计算。我希望找到类似以下内容的东西:
red orange blue yellow purple
red 22 6 2 3 2
orange 6 13 1 4 1
blue 2 1 10 3 1
yellow 3 4 3 9 2
purple 2 1 1 2 9
(注意:这些值是由人为组成的,我没有手动计算以上值)
我什至不知道从哪里开始。我搜索的大多数混淆矩阵都是将实际模型输出与预测模型输出(例如,link)进行比较。任何建议将不胜感激。
答案 0 :(得分:3)
对于此解决方案,我使用的是dplyr
和purrr
软件包
library(dplyr)
library(purrr)
# convert to long format
df_long <- test_data %>% pivot_longer(-event)
# df_long
# # A tibble: 35 x 3
# event name value
# <fct> <chr> <fct>
# 1 event1 rater1_1 red
# 2 event1 rater2_1 red
# 3 event1 rater3_1 red
# 4 event1 rater4_1 orange
# 5 event1 rater5_1 blue
# 6 event2 rater1_1 orange
# 7 event2 rater2_1 orange
# 8 event2 rater3_1 red
# 9 event2 rater4_1 orange
#10 event2 rater5_1 blue
# # ... with 25 more rows
# create function to compute the confusion matrix for two given events
create_confusion_matrix <- function(raters){
df_long %>% filter(name %in% raters) %>%
pivot_wider(names_from=name,values_from=value) %>%
select(-event) %>%
table()
}
# lets try this function with rater1_1 and rater2_1
create_confusion_matrix(c('rater1_1','rater2_1'))
# rater2_1
#rater1_1 orange purple red yellow blue
# orange 2 0 0 0 0
# purple 0 1 0 0 0
# red 1 0 2 0 0
# yellow 0 1 0 0 0
# blue 0 0 0 0 0
# now we need to get all combinations of two raters
raters2 <- combn(unique(df_long$name),2,simplify=FALSE)
# raters2 is a list, each element is a vector containing 2 raters
# loop over the list and apply create_confusion_matrix for each element
result_list <- map(raters2,create_confusion_matrix)
# result_list is a list, each element is a confusion matrix
#we can them sum all theses tables
contingency <- Reduce('+',result_list)
# rater2_1
#rater1_1 orange purple red yellow blue
# orange 14 1 2 1 5
# purple 6 4 0 3 0
# red 5 1 9 1 9
# yellow 0 4 0 3 1
# blue 0 1 0 0 0
# getting rid of rater1_1 and rater2_1 in dimnames
dimnames(contingency) <- list(dimnames(contingency)[[1]],dimnames(contingency)[[2]])
# orange purple red yellow blue
#orange 14 1 2 1 5
#purple 6 4 0 3 0
#red 5 1 9 1 9
#yellow 0 4 0 3 1
#blue 0 1 0 0 0
# sum symmetric cells and make contingency table lower triangular
# first lets extract the diagonal
# diag is needed twice, first to extract the diagonal from contingency as a vector
# second to convert this vector to a diagonal matrix
diag_contingency <- diag(diag(contingency))
# sum lower and upper matrices by adding the transposed matrix
# and substracting the diagonal (otherwise added twice)
contingency <- contingency + t(contingency) - diag_contingency
# we know have a symmetrical matrix
# orange purple red yellow blue
#orange 14 7 7 1 5
#purple 7 4 1 7 1
#red 7 1 9 1 9
#yellow 1 7 1 3 1
#blue 5 1 9 1 0
# set the upper triangular matrix to 0
contingency[upper.tri(contingency)] <- 0
# we get this matrix in the end
contingency
# orange purple red yellow blue
#orange 14 0 0 0 0
#purple 7 4 0 0 0
#red 7 1 9 0 0
#yellow 1 7 1 3 0
#blue 5 1 9 1 0