我有一个数据集
universityies <- c("UNI.1;UNI.1;UNI.2;UNI.3","UNI.5", "UNI.3;UNI.4" )
papers <- c(1,1,1)
cited <- c(10,5,20)
df <- data.frame(universityies, papers, cited )
df
我想得到类似的东西
#total papers total cited
#UNI.1 1 10
#UNI.2 1 10
#UNI.3 2 30
#UNI.4 1 20
#UNI.5 1 5
在此先感谢
答案 0 :(得分:2)
我们可以在";"
上拆分数据,获得唯一的行,group_by
universityies
统计不同的论文和引文总数。
library(dplyr)
df %>%
mutate(row = row_number()) %>%
tidyr::separate_rows(universityies, sep = ";") %>%
distinct() %>%
group_by(universityies) %>%
summarise(total_papers = n_distinct(row),
total_cited = sum(cited))
# universityies total_papers total_cited
# <chr> <int> <dbl>
#1 UNI.1 1 10
#2 UNI.2 1 10
#3 UNI.3 2 30
#4 UNI.4 1 20
#5 UNI.5 1 5
答案 1 :(得分:1)
您可以先使用strsplit
,然后使用aggregate
tmp <- do.call(rbind, apply(df, 1, function(x)
setNames(data.frame(strsplit(x[1], ";"), as.numeric(x[2]), as.numeric(x[3]),
row.names=NULL, stringsAsFactors=FALSE), names(df))))
res <- aggregate(cbind(total.papers=papers, total.cited=cited) ~ universityies,
unique(tmp), sum)
res[order(res$universityies), ]
# universityies total.papers total.cited
# 1 UNI.1 1 10
# 2 UNI.2 1 10
# 3 UNI.3 2 30
# 4 UNI.4 1 20
# 5 UNI.5 1 5
答案 2 :(得分:1)
我们可以使用cSplit
和splitstackshape
方法中的data.table
library(data.table)
library(splitstackshape)
unique(cSplit(setDT(df, keep.rownames = TRUE), "universityies", ";",
"long"))[, .(total_papers = uniqueN(rn), total_cited = sum(cited)),.(universityies)]
# universityies total_papers total_cited
#1: UNI.1 1 10
#2: UNI.2 1 10
#3: UNI.3 2 30
#4: UNI.5 1 5
#5: UNI.4 1 20