我在关键字之间有一个相似的大数据框,我想将其更改为包含所有成对相似性的格式。我目前的数据框架如下:
> df
kwd1 kwd2 sim
1 a b 1
2 b a 1
3 c a 2
4 a c 2
我希望将其转换为格式的data.frame:
> df
kwd1 kwd2 sim
a b 1
a c 2
b c 0
感谢您的帮助!
到目前为止我的代码是:
df <- data.frame(c('a', 'b', 'c', 'a'), c('b', 'a', 'a', 'c'), c(.1,.1,.2,.2))
colnames(df) = c('kwd1', 'kwd2', 'sim')
> dput(df)
structure(list(kwd1 = structure(c(1L, 2L, 3L, 1L), .Label = c("a",
"b", "c"), class = "factor"), kwd2 = structure(c(2L, 1L, 1L,
3L), .Label = c("a", "b", "c"), class = "factor"), sim = c(1,
1, 2, 2)), .Names = c("kwd1", "kwd2", "sim"), row.names = c(NA,
-4L), class = "data.frame")
答案 0 :(得分:2)
library(plyr)
res <- merge(expand.grid(kwd1 = unique(c(df$kwd1, df$kwd2)),
kwd2 = unique(c(df$kwd1,
df$kwd2))), df, all.x = T)
res <- ddply(res, .(kwd1, kwd2), function(x) {
if (which(letters == x$kwd1) != which(letters == x$kwd2)) {
if (which(letters == x$kwd1) > which(letters == x$kwd2)) {
return(data.frame(kwd1 = x$kwd2, kwd2 = x$kwd1, sim = x$sim))
} else {
return(x)
}
}
})
res1 <- res[!duplicated(res), ]
> res1
kwd1 kwd2 sim
1 a b 0.1
2 a c 0.2
4 b c NA
你可以把它变成一个函数,以便更容易调用。
convert_df <- function(df) {
res <- merge(expand.grid(kwd1 = unique(c(df$kwd1, df$kwd2)),
kwd2 = unique(c(df$kwd1,
df$kwd2))), df, all.x = T)
res <- ddply(res, .(kwd1, kwd2), function(x) {
if (which(letters == x$kwd1) != which(letters == x$kwd2)) {
if (which(letters == x$kwd1) > which(letters == x$kwd2)) {
return(data.frame(kwd1 = x$kwd2, kwd2 = x$kwd1, sim = x$sim))
} else {
return(x)
}
}
})
return(res[!duplicated(res), ])
}
# Then simply run this to convert your actual data.frame
convert_df(df)
答案 1 :(得分:0)
可能有更优雅的方式,但这是一种方法:
# make a data.frame with all possible combinations of kwd1 and kwd2.
# the ones that aren't in df are NA for sim.
k <- merge(expand.grid(kwd1=df$kwd1, kwd2=df$kwd2), df, all=TRUE)
# order the result to put the NA rows at the end, so that rows that are in df
# have priority in the following step.
k <- k[order(k$sim), ]
# remove all rows where the kwd1-kwd2 combo appears earlier in the data.frame
k <- k[! duplicated(apply(k[1:2], MARGIN=1, sort), MARGIN=2), ]
# assuming you don't want the rows where kwd1 and kwd2 are the same, remove them.
k <- subset(k, kwd1 != kwd2)
# set the NA values to 0
k[is.na(k)] <- 0
kwd1 kwd2 sim
5 a b 0.1
7 a c 0.2
12 b c 0.0