我在R中有一个data.frame:
user hobby user_profile_url
1 reading "https://...user1"
1 dancing "https://...user1"
2 dancing "https://...user2"
2 gaming "https://...user2"
3 gaming "https://...user3"
4 cooking "https://...user4"
4 singing "https://...user4"
...
我试图为每个独特的爱好选择一个独特的用户(只要有可能)。
期望的输出:
user hobby user_profile_url
1 reading "https://...user1"
2 dancing "https://...user2"
3 gaming "https://...user3"
4 cooking "https://...user4"
4 singing "https://...user4"
...
有谁知道怎么做?谢谢!
修改
我想我有一些有用的东西。
uniqueUserPerHobby <- function(df){
vec1Arr <- c()
vec2Arr <- c()
used_id <- c()
for (a_label in unique(df$hobby)){
if (nrow(df[df$hobby==a_label,])==1) {
used_id <- c(used_id, df[df$hobby==a_label,]$user )
vec1Arr <- c(vec1Arr, df[df$hobby==a_label,]$user)
vec2Arr <- c(vec2Arr, a_label)
} else {
i<-1
df_multy <- df[df$hobby==a_label,]
for (a_user in df_multy$user) {
i<-i+1
if (nrow(df[df$user==a_user,])==1) {
used_id <- c(used_id, a_user)
vec1Arr <- c(vec1Arr, a_user)
vec2Arr <- c(vec2Arr, a_label)
break
} else if (i == length(df$user)) {
if (! a_user %in% used_id){
used_id <- c(used_id, a_user)
}
vec1Arr <- c(vec1Arr, a_user)
vec2Arr <- c(vec2Arr, a_label)
} else if (! a_user %in% used_id) {
used_id <- c(used_id, a_user)
vec1Arr <- c(vec1Arr, a_user)
vec2Arr <- c(vec2Arr, a_label)
break
}
}
if (!a_label %in% vec2Arr){
if (!df[df$hobby==a_label,]$user[1] %in% used_id){
used_id <- c(used_id, df[df$hobby==a_label,]$user[1])
}
vec1Arr <- c(vec1Arr, df[df$hobby==a_label,]$user[1])
vec2Arr <- c(vec2Arr, a_label)
}
}
}
new.df <- dplyr::left_join(data.frame(user=vec1Arr, hobby=vec2Arr, stringsAsFactors = F), df, by=c("user", "hobby"))
return(new.df)
}
这可能不是最好的方法,但我在两个不同的data.frames上尝试了它并且它有效。
> dat <- data.frame(user = c(1,1,2,2,3,4,4), hobby = c("reading","dancing","dancing","gaming","gaming","cooking","singing"), user_profile_url = c("https://...user1","https://...user1","https://...user2","https://...user2","https://...user3","https://...user4","https://...user4"), stringsAsFactors = F)
> uniqueUserPerHobby(dat)
user hobby user_profile_url
1 1 reading https://...user1
2 2 dancing https://...user2
3 3 gaming https://...user3
4 4 cooking https://...user4
5 4 singing https://...user4
答案 0 :(得分:2)
Wooooweee并不容易。我用基础R做了它并且也做了一个功能。试一试:
您的数据:
dat <- data.frame(user = c(1,1,2,2,3,4,4), hobby = c("reading","dancing","dancing","gaming","gaming","cooking","singing"), user_profile_url = c("https://...user1","https://...user1","https://...user2","https://...user2","https://...user3","https://...user4","https://...user4"), stringsAsFactors = F)
我的功能:
somewhatUnique <- function(df, colA, colB){
uniq.df <- df[!duplicated(df[,c(colA, colB)]),]
tb.uniq.df <- table(uniq.df[,c(colA, colB)])
new.tb <- row(tb.uniq.df)
new.tb[] <- rownames(tb.uniq.df)[new.tb]
new.tb[tb.uniq.df == 0] <- NA
j <- apply(new.tb, 2, as.list) # supply table columns as individual lists
# expand.grid can take list arguments so we can handle dynamic unique hobbies
combos <- expand.grid(lapply(j, function(x) do.call(rbind,x)),
stringsAsFactors = F) # all possible options
k <- combos[complete.cases(combos),] # options without NA
s <- rep(NA,nrow(k)) # initialize vector
for(i in 1:nrow(k)) s[i] <- length(unique(k[i,,drop = T]))
L <- as.list(c(k[which.max(s),]))
names(L) <- unique(df[,colB])
# find position in split and return correct row
by_B <- split(df, df[,colB])
takerows <- as.list(c(mapply(function(x,y) match(x,y[,colA]),
x = L, y = by_B)))
out <- as.data.frame(t(mapply(function(z,r) z[r,], z = by_B, r = takerows)))
out <- do.call(cbind.data.frame, lapply(out, unlist)) # formatting output correctly
out <- out[order(out[,colA]),] # sorting by user
rownames(out) <- NULL
out
}
使用示例:
somewhatUnique(dat, "user", "hobby") # all unique hobbies, maximizing unique users
# user hobby user_profile_url
#1 1 reading https://...user1
#2 2 dancing https://...user2
#3 3 gaming https://...user3
#4 4 cooking https://...user4
#5 4 singing https://...user4
somewhatUnique(dat, "hobby", "user") # all unique users, maximizing unique hobbies
# user hobby user_profile_url
#1 4 cooking https://...user4
#2 2 dancing https://...user2
#3 3 gaming https://...user3
#4 1 reading https://...user1
如果您有任何问题或疑问,请与我们联系!
答案 1 :(得分:1)
好吧,我试图让它尽可能一般,但是我只测试了你的数据,并且它会给你一些警告,但不是; t影响输出(基于当前数据)。 我无法保证它会一直在工作,但它应该让你开始。
从您的数据开始,我将其保存为tibble:
df <- tibble(user=c(1,1,2,2,3,4,4), hobby=c("reading","dancing","dancing","gaming","gaming","cooking","singing"),user_profile=c("user1","user1","user2","user2","user3","user4","user4"))
数据预处理
我只在一个单独的数据框中保存了唯一的user
和user_profile
。这将在最后使用:
up <- df %>%
group_by(user) %>%
summarise(user_profile=unique(user_profile))
user user_profile
<dbl> <chr>
1 1 user1
2 2 user2
3 3 user3
4 4 user4
我重新格式化原始数据框,以便hobby
为列:
library(tidyverse)
new <- df %>%
mutate(dummy=1) %>%
complete(user,hobby) %>%
group_by(user) %>%
spread(.,hobby,dummy) %>%
filter(!is.na(user_profile))
user user_profile cooking dancing gaming reading singing
<dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 user1 NA 1 NA 1 NA
2 2 user2 NA 1 1 NA NA
3 3 user3 NA NA 1 NA NA
4 4 user4 1 NA NA NA 1
大功能
然后我写了一个递归函数,它将输入数据帧和预期的输出数据帧作为参数。我试着给每一步注释:
complicated <- function(x,y) {
# Find unique user:hobby pairs
col.sum <- x %>%
select(-user_profile) %>%
colSums(.,na.rm=T)
unique.col <- names(col.sum)[col.sum==1]
# Format unique user:hobby pairs
# Save as 2-column data frame: user, key
unique.hobby <- x %>%
select_(.dots = unique.col) %>%
gather(key,value,unique.col) %>%
filter(value==1) %>%
select(-value)
# Filter out (ie remove) unique user:hobby pairs
restof.hobby <- x %>%
filter(!(user %in% unique(unique.hobby$user)))
# Row-bind output data into single data frame
y <- rbind(ungroup(y),ungroup(unique.hobby))
# If all unique user:hobby pairs have not been found
# Perform above operations again but with only subset of original data frame
if (nrow(restof.hobby) > 0) {
complicated( restof.hobby, y )
} else {
return(y)
}
}
数据处理
我初始化最终输出数据框:
final <- tibble(user=NA,key=NA)
函数调用:
complicated(new,final) %>%
filter(!is.na(user)) %>%
arrange(user) %>%
full_join(.,up,by="user")
<强>输出强>
user key user_profile
<dbl> <chr> <chr>
1 1 reading user1
2 2 dancing user2
3 3 gaming user3
4 4 cooking user4
5 4 singing user4