从A列中选择唯一值以及B列中的唯一值

时间:2017-07-02 16:18:55

标签: r dataframe

我在R中有一个data.frame:

user    hobby      user_profile_url
1       reading    "https://...user1"
1       dancing    "https://...user1"
2       dancing    "https://...user2"
2       gaming     "https://...user2"
3       gaming     "https://...user3"
4       cooking    "https://...user4"
4       singing    "https://...user4"
...

我试图为每个独特的爱好选择一个独特的用户(只要有可能)。

期望的输出:

user    hobby      user_profile_url
1       reading    "https://...user1"
2       dancing    "https://...user2"
3       gaming     "https://...user3"
4       cooking    "https://...user4"
4       singing    "https://...user4"
...

有谁知道怎么做?谢谢!

修改

我想我有一些有用的东西。

uniqueUserPerHobby <- function(df){
  vec1Arr <- c()
  vec2Arr <- c()
  used_id <- c()
  for (a_label in unique(df$hobby)){
    if (nrow(df[df$hobby==a_label,])==1) {
      used_id <- c(used_id, df[df$hobby==a_label,]$user )
      vec1Arr <- c(vec1Arr, df[df$hobby==a_label,]$user)
      vec2Arr <- c(vec2Arr, a_label)
    } else {
      i<-1
      df_multy <- df[df$hobby==a_label,]
      for (a_user in df_multy$user) {
        i<-i+1
        if (nrow(df[df$user==a_user,])==1) {
          used_id <- c(used_id, a_user)
          vec1Arr <- c(vec1Arr, a_user)
          vec2Arr <- c(vec2Arr, a_label)
          break
        } else if (i == length(df$user)) {
          if (! a_user %in% used_id){
            used_id <- c(used_id, a_user)
          }
          vec1Arr <- c(vec1Arr, a_user)
          vec2Arr <- c(vec2Arr, a_label)
        } else if (! a_user %in% used_id) {
          used_id <- c(used_id, a_user)
          vec1Arr <- c(vec1Arr, a_user)
          vec2Arr <- c(vec2Arr, a_label)
          break
        }
      }
      if (!a_label %in% vec2Arr){
        if (!df[df$hobby==a_label,]$user[1] %in% used_id){
          used_id <- c(used_id, df[df$hobby==a_label,]$user[1])
        }
        vec1Arr <- c(vec1Arr, df[df$hobby==a_label,]$user[1])
        vec2Arr <- c(vec2Arr, a_label)
      }
    }
  }
  new.df <- dplyr::left_join(data.frame(user=vec1Arr, hobby=vec2Arr, stringsAsFactors = F), df, by=c("user", "hobby"))
  return(new.df)
}

这可能不是最好的方法,但我在两个不同的data.frames上尝试了它并且它有效。

> dat <- data.frame(user  = c(1,1,2,2,3,4,4), hobby = c("reading","dancing","dancing","gaming","gaming","cooking","singing"), user_profile_url = c("https://...user1","https://...user1","https://...user2","https://...user2","https://...user3","https://...user4","https://...user4"), stringsAsFactors = F)

> uniqueUserPerHobby(dat)

  user   hobby user_profile_url
1    1 reading https://...user1
2    2 dancing https://...user2
3    3  gaming https://...user3
4    4 cooking https://...user4
5    4 singing https://...user4

2 个答案:

答案 0 :(得分:2)

Wooooweee并不容易。我用基础R做了它并且也做了一个功能。试一试:

您的数据:

dat <- data.frame(user  = c(1,1,2,2,3,4,4), hobby = c("reading","dancing","dancing","gaming","gaming","cooking","singing"), user_profile_url = c("https://...user1","https://...user1","https://...user2","https://...user2","https://...user3","https://...user4","https://...user4"), stringsAsFactors = F)

我的功能:

somewhatUnique <- function(df, colA, colB){
  uniq.df <- df[!duplicated(df[,c(colA, colB)]),]
  tb.uniq.df <- table(uniq.df[,c(colA, colB)])
  new.tb <- row(tb.uniq.df)
  new.tb[] <- rownames(tb.uniq.df)[new.tb]
  new.tb[tb.uniq.df == 0] <- NA
  j <- apply(new.tb, 2, as.list) # supply table columns as individual lists
  # expand.grid can take list arguments so we can handle dynamic unique hobbies
  combos <- expand.grid(lapply(j, function(x) do.call(rbind,x)), 
                        stringsAsFactors = F) # all possible options
  k <- combos[complete.cases(combos),] # options without NA
  s <- rep(NA,nrow(k)) # initialize vector
  for(i in 1:nrow(k)) s[i] <- length(unique(k[i,,drop = T]))
  L <- as.list(c(k[which.max(s),]))
  names(L) <- unique(df[,colB]) 
  # find position in split and return correct row
  by_B <- split(df, df[,colB])
  takerows <- as.list(c(mapply(function(x,y) match(x,y[,colA]), 
                               x = L, y = by_B)))
  out <- as.data.frame(t(mapply(function(z,r) z[r,], z = by_B, r = takerows)))
  out <- do.call(cbind.data.frame, lapply(out, unlist)) # formatting output correctly
  out <- out[order(out[,colA]),] # sorting by user
  rownames(out) <- NULL
  out
}    

使用示例:

somewhatUnique(dat, "user", "hobby") # all unique hobbies, maximizing unique users
#  user   hobby user_profile_url
#1    1 reading https://...user1
#2    2 dancing https://...user2
#3    3  gaming https://...user3
#4    4 cooking https://...user4
#5    4 singing https://...user4

somewhatUnique(dat, "hobby", "user") # all unique users, maximizing unique hobbies
#  user   hobby user_profile_url
#1    4 cooking https://...user4
#2    2 dancing https://...user2
#3    3  gaming https://...user3
#4    1 reading https://...user1

如果您有任何问题或疑问,请与我们联系!

答案 1 :(得分:1)

好吧,我试图让它尽可能一般,但是我只测试了你的数据,并且它会给你一些警告,但不是; t影响输出(基于当前数据)。 我无法保证它会一直在工作,但它应该让你开始。

从您的数据开始,我将其保存为tibble:

df <- tibble(user=c(1,1,2,2,3,4,4), hobby=c("reading","dancing","dancing","gaming","gaming","cooking","singing"),user_profile=c("user1","user1","user2","user2","user3","user4","user4"))

数据预处理
我只在一个单独的数据框中保存了唯一的useruser_profile。这将在最后使用:

up <- df %>% 
        group_by(user) %>% 
        summarise(user_profile=unique(user_profile))

   user user_profile
  <dbl>        <chr>
1     1        user1
2     2        user2
3     3        user3
4     4        user4

我重新格式化原始数据框,以便hobby为列:

library(tidyverse)
new <- df %>% 
         mutate(dummy=1) %>%
         complete(user,hobby) %>%
         group_by(user) %>%
         spread(.,hobby,dummy) %>%
         filter(!is.na(user_profile))

   user user_profile cooking dancing gaming reading singing
  <dbl>        <chr>   <dbl>   <dbl>  <dbl>   <dbl>   <dbl>
1     1        user1      NA       1     NA       1      NA
2     2        user2      NA       1      1      NA      NA
3     3        user3      NA      NA      1      NA      NA
4     4        user4       1      NA     NA      NA       1

大功能
然后我写了一个递归函数,它将输入数据帧和预期的输出数据帧作为参数。我试着给每一步注释:

complicated <- function(x,y) {
    # Find unique user:hobby pairs
    col.sum <- x %>% 
                 select(-user_profile) %>% 
                 colSums(.,na.rm=T)
    unique.col <- names(col.sum)[col.sum==1]

    # Format unique user:hobby pairs
    # Save as 2-column data frame: user, key
    unique.hobby <- x %>% 
                     select_(.dots = unique.col) %>% 
                     gather(key,value,unique.col) %>%
                     filter(value==1) %>%
                     select(-value)

    # Filter out (ie remove) unique user:hobby pairs
    restof.hobby <- x %>%
                     filter(!(user %in% unique(unique.hobby$user)))

    # Row-bind output data into single data frame
    y <- rbind(ungroup(y),ungroup(unique.hobby))

    # If all unique user:hobby pairs have not been found
    # Perform above operations again but with only subset of original data frame
    if (nrow(restof.hobby) > 0) {
         complicated( restof.hobby, y )
    } else {
         return(y)
    } 
}

数据处理
我初始化最终输出数据框:

final <- tibble(user=NA,key=NA)

函数调用:

complicated(new,final) %>% 
    filter(!is.na(user)) %>% 
    arrange(user) %>%
    full_join(.,up,by="user")

<强>输出

   user     key user_profile
  <dbl>   <chr>        <chr>
1     1 reading        user1
2     2 dancing        user2
3     3  gaming        user3
4     4 cooking        user4
5     4 singing        user4