使用其他数据框

时间:2016-12-16 16:05:50

标签: r dplyr tidyr tidyverse

如何按组完成缺失值?

我有一个带有recomendations和rank的df,当我没有至少4个时,我需要插入默认的recomendations。

输入示例:

library(tidyverse)

fixed_recomendations <- data.frame(recomendation_id = 50:54, name = paste("recomendation", 50:54, sep = "_"), stringsAsFactors = FALSE)

content_id <- c(1,1,2,rep(3, 6))
rank <- c(1, 2, 1, 1:6)
recomendation_id <- c(1:9)
name <- paste("recomendation", recomendation_id, sep = "_")
df <- data.frame(content_id, rank, recomendation_id, name, stringsAsFactors = FALSE)

# content_id rank recomendation_id  name
# 1         1     1                 recomendation_1
# 1         2     2                 recomendation_2
# 2         1     3                 recomendation_3
# 3         1     4                 recomendation_4
# 3         2     5                 recomendation_5
# 3         3     6                 recomendation_6
# 3         4     7                 recomendation_7
# 3         5     8                 recomendation_8
# 3         6     9                 recomendation_9

我尝试使用完整/填充但是它不尊重组,它也会削减等级范围之外的值。

df %>% 
  complete(content_id, rank = 1:4, 
           fill = list(
            recomendation_id = fixed_recomendations$recomendation_id,
            name = fixed_recomendations$name
            ))

# content_id  rank recomendation_id name
# 1           1      1               recomendation_1
# 1           2      2               recomendation_2
# 1           3     50              recomendation_50
# 1           4     51              recomendation_51
# 2           1      3               recomendation_3
# 2           2     52              recomendation_52
# 2           3     53              recomendation_53
# 2           4     54              recomendation_54
# 3           1      4               recomendation_4
# 3           2      5               recomendation_5
# 3           3      6               recomendation_6
# 3           4      7               recomendation_7

期望的输出:

# content_id  rank recomendation_id name
# 1           1      1               recomendation_1
# 1           2      2               recomendation_2
# 1           3     50              recomendation_50
# 1           4     51              recomendation_51
# 2           1      3               recomendation_3
# 2           2     50              recomendation_50
# 2           3     51              recomendation_51
# 2           4     52              recomendation_52
# 3           1      4               recomendation_4
# 3           2      5               recomendation_5
# 3           3      6               recomendation_6
# 3           4      7               recomendation_7
# 3           5      8               recomendation_8
# 3           6      9               recomendation_9

2 个答案:

答案 0 :(得分:1)

我没有使用你所使用的同一套软件包,但这是一个使用foreach的解决方案,我在几分钟内就开始了。我可以使用基本的for循环来完成它,但foreach中的.combine功能很有用,而且如果真正的问题很大,它可以并行化。

fixed_recomendations <- data.frame(recomendation_id = 50:54, name = paste("recomendation", 50:54, sep = "_"), stringsAsFactors = FALSE)

content_id <- c(1,1,2,rep(3, 6))
rank <- c(1, 2, 1, 1:6)
recomendation_id <- c(1:9)
name <- paste("recomendation", recomendation_id, sep = "_")
df <- data.frame(content_id, rank, recomendation_id, name, stringsAsFactors = FALSE)


tab <- table(content_id)
library(foreach)
defaults <- foreach (i = 1:length(tab), .combine = rbind) %do%{
  if (tab[i]<4){
    foreach( j = 1:(4-tab[i]), .combine = rbind) %do% {
      data.frame(names(tab)[i], j + tab[i], fixed_recomendations$recomendation_id[j], fixed_recomendations$name[j])
    }
  }
}
colnames(defaults) <- colnames(df)
df.new <- rbind(df, defaults)
df.new <- df.new[with(df.new, order(content_id, rank)),]
df.new

   content_id rank recomendation_id             name
1           1    1                1  recomendation_1
2           1    2                2  recomendation_2
12          1    3               50 recomendation_50
11          1    4               51 recomendation_51
3           2    1                3  recomendation_3
23          2    2               50 recomendation_50
21          2    3               51 recomendation_51
22          2    4               52 recomendation_52
4           3    1                4  recomendation_4
5           3    2                5  recomendation_5
6           3    3                6  recomendation_6
7           3    4                7  recomendation_7
8           3    5                8  recomendation_8
9           3    6                9  recomendation_9

答案 1 :(得分:0)

我不知道这是否是最好的方法,但我分两步完成解决,首先我使用完全创建NA值的NA值,然后我过滤NA值并使用split / map_df更新它组:

add_missing <- function(x){
  for(i in 1:nrow(x)){
    x[i,]$recomendation_id = fixed_recomendations[i,]$recomendation_id
    x[i,]$name = fixed_recomendations[i,]$name     
  }
  x
}

df_missing <- df %>% 
  complete(content_id, rank = 1:4) %>% 
  filter(is.na(recomendation_id)) %>% 
  split(.$content_id) %>% 
  map_df(add_missing)

rbind(df, df_missing) %>% arrange(content_id, rank)

>   content_id rank recomendation_id             name
1           1    1                1  recomendation_1
2           1    2                2  recomendation_2
3           1    3               50 recomendation_50
4           1    4               51 recomendation_51
5           2    1                3  recomendation_3
6           2    2               50 recomendation_50
7           2    3               51 recomendation_51
8           2    4               52 recomendation_52
9           3    1                4  recomendation_4
10          3    2                5  recomendation_5
11          3    3                6  recomendation_6
12          3    4                7  recomendation_7
13          3    5                8  recomendation_8
14          3    6                9  recomendation_9