如何按组完成缺失值?
我有一个带有recomendations和rank的df,当我没有至少4个时,我需要插入默认的recomendations。
输入示例:
library(tidyverse)
fixed_recomendations <- data.frame(recomendation_id = 50:54, name = paste("recomendation", 50:54, sep = "_"), stringsAsFactors = FALSE)
content_id <- c(1,1,2,rep(3, 6))
rank <- c(1, 2, 1, 1:6)
recomendation_id <- c(1:9)
name <- paste("recomendation", recomendation_id, sep = "_")
df <- data.frame(content_id, rank, recomendation_id, name, stringsAsFactors = FALSE)
# content_id rank recomendation_id name
# 1 1 1 recomendation_1
# 1 2 2 recomendation_2
# 2 1 3 recomendation_3
# 3 1 4 recomendation_4
# 3 2 5 recomendation_5
# 3 3 6 recomendation_6
# 3 4 7 recomendation_7
# 3 5 8 recomendation_8
# 3 6 9 recomendation_9
我尝试使用完整/填充但是它不尊重组,它也会削减等级范围之外的值。
df %>%
complete(content_id, rank = 1:4,
fill = list(
recomendation_id = fixed_recomendations$recomendation_id,
name = fixed_recomendations$name
))
# content_id rank recomendation_id name
# 1 1 1 recomendation_1
# 1 2 2 recomendation_2
# 1 3 50 recomendation_50
# 1 4 51 recomendation_51
# 2 1 3 recomendation_3
# 2 2 52 recomendation_52
# 2 3 53 recomendation_53
# 2 4 54 recomendation_54
# 3 1 4 recomendation_4
# 3 2 5 recomendation_5
# 3 3 6 recomendation_6
# 3 4 7 recomendation_7
期望的输出:
# content_id rank recomendation_id name
# 1 1 1 recomendation_1
# 1 2 2 recomendation_2
# 1 3 50 recomendation_50
# 1 4 51 recomendation_51
# 2 1 3 recomendation_3
# 2 2 50 recomendation_50
# 2 3 51 recomendation_51
# 2 4 52 recomendation_52
# 3 1 4 recomendation_4
# 3 2 5 recomendation_5
# 3 3 6 recomendation_6
# 3 4 7 recomendation_7
# 3 5 8 recomendation_8
# 3 6 9 recomendation_9
答案 0 :(得分:1)
我没有使用你所使用的同一套软件包,但这是一个使用foreach
的解决方案,我在几分钟内就开始了。我可以使用基本的for
循环来完成它,但foreach中的.combine
功能很有用,而且如果真正的问题很大,它可以并行化。
fixed_recomendations <- data.frame(recomendation_id = 50:54, name = paste("recomendation", 50:54, sep = "_"), stringsAsFactors = FALSE)
content_id <- c(1,1,2,rep(3, 6))
rank <- c(1, 2, 1, 1:6)
recomendation_id <- c(1:9)
name <- paste("recomendation", recomendation_id, sep = "_")
df <- data.frame(content_id, rank, recomendation_id, name, stringsAsFactors = FALSE)
tab <- table(content_id)
library(foreach)
defaults <- foreach (i = 1:length(tab), .combine = rbind) %do%{
if (tab[i]<4){
foreach( j = 1:(4-tab[i]), .combine = rbind) %do% {
data.frame(names(tab)[i], j + tab[i], fixed_recomendations$recomendation_id[j], fixed_recomendations$name[j])
}
}
}
colnames(defaults) <- colnames(df)
df.new <- rbind(df, defaults)
df.new <- df.new[with(df.new, order(content_id, rank)),]
df.new
content_id rank recomendation_id name
1 1 1 1 recomendation_1
2 1 2 2 recomendation_2
12 1 3 50 recomendation_50
11 1 4 51 recomendation_51
3 2 1 3 recomendation_3
23 2 2 50 recomendation_50
21 2 3 51 recomendation_51
22 2 4 52 recomendation_52
4 3 1 4 recomendation_4
5 3 2 5 recomendation_5
6 3 3 6 recomendation_6
7 3 4 7 recomendation_7
8 3 5 8 recomendation_8
9 3 6 9 recomendation_9
答案 1 :(得分:0)
我不知道这是否是最好的方法,但我分两步完成解决,首先我使用完全创建NA值的NA值,然后我过滤NA值并使用split / map_df更新它组:
add_missing <- function(x){
for(i in 1:nrow(x)){
x[i,]$recomendation_id = fixed_recomendations[i,]$recomendation_id
x[i,]$name = fixed_recomendations[i,]$name
}
x
}
df_missing <- df %>%
complete(content_id, rank = 1:4) %>%
filter(is.na(recomendation_id)) %>%
split(.$content_id) %>%
map_df(add_missing)
rbind(df, df_missing) %>% arrange(content_id, rank)
> content_id rank recomendation_id name
1 1 1 1 recomendation_1
2 1 2 2 recomendation_2
3 1 3 50 recomendation_50
4 1 4 51 recomendation_51
5 2 1 3 recomendation_3
6 2 2 50 recomendation_50
7 2 3 51 recomendation_51
8 2 4 52 recomendation_52
9 3 1 4 recomendation_4
10 3 2 5 recomendation_5
11 3 3 6 recomendation_6
12 3 4 7 recomendation_7
13 3 5 8 recomendation_8
14 3 6 9 recomendation_9