Question

我有两个数据集x和y

> x
   a index b
1  1     1 5
2 NA     2 6
3  2     3 NA
4 NA     4 9
> y
  index   a
1     2 100
2     4 101
>

我想用y中包含的值填充x的缺失值。

我试过使用合并功能，但结果不是我想要的。

> merge(x,y, by = 'index', all=T)
  index a.x b a.y
1     1   1 5  NA
2     2  NA 6 100
3     3   2 7  NA
4     4  NA 9 101

在真正的问题中还有其他限制： 1 - y不会填充所有缺失的值 2 - x和y共同有更多变量（所以不仅是a和index）

编辑：更现实的例子

> x
   a index  b  c
1  1     1  5 NA
2 NA     2  6 NA
3  2     3  NA  5
4 NA     4  9 NA
5 NA     5 10  6
> y
  index   a  c
1     2 100  4
2     4 101 NA
>

解决方案将在python或R

中被接受

Answer 1

我使用了您的merge想法并使用dplyr执行了以下操作。我相信会有更好的方法来完成这项任务。

index <- 1:5
a <- c(1, NA, 2, NA, NA)
b <- c(5,6,NA,9,10)
c <- c(NA,NA,5,NA,6)
ana <- data.frame(index, a,b,c, stringsAsFactors=F)

index <- c(2,4)
a <- c(100, 101)
c <- c(4, NA)
bob <- data.frame(index, a,c, stringsAsFactors=F)

> ana
index  a  b  c
1     1  1  5 NA
2     2 NA  6 NA
3     3  2 NA  5
4     4 NA  9 NA
5     5 NA 10  6

> bob
  index   a  c
1     2 100  4
2     4 101 NA

ana %>%
    merge(., bob, by = "index", all = TRUE) %>%
    mutate(a.x = ifelse(a.x %in% NA, a.y, a.x)) %>%
    mutate(c.x = ifelse(c.x %in% NA, c.y, c.x))

  index a.x  b c.x a.y c.y
1     1   1  5  NA  NA  NA
2     2 100  6   4 100   4
3     3   2 NA   5  NA  NA
4     4 101  9  NA 101  NA
5     5  NA 10   6  NA  NA

我使用mutate使用a.y（bob $ a）覆盖了a.x（ana $$ a）。我为c.x做了类似的事情（ana $ c）。如果你最后删除了a.y和c.y，那么这就是你期望的结果。

Answer 2

尝试：

xa = x[,c(1,2)]
m1 = merge(y,xa,all=T)
m1 = m1[!duplicated(m1$index),]
m1$b = x$b[match(m1$index, x$index)]
m1$c = x$c[match(m1$index, x$index)]

m1
  index   a  b  c
1     1   1  5 NA
2     2 100  6 NA
4     3   2 NA  5
5     4 101  9 NA
7     5  NA 10  6

或者，如果有许多其他列，如b和c：

xa = x[,c(1,2)]
m1 = merge(y,xa,all=T)
m1 = m1[!duplicated(m1$index),]
for(nn in names(x)[3:4]) m1[,nn] = x[,nn][match(m1$index, x$index)]
m1
  index   a  b  c
1     1   1  5 NA
2     2 100  6 NA
4     3   2 NA  5
5     4 101  9 NA
7     5  NA 10  6

Answer 3

如果要替换多个列，您可以尝试从前两种方法中显示的wide转换为long表单，然后一步替换

 m1 <- merge(x,y, by="index", all=TRUE)
 m1L <- reshape(m1, idvar="index", varying=grep("\\.", colnames(m1)), direction="long", sep=".")
 row.names(m1L) <- 1:nrow(m1L)
 lst1 <- split(m1L, m1L$time)
 indx <- is.na(lst1[[1]][,4:5])
 lst1[[1]][,4:5][indx] <- lst1[[2]][,4:5][indx]
 res <- lst1[[1]][,c(4,1,2,5)]
 res
 #    a index  b  c
 #1   1     1  5 NA
 #2 100     2  6  4
 #3   2     3 NA  5
 #4 101     4  9 NA
 #5  NA     5 10  6

或者您可以将dplyr与tidyr

一起使用

 library(dplyr)
 library(tidyr)

  z <- left_join(x, y, by="index")  %>% 
            gather(Var, Val, matches("\\.")) %>%
             separate(Var, c("Var1", "Var2"))
  indx1 <- which(is.na(z$Val) & z$Var2=="x")
  z$Val[indx1] <- z$Val[indx1+nrow(z)/2]
  z %>%
    spread(Var1, Val) %>%
    filter(Var2=="x") %>% 
    select(-Var2)
  #  index  b   a  c
  #1     1  5   1 NA
  #2     2  6 100  4
  #3     3 NA   2  5
  #4     4  9 101 NA
  #5     5 10  NA  6

在split之前匹配names并使用.代替NA lapply来indx <- grep("\\.", colnames(m1),value=TRUE) res <- cbind(m1[!names(m1) %in% indx], sapply(split(indx, gsub("\\..*", "", indx)), function(x) { x1 <- m1[x] indx1 <- is.na(x1[,1]) x1[,1][indx1] <- x1[,2][indx1] x1[,1]} )) res # index b a c #1 1 5 1 NA #2 2 6 100 4 #3 3 NA 2 5 #4 4 9 101 NA #5 5 10 NA 6列。

{{1}}

使用新数据R-Python填充缺失值

3 个答案: