我有两个数据框:
df1 <- data.frame(id = c("LABEL1", "LABEL2", "LABEL3", "LABEL4", "LABEL5", "LABEL6"),matrix(1:60,6,10))
df1[c(4:6), c(2:4)] = NA
df2 = data.frame(id = c( "LABEL3", "LABEL4", "LABEL5", "LABEL6"),matrix(seq(100,10000, length.out = 32),4,8))
我想使用键值=&#39; id&#39;来查找DF2中DF1的缺失值。这是所需的输出: enter image description here
以下是我尝试的方法: 1. merge:但我得到X1:X3的重复列。 2.匹配:
df1[,2]= df2[,2][match(df1$id, df2$id)]
但我会在DF1中获得标签3。 3.从qdap包中查找:
library(qdap)
apply(df1, 2, lookup, df2)
与方法2相同的结果。
谢谢!
答案 0 :(得分:2)
您可以使用tidyr
处理整洁的数据格式,然后使用dplyr
来合并表格
library(dplyr)
library(tidyr)
以管道
的一种方式df1 %>%
mutate(id = as.character(id)) %>%
gather(key = "col", value = "val", -id) %>%
left_join(df2 %>%
mutate(id = as.character(id)) %>%
gather(key = "col", value = "val", -id),
by =c("id", "col")) %>%
transmute(id, col, val = ifelse(is.na(val.x), val.y, val.x)) %>%
spread(col, val) %>%
select(id, num_range("X", 1:10))
#> id X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
#> 1 LABEL1 1.0000 7.000 13.000 19 25 31 37 43 49 55
#> 2 LABEL2 2.0000 8.000 14.000 20 26 32 38 44 50 56
#> 3 LABEL3 3.0000 9.000 15.000 21 27 33 39 45 51 57
#> 4 LABEL4 419.3548 1696.774 2974.194 22 28 34 40 46 52 58
#> 5 LABEL5 738.7097 2016.129 3293.548 23 29 35 41 47 53 59
#> 6 LABEL6 1058.0645 2335.484 3612.903 24 30 36 42 48 54 60
逐步解释
# id as character instead of factor
df1 <- df1 %>% mutate(id = as.character(id))
# tidy data
df1 <- df1 %>% gather(key = "col", value = "val", -id)
# print result as dplyr tbl
df1 %>% as.tbl()
#> Source: local data frame [60 x 3]
#>
#> id col val
#> (chr) (chr) (int)
#> 1 LABEL1 X1 1
#> 2 LABEL2 X1 2
#> 3 LABEL3 X1 3
#> 4 LABEL4 X1 NA
#> 5 LABEL5 X1 NA
#> 6 LABEL6 X1 NA
#> 7 LABEL1 X2 7
#> 8 LABEL2 X2 8
#> 9 LABEL3 X2 9
#> 10 LABEL4 X2 NA
#> .. ... ... ...
# idem on df2
df2 <- df2 %>%
mutate(id = as.character(id)) %>%
tidyr::gather(key = "col", value = "val", -id)
# print result as dplyr tbl
df2 %>% as.tbl()
#> Source: local data frame [32 x 3]
#>
#> id col val
#> (chr) (chr) (dbl)
#> 1 LABEL3 X1 100.0000
#> 2 LABEL4 X1 419.3548
#> 3 LABEL5 X1 738.7097
#> 4 LABEL6 X1 1058.0645
#> 5 LABEL3 X2 1377.4194
#> 6 LABEL4 X2 1696.7742
#> 7 LABEL5 X2 2016.1290
#> 8 LABEL6 X2 2335.4839
#> 9 LABEL3 X3 2654.8387
#> 10 LABEL4 X3 2974.1935
#> .. ... ... ...
# join only id and col level of df1 with df2
new.df <- left_join(df1, df2, by = c("id", "col"))
# print result as dplyr tbl
new.df %>% as.tbl()
#> Source: local data frame [60 x 4]
#>
#> id col val.x val.y
#> (chr) (chr) (int) (dbl)
#> 1 LABEL1 X1 1 NA
#> 2 LABEL2 X1 2 NA
#> 3 LABEL3 X1 3 100.0000
#> 4 LABEL4 X1 NA 419.3548
#> 5 LABEL5 X1 NA 738.7097
#> 6 LABEL6 X1 NA 1058.0645
#> 7 LABEL1 X2 7 NA
#> 8 LABEL2 X2 8 NA
#> 9 LABEL3 X2 9 1377.4194
#> 10 LABEL4 X2 NA 1696.7742
#> .. ... ... ... ...
#replace NA in col val.x from df1 by value val.y of df2
# and only keep id, col and new column val
new.df <- new.df %>% transmute(id, col, val = ifelse(is.na(val.x), val.y, val.x))
new.df %>% as.tbl()
#> Source: local data frame [60 x 3]
#>
#> id col val
#> (chr) (chr) (dbl)
#> 1 LABEL1 X1 1.0000
#> 2 LABEL2 X1 2.0000
#> 3 LABEL3 X1 3.0000
#> 4 LABEL4 X1 419.3548
#> 5 LABEL5 X1 738.7097
#> 6 LABEL6 X1 1058.0645
#> 7 LABEL1 X2 7.0000
#> 8 LABEL2 X2 8.0000
#> 9 LABEL3 X2 9.0000
#> 10 LABEL4 X2 1696.7742
#> .. ... ... ...
# put back data in wide format
new.df %>%
spread(col, val) %>%
select(id, num_range("X", 1:10)) # put column in same order as df1
#> id X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
#> 1 LABEL1 1.0000 7.000 13.000 19 25 31 37 43 49 55
#> 2 LABEL2 2.0000 8.000 14.000 20 26 32 38 44 50 56
#> 3 LABEL3 3.0000 9.000 15.000 21 27 33 39 45 51 57
#> 4 LABEL4 419.3548 1696.774 2974.194 22 28 34 40 46 52 58
#> 5 LABEL5 738.7097 2016.129 3293.548 23 29 35 41 47 53 59
#> 6 LABEL6 1058.0645 2335.484 3612.903 24 30 36 42 48 54 60
答案 1 :(得分:0)
可能有人有比我更好的方法,但这应该有效。这确实假设列的顺序是相同的,所以要小心。
row.matches = match(df1$id, df2$id)
nas = which(is.na(df1), arr.ind = TRUE)
replacements = nas
replacements[ ,1] = row.matches[nas[ ,1]]
df1[nas] = df2[replacements]
基本上,我所做的就是找到匹配的行和df1
中出现NA的索引。使用匹配的行向量替换这些NA索引的行索引,并将df1
中的这些值替换为df2
中的相应值。