将数据框与缺失值组合

时间:2018-05-15 21:22:37

标签: r dataframe dplyr

我有几个数据框,其中包含来自同一调查的数据。我想将它们结合起来进行分析。数据框包含唯一变量和两个在所有数据帧中共享的变量(ID和Contest_no);这两个共享变量包含有关受访者和竞赛编号的信息(1,2,3,因为受访者被问过三次相同的问题)。

难点在于数据框缺少值:

DF1 <- data.frame(V1 = factor(c("A", "B", "C", "D")),
             V2 = factor(c("A", "B", "C", "D")),
             ID = factor(c("x1", "x1", "y2", "y2")),
             Contest_no = factor(c("1", "2", "1", "2")))

DF2 <- data.frame(V3 = factor(c("A", "C", "D")),
                  V4 = factor(c("A", "C", "D")),
                  ID = factor(c("x1", "y2", "y2")),
                  Contest_no = factor(c("1", "1", "2")))

DF3 <- data.frame(V5 = factor(c("A", "B", "C")),
                  V6 = factor(c("A", "B", "C")),
                  ID = factor(c("x1", "x1", "y2")),
                  Contest_no = factor(c("1", "2", "1")))

因此,受访者ID和比赛号码不一致。我想将数据与受访者IDS和竞赛号码相匹配,以便合并后的数据框如下所示:

DF_merged <- data.frame(V1 = factor(c("A", "B", "C", "D")),
                    V2 = factor(c("A", "B", "C", "D")),
                    V3 = factor(c("A", NA, "C", "D")),
                    V4 = factor(c("A", NA, "C", "D")),
                    V5 = factor(c("A", "B", "C", NA)),
                    V6 = factor(c("A", "B", "C", NA)),
                    ID = factor(c("x1", "x1", "y2", "y2")),
                    Contest_no = factor(c("1", "2", "1", "2")))  

我认为full_join可以解决这个问题,但DF_merged <- full_join(DF1, DF2, DF3, by="ID")给了我荒谬的结果。

如何将这样的不同数据合并?

新的更新示例(解决乘法行的问题)。在此示例中,根本没有缺失值,并且两个数据帧具有相同的行数,但代码导致相乘的行。首先,要合并的两个数据帧:

DF1:

structure(list(ID = structure(c(1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L), .Label = c("EE1", "EE101", "EE102"), class = "factor"), 
    Contest_no = c(1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 2L, 2L, 3L, 
    3L), Option = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 
    1L, 2L, 1L, 2L), .Label = c("Option1", "Option2"), class = "factor"), 
    Chosen_option = c(0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 
    0L, 1L), Combination = structure(c(5L, 5L, 6L, 6L, 4L, 4L, 
    2L, 2L, 1L, 1L, 3L, 3L), .Label = c("V133", "V181", "V234", 
    "V252", "V32", "V67"), class = "factor"), Attribute1 = structure(c(1L, 
    1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L), .Label = c("has strong ties to the government", 
    "has weak ties to the government"), class = "factor"), Attribute2 = structure(c(1L, 
    2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L), .Label = c("has strong ties to the local pastoralist community", 
    "has weak ties to the local pastoralist community"), class = "factor"), 
    Attribute3 = structure(c(2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 
    2L, 1L, 1L, 2L), .Label = c("is poor", "is wealthy"), class = "factor"), 
    Attribute4 = structure(c(2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 
    1L, 2L, 2L, 2L), .Label = c("has attained a high level of formal education (for example university degree)", 
    "has not attained a high level of formal education (for example never went to school or only attended primary school)"
    ), class = "factor")), .Names = c("ID", "Contest_no", "Option", 
"Chosen_option", "Combination", "Attribute1", "Attribute2", "Attribute3", 
"Attribute4"), class = "data.frame", row.names = c(NA, -12L))

DF2:

structure(list(ID = structure(c(1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 
3L, 3L, 3L, 3L), .Label = c("EE1", "EE101", "EE102"), class = "factor"), 
    Contest_no = c(1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 2L, 2L, 3L, 
    3L), Option = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 
    1L, 2L, 1L, 2L), .Label = c("Option1", "Option2"), class = "factor"), 
    Chosen_option = c(1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 
    0L, 1L), Combination = structure(c(6L, 6L, 4L, 4L, 1L, 1L, 
    3L, 3L, 5L, 5L, 2L, 2L), .Label = c("V150", "V249", "V252", 
    "V29", "V56", "V77"), class = "factor"), Attribute1 = structure(c(2L, 
    2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L), .Label = c("has strong ties to the government", 
    "has weak ties to the government"), class = "factor"), Attribute2 = structure(c(2L, 
    2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L), .Label = c("has strong ties to the local pastoralist community", 
    "has weak ties to the local pastoralist community"), class = "factor"), 
    Attribute3 = structure(c(2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 
    2L, 1L, 1L, 2L), .Label = c("is poor", "is wealthy"), class = "factor"), 
    Attribute4 = structure(c(2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 
    1L, 1L, 2L, 2L), .Label = c("has attained a high level of formal education (for example university degree)", 
    "has not attained a high level of formal education (for example never went to school or only attended primary school)"
    ), class = "factor")), .Names = c("ID", "Contest_no", "Option", 
"Chosen_option", "Combination", "Attribute1", "Attribute2", "Attribute3", 
"Attribute4"), class = "data.frame", row.names = c(NA, -12L))

现在尝试合并两个数据帧失败:

df_merge_attempt <- dplyr::full_join(df1, df2, by=c("ID","Contest_no"))

结果:

structure(list(ID = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L
), .Label = c("EE1", "EE101", "EE102"), class = "factor"), Contest_no = c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 2L, 
2L, 2L, 2L, 3L, 3L, 3L, 3L), Option.x = structure(c(1L, 1L, 2L, 
2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 
2L, 1L, 1L, 2L, 2L), .Label = c("Option1", "Option2"), class = "factor"), 
    Chosen_option.x = c(0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 
    1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L), 
    Combination.x = structure(c(5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 
    4L, 4L, 4L, 4L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 
    3L), .Label = c("V133", "V181", "V234", "V252", "V32", "V67"
    ), class = "factor"), Attribute1.x = structure(c(1L, 1L, 
    1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 
    2L, 2L, 2L, 1L, 1L, 2L, 2L), .Label = c("has strong ties to the government", 
    "has weak ties to the government"), class = "factor"), Attribute2.x = structure(c(1L, 
    1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 
    2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L), .Label = c("has strong ties to the local pastoralist community", 
    "has weak ties to the local pastoralist community"), class = "factor"), 
    Attribute3.x = structure(c(2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 
    1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 
    2L), .Label = c("is poor", "is wealthy"), class = "factor"), 
    Attribute4.x = structure(c(2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 
    2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 
    2L), .Label = c("has attained a high level of formal education (for example university degree)", 
    "has not attained a high level of formal education (for example never went to school or only attended primary school)"
    ), class = "factor"), Option.y = structure(c(1L, 2L, 1L, 
    2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 
    1L, 2L, 1L, 2L, 1L, 2L), .Label = c("Option1", "Option2"), class = "factor"), 
    Chosen_option.y = c(1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 
    1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 1L), 
    Combination.y = structure(c(6L, 6L, 6L, 6L, 4L, 4L, 4L, 4L, 
    1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 5L, 5L, 5L, 5L, 2L, 2L, 2L, 
    2L), .Label = c("V150", "V249", "V252", "V29", "V56", "V77"
    ), class = "factor"), Attribute1.y = structure(c(2L, 2L, 
    2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 2L, 1L, 2L, 1L), .Label = c("has strong ties to the government", 
    "has weak ties to the government"), class = "factor"), Attribute2.y = structure(c(2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L), .Label = c("has strong ties to the local pastoralist community", 
    "has weak ties to the local pastoralist community"), class = "factor"), 
    Attribute3.y = structure(c(2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 
    2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 
    2L), .Label = c("is poor", "is wealthy"), class = "factor"), 
    Attribute4.y = structure(c(2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
    1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 
    2L), .Label = c("has attained a high level of formal education (for example university degree)", 
    "has not attained a high level of formal education (for example never went to school or only attended primary school)"
    ), class = "factor")), class = "data.frame", row.names = c(NA, 
-24L), .Names = c("ID", "Contest_no", "Option.x", "Chosen_option.x", 
"Combination.x", "Attribute1.x", "Attribute2.x", "Attribute3.x", 
"Attribute4.x", "Option.y", "Chosen_option.y", "Combination.y", 
"Attribute1.y", "Attribute2.y", "Attribute3.y", "Attribute4.y"
))

1 个答案:

答案 0 :(得分:1)

您可以使用dplyr::full_join参数{@ 1}}来尝试:

by=c("ID","Contest_no")

已更新:已修改答案,以便将library(dplyr) df1 <- full_join(DF1, DF2, by=c("ID","Contest_no")) %>% full_join(DF3, by=c("ID","Contest_no")) df1 # V1 V2 V3 V4 V5 V6 ID Contest_no #1 A A A A A A x1 1 #2 B B <NA> <NA> B B x1 2 #3 C C C C C C y2 1 #4 D D D D <NA> <NA> y2 2 中的另一列Option视为:

full_join

注意:我必须调整df1 <- full_join(DF1, DF2, by=c("ID","Contest_no", "Option")) 以匹配@Gregor建议的内容,以获得预期的结果。