将dataframe变量中的NA值替换为其他数据帧中的值“ID”

时间:2013-10-25 15:27:55

标签: r merge dataframe

我想知道是否有一种更简洁的方法来替换数据帧中变量的NA值,而不是我在下面所做的。下面的代码似乎比我认为在R中可能的更长。例如,我不知道某些软件包/工具可能更简洁地执行此操作。

有没有办法在NA {%}}时替换或合并值?在使用all.x = T合并两个数据帧后,我有一些NA值,我想用另一个数据框中的信息替换它们,使用公共变量来链接替换。

# get dataframes
breaks <- structure(list(Break = 1:11, Value = c(2L, 13L, 7L, 9L, 40L, 
21L, 10L, 37L, 7L, 26L, 42L)), .Names = c("Break", "Value"), class = "data.frame", row.names = c(NA, 
-11L))

fsites <- structure(list(Site = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 
3L, 3L, 3L, 3L), Plot = c(0L, 1L, 2L, 3L, 4L, 0L, 1L, 2L, 0L, 
1L, 2L, 3L, 4L, 5L), Break = c(1L, 5L, 7L, 8L, 11L, 1L, 6L, 11L, 
1L, 4L, 6L, 8L, 9L, 11L)), .Names = c("Site", "Plot", "Break"
), class = "data.frame", row.names = c(NA, -14L))

bps <- structure(list(Site = c(1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 3L, 3L, 
3L), Plot = c(0L, 1L, 2L, 3L, 1L, 2L, 0L, 1L, 2L, 3L, 4L), Value = c(0.393309653, 
0.12465733, 0.27380161, 0.027288989, 0.439712533, 0.289724079, 
0.036429062, 0.577460008, 0.820375917, 0.323217357, 0.28637503
)), .Names = c("Site", "Plot", "Value"), class = "data.frame", row.names = c(NA, 
-11L))

# merge fsites and bps
df1 <- merge(fsites, bps, by=c("Site", "Plot"), all.x=T)

# merge df1 and breaks to get values to eventually replace the NA values in 
# df1$Values.x, here "Break" is the ID by which to replace the NA values
df2 <- merge(df1, breaks, by=c("Break"))

# Create a new column 'Value' that uses Value.x, unless NA, then Value.y
df3 <- df2
df3$Value <- df2$Value.x
df2.na <- is.na(df2$Value.x)
df3$Value[df2.na] <- df2$Value.y[df2.na]

# get rid of unnecessary columns
cols <- c(1:3,6)
df4 <- df3[,cols]

1 个答案:

答案 0 :(得分:5)

在只有{breaksfsitesbpsdf1周围的阶段:

df1$Value <- ifelse(is.na(df1$Value), 
                            breaks$Value[match(df1$Break, breaks$Break)], df1$Value)

#> df1
#   Site Plot Break       Value
#1     1    0     1  0.39330965
#2     1    1     5  0.12465733
#3     1    2     7  0.27380161
#4     1    3     8  0.02728899
#5     1    4    11 42.00000000
#6     2    0     1  2.00000000
#7     2    1     6  0.43971253
#8     2    2    11  0.28972408
#9     3    0     1  0.03642906
#10    3    1     4  0.57746001
#11    3    2     6  0.82037592
#12    3    3     8  0.32321736
#13    3    4     9  0.28637503
#14    3    5    11 42.00000000

#just to test with your `df4`
> sort(df1$Value) == sort(df4$Value)
[1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE