R:自动加入变量与合并后创建的后缀

时间:2015-07-11 08:52:12

标签: r join merge

我正在寻找一种方法,在合并数据帧之后,将相同变量名的值连接到一列,使用不同的x和y后缀。例如:' name.x'和' name.y'将被加入一个名为' name'其中&name; name.x'价值观将取代名称。'值除了&name; name.x'没有价值,在这种情况下,名称。'会被使用。我希望对数据框中带有后缀的所有列执行此操作。

这是我正在寻找的一个例子:

df <- data.frame(ID=c(1,2,3,4,5), variable1.x=c('5.0',"",'7.9','NA','12'), variable1.y=c('1.5','3.0',"",'8.9','3.9') );

ID variable1.x variable1.y
1  5.0         1.5
2              3.0
3  7.9
4  NA          8.9
5  12          3.9

预期产出:

ID variable1
1  5.0
2  3.0
3  7.9
4  8.9
5  12

我的数据样本如下:

structure(list(cikcode = c(20, 20, 20, 20, 20, 20, 20, 20, 20, 
20), yearendeddate = structure(c(3L, 3L, 1L, 1L, 2L, 2L, 4L, 
4L, 5L, 5L), .Label = c("2000-12-30", "2001-12-29", "2002-12-28", 
"2004-01-03", "2005-01-01"), class = "factor"), source = structure(c(1L, 
3L, 1L, 3L, 2L, 3L, 1L, 3L, 1L, 3L), .Label = c("10-K", "10-K405", 
"DEF 14A"), class = "factor"), sourcedate = structure(c(5L, 6L, 
1L, 2L, 3L, 4L, 7L, 8L, 9L, 10L), .Label = c("2001-03-26", "2001-03-28", 
"2002-03-20", "2002-03-25", "2003-03-27", "2003-03-31", "2004-04-01", 
"2004-04-06", "2005-03-31", "2005-04-04"), class = "factor"), 
financialsdate = structure(c(3L, 3L, 1L, 1L, 2L, 2L, 4L, 
4L, 5L, 5L), .Label = c("2000-12-30", "2001-12-29", "2002-12-28", 
"2004-01-03", "2005-01-01"), class = "factor"), ticker.x =        structure(c(1L, 
NA, 1L, NA, 1L, NA, 1L, NA, 1L, NA), .Label = "", class = "factor"), 
statecode.x = structure(c(1L, NA, 1L, NA, 1L, NA, 1L, NA, 
1L, NA), .Label = "NJ", class = "factor"), statename.x = structure(c(1L, 
NA, 1L, NA, 1L, NA, 1L, NA, 1L, NA), .Label = "NEW JERSEY", class =   "factor"), 
siccode.x = c(3823, NA, 3823, NA, 3823, NA, 3823, NA, 3823, 
NA), naicscode.x = c(334513, NA, 334513, NA, 334513, NA, 
334513, NA, 334513, NA), auditor.x = structure(c(3L, NA, 
1L, NA, 1L, NA, 2L, NA, 2L, NA), .Label = c("Arthur Andersen LLP", 
"Grant Thornton LLP", "KPMG LLP"), class = "factor"), auditfees.x =   structure(c(5L, 
NA, 3L, NA, 1L, NA, 4L, NA, 2L, NA), .Label = c("185,000", 
"225,000", "200,000", "137,100", "123,700"), class = "factor"), 
revenue.x = structure(c(3L, NA, 1L, NA, 4L, NA, 5L, NA, 2L, 
NA), .Label = c("84,912,000", "112,494,000", "68,231,000", 
"71,819,000", "94,676,000"), class = "factor"), earnings.x = structure(c(3L, 
NA, 4L, NA, 2L, NA, 1L, NA, 1L, NA), .Label = c("", "1,048,000", 
"3,284,000", "5,838,000"), class = "factor"), assets.x = structure(c(2L, 
NA, 3L, NA, 1L, NA, 4L, NA, 5L, NA), .Label = c("47,644,000", 
"50,459,000", "54,421,000", "83,081,000", "93,016,000"), class = "factor"), 
ticker.y = c(NA, "", NA, "", NA, "", NA, "", NA, ""), statecode.y = c(NA, 
"NJ", NA, "NJ", NA, "NJ", NA, "NJ", NA, "NJ"), statename.y = c(NA, 
"NEW JERSEY", NA, "NEW JERSEY", NA, "NEW JERSEY", NA, "NEW JERSEY", 
NA, "NEW JERSEY"), siccode.y = c(NA, 3823, NA, 3823, NA, 
3823, NA, 3823, NA, 3823), naicscode.y = c(NA, "334513", 
NA, "334513", NA, "334513", NA, "334513", NA, "334513"), 
auditor.y = c(NA, "KPMG LLP", NA, "Arthur Andersen LLP", 
NA, "Arthur Andersen LLP", NA, "Grant Thornton LLP", NA, 
"Grant Thornton LLP"), auditfees.y = c(NA, "123,700", NA, 
"200,000", NA, "185,000", NA, "137,100", NA, "225,000"), 
revenue.y = c(NA, "68,231,000", NA, "84,912,000", NA, "71,819,000", 
NA, "94,676,000", NA, "112,494,000"), earnings.y = c(NA, 
"3,284,000", NA, "5,838,000", NA, "1,048,000", NA, "", NA, 
""), assets.y = c(NA, "50,459,000", NA, "54,421,000", NA, 
"47,644,000", NA, "83,081,000", NA, "93,016,000")), .Names = c("cikcode", 
"yearendeddate", "source", "sourcedate", "financialsdate", "ticker.x", 
"statecode.x", "statename.x", "siccode.x", "naicscode.x", "auditor.x", 
"auditfees.x", "revenue.x", "earnings.x", "assets.x", "ticker.y", 
"statecode.y", "statename.y", "siccode.y", "naicscode.y", "auditor.y", 
"auditfees.y", "revenue.y", "earnings.y", "assets.y"), row.names = c(NA, 
10L), class = "data.frame")

2 个答案:

答案 0 :(得分:4)

根据显示的示例,我认为你想要pmax。如果您需要保留&#39; variable1.x&#39;除''NA以外的值的值,您可以使用ifelse。在示例中,"NA"不是真正的NA。您不需要引用NA值。如果是真的,我们可以使用is.na(df[,2])

data.frame(df[1], variable=ifelse(df[,2] %in% c('','NA'), 
    as.numeric(as.character(df[,3])), as.numeric(as.character(df[,2]))),
    stringsAsFactors=FALSE)
#   ID variable
#1  1      5.0
#2  2      3.0
#3  3      7.9
#4  4      8.9
#5  5       12

如果有多列,例如&#39; df2&#39;(我在这里使用真正的NAs),我们将&#39;变量&#39;删除后缀部分i.x后的列。 &#39; x&#39;,&#39; y&#39;与sub。使用lapply循环遍历list元素,并使用ifelse获得结果,如上所示。另外,请注意,我在这里创建了'#39;字符&#39;列使用stringsAsFactors=FALSE

nm1 <- sub('\\..*$', '', names(df1)[-1])
df2 <- data.frame(ID= df1$ID)
df2[unique(nm1)] <-  lapply(split(names(df1)[-1],nm1), function(x) {
           x1 <- df1[x]
          as.numeric(ifelse(is.na(x1[,1])|x1[,1]=='', x1[,2], x1[,1]))})
df2
#   ID variable1 variable2
#1  1       5.0       4.2
#2  2       3.0       3.5
#3  3       7.9       3.2
#4  4       8.9       1.2
#5  5      12.0       4.0

我们也可以使用ifelse索引

在没有row/column的情况下执行此操作
  df2[unique(nm1)] <-  lapply(split(names(df1)[-1], nm1), function(x) {
      x1 <- df1[x]
      as.numeric(x1[cbind(1:nrow(x1),((is.na(x1[,1])|x1[,1]==''))+1L)])})

更新

基于OP帖子中的输出输出(&#39; df2&#39;)

 #subset the column names that have either `.x` or `.y` at the end
 v1 <- grep('\\.(x|y)$', names(df2), value=TRUE)
 #create another dataset with all columns except the ones in "v1"
 df2N <- df2[setdiff(names(df2), v1)]

 #change the 'factor' columns in 'v1' to 'character'    
 indx <- sapply(df2[v1], is.factor)
 df2[v1][indx] <- lapply(df2[v1][indx], as.character)

 #remove the suffix part ('.x', 'y') with `sub`
 nm1 <-sub('\\..*$', '', v1)
 #sort the unique elements of 'nm1' for naming new columns
 nm2 <- sort(unique(nm1))
 #create new columns after `split`ting the 'v1' with 'nm1'
 #merge the columns with the condition that if the first column
 # i.e. '.x' has NA or empty strings, replace that with the
 # second column element 
 df2N[nm2] <- lapply(split(v1, nm1), function(x) {
                  x1 <- df2[x]
      ifelse(is.na(x1[,1])|x1[,1]=='', x1[,2], x1[,1])})
 head(df2N,3)
 #  cikcode yearendeddate  source sourcedate financialsdate     assets auditfees
 #1      20    2002-12-28    10-K 2003-03-27     2002-12-28 50,459,000   123,700
 #2      20    2002-12-28 DEF 14A 2003-03-31     2002-12-28 50,459,000   123,700
 #3      20    2000-12-30    10-K 2001-03-26     2000-12-30 54,421,000   200,000
 #              auditor  earnings naicscode    revenue siccode statecode
 #1            KPMG LLP 3,284,000    334513 68,231,000    3823        NJ
 #2            KPMG LLP 3,284,000    334513 68,231,000    3823        NJ
 #3 Arthur Andersen LLP 5,838,000    334513 84,912,000    3823        NJ
 #   statename ticker
 #1 NEW JERSEY   <NA>
 #2 NEW JERSEY       
 #3 NEW JERSEY   <NA>

数据

set.seed(24)
df1 <- data.frame(ID=1:5, variable1.x= c(5.0, '', 7.9, NA, 12), 
 variable1.y=c(1.5, 3.0, '', 8.9, 3.9), variable2.x= c(4.2, 3.5, '', NA, 4), 
 variable2.y=c(1.2, 1.5, 3.2, 1.2, NA), stringsAsFactors=FALSE)

答案 1 :(得分:0)

处理此问题的另一种方法是在合并时处理它。

我的软件包safejoin具有可处理这些冲突的连接

# devtools::install_github("moodymudskipper/safejoin")
library(safejoin)

df1 <- data.frame(ID=c(1,2,3,4,5), variable1=c('5.0',"",'7.9','NA','12'))
df2 <- data.frame(ID=c(1,2,3,4,5), variable1=c('1.5','3.0',"",'8.9','3.9'))

safe_left_join(df1, df2, by = "ID", conflict = ~ dplyr::coalesce(
  as.numeric(as.character(.x)), as.numeric(as.character(.y))))
#   ID variable1
# 1  1       5.0
# 2  2       3.0
# 3  3       7.9
# 4  4       8.9
# 5  5      12.0

如果您有能力事先清理数据并从数字列开始,它将 刚刚去过

safe_left_join(df1, df2, by = "ID", conflict = dplyr::coalesce)