我正在寻找一种方法,在合并数据帧之后,将相同变量名的值连接到一列,使用不同的x和y后缀。例如:' name.x'和' name.y'将被加入一个名为' name'其中&name; name.x'价值观将取代名称。'值除了&name; name.x'没有价值,在这种情况下,名称。'会被使用。我希望对数据框中带有后缀的所有列执行此操作。
这是我正在寻找的一个例子:
df <- data.frame(ID=c(1,2,3,4,5), variable1.x=c('5.0',"",'7.9','NA','12'), variable1.y=c('1.5','3.0',"",'8.9','3.9') );
ID variable1.x variable1.y
1 5.0 1.5
2 3.0
3 7.9
4 NA 8.9
5 12 3.9
预期产出:
ID variable1
1 5.0
2 3.0
3 7.9
4 8.9
5 12
我的数据样本如下:
structure(list(cikcode = c(20, 20, 20, 20, 20, 20, 20, 20, 20,
20), yearendeddate = structure(c(3L, 3L, 1L, 1L, 2L, 2L, 4L,
4L, 5L, 5L), .Label = c("2000-12-30", "2001-12-29", "2002-12-28",
"2004-01-03", "2005-01-01"), class = "factor"), source = structure(c(1L,
3L, 1L, 3L, 2L, 3L, 1L, 3L, 1L, 3L), .Label = c("10-K", "10-K405",
"DEF 14A"), class = "factor"), sourcedate = structure(c(5L, 6L,
1L, 2L, 3L, 4L, 7L, 8L, 9L, 10L), .Label = c("2001-03-26", "2001-03-28",
"2002-03-20", "2002-03-25", "2003-03-27", "2003-03-31", "2004-04-01",
"2004-04-06", "2005-03-31", "2005-04-04"), class = "factor"),
financialsdate = structure(c(3L, 3L, 1L, 1L, 2L, 2L, 4L,
4L, 5L, 5L), .Label = c("2000-12-30", "2001-12-29", "2002-12-28",
"2004-01-03", "2005-01-01"), class = "factor"), ticker.x = structure(c(1L,
NA, 1L, NA, 1L, NA, 1L, NA, 1L, NA), .Label = "", class = "factor"),
statecode.x = structure(c(1L, NA, 1L, NA, 1L, NA, 1L, NA,
1L, NA), .Label = "NJ", class = "factor"), statename.x = structure(c(1L,
NA, 1L, NA, 1L, NA, 1L, NA, 1L, NA), .Label = "NEW JERSEY", class = "factor"),
siccode.x = c(3823, NA, 3823, NA, 3823, NA, 3823, NA, 3823,
NA), naicscode.x = c(334513, NA, 334513, NA, 334513, NA,
334513, NA, 334513, NA), auditor.x = structure(c(3L, NA,
1L, NA, 1L, NA, 2L, NA, 2L, NA), .Label = c("Arthur Andersen LLP",
"Grant Thornton LLP", "KPMG LLP"), class = "factor"), auditfees.x = structure(c(5L,
NA, 3L, NA, 1L, NA, 4L, NA, 2L, NA), .Label = c("185,000",
"225,000", "200,000", "137,100", "123,700"), class = "factor"),
revenue.x = structure(c(3L, NA, 1L, NA, 4L, NA, 5L, NA, 2L,
NA), .Label = c("84,912,000", "112,494,000", "68,231,000",
"71,819,000", "94,676,000"), class = "factor"), earnings.x = structure(c(3L,
NA, 4L, NA, 2L, NA, 1L, NA, 1L, NA), .Label = c("", "1,048,000",
"3,284,000", "5,838,000"), class = "factor"), assets.x = structure(c(2L,
NA, 3L, NA, 1L, NA, 4L, NA, 5L, NA), .Label = c("47,644,000",
"50,459,000", "54,421,000", "83,081,000", "93,016,000"), class = "factor"),
ticker.y = c(NA, "", NA, "", NA, "", NA, "", NA, ""), statecode.y = c(NA,
"NJ", NA, "NJ", NA, "NJ", NA, "NJ", NA, "NJ"), statename.y = c(NA,
"NEW JERSEY", NA, "NEW JERSEY", NA, "NEW JERSEY", NA, "NEW JERSEY",
NA, "NEW JERSEY"), siccode.y = c(NA, 3823, NA, 3823, NA,
3823, NA, 3823, NA, 3823), naicscode.y = c(NA, "334513",
NA, "334513", NA, "334513", NA, "334513", NA, "334513"),
auditor.y = c(NA, "KPMG LLP", NA, "Arthur Andersen LLP",
NA, "Arthur Andersen LLP", NA, "Grant Thornton LLP", NA,
"Grant Thornton LLP"), auditfees.y = c(NA, "123,700", NA,
"200,000", NA, "185,000", NA, "137,100", NA, "225,000"),
revenue.y = c(NA, "68,231,000", NA, "84,912,000", NA, "71,819,000",
NA, "94,676,000", NA, "112,494,000"), earnings.y = c(NA,
"3,284,000", NA, "5,838,000", NA, "1,048,000", NA, "", NA,
""), assets.y = c(NA, "50,459,000", NA, "54,421,000", NA,
"47,644,000", NA, "83,081,000", NA, "93,016,000")), .Names = c("cikcode",
"yearendeddate", "source", "sourcedate", "financialsdate", "ticker.x",
"statecode.x", "statename.x", "siccode.x", "naicscode.x", "auditor.x",
"auditfees.x", "revenue.x", "earnings.x", "assets.x", "ticker.y",
"statecode.y", "statename.y", "siccode.y", "naicscode.y", "auditor.y",
"auditfees.y", "revenue.y", "earnings.y", "assets.y"), row.names = c(NA,
10L), class = "data.frame")
答案 0 :(得分:4)
根据显示的示例,我认为你想要pmax
。如果您需要保留&#39; variable1.x&#39;除''
或NA
以外的值的值,您可以使用ifelse
。在示例中,"NA"
不是真正的NA。您不需要引用NA值。如果是真的,我们可以使用is.na(df[,2])
data.frame(df[1], variable=ifelse(df[,2] %in% c('','NA'),
as.numeric(as.character(df[,3])), as.numeric(as.character(df[,2]))),
stringsAsFactors=FALSE)
# ID variable
#1 1 5.0
#2 2 3.0
#3 3 7.9
#4 4 8.9
#5 5 12
如果有多列,例如&#39; df2&#39;(我在这里使用真正的NAs),我们将&#39;变量&#39;删除后缀部分i.x后的列。 &#39; x&#39;,&#39; y&#39;与sub
。使用lapply
循环遍历list
元素,并使用ifelse
获得结果,如上所示。另外,请注意,我在这里创建了'#39;字符&#39;列使用stringsAsFactors=FALSE
。
nm1 <- sub('\\..*$', '', names(df1)[-1])
df2 <- data.frame(ID= df1$ID)
df2[unique(nm1)] <- lapply(split(names(df1)[-1],nm1), function(x) {
x1 <- df1[x]
as.numeric(ifelse(is.na(x1[,1])|x1[,1]=='', x1[,2], x1[,1]))})
df2
# ID variable1 variable2
#1 1 5.0 4.2
#2 2 3.0 3.5
#3 3 7.9 3.2
#4 4 8.9 1.2
#5 5 12.0 4.0
我们也可以使用ifelse
索引
row/column
的情况下执行此操作
df2[unique(nm1)] <- lapply(split(names(df1)[-1], nm1), function(x) {
x1 <- df1[x]
as.numeric(x1[cbind(1:nrow(x1),((is.na(x1[,1])|x1[,1]==''))+1L)])})
基于OP帖子中的输出输出(&#39; df2&#39;)
#subset the column names that have either `.x` or `.y` at the end
v1 <- grep('\\.(x|y)$', names(df2), value=TRUE)
#create another dataset with all columns except the ones in "v1"
df2N <- df2[setdiff(names(df2), v1)]
#change the 'factor' columns in 'v1' to 'character'
indx <- sapply(df2[v1], is.factor)
df2[v1][indx] <- lapply(df2[v1][indx], as.character)
#remove the suffix part ('.x', 'y') with `sub`
nm1 <-sub('\\..*$', '', v1)
#sort the unique elements of 'nm1' for naming new columns
nm2 <- sort(unique(nm1))
#create new columns after `split`ting the 'v1' with 'nm1'
#merge the columns with the condition that if the first column
# i.e. '.x' has NA or empty strings, replace that with the
# second column element
df2N[nm2] <- lapply(split(v1, nm1), function(x) {
x1 <- df2[x]
ifelse(is.na(x1[,1])|x1[,1]=='', x1[,2], x1[,1])})
head(df2N,3)
# cikcode yearendeddate source sourcedate financialsdate assets auditfees
#1 20 2002-12-28 10-K 2003-03-27 2002-12-28 50,459,000 123,700
#2 20 2002-12-28 DEF 14A 2003-03-31 2002-12-28 50,459,000 123,700
#3 20 2000-12-30 10-K 2001-03-26 2000-12-30 54,421,000 200,000
# auditor earnings naicscode revenue siccode statecode
#1 KPMG LLP 3,284,000 334513 68,231,000 3823 NJ
#2 KPMG LLP 3,284,000 334513 68,231,000 3823 NJ
#3 Arthur Andersen LLP 5,838,000 334513 84,912,000 3823 NJ
# statename ticker
#1 NEW JERSEY <NA>
#2 NEW JERSEY
#3 NEW JERSEY <NA>
set.seed(24)
df1 <- data.frame(ID=1:5, variable1.x= c(5.0, '', 7.9, NA, 12),
variable1.y=c(1.5, 3.0, '', 8.9, 3.9), variable2.x= c(4.2, 3.5, '', NA, 4),
variable2.y=c(1.2, 1.5, 3.2, 1.2, NA), stringsAsFactors=FALSE)
答案 1 :(得分:0)
处理此问题的另一种方法是在合并时处理它。
我的软件包safejoin具有可处理这些冲突的连接
# devtools::install_github("moodymudskipper/safejoin")
library(safejoin)
df1 <- data.frame(ID=c(1,2,3,4,5), variable1=c('5.0',"",'7.9','NA','12'))
df2 <- data.frame(ID=c(1,2,3,4,5), variable1=c('1.5','3.0',"",'8.9','3.9'))
safe_left_join(df1, df2, by = "ID", conflict = ~ dplyr::coalesce(
as.numeric(as.character(.x)), as.numeric(as.character(.y))))
# ID variable1
# 1 1 5.0
# 2 2 3.0
# 3 3 7.9
# 4 4 8.9
# 5 5 12.0
如果您有能力事先清理数据并从数字列开始,它将 刚刚去过
safe_left_join(df1, df2, by = "ID", conflict = dplyr::coalesce)