我有以下数据框:
set.seed(123)
df <- data.frame(col.dfA=rnorm(8,40,2),
col.dfB=rnorm(8,20,2),
colt=rnorm(8,100,20),
pr.dfA=rnorm(8,20,2),
pr.dfB=rnorm(8,30,2),
priority=c("dfA","dfB","dfA","dfA","dfB","dfA","dfB","dfB"))
现在,我想将col.dfA
&col.dfB
和pr.dfA
&pr.dfB
分别合并到df$col
和df$pr
中。
df$priority
列指示应在每一行中使用哪些列值。就是在第一,第三,第四和第六行中,后缀为dfA
的列应优先。在其余的行中,带有后缀dfB
的列应优先考虑。该解决方案需要适用于具有数十列和数百行的大型方案。
初始数据框:
col.dfA col.dfB colt pr.dfA pr.dfB priority
1 38.87905 18.62629 109.95701 18.74992 31.79025 dfA
2 39.53965 19.10868 60.66766 16.62661 31.75627 dfB
3 43.11742 22.44816 114.02712 21.67557 31.64316 dfA
4 40.14102 20.71963 90.54417 20.30675 31.37728 dfA
5 40.25858 20.80154 78.64353 17.72373 31.10784 dfB
6 43.43013 20.22137 95.64050 22.50763 29.87618 dfA
7 40.92183 18.88832 79.47991 20.85293 29.38807 dfB
8 37.46988 23.57383 85.42218 19.40986 29.23906 dfB
预期结果:
col colt pr priority
1 38.87905 109.95701 18.74992 dfA
2 19.10868 60.66766 31.75627 dfB
3 43.11742 114.02712 21.67557 dfA
4 40.14102 90.54417 20.30675 dfA
5 20.80154 78.64353 31.10784 dfB
6 43.43013 95.64050 22.50763 dfA
7 18.88832 79.47991 29.38807 dfB
8 23.57383 85.42218 29.23906 dfB
答案 0 :(得分:1)
如果以此方式尝试该怎么办:
library(tidyverse)
df %>%
gather(key = 'varname', value = 'varvalue',
matches(paste(.$priority, collapse = '|'))) %>%
rowwise() %>%
filter(grepl(priority, varname)) %>%
mutate(varname = gsub('\\.[[:alpha:]]+$', '', varname)) %>%
spread(key = varname, value = varvalue)
答案 1 :(得分:1)
还有一种替代方法,它利用data.table的化身melt()
的能力来同时重塑多个值列:
library(data.table)
library(magrittr) # used to improve readability
# pick column names ending in .dfA or .dfB
pat <- names(df) %>% stringr::str_extract(".+\\.df(?=[AB]$)") %>% unique() %>% na.omit()
# new column names without trailing .df
col <- pat %>% stringr::str_replace("\\.df$", "")
# reshape from wide to long format
melt(setDT(df)[, rn := .I], measure.vars = patterns(pat), value.name = col)[
# subset rows where priority equals variable (as factor levels)
as.integer(priority) == as.integer(variable)][
# re-order and clean up
order(rn)][, variable := NULL][]
colt priority rn col pr 1: 109.95701 dfA 1 38.87905 18.74992 2: 60.66766 dfB 2 19.10868 31.75627 3: 114.02712 dfA 3 43.11742 21.67557 4: 90.54417 dfA 4 40.14102 20.30675 5: 78.64353 dfB 5 20.80154 31.10784 6: 95.64050 dfA 6 43.43013 22.50763 7: 79.47991 dfB 7 18.88832 29.38807 8: 85.42218 dfB 8 23.57383 29.23906
答案 2 :(得分:0)
我不确定我了解你在追求什么...但是我想是这样的
library(data.table)
setDT(df, key = 'priority')
df[.('dfA'), c('col', 'pr') := .(col.dfA, pr.dfA)]
df[.('dfB'), c('col', 'pr') := .(col.dfB, pr.dfB)]
如果需要,可丢弃源列:
df[ , grep('\\.df[AB]$', names(df)) := NULL][]
# colt priority col pr
# 1: 66.99376 dfA 40.81422 22.05758
# 2: 104.97938 dfA 41.09455 19.78682
# 3: 66.33725 dfA 40.08257 21.65990
# 4: 112.25391 dfA 42.89993 19.72432
# 5: 72.14824 dfB 23.47501 30.76038
# 6: 103.57393 dfB 19.43745 26.52382
# 7: 93.28548 dfB 20.46695 29.38035
# 8: 89.28845 dfB 19.21581 28.66537
如果对您很重要,请使用setcolorder
对列进行重新排序。
更具编程性的概括可以使用mget
:
priorities = unique(df$priority)
nm = names(df)
merge_col = unique(c(sapply(priorities, function(suff) {
has_suff = grepl(sprintf('%s$', suff), nm)
gsub(sprintf('\\.%s$', suff), '', nm[has_suff])
})))
for (PRIORITY in priorities) {
source_cols = sprintf('%s.%s', merge_cols, PRIORITY)
df[.(PRIORITY), (merge_cols) := mget(source_cols)]
df[ , (source_cols) := NULL]
}
mget
已知运行缓慢。使用eval
可能更有效:
df[.(PRIORITY), (merge_cols) := lapply(source_cols, function(x) eval(as.name(x)))]