我想遍历数据框的列名,然后使用 dplyr ,使用在行字段之间找到的定界符(->)分隔字段。这就是数据集的样子:
dput(df)
structure(list(v1 = c("Silva->Mark", "Brandon->Livo", "Mango->Apple"),
v2 = c("Austin", "NA ", "Orange"),
v3 = c("James -> Jacy","NA->Jane", "apple -> Orange")),
class = "data.frame", row.names = c(NA, -3L))
现在,我编写了一个代码,用列v1 和列v3 的行用定界符(->)过滤了列名。这是代码:
rows_true <- apply(df,2,function(x) any(sapply(x,function(y)grepl("->",y))))
ss<-df[,rows_true]
然后我尝试遍历这些列名,以便可以使用分隔符使用此代码进行分隔,但它不起作用
cols<- names(df)
if (names %in% df){
splitcols <- ss %>%
tidyr::separate(cols, into = c(paste0(names,+ "old"), "paste0(names,+ "New")"), sep = "->")
}
我使用 paste0 的原因是,我确实希望使用定界符将列分成两部分,然后应使用原始名称加后缀来命名新形成的列 旧 (第一个拆分列), 新 (第二个拆分列)
遍历列名并递归分离它们后的最终结果应如下图所示
dput(df)
structure(list(v1_Old = c("Silva", "Brandon", "Mango"),
v1_New = c("Mark", "Livo", "Apple"),
v3_Old = c("James","NA", "apple"),
v3_New = c("Jacy","Jane", "Orange")),
class = "data.frame", row.names = c(NA, -3L))
答案 0 :(得分:1)
涉及dplyr
和tidyr
的一种可能性是:
df %>%
select(v1, v3) %>%
rowid_to_column() %>%
gather(var, val, -rowid) %>%
separate_rows(val, sep = "->", convert = TRUE) %>%
group_by(rowid) %>%
mutate(val = trimws(val),
var = make.unique(var)) %>%
ungroup() %>%
spread(var, val) %>%
select(-rowid)
v1 v1.1 v3 v3.1
<chr> <chr> <chr> <chr>
1 Silva Mark James Jacy
2 Brandon Livo <NA> Jane
3 Mango Apple apple Orange
或者进一步匹配预期输出:
df %>%
select(v1, v3) %>%
rowid_to_column() %>%
gather(var, val, -rowid) %>%
separate_rows(val, sep = "->", convert = TRUE) %>%
group_by(rowid, var) %>%
mutate(val = trimws(val),
var2 = if_else(row_number() == 2, paste0(var, "_old"), paste0(var, "_new"))) %>%
ungroup() %>%
select(-var) %>%
spread(var2, val) %>%
select(-rowid)
v1_new v1_old v3_new v3_old
<chr> <chr> <chr> <chr>
1 Silva Mark James Jacy
2 Brandon Livo <NA> Jane
3 Mango Apple apple Orange
答案 1 :(得分:1)
以下是使用dplyr
,purr
和stringr
的另一种方法。
library(dplyr)
library(purrr)
library(stringr)
# Detect the columns with at least on "->"
my_df_cols <- map_lgl(my_df, ~ any(str_detect(., "->")))
my_df %>%
# Select only the columns with at least "->"
select(which(my_df_cols)) %>%
# Mutate these columns and only keep the mutated columns with new names
transmute_all(list(old = ~ str_split(., "->", simplify = TRUE)[, 1],
new = ~ str_split(., "->", simplify = TRUE)[, 2]))
# v1_old v3_old v1_new v3_new
# 1 Silva James Mark Jacy
# 2 Brandon NA Livo Jane
# 3 Mango apple Apple Orange
答案 2 :(得分:1)
我们还可以使用cSplit
中的splitstackshape
#Detect columns with "->"
cols <- names(df)[colSums(sapply(df, grepl, pattern = "->")) > 1]
#Remove unwanted whitespaces before and after "->"
df[cols] <- lapply(df[cols], function(x) gsub("\\s+", "", x))
#Split into new columns specifying sep as "->"
splitstackshape::cSplit(df[cols], cols, sep = "->")
# v1_1 v1_2 v3_1 v3_2
#1: Silva Mark James Jacy
#2: Brandon Livo <NA> Jane
#3: Mango Apple apple Orange
答案 3 :(得分:1)
为了完整起见,这也是使用data.table()
的解决方案。
与到目前为止发布的其他答案有些不同:
"->"
的列从结果即时删除。" *-> *"
。这样可以避免以后在生成的图块上调用trimws()
或事先删除空白。。
library(data.table)
library(magrittr) # piping used to improve readability
setDT(df)
lapply(names(df), function(x) {
mDT <- df[, tstrsplit(get(x), " *-> *")]
if (ncol(mDT) == 2L) setnames(mDT, paste0(x, c("_Old", "_New")))
}) %>% as.data.table()
v1_Old v1_New v3_Old v3_New 1: Silva Mark James Jacy 2: Brandon Livo NA Jane 3: Mango Apple apple Orange