我有一个包含数千列的数据集,其中某些列具有相同的列名。我想合并具有相同列名的列,以便将值附加为行。并且,对于没有相同名称的列,将0
附加在行中。
说明:下面仅是一个示例,我拥有的真实数据集具有数千个列,其中许多具有重复的列名,而许多却没有。
样本输入数据
Col_1 Col_1 Col_1 Col_1 Col_2
1 2 3 4 5
5 6 7 8 5
9 10 11 12 5
13 14 15 16 5
示例输出数据
Col_1 Col_2
1 5
2 5
3 5
4 5
5 0
6 0
7 0
8 0
9 0
10 0
11 0
12 0
13 0
14 0
15 0
16 0
答案 0 :(得分:1)
这是我的工作方式,涉及一些手动工作。假设您的数据集在变量test
# may only require some of the packages of tidyverse
library(tidyverse)
# this will give all column unique names
renamed_test <- test %>%
set_names(str_c(names(test), 1:ncol(test)))
# then for each duplicated column name, they now start with the same prefix;
# so select all these columns and use gather to append them one after another,
# and finally rename the merged column back to the original name
bound_col_1 <- renamed_test %>%
select(starts_with("Col_1")) %>%
gather %>%
transmute(Col_1 = value)
# repeat this for 'Col_2'
# .....
# last, column bind all these results
bind_cols(bound_col_1, bound_col_2, [potentiall other variables])
我对解决方案进行了概括,因此它将自动查找所有重复的列,并分别绑定行
library(tidyverse)
# testing data
test <- data.frame(c(1,2,3), c(7,8,9), c(4,5,6), c(10,11,12), c(100, 101, 102)) %>%
set_names(c("Col_1", "Col_2", "Col_1", "Col_2", "Col_3"))
col_names <- names(test)
# find all columns that have duplicated columns
dup_names <- col_names[duplicated(col_names)]
# make the column names unique so it will work with tidyr
renamed_test <- test %>%
set_names(str_c(col_names, "-", 1:ncol(test)))
unique_data <- test[!(duplicated(col_names) | duplicated(col_names, fromLast = TRUE))]
# for each duplicated column name, merge all columns that have the same name
dup_names %>% map(function(col_name) {
renamed_test %>%
select(starts_with(col_name)) %>%
gather %>% # bind rows
select(-1) %>% # merged value is the last column
set_names(c(col_name)) # rename the column name back to its original name
}) %>% bind_cols
result <- bind_rows(tmp_result, unique_data)
当您尝试绑定列时这很棘手,因为合并的数据可能具有不同的行号。您可以每次合并时比较长度,并通过添加0来填充较短的列表。
答案 1 :(得分:0)
尝试一下。逻辑尚不清楚: 编辑:,我认为最好的办法就是像这样融化数据
library(tidyverse)
df1<-df %>%
gather("ID","Value") %>%
group_by(ID) %>%
arrange(Value)
df1$ID<-str_replace_all(df1$ID,"Col_1.\\d","Col_1")
您可以这样进行,但我觉得让数据融化会更好。
library(reshape2)
df1 %>%
ungroup() %>%
dcast(Value~ID,fun=mean) %>%
mutate(Col_2=ifelse(Col_1<=4,5,0)) %>%
select(-Value)
结果(熔化):然后的问题是如何处理重复项。
ID Value
<chr> <int>
1 Col_1 1
2 Col_1 2
3 Col_1 3
4 Col_1 4
5 Col_1 5
6 Col_2 5
7 Col_2 5
8 Col_2 5
9 Col_2 5
10 Col_1 6
11 Col_1 7
12 Col_1 8
13 Col_1 9
14 Col_1 10
15 Col_1 11
16 Col_1 12
17 Col_1 13
18 Col_1 14
19 Col_1 15
20 Col_1 16
原文:
library(tidyverse)
df %>%
gather(key,value,-Col_2) %>%
arrange(value) %>%
rename(Col_1=value) %>%
mutate(Col_2=ifelse(Col_1<=4,5,0)) %>%
select(Col_1,everything(),-key)
结果:
Col_1 Col_2
1 1 5
2 2 5
3 3 5
4 4 5
5 5 0
6 6 0
7 7 0
8 8 0
9 9 0
10 10 0
11 11 0
12 12 0
13 13 0
14 14 0
15 15 0
16 16 0
答案 2 :(得分:0)
这是一个非常复杂的答案。有些代码有些笨拙,但这是一个通用解决方案。
library(tidyverse)
library(magrittr)
# function to create lookup table, matching duplicate column names to syntactically valid names
rel <- function(x) {x %>%
colnames %>%
make.names(., unique = TRUE) %>%
as.data.frame() %>%
mutate(names(x)) %>%
setNames(c("New", "Old")) }
# create lookup table to match old and new column names
lookup <- rel(df)
# gather df into long format
df_long <- df %>%
setNames(lookup$New) %>%
gather(var, value)
# match new names to original names
df_colnames <- lapply(1:length(unique(lookup$Old)), function(x) grepl(unique(lookup$Old)[x], df_long$var)) %>%
setNames(unique(lookup$Old)) %>%
as.data.frame
# vector replacing new syntactically valid names with original names
column <- lapply(names(df_colnames), function(x) ifelse(df_colnames[, x], x, F)) %>%
setNames(unique(lookup$Old)) %>%
as.data.frame %>%
unite(comb, sep = "") %>%
magrittr::extract(, "comb") %>%
gsub("FALSE", "", .)
# put original columns into lists
final_list <- df_long %>%
mutate(var = column) %>%
arrange(var, value) %>%
split(.$var) %>%
map(~select_at(.x, c("value"))) %>%
lapply(function(x) x$value)
# create vectors of zeros to append to original data
final_list_extend <- sapply(abs(unlist(lapply(final_list, length)) - max(unlist(lapply(final_list, length)))), function(x) rep(0, x))
# append zeros to original data and rename columns to match original names
output <- sapply(1:length(final_list), function(x) c(final_list[[x]], final_list_extend[[x]])) %>%
as_data_frame %>%
setNames(unique(lookup$Old))
#show result
output
# A tibble: 16 x 2
Col_1 Col_2
<dbl> <dbl>
1 1 5
2 2 5
3 3 5
4 4 5
5 5 0
6 6 0
7 7 0
8 8 0
9 9 0
10 10 0
11 11 0
12 12 0
13 13 0
14 14 0
15 15 0
16 16 0
df <- read.table(header = T, text = "
Col_1 Col_1 Col_1 Col_1 Col_2
1 2 3 4 5
5 6 7 8 5
9 10 11 12 5
13 14 15 16 5") %>%
setNames(c("Col_1", "Col_1", "Col_1", "Col_1", "Col_2"))