解决方案

Question

我有一个包含数千列的数据集，其中某些列具有相同的列名。我想合并具有相同列名的列，以便将值附加为行。并且，对于没有相同名称的列，将0附加在行中。

说明：下面仅是一个示例，我拥有的真实数据集具有数千个列，其中许多具有重复的列名，而许多却没有。

样本输入数据

Col_1 Col_1 Col_1 Col_1 Col_2
  1     2     3     4   5
  5     6     7     8   5
  9    10    11    12   5
 13    14    15    16   5

示例输出数据

Col_1 Col_2
  1    5
  2    5
  3    5
  4    5
  5    0
  6    0
  7    0 
  8    0
  9    0
 10    0
 11    0
 12    0
 13    0
 14    0
 15    0
 16    0

Answer 1

这是我的工作方式，涉及一些手动工作。假设您的数据集在变量test

中

# may only require some of the packages of tidyverse
library(tidyverse)

# this will give all column unique names
renamed_test <- test %>%
                set_names(str_c(names(test), 1:ncol(test)))

# then for each duplicated column name, they now start with the same prefix;
# so select all these columns and use gather to append them one after another,
# and finally rename the merged column back to the original name
bound_col_1 <- renamed_test %>%
               select(starts_with("Col_1")) %>%
               gather %>%
               transmute(Col_1 = value)

# repeat this for 'Col_2'
# .....

# last, column bind all these results
bind_cols(bound_col_1, bound_col_2, [potentiall other variables])

编辑：

我对解决方案进行了概括，因此它将自动查找所有重复的列，并分别绑定行

library(tidyverse)

# testing data
test <- data.frame(c(1,2,3), c(7,8,9), c(4,5,6), c(10,11,12), c(100, 101, 102)) %>%
  set_names(c("Col_1", "Col_2", "Col_1", "Col_2", "Col_3"))

col_names <- names(test)

# find all columns that have duplicated columns
dup_names <- col_names[duplicated(col_names)]

# make the column names unique so it will work with tidyr
renamed_test <- test %>%
  set_names(str_c(col_names, "-", 1:ncol(test)))

unique_data <- test[!(duplicated(col_names) | duplicated(col_names, fromLast = TRUE))]

# for each duplicated column name, merge all columns that have the same name
dup_names %>% map(function(col_name) {
  renamed_test %>%
    select(starts_with(col_name)) %>% 
    gather %>% # bind rows
    select(-1) %>% # merged value is the last column
    set_names(c(col_name)) # rename the column name back to its original name
}) %>% bind_cols

result <- bind_rows(tmp_result, unique_data)

当您尝试绑定列时这很棘手，因为合并的数据可能具有不同的行号。您可以每次合并时比较长度，并通过添加0来填充较短的列表。

Answer 2

尝试一下。逻辑尚不清楚： 编辑：，我认为最好的办法就是像这样融化数据

library(tidyverse)
df1<-df %>% 
  gather("ID","Value") %>% 
  group_by(ID) %>% 
  arrange(Value)

df1$ID<-str_replace_all(df1$ID,"Col_1.\\d","Col_1")

您可以这样进行，但我觉得让数据融化会更好。

library(reshape2)
df1 %>% 
  ungroup() %>% 
  dcast(Value~ID,fun=mean) %>% 
  mutate(Col_2=ifelse(Col_1<=4,5,0)) %>% 
  select(-Value)

结果（熔化）：然后的问题是如何处理重复项。

 ID    Value
   <chr> <int>
 1 Col_1     1
 2 Col_1     2
 3 Col_1     3
 4 Col_1     4
 5 Col_1     5
 6 Col_2     5
 7 Col_2     5
 8 Col_2     5
 9 Col_2     5
10 Col_1     6
11 Col_1     7
12 Col_1     8
13 Col_1     9
14 Col_1    10
15 Col_1    11
16 Col_1    12
17 Col_1    13
18 Col_1    14
19 Col_1    15
20 Col_1    16

原文：

  library(tidyverse)
    df %>% 
  gather(key,value,-Col_2) %>% 
  arrange(value) %>% 
  rename(Col_1=value) %>% 
  mutate(Col_2=ifelse(Col_1<=4,5,0)) %>% 
  select(Col_1,everything(),-key)

结果：

      Col_1 Col_2
1      1     5
2      2     5
3      3     5
4      4     5
5      5     0
6      6     0
7      7     0
8      8     0
9      9     0
10    10     0
11    11     0
12    12     0
13    13     0
14    14     0
15    15     0
16    16     0

Answer 3

这是一个非常复杂的答案。有些代码有些笨拙，但这是一个通用解决方案。

解决方案

library(tidyverse)
library(magrittr)

# function to create lookup table, matching duplicate column names to syntactically valid names 
rel <- function(x) {x %>% 
  colnames %>% 
  make.names(., unique = TRUE) %>% 
  as.data.frame() %>% 
  mutate(names(x)) %>% 
  setNames(c("New", "Old")) }

# create lookup table to match old and new column names
lookup <- rel(df)

# gather df into long format
df_long <- df %>% 
  setNames(lookup$New) %>% 
  gather(var, value)

# match new names to original names
df_colnames <- lapply(1:length(unique(lookup$Old)), function(x) grepl(unique(lookup$Old)[x], df_long$var)) %>% 
  setNames(unique(lookup$Old)) %>% 
  as.data.frame

# vector replacing new syntactically valid names with original names
column <- lapply(names(df_colnames), function(x) ifelse(df_colnames[, x], x, F)) %>% 
  setNames(unique(lookup$Old)) %>% 
  as.data.frame %>% 
  unite(comb, sep = "") %>% 
  magrittr::extract(, "comb") %>% 
  gsub("FALSE", "", .)

# put original columns into lists
final_list <- df_long %>% 
  mutate(var = column) %>% 
  arrange(var, value) %>% 
  split(.$var) %>% 
  map(~select_at(.x, c("value"))) %>% 
  lapply(function(x) x$value)

# create vectors of zeros to append to original data
final_list_extend <- sapply(abs(unlist(lapply(final_list, length)) - max(unlist(lapply(final_list, length)))), function(x) rep(0, x))

# append zeros to original data and rename columns to match original names
output <- sapply(1:length(final_list), function(x) c(final_list[[x]], final_list_extend[[x]])) %>% 
  as_data_frame %>% 
  setNames(unique(lookup$Old))

#show result
output

# A tibble: 16 x 2
   Col_1 Col_2
   <dbl> <dbl>
 1     1     5
 2     2     5
 3     3     5
 4     4     5
 5     5     0
 6     6     0
 7     7     0
 8     8     0
 9     9     0
10    10     0
11    11     0
12    12     0
13    13     0
14    14     0
15    15     0
16    16     0

数据

df <- read.table(header = T, text = "
Col_1 Col_1 Col_1 Col_1 Col_2
  1     2     3     4   5
5     6     7     8   5
9    10    11    12   5
13    14    15    16   5") %>% 
  setNames(c("Col_1", "Col_1", "Col_1", "Col_1", "Col_2"))

如何在数据框中行添加具有相同列名的附加列

3 个答案:

编辑：

解决方案

数据