Question

我有一个数据帧，其中包含许多变量，我想将其连接到同一数据帧中的新变量中。我的数据框 df 的简化版本如下：

first.1 second.1 first.2 second.2 
1222 3223 3333 1221 
1111 2212 2232 2113

这是我在没有for循环的情况下效率低下的方法：

df$concatenated.1 <- paste0(df$first.1,"-",df$second.1)
df$concatenated.2 <- paste0(df$first.2,"-",df$second.2)

这将导致以下数据帧 df ：

first.1 second.1 first.2 second.2 concatenated.1 concatenated.2 
1222 3223 3333 1221 1222-3223 3333-1221 
1111 2212 2232 2113 1111-2212 2232-2113

我要连接的变量对多于2对，所以我想在for循环中这样做：

for (i in 1:2){
??
}

关于如何实现此目标的任何想法？

Answer 1

如果您能找到一种拆分列的方法，那就容易多了。例如，根据提供的示例，我们可以根据列名的最后一个字符（1、2、2、2）拆分列。

使用基数R，我们使用split.default根据名称拆分列（如上所述），对于每个组，我们paste每行并添加新列。

group_names <- substring(names(df), nchar(names(df)))
df[paste0("concatenated.", unique(group_names))] <- 
     lapply(split.default(df,group_names),  function(x)  do.call(paste, c(x, sep = "-")))

df
#  first.1 second.1 first.2 second.2 concatenated.1 concatenated.2
#1    1222     3223    3333     1221      1222-3223      3333-1221
#2    1111     2212    2232     2113      1111-2212      2232-2113

Answer 2

如果实际数据中的名称遵循本示例数据中所示的清晰模式，则Ronak的split / lapply答案可能是最好的。如果没有，则可以仅创建名称的向量，然后将Map与paste一起使用。

new.names <- paste0('concatenated.', 1:2)
names.1 <- paste0('first.', 1:2)
names.2 <- paste0('second.', 1:2)

df[new.names] <- Map(paste, df[names.1], df[names.2], sep = '-')

df

#   first.1 second.1 first.2 second.2 concatenated.1 concatenated.2
# 1    1222     3223    3333     1221      1222-3223      3333-1221
# 2    1111     2212    2232     2113      1111-2212      2232-2113

Answer 3

这是一个 tidyverse 解决方案，可助您一臂之力。唯一的区别是，列是按字母顺序输出的，即“第一”，“连接的”，“秒”。

txt <- 'first.1 second.1 first.2 second.2 
1222 3223 3333 1221 
1111 2212 2232 2113'

df <- read.table(text = txt, header = T)

library(tidyverse)

df2 <- df %>% 
  mutate(row.num = row_number()) %>% 
  gather(variable, value, -row.num) %>% 
  separate(variable, into = c('order', 'pair')) %>% 
  spread(order, value) %>% 
  mutate(concatenated = paste0(first, '-', second)) %>% 
  gather(variable, value, -row.num, -pair) %>% 
  unite(name, variable, pair) %>% 
  spread(name, value)

  row.num concatenated_1 concatenated_2 first_1 first_2 second_1 second_2
1       1      1222-3223      3333-1221    1222    3333     3223     1221
2       2      1111-2212      2232-2113    1111    2232     2212     2113

Answer 4

library(tidyverse)

[编辑：原始解决方案未正确使用starts_with]

此解决方案使用ends_with()选择适当的列，然后使用unite将它们与-分隔符组合在一起：

df <- tribble(
        ~first.1, ~second.1, ~first.2, ~second.2,
        1222,3223,3333,1221,
        1111,2212,2232,2113)

df1 <- df %>%
  select(ends_with("1")) %>%
  unite(concatenated.1, sep = "-")

df2 <- df %>%
  select(ends_with("2")) %>%
  unite(concatenated.2, sep = "-")

cbind(df, df1, df2)

Answer 5

您可以在stringi程序包中使用功能stri_join，这非常快。

library(data.table)
library(stringi)

df <- fread("first.1 second.1 first.2 second.2 
             1222 3223 3333 1221 
             1111 2212 2232 2113")

cols <- paste0("concatenated_", 1:2)
df[, (cols) := Map(stri_join, .(first.1, first.2), .(second.1, second.2), sep = "-")]
setDF(df)

first.1 second.1 first.2 second.2 concatenated_1 concatenated_2
1    1222     3223    3333     1221      1222-3223      3333-1221
2    1111     2212    2232     2113      1111-2212      2232-2113

连接具有相同后缀的变量对

5 个答案: