粘贴两个不同大小的数据帧

时间:2017-03-08 15:21:15

标签: r match extract subset paste

我想在最后粘贴来自2 dfs np - dput的字符串。 它们的大小不同nrow(n) = 25nrow(p) = 20 有两个因素:factor1(二进制)和factor2(整数)

head(n,3)                       head(p,3)
string   factor1  factor2        string    factor1  factor2
--       --       --             --        --       --
h        f1       5              i         f1       1
h        f1       6              c         f1       2
h        f1       7              c         f1       3
tail(n,3)                       tail(p,3)
string   factor1  factor2        string    factor1  factor2
--       --       --             --        --       --
a        f2       27             h         f2       18
g        f2       28             i         f2       19
b        f2       29             i         f2       20

在这里,我想创建一个数据框

  1. 没有遗漏任何因素
  2. 在一组因子相同时粘贴n和p的字符串
  3. 如果只有一组唯一的因子可用,请粘贴一个值
  4. output <- paste (p - n) # error n an p different length
    output <- merge (p,n, all=T) # merge into one df
    output <- tapply(output, 1, paste) # same error
    output <- tapply(output[which((output$factor == output$factor & output$factor2 == output$factor2 ))], 1, paste) # nonsensical
    

    抱歉缺少“最小代码”......

    ----

    预期输出:

    head(output)                   tail(output)
    string   factor   factor2        string    factor   factor2
    --       --       --             --        --       --
    i        f1       1              g         f2       24
    c        f1       2              e         f1       25
    c        f1       3              j         f1       26
    g        f1       4              a         f2       27
    fh       f1       5              g         f2       28  
    ih       f1       6              b         f2       29  
    

    -----

    > dput(n)
        structure(list(string = structure(c(7L, 7L, 7L, 4L, 5L, 2L, 2L, 
    1L, 4L, 1L, 1L, 2L, 3L, 1L, 4L, 1L, 8L, 8L, 2L, 6L, 5L, 8L, 1L, 
    6L, 2L), .Label = c("a", "b", "c", "d", "e", "g", "h", "j"), class = "factor"), 
        factor = c("f1", "f1", "f1", "f1", "f1", "f1", "f1", "f1", 
        "f1", "f1", "f2", "f2", "f2", "f2", "f2", "f2", "f2", "f2", 
        "f2", "f2", "f1", "f1", "f2", "f2", "f2"), factor2 = 5:29), .Names = c("string", 
    "factor", "factor2"), row.names = c(NA, -25L), class = "data.frame")
    
    > dput(p)
         structure(list(string = structure(c(5L, 1L, 1L, 3L, 2L, 5L, 5L, 
    6L, 4L, 6L, 6L, 5L, 4L, 6L, 6L, 6L, 6L, 4L, 5L, 5L), .Label = c("c", 
    "f", "g", "h", "i", "j"), class = "factor"), factor = c("f1", 
    "f1", "f1", "f1", "f1", "f1", "f1", "f1", "f1", "f1", "f2", "f2", 
    "f2", "f2", "f2", "f2", "f2", "f2", "f2", "f2"), factor2 = 1:20), .Names = c("string", 
    "factor", "factor2"), row.names = c(NA, -20L), class = "data.frame")
    

1 个答案:

答案 0 :(得分:0)

使用dplyrpurrr,我们可以先full_join,然后paste我们省略NAs的两个字符串的向量:

library(tidyverse)
full_join(n, p, by = c('factor', 'factor2')) %>% 
  mutate(string = map2(as.character(string.x), as.character(string.y), 
                       ~paste0(na.omit(c(.y, .x)), collapse = ''))) %>% 
  select(-string.x, -string.y)
   factor factor2 string
1      f1       5     fh
2      f1       6     ih
3      f1       7     ih
4      f1       8     jd
5      f1       9     he
6      f1      10     jb
7      f1      11      b
8      f1      12      a
9      f1      13      d
10     f1      14      a
11     f2      15     ja
12     f2      16     jb
13     f2      17     jc
14     f2      18     ha
15     f2      19     id
16     f2      20     ia
17     f2      21      j
18     f2      22      j
19     f2      23      b
20     f2      24      g
21     f1      25      e
22     f1      26      j
23     f2      27      a
24     f2      28      g
25     f2      29      b
26     f1       1      i
27     f1       2      c
28     f1       3      c
29     f1       4      g
30     f2      11      j
31     f2      12      i
32     f2      13      h
33     f2      14      j

在基地R:

np <- merge(n, p, c('factor', 'factor2'), all = TRUE)
np$string <- mapply(function(x, y) paste0(na.omit(c(x, y)), collapse = ''), 
                    as.character(np$string.y), as.character(np$string.x))
np[, -c(3:4)]