如何根据r

时间:2016-08-05 18:08:41

标签: r

我有一个数据框:

df = read.table(text="group X1  X2  X3  X4  X5  X6  X7
P1  H   H   H   H   H   H   H
P1  C   D   C   D   B   C   C
P1  D   C   B   A   C   D   H
P1  D   C   B   A   C   D   D
P1  C   D   C   D   B   C   D
P2  C   D   B   D   C   D   C
P2  H   H   H   H   H   H   H
P2  D   C   C   A   B   C   D
P3  C   D   C   D   B   C   C
P3  H   H   H   H   H   H   H
P3  C   D   C   D   B   C   C
P3  D   C   B   A   C   D   D", header=T, stringsAsFactors=F)

我有另一个数据框:

df2 = read.table(text="Group    col R   S
P1  'X2 X4 X7'  'C A D' 'D D C'
P2  'X2 X3 X4 X6'   'C C A C'   'D B D D'
P3  'X3 X5 X6 X7'   'B C D D'   'C B C C'", header=T, stringsAsFactors=F)

我想添加名为&#34的列;分配"保持基于df2的作业。例如,如果df $ group ==" P1",则只连接df2 $ col" P1"中显示的df中的列。行,如果所有列都有相同的字母" H",则分配" H"到"分配"柱;如果匹配df2 $ R列中的字符串,则分配" R&#34 ;;如果匹配df2 $ S列中的字符串,则分配" S&#34 ;;如果不符合上述任何三种情况,则分配" U"。 我已经在小组" P1"中测试了我的脚本,但我不知道如何将指定的值返回到df并完成循环。感谢任何帮助。 我希望结果如下:

df = read.table(text="group 1 2 3 4 5 6 7 assign
P1 H H H H H H H H
P1 C D C D B C C S
P1 D C B A C D D U
P1 D C B A C D D R
P1 C D C D B C D U
P2 C D B D C D C S
P2 H H H H H H H H
P2 D C C A B C D R
P3 C D C D B C C S
P3 H H H H H H H H
P3 C D C D B C C S
P3 D C B A C D D R
", header=T, stringsAsFactors=F) 

1 个答案:

答案 0 :(得分:1)

您可以使用data.table并分三步解决问题:

  1. 合并data.table s

  2. 这是关键步骤,构建一个模式以便以后匹配,很酷的是我们可以在.SDcolsby组中使用灵活数量的data.table }

  3. 构建assign变量

  4. 以下是代码:

    # data
    require(data.table)
    dt = data.table(df)
    dt2 = data.table(df2)
    
    # add col_int, a list(!) of col indices, to dt2 for each Group
    dt3 = dt2[, list(col_name = strsplit(col, ' ')[[1]]), by = Group]
    dt3 = dt3[, col_idx := match(col_name, names(dt))]
    dt3 = dt3[, list(col_idx = list(col_idx)), by = Group]
    dt2 = merge(dt2, dt3, by = 'Group')
    
    # solution
    dt = merge(x = dt,
               y = dt2,
               by = 'Group')
    
    idx_matching_table = names(dt)
    
    # a: using strings
    dt[,
       j = pattern := {
         .SD[, do.call('paste', c(.SD)), .SDcols = strsplit(col, ' ')[[1]]]
       },
       by = list(Group, col)]
    
    # b: using indices
    dt[,
       j = pattern_2 := {
         # .SD has less cols (compared to dt), therefore find out what the integer index of col_idx in .SD is:
         col_idx_sd = match(idx_matching_table[col_idx[[1]]], names(.SD))
         .SD[, do.call('paste', c(.SD)), .SDcols = col_idx_sd]
       },
       by = list(Group, col)]
    dt[, identical(pattern, pattern_2)] # TRUE
    
    dt[, assign := 'U']
    dt[pattern %like% '[H ]+H', assign := 'H']
    dt[pattern == R, assign := 'R']
    dt[pattern == S, assign := 'S']
    

    编辑我将apply(.SD, 1, paste, collapse = ' ')替换为do.call('paste', c(.SD)),以避免强迫matrix