遍历执行group_by和邻居分配的列

时间:2019-05-13 13:55:05

标签: r iteration tidyverse spread

这可能是一个常见问题,但是我找不到适合我问题的答案。

我的数据中有一列,其中分配了一些行,而另一些则是NA。
如果NA案例属于至少分配了一个项目的组,则可以分配。
此数据还有其他可用于分组的列,并且此列具有层次结构,这意味着我们应该按特定顺序将每个列一次分组,这是一个优势,这些分组列是从1到{{ 1}}。
我认为这是一种深度优先算法。

0.2

structure(list(ID = c("WP_012391491.1/58-334", "WP_045025307.1/57-335", 
"WP_065911868.1/57-334", "WP_094130548.1/57-334", "WP_041093274.1/57-335", 
"WP_087741863.1/58-335", "WP_048735837.1/58-335", "WP_024526760.1/58-335", 
"YP_006375059.1/60-339", "4RK1_A", "WP_081134210.1/58-337", "WP_067481377.1/58-337", 
"WP_023519081.1/58-337", "WP_005918334.1/58-332", "WP_011673819.1/58-332", 
"WP_101874263.1/58-332", "YP_004891129.1/58-332", "WP_021730312.1/58-332", 
"WP_105451130.1/58-332", "WP_105448628.1/58-332", "4RK0_A", "NP_816580.1/58-331", 
"WP_014215863.1/58-333", "WP_014074009.1/57-334", "WP_014939645.1/57-334", 
"WP_057909529.1/57-334", "WP_035168530.1/57-335", "YP_001328142.1/20-321", 
"NP_386672.1/20-321", "NP_437689.1/21-322", "WP_064322056.1/23-326", 
"WP_095444766.1/23-326", "WP_022561933.1/19-321", "WP_060691636.1/22-324", 
"WP_062000852.1/16-317", "4RY9_A", "WP_011809835.1/29-330", "WP_067873970.1/47-338", 
"WP_067195222.1/48-334", "WP_108390182.1/48-337", "WP_088455092.1/48-337", 
"WP_013585042.1/48-336", "WP_094181221.1/110-393", "WP_013118380.1/59-348", 
"WP_078845346.1/48-338", "WP_058920843.1/26-338", "WP_020937919.1/34-340", 
"WP_029381425.1/47-337", "WP_013152136.1/51-349", "WP_079255911.1/49-343", 
"WP_078638163.1/49-341", "WP_064731434.1/47-343", "WP_031033051.1/49-344", 
"WP_095681865.1/49-343", "WP_079255907.1/54-335", "WP_077275989.1/55-331", 
"WP_071455952.1/54-331", "WP_068799469.1/44-336", "WP_108392182.1/49-339", 
"WP_012865347.1/56-345", "WP_014104198.1/48-350", "WP_007397977.1/51-337", 
"WP_087607280.1/49-337", "WP_028700814.1/24-338", "WP_028700389.1/48-340", 
"NP_350138.1/55-335", "3G85_A"), PDB = c(FALSE, FALSE, FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, 
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 
TRUE), ligandId = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, "CL MSE RIB", 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "RIB", NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "GOL TLZ", NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "GOL MSE"), `1` = c(138240, 
50687, 1378, 126023, 237469, 124229, 41453, 217657, 91126, 183150, 
107179, 6274, 262561, 176386, 148744, 74195, 93951, 264634, 95933, 
96095, 183149, 188356, 250907, 252918, 243804, 66122, 207410, 
87437, 181530, 187508, 16752, 81323, 265938, 58828, 13021, 638, 
638, 28566, 4878, 89140, 122445, 155228, 125955, 164812, 120428, 
55374, 257779, 219988, 170251, 116415, 120125, 14694, 208562, 
78746, 116429, 115371, 21301, 27608, 88975, 166418, 252468, 197645, 
123797, 220728, 220653, 182317, 183297), `0.9` = c(115828, 171070, 
47923, 71525, 156529, 98825, 165289, 189247, 31455, 31455, 88210, 
61492, 196793, 131804, 143158, 10327, 10327, 10327, 26521, 26521, 
31058, 31058, 123582, 121348, 186492, 34988, 162176, 25258, 25258, 
138442, 15902, 15902, 196359, 62175, 46251, 147381, 147381, 56135, 
59451, 76614, 102076, 130255, 71453, 114643, 96504, 58676, 197451, 
192775, 114280, 23991, 23991, 53944, 190660, 69970, 97189, 94703, 
43808, 33373, 76685, 111928, 125148, 137147, 98782, 193109, 193143, 
26538, 26538), `0.8` = c(110007, 126908, 41980, 81956, 146463, 
7523, 7523, 7523, 13169, 13169, 13169, 54211, 144263, 30337, 
144486, 68026, 68026, 68026, 69168, 69168, 67041, 67041, 93235, 
93313, 91652, 132429, 139245, 64638, 64638, 62222, 81172, 81172, 
141549, 44105, 47435, 96620, 96620, 54519, 55356, 70400, 84802, 
119491, 82179, 108083, 39098, 131355, 142185, 137368, 106528, 
8338, 8338, 8338, 8338, 83518, 39373, 26484, 28621, 51215, 70429, 
105638, 23426, 23426, 85745, 137611, 137684, 57248, 57248), `0.7` = c(67870, 
102232, 35151, 42345, 92357, 99040, 99040, 99040, 6533, 6533, 
6533, 6533, 6533, 20017, 20017, 47416, 47416, 47416, 48130, 48130, 
82070, 82070, 73319, 73626, 7056, 7056, 7056, 46979, 46979, 81437, 
13586, 13586, 13586, 27032, 26586, 85055, 85055, 13402, 13402, 
15242, 15242, 77617, 42498, 67680, 6951, 6951, 6951, 112867, 
67415, 12983, 12983, 12983, 12983, 12983, 54691, 23945, 25470, 
33887, 49979, 65311, 19335, 19335, 19335, 113208, 113217, 81718, 
81718), `0.6` = c(8366, 8366, 23618, 32642, 70323, 78974, 78974, 
78974, 1269, 1269, 1269, 1269, 1269, 1269, 1269, 1269, 1269, 
1269, 1269, 1269, 1269, 1269, 56028, 56694, 83944, 83944, 83944, 
11860, 11860, 11860, 15130, 15130, 15130, 15130, 25412, 64954, 
64954, 24654, 24654, 38268, 38268, 59264, 32603, 51690, 10955, 
10955, 10955, 10955, 50901, 18836, 18836, 18836, 18836, 18836, 
42361, 6461, 6461, 28216, 38284, 50584, 61467, 61467, 61467, 
11132, 11132, 62398, 62398), `0.5` = c(568, 568, 568, 568, 568, 
568, 568, 568, 28283, 28283, 28283, 28283, 28283, 28283, 28283, 
28283, 28283, 28283, 28283, 28283, 28283, 28283, 43941, 44445, 
66203, 66203, 66203, 1080, 1080, 1080, 1080, 1080, 1080, 1080, 
1080, 1080, 1080, 3884, 3884, 3884, 3884, 3884, 4264, 4264, 8047, 
8047, 8047, 8047, 8047, 14712, 14712, 14712, 14712, 14712, 15093, 
15671, 15671, 21991, 29850, 39515, 48308, 48308, 48308, 68304, 
68304, 48996, 48996), `0.4` = c(2884, 2884, 2884, 2884, 2884, 
2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 
2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 
809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 
809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 
809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 13941, 
13941), `0.3` = c(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1561, 1561, 1561, 1561, 1561, 
1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 
1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 
1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 
1962, 1962), `0.2` = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)), class = c("tbl_df", "tbl", 
"data.frame"), row.names = c(NA, -67L))

为解决这个问题,我编写了一个函数并多次调用。

我想在不调用大量时间的情况下执行此操作,并且在可能的情况下无需编写函数。

我的解决方案可以工作,但是要花费很长时间。


那么,有人知道更好的策略吗?

这是输出:

assingn_lig <- function(data, group_col){
  data %>%
    dplyr::group_by(!!rlang::ensym(group_col)) %>%
    mutate( 
      ligandId = case_when(
        PDB == TRUE ~ ligandId ,
        !is.na(ligandId) ~ ligandId ,
        any( PDB == T ) & is.na(ligandId) ~ paste(unique(unlist(str_split(na.omit(ligandId), " ")) ), collapse = " "),
        TRUE ~ NA_character_  
      )
    ) %>%
    ungroup()
}

tmp2 %>%
  assingn_lig(group_col = `1`) %>%
  assingn_lig(group_col = `0.9`) %>%
  assingn_lig(group_col = `0.8`) %>%
  assingn_lig(group_col = `0.7`) %>%
  assingn_lig(group_col = `0.6`) %>%
  assingn_lig(group_col = `0.5`) %>%
  assingn_lig(group_col = `0.4`) %>%
  assingn_lig(group_col = `0.3`) %>%
  assingn_lig(group_col = `0.2`)

感谢您的帮助。 预先感谢。

0 个答案:

没有答案