在数据框中的一组内使用dplyr进行操作

时间:2019-03-23 03:10:49

标签: r dplyr

下面,我有以下数据集(以制表符分隔为可重现的示例)。我正在使用dplyr函数集将四个列添加到我的数据框中。如果组1){ID,2)gene和3)gene + {{ 1}}。我或多或少都得到了最后两个,但是由于某种原因我无法得到第一个,如果观察到的是其exon组中的最后一个,则将其标记。

我的代码为2)ID和3)gene + gene中的最后观察生成标志:

exon

对于1)df2 <- df %>% dplyr::group_by(gene) %>% dplyr::mutate(lastObsFlagG = ifelse(is.na(replace(gene, n(), 1)),1,0)) %>% ungroup %>% dplyr::group_by(gene,exon) %>% dplyr::mutate(lastObsFlagGE = ifelse(is.na(replace(gene, n(), 1)),1,0)) %>% data.frame() ,我已经尝试

ID

,但这在最后一次观察中没有给出值1。


数据集

dplyr::group_by(ID) %>%
dplyr::mutate(lastObsFlagID = ifelse(row_number()==1, "1", "0"))

更新

"ID" "gene" "exon" "mutation" "TCGA-AN-A046" "OR4F5" "E1" 69767 "TCGA-A2-A0CP" "SAMD11" "E2" 925952 "TCGA-A8-A08H" "NOC2L" "E5" 956126 "TCGA-GM-A2DM" "NOC2L" "E4" 956911 "TCGA-GM-A2DM" "NOC2L" "E4" 956912 "TCGA-D8-A1XM" "KLHL17" "E3" 961658 "TCGA-BH-A18G" "KLHL17" "E5" 962441 "TCGA-3C-AALI" "KLHL17" "E8" 963353 "TCGA-AC-A62Y" "KLHL17" "E9" 964004 "TCGA-AR-A2LE" "PLEKHN1" "E1" 966556 "TCGA-E2-A14N" "PLEKHN1" "E5" 970728 "TCGA-AO-A0J4" "PLEKHN1" "E12" 973506 "TCGA-D8-A1J9" "HES4" "E3" 999551 "TCGA-EW-A1PH" "ISG15" "E2" 1014276 "TCGA-A2-A0T0" "AGRN" "E2" 1022338 "TCGA-GM-A2DD" "AGRN" "E3" 1035303 "TCGA-5L-AAT1" "AGRN" "E4" 1040690 "TCGA-OL-A5RW" "AGRN" "E8" 1043314 "TCGA-D8-A27M" "AGRN" "E25" 1049355 "TCGA-AR-A1AI" "AGRN" "E29" 1050430 "TCGA-5L-AAT0" "AGRN" "E36" 1055374 "TCGA-5L-AAT0" "AGRN" "E36" 1055376 "TCGA-C8-A8HP" "AGRN" "E36" 1055442 "TCGA-A7-A4SD" "TTLL10" "E13" 1184971 "TCGA-BH-A1F0" "SDF4" "E4" 1223283 "TCGA-AO-A128" "SDF4" "E4" 1223330 "TCGA-E9-A1R0" "SDF4" "E2" 1228592 "TCGA-A2-A04P" "UBE2J2" "E7" 1255246 "TCGA-C8-A274" "UBE2J2" "E7" 1255342 "TCGA-5L-AAT1" "SCNN1D" "E1" 1281422 "TCGA-AO-A128" "SCNN1D" "E6" 1287116 "TCGA-E2-A15R" "SCNN1D" "E7" 1287596 "TCGA-AC-A62V" "SCNN1D" "E11" 1290543 "TCGA-BH-A18V" "ACAP3" "E22" 1294187 "TCGA-A7-A6VX" "ACAP3" "E6" 1300640 "TCGA-GM-A2DB" "ACAP3" "E3" 1303170 "TCGA-EW-A1IY" "ACAP3" "E3" 1303176 "TCGA-D8-A1XQ" "CPSF3L" "E9" 1313879 "TCGA-5L-AAT1" "CPSF3L" "E9" 1313888 "TCGA-C8-A26Y" "CPSF3L" "E7" 1314919 "TCGA-D8-A1XK" "CPSF3L" "E2" 1321057 "TCGA-AO-A128" "TAS1R3" "E2" 1331863 "TCGA-A8-A07P" "TAS1R3" "E6" 1334323 "TCGA-A7-A0DA" "DVL1" "E14" 1338066 "TCGA-C8-A8HQ" "DVL1" "E10" 1339589 "TCGA-BH-A18T" "DVL1" "E8" 1340130 "TCGA-C8-A12V" "MXRA8" "E6" 1354445 "TCGA-C8-A3M8" "AURKAIP1" "E2" 1374747 "TCGA-BH-A0B6" "CCNL2" "E11" 1387308 "TCGA-A8-A09Z" "CCNL2" "E4" 1395413 "TCGA-AC-A23H" "MRPL20" "E4" 1402084 "TCGA-BH-A1FU" "MRPL20" "E4" 1402116 "TCGA-BH-A0W4" "MRPL20" "E4" 1402194 "TCGA-AR-A1AH" "MRPL20" "E4" 1402205 "TCGA-A8-A06Q" "ANKRD65" "E1" 1420868 "TCGA-AC-A8OQ" "ATAD3C" "E1" 1450566 "TCGA-A2-A25A" "ATAD3C" "E11" 1462661 "TCGA-AR-A5QQ" "ATAD3B" "E7" 1482563 "TCGA-AO-A1KS" "ATAD3B" "E12" 1487900 "TCGA-AO-A124" "ATAD3B" "E15" 1490662 "TCGA-A7-A56D" "ATAD3B" "E16" 1495857 "TCGA-D8-A27N" "ATAD3B" "E16" 1495961 "TCGA-AR-A2LH" "ATAD3A" "E5" 1518929 "TCGA-EW-A1OY" "ATAD3A" "E12" 1525277 "TCGA-AO-A128" "ATAD3A" "E16" 1533987 "TCGA-BH-A1FM" "SSU72" "E3" 1544993 "TCGA-C8-A12Y" "MIB2" "E4" 1623874 "TCGA-AC-A23H" "MIB2" "E7" 1625321 "TCGA-B6-A0RV" "MIB2" "E10" 1626960 "TCGA-E2-A1LG" "MIB2" "E19" 1629667 "TCGA-C8-A1HI" "SLC35E2B" "E8" 1668398 "TCGA-A2-A0CR" "SLC35E2B" "E8" 1668406 "TCGA-PL-A8LZ" "SLC35E2B" "E7" 1669727 "TCGA-C8-A3M7" "CDK11A" "E17" 1703847 "TCGA-D8-A1JP" "CDK11A" "E17" 1703884 "TCGA-C8-A1HK" "CDK11A" "E17" 1703915 "TCGA-D8-A27G" "CDK11A" "E15" 1704265 "TCGA-5L-AAT1" "CDK11A" "E13" 1705003 "TCGA-D8-A1JE" "CDK11A" "E11" 1707419 "TCGA-BH-A18P" "CDK11A" "E5" 1716352 "TCGA-D8-A27G" "CDK11A" "E5" 1716387 "TCGA-EW-A6SD" "CDK11A" "E5" 1716477 "TCGA-BH-A0W4" "CDK11A" "E3" 1721601 "TCGA-AN-A049" "CDK11A" "E3" 1721603 "TCGA-D8-A1XK" "SLC35E2" "E3" 1739030 "TCGA-LL-A5YM" "SLC35E2" "E1" 1745772 "TCGA-A7-A26H" "NADK" "E5" 1756596 "TCGA-AO-A128" "GNB1" "E5" 1815804 "TCGA-A2-A3Y0" "CALML6" "E4" 1916819 "TCGA-AR-A0U3" "CALML6" "E4" 1916828 "TCGA-AR-A5QQ" "GABRD" "E3" 2025399 "TCGA-A2-A0CP" "PRKCZ" "E8" 2148880 "TCGA-A2-A3Y0" "PRKCZ" "E13" 2172304 "TCGA-E2-A15E" "PRKCZ" "E13" 2172364 "TCGA-BH-A18N" "C1orf86" "E2" 2194056 "TCGA-OL-A5D6" "SKI" "E1" 2229410 "TCGA-BH-A2L8" "SKI" "E3" 2303317 "TCGA-D8-A4Z1" "SKI" "E5" 2304504 "TCGA-BH-A0B6" "SKI" "E5" 2304579 "TCGA-GM-A2D9" "MORN1" "E14" 2321445 "TCGA-AR-A0TR" "MORN1" "E12" 2336532 "TCGA-BH-A18G" "MORN1" "E8" 2372498 "TCGA-AN-A0FV" "MORN1" "E8" 2372505 "TCGA-AN-A0FV" "MORN1" "E8" 2372506 "TCGA-BH-A0HF" "MORN1" "E8" 2372521 "TCGA-E2-A1IG" "MORN1" "E8" 2372562 "TCGA-AQ-A04J" "MORN1" "E7" 2374471 "TCGA-BH-A1F5" "MORN1" "E4" 2387496 "TCGA-A8-A09Z" "MORN1" "E3" 2388258 "TCGA-BH-A0HA" "MORN1" "E3" 2388291 "TCGA-B6-A0IA" "RER1" "E2" 2397118 "TCGA-E2-A10C" "PEX10" "E3" 2408786 "TCGA-AC-A3HN" "PLCH2" "E1" 2476493 "TCGA-AC-A8OP" "PLCH2" "E3" 2479760 "TCGA-A2-A0YK" "PLCH2" "E5" 2484610 "TCGA-B6-A0IK" "PLCH2" "E11" 2491251 "TCGA-C8-A135" "PLCH2" "E18" 2498827 "TCGA-E2-A15E" "PLCH2" "E19" 2499127 "TCGA-PE-A5DE" "PLCH2" "E22" 2504427 "TCGA-LD-A74U" "PANK4" "E16" 2510680 "TCGA-S3-AA10" "PANK4" "E15" 2511386 "TCGA-C8-A1HM" "PANK4" "E10" 2515609 "TCGA-AC-A23H" "PANK4" "E7" 2519284 "TCGA-BH-A18N" "PANK4" "E4" 2520820 "TCGA-BH-A0HF" "PANK4" "E4" 2520821 "TCGA-BH-A0HF" "PANK4" "E3" 2521173 "TCGA-5L-AAT1" "PANK4" "E3" 2521297 "TCGA-B6-A0RN" "PANK4" "E2" 2521747 "TCGA-5L-AAT1" "PANK4" "E1" 2526579 "TCGA-C8-A12V" "TNFRSF14" "E3" 2558349 "TCGA-OL-A66P" "TNFRSF14" "E6" 2561704 "TCGA-A2-A25F" "TNFRSF14" "E6" 2561714 "TCGA-AC-A23H" "TNFRSF14" "E8" 2563296 "TCGA-A7-A6VV" "MMEL1" "E20" 2592868 "TCGA-D8-A1J8" "MMEL1" "E20" 2592908 "TCGA-A2-A0EU" "MMEL1" "E15" 2596056 "TCGA-AR-A1AJ" "MMEL1" "E11" 2603941 "TCGA-E2-A572" "MMEL1" "E10" 2604271 "TCGA-EW-A1J5" "MMEL1" "E10" 2604275 "TCGA-E2-A574" "MMEL1" "E9" 2605620 "TCGA-AC-A23H" "MMEL1" "E5" 2609733 "TCGA-B6-A0IA" "MMEL1" "E5" 2609757 "TCGA-AN-A0XS" "ACTRT2" "E1" 3021531 "TCGA-E9-A1RC" "ACTRT2" "E1" 3022445 "TCGA-AC-A6IW" "ACTRT2" "E1" 3022597 "TCGA-A2-A4S3" "ACTRT2" "E1" 3022696 "TCGA-A2-A0EX" "PRDM16" "E4" 3385231 "TCGA-B6-A0WZ" "PRDM16" "E5" 3396498 "TCGA-E9-A22E" "PRDM16" "E6" 3402898 "TCGA-BH-A8FZ" "PRDM16" "E9" 3411750 "TCGA-BH-A0HF" "PRDM16" "E9" 3412534 "TCGA-A8-A09V" "PRDM16" "E15" 3431074 "TCGA-5L-AAT1" "ARHGEF16" "E2" 3463558 "TCGA-AR-A1AI" "ARHGEF16" "E4" 3467276 "TCGA-E9-A1R4" "ARHGEF16" "E13" 3479535 "TCGA-AC-A5XS" "MEGF6" "E37" 3490581 "TCGA-AC-A23H" "MEGF6" "E7" 3514608 "TCGA-A2-A0SY" "MEGF6" "E2" 3602584 "TCGA-AC-A23H" "TPRG1L" "E2" 3625459 "TCGA-AR-A1AS" "WRAP73" "E1" 3649987 "TCGA-BH-A0DX" "TP73" "E2" 3682353 "TCGA-A1-A0SF" "TP73" "E3" 3683095 "TCGA-5L-AAT1" "TP73" "E3" 3683125 "TCGA-A8-A095" "TP73" "E4" 3707673 "TCGA-BH-A0HA" "TP73" "E8" 3727733 "TCGA-E9-A1R5" "CCDC27" "E1" 3752488 "TCGA-BH-A0C0" "CCDC27" "E3" 3755474 "TCGA-GM-A5PV" "CCDC27" "E6" 3762664 "TCGA-A2-A3Y0" "CCDC27" "E9" 3766586 "TCGA-BH-A1F6" "LRRC47" "E7" 3781140 "TCGA-B6-A0WZ" "LRRC47" "E7" 3781267 "TCGA-BH-A0B6" "LRRC47" "E2" 3787022 "TCGA-AR-A2LE" "CEP104" "E19" 3823467 "TCGA-A2-A0EN" "CEP104" "E12" 3833888 "TCGA-A8-A09Z" "CEP104" "E11" 3835070 "TCGA-AC-A4ZE" "DFFB" "E7" 3883608 "TCGA-AR-A1AH" "C1orf174" "E4" 3890056 "TCGA-BH-A18G" "AJAP1" "E2" 4711920 "TCGA-BH-A1F2" "AJAP1" "E3" 4769894 "TCGA-AC-A23H" "AJAP1" "E4" 4772372 "TCGA-C8-A12L" "AJAP1" "E4" 4772486 "TCGA-AC-A23H" "NPHP4" "E28" 5864419 "TCGA-BH-A1FD" "NPHP4" "E28" 5864478 "TCGA-EW-A1IZ" "NPHP4" "E27" 5865131 "TCGA-EW-A1J5" "NPHP4" "E27" 5865172 "TCGA-A2-A04P" "NPHP4" "E26" 5866378 "TCGA-D8-A1XZ" "NPHP4" "E24" 5867849 "TCGA-4H-AAAK" "NPHP4" "E20" 5877217 "TCGA-D8-A1JD" "NPHP4" "E19" 5880178 "TCGA-GM-A2D9" "NPHP4" "E17" 5890905 "TCGA-E2-A1LA" "NPHP4" "E17" 5890987 "TCGA-D8-A1J8" "NPHP4" "E17" 5891009 "TCGA-E2-A1B4" "NPHP4" "E16" 5904787 "TCGA-EW-A1P3" "NPHP4" "E13" 5907125 "TCGA-BH-A0EE" "NPHP4" "E12" 5909177 "TCGA-A2-A0ER" "NPHP4" "E11" 5927714 "TCGA-BH-A0HF" "NPHP4" "E11" 5927761 "TCGA-AO-A128" "NPHP4" "E9" 5947174 "TCGA-AC-A23H" "NPHP4" "E4" 5969131 "TCGA-AC-A23H" "NPHP4" "E2" 5986185 lastObsFlagG看起来像这样。同样,lastObsFlagGE(未显示)应主要为1,但如果观察值不在该唯一ID的最后一行(例如,第4行应具有lastObsFlagID = 0,但第5行应具有lastObsFlagID = 1),则应具有0。

lastObsFlagID

1 个答案:

答案 0 :(得分:1)

我们可以使用相同的逻辑为不同的组生成标志1/0。在这里,我们检查当前组中的row_number()是否是最后一个(n()),该值返回TRUE/FALSE值,然后使用as.integer将其转换为1/0。

library(dplyr)
library(data.table)

df %>% 
  group_by(gene) %>%
  mutate(lastObsFlagG = as.integer(row_number() == n())) %>%
  group_by(gene,exon) %>%
  mutate(lastObsFlagGE = as.integer(row_number() == n())) %>%
  ungroup() %>%
  group_by(group = rleid(ID)) %>%
  mutate(lastObsFlagID = as.integer(row_number() == n())) %>%
  ungroup() %>%
  select(-group)


#   ID           gene    exon  mutation lastObsFlagG lastObsFlagGE lastObsFlagID
#  <fct>        <fct>   <fct>    <int>        <int>         <int>         <int>
# 1 TCGA-AN-A046 OR4F5   E1       69767            1             1             1
# 2 TCGA-A2-A0CP SAMD11  E2      925952            1             1             1
# 3 TCGA-A8-A08H NOC2L   E5      956126            0             1             1
# 4 TCGA-GM-A2DM NOC2L   E4      956911            0             0             0
# 5 TCGA-GM-A2DM NOC2L   E4      956912            1             1             1
# 6 TCGA-D8-A1XM KLHL17  E3      961658            0             1             1
# 7 TCGA-BH-A18G KLHL17  E5      962441            0             1             1
# 8 TCGA-3C-AALI KLHL17  E8      963353            0             1             1
# 9 TCGA-AC-A62Y KLHL17  E9      964004            1             1             1
#10 TCGA-AR-A2LE PLEKHN1 E1      966556            0             1             1
# … with 190 more rows