根据R中的几种条件保留行

时间:2019-02-25 23:32:45

标签: r dplyr tibble

给出以下data.frame

dput(t2)
structure(list(rs. = c("S1A_494392059", "S1A_494392059", "S1A_497201550", 
"S1A_497201550", "S1A_499864157", "S1A_499864157", "S1B_566171302", 
"S1B_566171302", "S1B_642616640", "S1B_642616640", "S2B_24883552", 
"S2B_24883552", "S2B_75832544", "S2B_75832544", "S2B_784544719", 
"S2B_784544719", "S4B_644330895", "S4B_644330895", "S5A_548234618", 
"S5A_548234618", "S5B_24292046", "S5B_24292046", "S5B_47584429", 
"S5B_47584429", "S5B_513712393", "S5B_513712393", "S5D_550192169", 
"S5D_550192169", "S6B_17686703", "S6B_17686703", "S6B_459374225", 
"S6B_459374225", "S7A_12011058", "S7A_12011058", "S7A_7938818", 
"S7A_7938818", "S7B_124548883", "S7B_124548883", "S7B_576927863", 
"S7B_576927863", "S7B_605313385", "S7B_605313385", "S7B_733461150", 
"S7B_733461150"), marker = c("0", "2", "0", "2", "0", "2", "0", 
"2", "0", "2", "0", "2", "0", "2", "0", "2", "0", "2", "0", "2", 
"0", "2", "0", "2", "0", "2", "0", "2", "0", "2", "0", "2", "0", 
"2", "0", "2", "0", "2", "0", "2", "0", "2", "0", "2"), n = c(653L, 
1463L, 943L, 1110L, 960L, 1100L, 708L, 1335L, 148L, 1060L, 208L, 
1938L, 785L, 1254L, 402L, 1695L, 722L, 1326L, 872L, 1176L, 694L, 
1381L, 619L, 1432L, 581L, 1462L, 383L, 1707L, 235L, 1894L, 458L, 
1636L, 794L, 1281L, 589L, 1484L, 163L, 1979L, 740L, 920L, 868L, 
1215L, 573L, 1521L), prop = c(0.298992673992674, 0.669871794871795, 
0.431776556776557, 0.508241758241758, 0.43956043956044, 0.503663003663004, 
0.324175824175824, 0.611263736263736, 0.0677655677655678, 0.485347985347985, 
0.0952380952380952, 0.887362637362637, 0.359432234432234, 0.574175824175824, 
0.184065934065934, 0.776098901098901, 0.330586080586081, 0.607142857142857, 
0.399267399267399, 0.538461538461538, 0.317765567765568, 0.632326007326007, 
0.283424908424908, 0.655677655677656, 0.266025641025641, 0.669413919413919, 
0.1753663003663, 0.781593406593407, 0.107600732600733, 0.867216117216117, 
0.20970695970696, 0.749084249084249, 0.363553113553114, 0.586538461538462, 
0.269688644688645, 0.67948717948718, 0.0746336996336996, 0.906135531135531, 
0.338827838827839, 0.421245421245421, 0.397435897435897, 0.556318681318681, 
0.262362637362637, 0.696428571428571), BASE = c("C", "C", "C", 
"C", "T", "T", "A", "A", "G", "G", "A", "A", "G", "G", "A", "A", 
"G", "G", "A", "A", "A", "A", "C", "C", "A", "A", "T", "T", "G", 
"G", "C", "C", "A", "A", "G", "G", "A", "A", "T", "T", "A", "A", 
"T", "T"), alleles = c("C/G", "C/G", "C/T", "C/T", "C/T", "C/T", 
"G/A", "G/A", "A/G", "A/G", "A/G", "A/G", "A/G", "A/G", "G/A", 
"G/A", "G/C", "G/C", "A/G", "A/G", "C/A", "C/A", "T/C", "T/C", 
"A/G", "A/G", "T/C", "T/C", "G/A", "G/A", "C/T", "C/T", "G/A", 
"G/A", "G/C", "G/C", "G/A", "G/A", "C/T", "C/T", "A/G", "A/G", 
"T/C", "T/C")), row.names = c(NA, -44L), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"), vars = "rs.", drop = TRUE, indices = list(
0:1, 2:3, 4:5, 6:7, 8:9, 10:11, 12:13, 14:15, 16:17, 18:19, 
20:21, 22:23, 24:25, 26:27, 28:29, 30:31, 32:33, 34:35, 36:37, 
38:39, 40:41, 42:43), group_sizes = c(2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L), biggest_group_size = 2L, labels = structure(list(rs. = 
c("S1A_494392059", 
"S1A_497201550", "S1A_499864157", "S1B_566171302", "S1B_642616640", 
"S2B_24883552", "S2B_75832544", "S2B_784544719", "S4B_644330895", 
"S5A_548234618", "S5B_24292046", "S5B_47584429", "S5B_513712393", 
"S5D_550192169", "S6B_17686703", "S6B_459374225", "S7A_12011058", 
"S7A_7938818", "S7B_124548883", "S7B_576927863", "S7B_605313385", 
"S7B_733461150")), row.names = c(NA, -22L), class = "data.frame", vars = 
"rs.", drop = TRUE))

名为dataframe的{​​{1}}看起来像这样:

t2

我真的很想拥有一种巧妙的方法来实现以下条件:

如果# A tibble: 6 x 6 # Groups: rs. [3] rs. marker n prop BASE alleles <chr> <chr> <int> <dbl> <chr> <chr> 1 S1A_494392059 0 653 0.299 C C/G 2 S1A_494392059 2 1463 0.670 C C/G 3 S1A_497201550 0 943 0.432 C C/T 4 S1A_497201550 2 1110 0.508 C C/T 5 S1A_499864157 0 960 0.440 T C/T 6 S1A_499864157 2 1100 0.504 T C/T 等于t2$BASE的第一个字符串,请子集t2$alleles等于2。否则,如果t2$marker等于{{的第三个字符串1}}请设置等于0的子集t2$BASE。通过逐行应用条件,所需的t2$alleles应该具有初始数据帧行数的一半。

1 个答案:

答案 0 :(得分:1)

可以尝试:

library(dplyr)

t2 %>%
  group_by(rs.) %>%
  filter(
    BASE == substr(alleles, 1, 1) & marker == 2 |
    BASE == substr(alleles, 3, 3) & marker == 0
  )