使用最小条件子集data.table

时间:2014-01-18 01:28:50

标签: r data.table

这个问题可能有一个非常简单的解决方案,但我无法通过Google搜索或data.table常见问题解答找到它。

我有一个像这样的data.table:

> test
    chr     bp          ID   REF       ALT        AF  AC   AN                EFFECT   IMPACT FUNCLASS CODING       GENE      pos effRank
 1:   1 860416  rs61464428     G         A 0.5000000  14   28              UPSTREAM MODIFIER          CODING     SAMD11 1:860416      21
 2:   1 860416  rs61464428     G         A 0.5000000  14   28              UPSTREAM MODIFIER          CODING     SAMD11 1:860416      21
 3:   1 860416  rs61464428     G         A 0.5000000  14   28            DOWNSTREAM MODIFIER          CODING AL645608.1 1:860416      22
 4:   1 860461  rs57465118     G         A 1.0000000  62   62              UPSTREAM MODIFIER          CODING     SAMD11 1:860461      21
 5:   1 860461  rs57465118     G         A 1.0000000  62   62              UPSTREAM MODIFIER          CODING     SAMD11 1:860461      21
 6:   1 860461  rs57465118     G         A 1.0000000  62   62            DOWNSTREAM MODIFIER          CODING AL645608.1 1:860461      22
 7:   1 860521  rs57924093     C         A 0.9840000  61   62              UPSTREAM MODIFIER          CODING     SAMD11 1:860521      21
 8:   1 860521  rs57924093     C         A 0.9840000  61   62              UPSTREAM MODIFIER          CODING     SAMD11 1:860521      21
 9:   1 860521  rs57924093     C         A 0.9840000  61   62            DOWNSTREAM MODIFIER          CODING AL645608.1 1:860521      22
10:   1 861261 rs144896029     G         A 0.0027270   3 1100              UPSTREAM MODIFIER          CODING     SAMD11 1:861261      21
11:   1 861261 rs144896029     G         A 0.0027270   3 1100            DOWNSTREAM MODIFIER          CODING AL645608.1 1:861261      22
12:   1 861332                 G         A 0.0009074   1 1102 NON_SYNONYMOUS_CODING MODERATE MISSENSE CODING AL645608.1 1:861332      11
13:   1 861332                 G         A 0.0009074   1 1102 NON_SYNONYMOUS_CODING MODERATE MISSENSE CODING     SAMD11 1:861332      11
14:   1 861332                 G         A 0.0009074   1 1102 NON_SYNONYMOUS_CODING MODERATE MISSENSE CODING     SAMD11 1:861332      11
15:   1 861332                 G         A 0.0009074   1 1102 NON_SYNONYMOUS_CODING MODERATE MISSENSE CODING     SAMD11 1:861332      11
16:   1 861332                 G         A 0.0009074   1 1102              UPSTREAM MODIFIER          CODING     SAMD11 1:861332      21
17:   1 865455                 C         G 0.0033190   3  904              UPSTREAM MODIFIER          CODING     SAMD11 1:865455      21
18:   1 865628  rs41285790     G         A 0.0027780   3 1080 NON_SYNONYMOUS_CODING MODERATE MISSENSE CODING     SAMD11 1:865628      11
19:   1 865628  rs41285790     G         A 0.0027780   3 1080 NON_SYNONYMOUS_CODING MODERATE MISSENSE CODING     SAMD11 1:865628      11
20:   1 865628  rs41285790     G         A 0.0027780   3 1080 NON_SYNONYMOUS_CODING MODERATE MISSENSE CODING     SAMD11 1:865628      11
21:   1 865628  rs41285790     G         A 0.0027780   3 1080     SYNONYMOUS_CODING      LOW   SILENT CODING AL645608.1 1:865628      14
22:   1 865628  rs41285790     G         A 0.0027780   3 1080              UPSTREAM MODIFIER          CODING     SAMD11 1:865628      21
23:   1 866437 rs139076934     C         T 0.0009074   1 1102     SYNONYMOUS_CODING      LOW   SILENT CODING AL645608.1 1:866437      14
24:   1 866437 rs139076934     C         T 0.0009074   1 1102     SYNONYMOUS_CODING      LOW   SILENT CODING     SAMD11 1:866437      14
25:   1 866437 rs139076934     C         T 0.0009074   1 1102     SYNONYMOUS_CODING      LOW   SILENT CODING     SAMD11 1:866437      14
26:   1 866437 rs139076934     C         T 0.0009074   1 1102     SYNONYMOUS_CODING      LOW   SILENT CODING     SAMD11 1:866437      14
27:   1 866461 rs148884928     G         A 0.0009074   1 1102     SYNONYMOUS_CODING      LOW   SILENT CODING     SAMD11 1:866461      14
28:   1 866461 rs148884928     G         A 0.0009074   1 1102     SYNONYMOUS_CODING      LOW   SILENT CODING     SAMD11 1:866461      14
29:   1 866461 rs148884928     G         A 0.0009074   1 1102     SYNONYMOUS_CODING      LOW   SILENT CODING     SAMD11 1:866461      14
30:   1 866461 rs148884928     G         A 0.0009074   1 1102              UPSTREAM MODIFIER          CODING AL645608.1 1:866461      21
31:   1 866511  rs71576583 CCCCT CCCCTCCCT 1.0000000 148  148              UPSTREAM MODIFIER          CODING AL645608.1 1:866511      21
32:   1 871057                 C         T 0.0009074   1 1102              UPSTREAM MODIFIER          CODING     SAMD11 1:871057      21
33:   1 871057                 C         T 0.0009074   1 1102              UPSTREAM MODIFIER          CODING AL645608.1 1:871057      21
34:   1 871057                 C         T 0.0009074   1 1102              UPSTREAM MODIFIER          CODING     SAMD11 1:871057      21
35:   1 871215  rs28419423     C         G 0.0036300   4 1102     SYNONYMOUS_CODING      LOW   SILENT CODING     SAMD11 1:871215      14
36:   1 871215  rs28419423     C         G 0.0036300   4 1102     SYNONYMOUS_CODING      LOW   SILENT CODING     SAMD11 1:871215      14
37:   1 871215  rs28419423     C         G 0.0036300   4 1102              UPSTREAM MODIFIER          CODING     SAMD11 1:871215      21
38:   1 871215  rs28419423     C         G 0.0036300   4 1102              UPSTREAM MODIFIER          CODING     SAMD11 1:871215      21
39:   1 871215  rs28419423     C         G 0.0036300   4 1102              UPSTREAM MODIFIER          CODING AL645608.1 1:871215      21
40:   1 871215  rs28419423     C         G 0.0036300   4 1102            DOWNSTREAM MODIFIER          CODING     SAMD11 1:871215      22
41:   1 871287                 C         G 0.0009107   1 1098              UPSTREAM MODIFIER          CODING     SAMD11 1:871287      21
42:   1 871287                 C         G 0.0009107   1 1098              UPSTREAM MODIFIER          CODING     SAMD11 1:871287      21
43:   1 871287                 C         G 0.0009107   1 1098              UPSTREAM MODIFIER          CODING AL645608.1 1:871287      21
44:   1 871287                 C         G 0.0009107   1 1098            DOWNSTREAM MODIFIER          CODING     SAMD11 1:871287      22
45:   1 871334   rs4072383     G         T 0.6680000 474  710              UPSTREAM MODIFIER          CODING     SAMD11 1:871334      21
46:   1 871334   rs4072383     G         T 0.6680000 474  710              UPSTREAM MODIFIER          CODING     SAMD11 1:871334      21
47:   1 871334   rs4072383     G         T 0.6680000 474  710              UPSTREAM MODIFIER          CODING AL645608.1 1:871334      21
48:   1 871334   rs4072383     G         T 0.6680000 474  710            DOWNSTREAM MODIFIER          CODING     SAMD11 1:871334      22
49:   1 874415  rs74047412     C         T 0.0018250   2 1096              UPSTREAM MODIFIER          CODING     SAMD11 1:874415      21
50:   1 874415  rs74047412     C         T 0.0018250   2 1096              UPSTREAM MODIFIER          CODING     SAMD11 1:874415      21
    chr     bp          ID   REF       ALT        AF  AC   AN                EFFECT   IMPACT FUNCLASS CODING       GENE      pos effRank

如您所见,对于某些列,许多行中的值都是重复的。我想要做的是根据effRank变量的值(min)删除重复的行。我已将密钥设置为chrbpeffRank。所以表格应该根据这三列进行排序。我有点亲近。以下命令返回我想要的行,但不返回我想要的所有列。

> test[,min(effRank), by=pos]
         pos V1
 1: 1:860416 21
 2: 1:860461 21
 3: 1:860521 21
 4: 1:861261 21
 5: 1:861332 11
 6: 1:865455 21
 7: 1:865628 11
 8: 1:866437 14
 9: 1:866461 14
10: 1:866511 21
11: 1:871057 21
12: 1:871215 14
13: 1:871287 21
14: 1:871334 21
15: 1:874415 21

我需要的是一种方法,使上面的命令返回data.table中的所有列,而不仅仅是表达式中提到的列。否则,完美的工作。任何帮助表示赞赏。对于那些自己做出示例的人来说,dput的输出是低于的。

干杯, 戴维

> dput(test)
structure(list(chr = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), bp = c(860416L, 860416L, 860416L, 
860461L, 860461L, 860461L, 860521L, 860521L, 860521L, 861261L, 
861261L, 861332L, 861332L, 861332L, 861332L, 861332L, 865455L, 
865628L, 865628L, 865628L, 865628L, 865628L, 866437L, 866437L, 
866437L, 866437L, 866461L, 866461L, 866461L, 866461L, 866511L, 
871057L, 871057L, 871057L, 871215L, 871215L, 871215L, 871215L, 
871215L, 871215L, 871287L, 871287L, 871287L, 871287L, 871334L, 
871334L, 871334L, 871334L, 874415L, 874415L), ID = structure(c(10L, 
10L, 10L, 8L, 8L, 8L, 9L, 9L, 9L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 
1L, 7L, 7L, 7L, 7L, 7L, 2L, 2L, 2L, 2L, 4L, 4L, 4L, 4L, 11L, 
1L, 1L, 1L, 5L, 5L, 5L, 5L, 5L, 5L, 1L, 1L, 1L, 1L, 6L, 6L, 6L, 
6L, 12L, 12L), .Label = c("", "rs139076934", "rs144896029", "rs148884928", 
"rs28419423", "rs4072383", "rs41285790", "rs57465118", "rs57924093", 
"rs61464428", "rs71576583", "rs74047412"), class = "factor"), 
    REF = structure(c(3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 3L, 
    3L, 3L, 3L, 3L, 3L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 
    1L, 3L, 3L, 3L, 3L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 1L, 1L), .Label = c("C", 
    "CCCCT", "G"), class = "factor"), ALT = structure(c(1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 
    1L, 1L, 1L, 1L, 1L, 4L, 4L, 4L, 4L, 1L, 1L, 1L, 1L, 2L, 4L, 
    4L, 4L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 
    4L, 4L, 4L), .Label = c("A", "CCCCTCCCT", "G", "T"), class = "factor"), 
    AF = c(0.5, 0.5, 0.5, 1, 1, 1, 0.984, 0.984, 0.984, 0.002727, 
    0.002727, 0.0009074, 0.0009074, 0.0009074, 0.0009074, 0.0009074, 
    0.003319, 0.002778, 0.002778, 0.002778, 0.002778, 0.002778, 
    0.0009074, 0.0009074, 0.0009074, 0.0009074, 0.0009074, 0.0009074, 
    0.0009074, 0.0009074, 1, 0.0009074, 0.0009074, 0.0009074, 
    0.00363, 0.00363, 0.00363, 0.00363, 0.00363, 0.00363, 0.0009107, 
    0.0009107, 0.0009107, 0.0009107, 0.668, 0.668, 0.668, 0.668, 
    0.001825, 0.001825), AC = c(14L, 14L, 14L, 62L, 62L, 62L, 
    61L, 61L, 61L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 
    3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 148L, 1L, 1L, 1L, 
    4L, 4L, 4L, 4L, 4L, 4L, 1L, 1L, 1L, 1L, 474L, 474L, 474L, 
    474L, 2L, 2L), AN = c(28L, 28L, 28L, 62L, 62L, 62L, 62L, 
    62L, 62L, 1100L, 1100L, 1102L, 1102L, 1102L, 1102L, 1102L, 
    904L, 1080L, 1080L, 1080L, 1080L, 1080L, 1102L, 1102L, 1102L, 
    1102L, 1102L, 1102L, 1102L, 1102L, 148L, 1102L, 1102L, 1102L, 
    1102L, 1102L, 1102L, 1102L, 1102L, 1102L, 1098L, 1098L, 1098L, 
    1098L, 710L, 710L, 710L, 710L, 1096L, 1096L), EFFECT = structure(c(4L, 
    4L, 1L, 4L, 4L, 1L, 4L, 4L, 1L, 4L, 1L, 2L, 2L, 2L, 2L, 4L, 
    4L, 2L, 2L, 2L, 3L, 4L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 
    4L, 4L, 4L, 3L, 3L, 4L, 4L, 4L, 1L, 4L, 4L, 4L, 1L, 4L, 4L, 
    4L, 1L, 4L, 4L), .Label = c("DOWNSTREAM", "NON_SYNONYMOUS_CODING", 
    "SYNONYMOUS_CODING", "UPSTREAM"), class = "factor"), IMPACT = structure(c(3L, 
    3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 3L, 
    3L, 2L, 2L, 2L, 1L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 
    3L, 3L, 3L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
    3L, 3L, 3L, 3L), .Label = c("LOW", "MODERATE", "MODIFIER"
    ), class = "factor"), FUNCLASS = structure(c(1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 
    2L, 2L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 
    1L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L), .Label = c("", "MISSENSE", "SILENT"), class = "factor"), 
    CODING = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "CODING", class = "factor"), 
    GENE = structure(c(2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 
    1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 
    2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 
    2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L), .Label = c("AL645608.1", 
    "SAMD11"), class = "factor"), pos = structure(c(1L, 1L, 1L, 
    2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 6L, 7L, 
    7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 10L, 11L, 
    11L, 11L, 12L, 12L, 12L, 12L, 12L, 12L, 13L, 13L, 13L, 13L, 
    14L, 14L, 14L, 14L, 15L, 15L), .Label = c("1:860416", "1:860461", 
    "1:860521", "1:861261", "1:861332", "1:865455", "1:865628", 
    "1:866437", "1:866461", "1:866511", "1:871057", "1:871215", 
    "1:871287", "1:871334", "1:874415"), class = "factor"), effRank = c(21L, 
    21L, 22L, 21L, 21L, 22L, 21L, 21L, 22L, 21L, 22L, 11L, 11L, 
    11L, 11L, 21L, 21L, 11L, 11L, 11L, 14L, 21L, 14L, 14L, 14L, 
    14L, 14L, 14L, 14L, 21L, 21L, 21L, 21L, 21L, 14L, 14L, 21L, 
    21L, 21L, 22L, 21L, 21L, 21L, 22L, 21L, 21L, 21L, 22L, 21L, 
    21L)), .Names = c("chr", "bp", "ID", "REF", "ALT", "AF", 
"AC", "AN", "EFFECT", "IMPACT", "FUNCLASS", "CODING", "GENE", 
"pos", "effRank"), row.names = c(NA, -50L), class = c("data.table", 
"data.frame"), .internal.selfref = <pointer: 0x0000000004260788>, sorted = c("chr", 
"bp", "effRank"))

1 个答案:

答案 0 :(得分:8)

您可以使用内部变量.I,它给出行号。然后subset使用这些值,如下所示:

DT[DT[, .I[which.min(effRank)], pos]$V1]

如果您按如下方式将其写成两行,则更容易理解:

tmp <- DT[, .I[which.min(effRank)], pos]
DT[tmp$V1]

第一行生成一列V1,其中包含按j分组的最小位置(来自pos表达式)的所有行号。然后你只需将它们分组。