创建标识组内最小字符的列并标记关系

时间:2015-11-06 13:59:08

标签: r data.table dplyr subset

我有10个科目的配对数据(有些遗失和有些关系)。我的目标是选择具有最佳eye(A> B> C)的disc_grade,并相应地从下面的数据框中标记关系。

我坚持如何使用R代码为每个主题选择最佳disc_grade的行。

df <- structure(list(patientID = c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 
6, 7, 7, 8, 8, 9, 9, 10, 10), eye = c("R", "L", "R", "L", "R", 
"L", "R", "L", "R", "L", "R", "L", "R", "L", "R", "L", "R", "L", 
"R", "L"), disc_grade = c(NA, "B", "C", "B", "B", "C", "B", "C", 
"B", "A", "B", "B", "C", "B", NA, NA, "B", "C", "B", "C")), .Names = c("patientID", "eye", "disc_grade"), class = c("tbl_df", "data.frame"), row.names = c(NA, -20L))

所需的输出是:

   patientID   eye disc_grade
2          1   L          B
4          2   L          B
5          3   R          B
7          4   R          B
10         5   L          A
11         6   Tie        B
14         7   L          B
17         9   R          B
19        10   R          B

4 个答案:

答案 0 :(得分:3)

这似乎有效:

df %>% 
  group_by(patientID) %>% 
  filter(disc_grade == min(disc_grade, na.rm=TRUE)) %>%
  summarise(eye = if (n()==1) eye else "Tie", disc_grade = first(disc_grade))

  patientID   eye disc_grade
      (dbl) (chr)      (chr)
1         1     L          B
2         2     L          B
3         3     R          B
4         4     R          B
5         5     L          A
6         6   Tie          B
7         7     L          B
8         9     R          B
9        10     R          B

第8组有警告,但由于filterNA的处理方式,我们得到了预期的结果。

使用data.table:

setDT(df)[, 
  .SD[ disc_grade == min(disc_grade, na.rm=TRUE) ][,
    .( eye = if (.N==1) eye else "Tie", disc_grade = disc_grade[1] )
  ]
, by=patientID]

同样,有一个警告,但现在我们确实为第8组获取了一行,因为[不会忽略NA。为了解决这个问题,您可以在操作之前或之后过滤NA(如在其他答案中)。在主要操作期间我最好的想法是相当复杂的:

setDT(df)[, 
  .SD[ which(disc_grade == min(disc_grade, na.rm=TRUE)) ][,
    if (.N >= 1) list( eye = if (.N==1) eye else "Tie", disc_grade = disc_grade[1] )
  ]
, by=patientID]

答案 1 :(得分:2)

library(data.table) na.omit(setDT(df))[, eye:=if(uniqueN(disc_grade)==1 & .N >1) 'Tie' else eye, patientID ][order(factor(disc_grade, levels=c('A', 'B', 'C'))), .SD[1L] ,patientID][order(patientID)] # patientID eye disc_grade #1: 1 L B #2: 2 L B #3: 3 R B #4: 4 R B #5: 5 L A #6: 6 Tie B #7: 7 L B #8: 9 R B #9: 10 R B

的一个选项
Iterator<Foo> aggregate = datastore.createAggregation(Foo.class)
       .project(projection("_id").suppress(),
           projection("field1", "_id"),
           projection("field2"), projection("field3"),
           projection("sales", multiply(projection("value"), projection("amount"))))
      .group("field3", grouping("totalSales", sum("sales")));

答案 2 :(得分:1)

library(dplyr)

df <- structure(list(patientID = c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 
                                   6, 7, 7, 8, 8, 9, 9, 10, 10), eye = c("R", "L", "R", "L", "R", 
                                                                         "L", "R", "L", "R", "L", "R", "L", "R", "L", "R", "L", "R", "L", 
                                                                         "R", "L"), disc_grade = c(NA, "B", "C", "B", "B", "C", "B", "C", 
                                                                                                   "B", "A", "B", "B", "C", "B", NA, NA, "B", "C", "B", "C")), .Names = c("patientID", "eye", "disc_grade"), class = c("tbl_df", "data.frame"), row.names = c(NA, -20L))



df %>%
  filter(!is.na(disc_grade)) %>%                     ## remove rows with NAs
  group_by(patientID) %>%                            ## for each patient
  filter(disc_grade == min(disc_grade)) %>%          ## keep the row (his eye) that has the best score
  mutate(eye_upd = ifelse(n() > 1, "tie", eye)) %>%  ## if you kept both eyes you have a tie
  select(patientID,eye_upd,disc_grade) %>%
  distinct()

#    patientID eye_upd disc_grade
#        (dbl)   (chr)     (fctr)
# 1         1       L          B
# 2         2       L          B
# 3         3       R          B
# 4         4       R          B
# 5         5       L          A
# 6         6     tie          B
# 7         7       L          B
# 8         9       R          B
# 9        10       R          B

答案 3 :(得分:0)

肯定有更好的方法可以做到这一点,但这可以完成工作......需要更多的咖啡......

df_orig <- df

library(dplyr)

df %>%
  filter(!is.na(disc_grade)) %>%
  group_by(patientID) %>%
  summarise(best = min(disc_grade)) %>%
  left_join(., df_orig, by = c("patientID" = "patientID",
                               "best" = "disc_grade")) %>%
  group_by(patientID) %>%
  mutate(eye = ifelse(n() > 1, "tie", eye)) %>%
  distinct(patientID) %>% 
  select(patientID, eye, best)

注意:由于类型对话,我可以使用min(disc_grade)。考虑查看as.numeric(as.factor(df$disc_grade))