查找唯一值的第一个示例并返回行号

时间:2018-11-06 14:05:58

标签: r dplyr unique

我有这个数据框:

df <- structure(list(Name = c("Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", 
                          "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", 
                          "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", 
                          "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", 
                          "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2"), 
                 StimulusName = c("Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", 
                                  "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", 
                                  "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", 
                                  "Stim2", "Stim2", "Stim2", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", 
                                  "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim2", 
                                  "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", 
                                  "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2"), 
                 Fixation = c(NA, NA, 1L, 1L, NA, NA, 2L, 2L, 3L, 3L, NA, NA, NA, NA, NA, 4L, 4L, 5L, 5L, NA, NA, NA, NA, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 
                              2L, NA, NA, NA, 3L, 3L, 3L, NA, NA, NA, NA, NA, NA, 1L, 1L, 1L, 1L, 2L, 2L, NA, NA, 3L, 3L, 3L, 4L, 4L, 4L, NA, NA, 1L, 1L, NA, 
                              NA, 2L, 2L, 3L, 3L, NA, NA, NA, NA, NA, 4L, 4L, 5L, 5L, NA)), 
            row.names = c(NA, -79L), class = c("tbl_df", "tbl", "data.frame"))

共有3列:NameStimulusNameFixation

我希望能够返回列Fixation中唯一值的 first 示例的行号,并按Name和{{1 }}。

这是我到目前为止尝试过的(基于在其他地方找到的部分解决方案):

StimulusName

但是当我将其与dplyr链一起使用时,它不会返回原始行号,而是根据分组重新开始行计数:

# function to return rows
Unique_Indices <- function(Values){
  unik <- !duplicated(Values)  ## logical vector of unique values
  return(seq_along(Values)[unik])  ## indices
}

错误的输出如下所示:

enter image description here

您可以看到library(tidyr) # This doesn't work Unique_Index <- df %>% group_by(Name, StimulusName) %>% summarise(Indices = list(Unique_Indices(Fixation))) %>% unnest() 由于使用了Indices指令而移至下一个StimulusName时不包含原始行号。在保留来自group_by的原始行号的同时,有什么我可以group_by的方式?

3 个答案:

答案 0 :(得分:3)

data.table中有一个变量.I,它是行号,因此您可以仅对.I进行子集设置。输出中的Indices应该与@kath注释中的代码输出中的rowname相同。

library(data.table)
setDT(df)

df[, .(Indices = .I[!duplicated(Fixation)])
   , .(Name, StimulusName)]


#     Name StimulusName Indices
#  1: Sub1        Stim1       1
#  2: Sub1        Stim1       3
#  3: Sub1        Stim1       7
#  4: Sub1        Stim1       9
#  5: Sub1        Stim1      16
#  6: Sub1        Stim1      18
#  7: Sub1        Stim2      20
#  8: Sub1        Stim2      24
#  9: Sub1        Stim2      28
# 10: Sub1        Stim2      37
# 11: Sub2        Stim1      40
# 12: Sub2        Stim1      46
# 13: Sub2        Stim1      50
# 14: Sub2        Stim1      54
# 15: Sub2        Stim1      57
# 16: Sub2        Stim2      60
# 17: Sub2        Stim2      62
# 18: Sub2        Stim2      66
# 19: Sub2        Stim2      68
# 20: Sub2        Stim2      75
# 21: Sub2        Stim2      77
#     Name StimulusName Indices

答案 1 :(得分:3)

您可以直接按组过滤非重复的Fixation值,然后首先将行名转到适当的列以保留索引。

library(dplyr)
library(tibble)

df %>% 
  rownames_to_column() %>% 
  group_by(Name, StimulusName) %>%
  filter(!duplicated(Fixation))

# A tibble: 21 x 4
# Groups:   Name, StimulusName [4]
#    rowname Name  StimulusName Fixation
#    <chr>   <chr> <chr>           <int>
#  1 1       Sub1  Stim1              NA
#  2 3       Sub1  Stim1               1
#  3 7       Sub1  Stim1               2
#  4 9       Sub1  Stim1               3
#  5 16      Sub1  Stim1               4
#  6 18      Sub1  Stim1               5
#  7 20      Sub1  Stim2              NA
#  8 24      Sub1  Stim2               1
#  9 28      Sub1  Stim2               2
# 10 37      Sub1  Stim2               3
# ... with 11 more rows

在罗纳克·沙(Ronak Shah)的建议下,仅dplyr的解决方案如下所示:

df %>% 
  mutate(Index = row_number()) %>% 
  group_by(Name, StimulusName) %>%
  filter(!duplicated(Fixation))

答案 2 :(得分:0)

这里是base R

的一个选项
cbind(unique(df)[-3], Fixation = which(!duplicated(df)))
#   Name StimulusName Fixation
#1  Sub1        Stim1        1
#2  Sub1        Stim1        3
#3  Sub1        Stim1        7
#4  Sub1        Stim1        9
#5  Sub1        Stim1       16
#6  Sub1        Stim1       18
#7  Sub1        Stim2       20
#8  Sub1        Stim2       24
#9  Sub1        Stim2       28
#10 Sub1        Stim2       37
#11 Sub2        Stim1       40
#12 Sub2        Stim1       46
#13 Sub2        Stim1       50
#14 Sub2        Stim1       54
#15 Sub2        Stim1       57
#16 Sub2        Stim2       60
#17 Sub2        Stim2       62
#18 Sub2        Stim2       66
#19 Sub2        Stim2       68
#20 Sub2        Stim2       75
#21 Sub2        Stim2       77