如何计算R中表中每行的唯一值?

时间:2014-08-21 21:41:27

标签: r

我的表g是:

g
      ID     GROUP
1     123      A
2     656      A
3     456      A
4     123      A
5     456      B
6     789      A
7     453      B
8     123      C
9     720      D
10    456      E
11    453      A
12    863      F

我想知道有多少个唯一的GROUP与每个ID相关。


我想获得如下输出。 第x列应该给我一定数量的唯一GROUP

      ID       x
1    123       2      # as there are 2 unique GROUPs: A(twice) and C
2    453       2      # as it is B and A - 2 unique GROUPS
3    456       3      # as it is A, B, E
4    656       1      # as it is A
5    720       1      # as it is D
6    789       1      # as it is A
7    863       1      # as it is F

----------------------------------------------- ----------------------------

尝试解决上述问题的例子:

1

agg<-aggregate(g$GROUP, by=list(ID=g$ID), unique)
agg
      ID        x
1    123     2, 6     # amount of digits in column x tells me how many GROUPs 
2    453     1, 2     # are related to single ID. 
3    456  2, 1, 4     # Numbers stand for: B  A  F  E  D  C
4    656        2                        # 1  2  3  4  5  6
5    720        5                       
6    789        2
7    863        3

agg$x
$`0`
[1] A C
Levels: B A F E D C
$`1`
[1] B A
...
...

2

ggg <- aggregate(g$GROUP, by=list(ID=g$ID), paste, collapse=",")

     ID           x
1    123      A,A,C      # I want to count unique values (A,C)=2
2    453        B,A
3    456      A,B,E
4    656          A
5    720          D
6    789          A
7    863          F

5 个答案:

答案 0 :(得分:3)

library(data.table)

dt = as.data.table(g)

# using `keyby` instead of `by` to get sorted results
dt[, length(unique(GROUP)), keyby = ID]
#    ID V1
#1: 123  2
#2: 453  2
#3: 456  3
#4: 656  1
#5: 720  1
#6: 789  1
#7: 863  1


# or if you want them combined in a string like in OP:
dt[, paste(unique(GROUP), collapse = ","), keyby = ID]
#    ID    V1
#1: 123   A,C
#2: 453   B,A
#3: 456 A,B,E
#4: 656     A
#5: 720     D
#6: 789     A
#7: 863     F

答案 1 :(得分:3)

table(unique(g)$ID)
#123 453 456 656 720 789 863 
#  2   2   3   1   1   1   1 

可替换地,

data.frame(table(unique(g)$ID))
#  Var1 Freq
#1  123    2
#2  453    2
#3  456    3
#4  656    1
#5  720    1
#6  789    1
#7  863    1

答案 2 :(得分:3)

使用sqldf package的一个小技巧:

# I had to rename the 'group'  column to avoid problems
> g <- data.frame(
    id=c(123, 656, 456, 123, 456, 789, 453, 123, 720, 456, 453, 863),
    group_id=c('A', 'A', 'A', 'A', 'B', 'A', 'B', 'C', 'D', 'E', 'A', 'F')
  )

> library(sqldf)

> sqldf('select id, count(distinct group_id) as unique_groups from g group by ID')
   id unique_groups
1 123             2
2 453             2
3 456             3
4 656             1
5 720             1
6 789             1
7 863             1

答案 3 :(得分:3)

在问题中使用aggregate

with(g, aggregate(GROUP ~ ID, FUN = function(x) length(unique(x))))
#   ID GROUP
#1 123     2
#2 453     2
#3 456     3
#4 656     1
#5 720     1
#6 789     1
#7 863     1

dplyr替代方案:

library(dplyr)
g %>% group_by(ID) %>% summarise(count = n_distinct(GROUP))
#Source: local data frame [7 x 2]
#
#   ID count
#1 123     2
#2 453     2
#3 456     3
#4 656     1
#5 720     1
#6 789     1
#7 863     1

答案 4 :(得分:1)

tapply也可以在这里用于简单的代码和干净的输出:

with(g, tapply(GROUP, ID, function(x)length(unique(x))))
123 453 456 656 720 789 863 
  2   2   3   1   1   1   1