获取R

时间:2015-12-10 09:43:30

标签: r

假设我有类似这样的数据

Id Name Price sales Profit Month Category Mode Supplier
1    A     2     5      8     1        X    K     John
1    A     2     6      9     2        X    K     John
1    A     2     5      8     3        X    K     John
2    B     2     4      6     1        X    L      Sam
2    B     2     3      4     2        X    L      Sam
2    B     2     5      7     3        X    L      Sam
3    C     2     5     11     1        X    M     John
3    C     2     5     11     2        X    L     John
3    C     2     5     11     3        X    K     John
4    D     2     8     10     1        Y    M     John
4    D     2     8     10     2        Y    K     John
4    D     2     5      7     3        Y    K     John
5    E     2     5      9     1        Y    M      Sam
5    E     2     5      9     2        Y    L      Sam
5    E     2     5      9     3        Y    M      Sam
6    F     2     4      7     1        Z    M     Kyle
6    F     2     5      8     2        Z    L     Kyle
6    F     2     5      8     3        Z    M     Kyle

如果我应用table函数,它只会结合行,结果将是

    K L M
  X 4 4 1
  Y 2 1 3
  Z 0 1 2

现在如果我不想要所有行的总和但只需要那些具有唯一Id的行的总和怎么办? 所以它看起来像

    K L M
  X 2 2 1
  Y 1 1 2
  Z 0 1 1

由于

2 个答案:

答案 0 :(得分:6)

如果df是您的data.frame:

# Subset original data.frame to keep columns of interest
df1 <- df[,c("Id", "Category", "Mode")]
# Remove duplicated rows
df1 <- df1[!duplicated(df1),]
# Create table
with(df1, table(Category, Mode))
#         Mode
# Category K L M
#        X 2 2 1
#        Y 1 1 2
#        Z 0 1 1

或使用unique

在一行中
table(unique(df[c("Id", "Category", "Mode")])[-1])
df <- structure(list(Id = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 
4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L), Name = structure(c(1L, 1L, 1L, 
2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L), .Label = c("A", 
"B", "C", "D", "E", "F"), class = "factor"), Price = c(2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
), sales = c(5L, 6L, 5L, 4L, 3L, 5L, 5L, 5L, 5L, 8L, 8L, 5L, 
5L, 5L, 5L, 4L, 5L, 5L), Profit = c(8L, 9L, 8L, 6L, 4L, 7L, 11L, 
11L, 11L, 10L, 10L, 7L, 9L, 9L, 9L, 7L, 8L, 8L), Month = c(1L, 
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 
3L), Category = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L), .Label = c("X", "Y", "Z"
), class = "factor"), Mode = structure(c(1L, 1L, 1L, 2L, 2L, 
2L, 3L, 2L, 1L, 3L, 1L, 1L, 3L, 2L, 3L, 3L, 2L, 3L), .Label = c("K", 
"L", "M"), class = "factor"), Supplier = structure(c(1L, 1L, 
1L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 2L, 2L, 2L
), .Label = c("John", "Kyle", "Sam"), class = "factor")), .Names = c("Id", 
"Name", "Price", "sales", "Profit", "Month", "Category", "Mode", 
"Supplier"), class = "data.frame", row.names = c(NA, -18L))

答案 1 :(得分:2)

我们可以尝试

library(data.table)
dcast(unique(setDT(df1[c('Category', 'Mode', 'Id')])), 
            Category~Mode, value.var='Id', length)
#   Category K L M
#1:        X 2 2 1
#2:        Y 1 1 2
#3:        Z 0 1 1

dplyr

library(dplyr)
df1 %>% 
  distinct(Id, Category, Mode) %>% 
  group_by(Category, Mode) %>%
  tally() %>% 
  spread(Mode, n, fill=0)
#   Category     K     L     M
#      (chr) (dbl) (dbl) (dbl)
#1        X     2     2     1
#2        Y     1     1     2
#3        Z     0     1     1

或者@David Arenburg建议,上面的变体是

df1 %>% 
   distinct(Id, Category, Mode) %>% 
   select(Category, Mode) %>%
   table()