我有关于国家/地区的数据,并希望对其进行总结并创建表格。
> head(data)
country year score members
A 1989 0 7
A 1990 0 7
A 1991 0 7
A 1992 0 7
A 1993 0 7
A 1994 0 7
该表应显示国家/地区之间的关系"得分"以及"成员的数量" - 换句话说,我想看看有多少分数为0,1或2的州有"成员"(范围从1到7)。
我想这样设置:
score members==1 members==2 members==3 members==4 members==5 members==6 members==7
0 1 0
1 2 0
2 0 1 and so on..
为此,我运行以下命令:
library(dplyr)
table <- data %>%
group_by(score) %>%
summarise(
m1 = sum(members==1, na.rm=TRUE),
m2 = sum(members==2, na.rm=TRUE),
m3 = sum(members==3, na.rm=TRUE),
m4 = sum(members==4, na.rm=TRUE),
m5 = sum(members==5, na.rm=TRUE),
m6 = sum(members==6, na.rm=TRUE),
m7 = sum(members==7, na.rm=TRUE)
)
这给出了:
score m1 m2 m3 m4 m5 m6 m7
0 0 2 0 0 0 3 30
1 15 3 11 11 3 18 3
2 3 0 2 2 0 6 9
.
.
我需要一点帮助。如您所见,它已计算出观察总数,而我只想计算每个国家一次。
如何汇总这些数据以获得每个成员级别的国家/地区总数?
以下是我的可重复性数据示例:
data <-
structure(list(country = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L), .Label = c("A", "B", "C", "D", "E", "F"), class = "factor"),
year = c(1989L, 1990L, 1991L, 1992L, 1993L, 1994L, 1995L,
1996L, 1997L, 1998L, 1999L, 2000L, 2001L, 2002L, 2003L, 2004L,
2005L, 2006L, 2007L, 2008L, 2010L, 1989L, 1990L, 1991L, 1992L,
1993L, 1994L, 1995L, 1996L, 1997L, 1998L, 1999L, 2000L, 2001L,
2002L, 2003L, 2004L, 2005L, 2006L, 2007L, 2008L, 2009L, 2010L,
2011L, 1989L, 1991L, 1993L, 1994L, 1995L, 1996L, 1997L, 1999L,
2000L, 2001L, 2002L, 2003L, 2004L, 2005L, 2006L, 2007L, 2008L,
2010L, 1989L, 1990L, 1991L, 1992L, 1993L, 1994L, 1995L, 1996L,
1997L, 1998L, 1999L, 2000L, 2001L, 2002L, 2003L, 2004L, 2005L,
2006L, 2007L, 2008L, 2009L, 2010L, 2011L, 1991L, 1992L, 1993L,
1994L, 1995L, 1997L, 1998L, 1999L, 2000L, 2001L, 2002L, 2003L,
2004L, 2005L, 2006L, 2007L, 2008L, 2010L, 1991L, 1992L, 1993L,
1994L, 1995L, 1997L, 1998L, 1999L, 2000L, 2001L, 2002L, 2003L,
2004L, 2005L, 2006L, 2007L, 2008L, 2010L), score = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 1L, 0L, 1L, 1L,
1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 2L, 2L,
2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
2L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), members = c(7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L,
4L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L)), .Names = c("country", "year", "score",
"members"), class = "data.frame", row.names = c(NA, -121L))
答案 0 :(得分:4)
我相信你需要这个:
library(reshape2)
dcast(aggregate(country~score+members, data=data, FUN=function(x) length(unique(x))),
score~members, value.var="country", fill=0L)
# score 1 2 3 4 5 6 7
#1 0 0 1 0 0 0 1 2
#2 1 1 1 2 2 1 3 2
#3 2 1 0 1 2 0 1 1
或者,用dplyr
/ tidyr
方式:
data %>%
group_by(members, score) %>%
summarise(n=n_distinct(country)) %>%
spread(members, n, fill=0L)
## A tibble: 3 x 8
# score 1 2 3 4 5 6 7
#* <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 0 0 1 0 0 0 1 2
#2 1 1 1 2 2 1 3 2
#3 2 1 0 1 2 0 1 1
答案 1 :(得分:3)
由于OP正在使用dplyr
方法,我们可以通过将“得分”,“成员”分组以获取元素数量(n()
),然后spread
来实现此目的(来自tidyr
)将其重塑为“宽”格式。
library(dplyr)
library(tidyr)
data %>%
group_by(score, members) %>%
summarise(n = n()) %>%
mutate(members = paste0("m", members)) %>%
spread(members, n, fill = 0)
# score m1 m2 m3 m4 m5 m6 m7
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 0 0 2 0 0 0 3 30
#2 1 15 3 11 11 3 18 3
#3 2 3 0 2 2 0 6 9
如果我们还需要按“国家/地区”获取计数,只需在group_by
data %>%
group_by(country, score, members) %>%
summarise(n = n()) %>%
mutate(members = paste0("m", members)) %>%
spread(members, n, fill = 0)
如果预期输出是其他帖子中显示的输出,则使用data.table
的选项是将'data.frame'转换为'data.table'(setDT(data
)和{ {1}}从'long'到'wide',将dcast
指定为'value.var'变量的fun.aggregate
,即'{1}}返回uniqueN
的'country' “国家/地区”列中的uniqueN
个元素。 length
指定对于那些不可用的组合占用0。默认情况下,它返回NA。
unique
答案 2 :(得分:2)
问题的关键似乎是每年都有重复的行?在这种情况下,您可以使用distinct
删除它们,然后它就是一个简单的交叉表。您可以使用magrittr的%$%
展示管道:
library(dplyr)
library(magrittr)
data %>%
distinct(country, score, members) %$%
table(score, members)
members
score 1 2 3 4 5 6 7
0 0 1 0 0 0 1 2
1 1 1 2 2 1 3 2
2 1 0 1 2 0 1 1
或者来自janitor包的常规管道和crosstab
:
library(dplyr)
library(janitor)
data %>%
distinct(country, score, members) %>%
crosstab(score, members)
score 1 2 3 4 5 6 7
1 0 0 1 0 0 0 1 2
2 1 1 1 2 2 1 3 2
3 2 1 0 1 2 0 1 1