我在尝试总结数据时遇到了一些问题。基本上对于我,我试图根据性别总结每年的诊断。我试图使用聚合函数,但产生的输出不提供Gender ==“UNSPECIFIED”
的信息aggregate( cbind(Year.10,Year.11,Year.12,Year.13) ~ Gender, data = dummy , sum)
> Gender Year.10 Year.11 Year.12 Year.13
1 FEMALE 1176290 1113480 1039570 1021810
2 MALE 674020 783150 571170 588660
我尝试用tapply来做,它显示了输出上的特定性别,但是我给了男性和未指定的NA值。
with(dummy, tapply(Year.10, Gender, FUN = sum), na.rm = FALSE)
> FEMALE MALE UNSPECIFIED
1181980 NA NA
我如何总结每个性别的每年诊断?并总结每年所有性别的诊断结合?
dput(dummy[sample(1:nrow(dummy), 15, replace=FALSE),]) "factor")), Year.10.1 = c(13700L, 41370L, 52680L, 356070L,
structure(list(Gender = structure(c(1L, 2L, 1L, 2L, 1L, 1L, 1L,
2L, 1L, 1L, 3L, 3L, 3L, 3L, 3L), .Label = c("FEMALE", "MALE",
"UNSPECIFIED"), class = "factor"), Age = structure(c(4L, 7L,
7L, 1L, 3L, 5L, 1L, 2L, 2L, 6L, 6L, 8L, 4L, 3L, 2L), .Label = c("0-2",
"3-9", "10-19", "20-39", "40-59", "60-64", "65+", "UNSP", "0-2",
"3-9", "10-19", "20-39", "40-59", "60-64", "65+", "UNSP", "3-9",
"10-19", "20-39", "40-59", "60-64", "65+", "UNSP"), class = "factor"),
Year.10 = c(380610L, 63360L, 111790L, 16900L, 58440L, 484770L,
13700L, 41520L, 46890L, 80090L, 2470L, 10920L, NA, 5690L,
NA), Year.11 = c(387080L, 84750L, 84380L, 22380L, 92870L,
439860L, 2470L, 38460L, 30150L, 76670L, NA, NA, 5590L, NA,
2150L), Year.12 = c(291930L, 64810L, 123950L, 2260L, 50900L,
454200L, 7820L, 21550L, 18020L, 92750L, NA, 5500L, NA, NA,
NA), Year.13 = c(371290L, 79150L, 71890L, 5860L, 84110L,
412650L, 2100L, 33680L, 19060L, 60710L, NA, 5700L, 11720L,
NA, NA), Gender.1 = structure(c(1L, 2L, 1L, 2L, 1L, 1L, 1L,
2L, 1L, 1L, 3L, 3L, 3L, 3L, 3L), .Label = c("FEMALE", "MALE",
"UNSPECIFIED"), class = "factor"), Age.1 = structure(c(4L,
7L, 7L, 1L, 3L, 5L, 1L, 2L, 2L, 6L, 7L, 2L, 5L, 4L, 3L), .Label = c("0-2",
"3-9", "10-19", "20-39", "40-59", "60-64", "65+", "UNSP"), class = c("ordered",
"factor")), Year.10.1 = c(356070L, 52240L, 106280L, 16900L,
52680L, 460340L, 13700L, 35910L, 41370L, 80090L, NA, NA,
12850L, NA, 5690L), Year.11.1 = c(372370L, 79200L, 73110L,
22380L, 87280L, 417800L, 2470L, 38460L, 30150L, 76670L, NA,
NA, NA, 5590L, NA), Year.12.1 = c(280720L, 59190L, 123950L,
2260L, 50900L, 447400L, 7820L, 21550L, 18020L, 92750L, 3610L,
NA, 5560L, NA, NA), Year.13.1 = c(356500L, 62130L, 62110L,
5860L, 78440L, 395700L, 2100L, 30310L, 19060L, 49240L, NA,
NA, NA, 11720L, NA)), .Names = c("Gender", "Age", "Year.10",
"Year.11", "Year.12", "Year.13", "Gender.1", "Age.1", "Year.10.1",
"Year.11.1", "Year.12.1", "Year.13.1"), row.names = c(4L, 15L,
7L, 9L, 3L, 5L, 1L, 10L, 2L, 6L, 21L, 23L, 19L, 18L, 17L), class = "data.frame")
头数据
Gender Age Year.10 Year.11 Year.12 Year.13 Gender.1 Age.1 Year.10.1 Year.11.1 Year.12.1 Year.13.1
1 FEMALE 0-2 13700 2470 7820 2100 FEMALE 0-2 13700 2470 7820 2100
2 FEMALE 3-9 46890 30150 18020 19060 FEMALE 3-9 41370 30150 18020 19060
3 FEMALE 10-19 58440 92870 50900 84110 FEMALE 10-19 52680 87280 50900 78440
尾巴数据
Gender Age Year.10 Year.11 Year.12 Year.13 Gender.1 Age.1 Year.10.1 Year.11.1 Year.12.1
14 MALE 60-64 54780 54400 47960 40600 MALE 60-64 54780 54400 47960
15 MALE 65+ 63360 84750 64810 79150 MALE 65+ 52240 79200 59190
16 MALE UNSP NA NA NA 5670 MALE UNSP NA NA NA
17 UNSPECIFIED 3-9 NA 2150 NA NA UNSPECIFIED 10-19 5690 NA NA
18 UNSPECIFIED 10-19 5690 NA NA NA UNSPECIFIED 20-39 NA 5590 NA
19 UNSPECIFIED 20-39 NA 5590 NA 11720 UNSPECIFIED 40-59 12850 NA 5560
20 UNSPECIFIED 40-59 12850 NA 5560 NA UNSPECIFIED 60-64 2470 NA NA
21 UNSPECIFIED 60-64 2470 NA NA NA UNSPECIFIED 65+ NA NA 3610
22 UNSPECIFIED 65+ NA NA 3610 NA UNSPECIFIED UNSP 10920 NA NA
23 UNSPECIFIED UNSP 10920 NA 5500 5700 UNSPECIFIED 3-9 NA NA NA
答案 0 :(得分:0)
使用dplyr
library(dplyr)
dummy1 <- dummy[,1:6] #subsetting columns 1:6 as it is not about columns 7 to 12
dummy1 %>%
group_by(Gender) %>%
summarise_each(funs(sum=sum(., na.rm=TRUE)), starts_with("Year"))
# Source: local data frame [3 x 5]
# Gender Year.10 Year.11 Year.12 Year.13
#1 FEMALE 1176290 1113480 1039570 1021810
#2 MALE 121780 145590 88620 118690
#3 UNSPECIFIED 19080 7740 5500 17420
或使用aggregate
aggregate(dummy1[,3:6], list(Gender=dummy1[,"Gender"]), FUN=sum, na.rm=TRUE)
# Gender Year.10 Year.11 Year.12 Year.13
#1 FEMALE 1176290 1113480 1039570 1021810
#2 MALE 121780 145590 88620 118690
#3 UNSPECIFIED 19080 7740 5500 17420
在Age
Age.1
数据中安排dummyN
,tail
列
dummyN$Age.1 <- factor(dummyN$Age.1, levels=c("0-2", "3-9", "10-19", "20-39", "40-59", "60-64", "65+", "UNSP"))
dummyN$Age<- factor(dummyN$Age, levels=c("0-2", "3-9", "10-19", "20-39", "40-59", "60-64", "65+", "UNSP"))
或者您也可以使用?ordered
获取有序因子。
dummyN %>%
arrange(Gender, Age, Age.1) %>% #if you need in descending #`arrange(Gender, desc(Age), desc(Age.1))
select(Gender, Age, Age.1) #just selecting the `Age` columns
# Gender Age Age.1
#1 MALE 60-64 60-64
#2 MALE 65+ 65+
#3 MALE UNSP UNSP
#4 UNSPECIFIED 3-9 10-19
#5 UNSPECIFIED 10-19 20-39
#6 UNSPECIFIED 20-39 40-59
#7 UNSPECIFIED 40-59 60-64
#8 UNSPECIFIED 60-64 65+
#9 UNSPECIFIED 65+ UNSP
#10 UNSPECIFIED UNSP 3-9