数据框中的总和列

时间:2014-10-12 03:48:23

标签: r sum

我在尝试总结数据时遇到了一些问题。基本上对于我,我试图根据性别总结每年的诊断。我试图使用聚合函数,但产生的输出不提供Gender ==“UNSPECIFIED”

的信息
aggregate( cbind(Year.10,Year.11,Year.12,Year.13) ~ Gender, data = dummy , sum)

> Gender Year.10 Year.11 Year.12 Year.13
  1 FEMALE 1176290 1113480 1039570 1021810
  2   MALE  674020  783150  571170  588660

我尝试用tapply来做,它显示了输出上的特定性别,但是我给了男性和未指定的NA值。

with(dummy, tapply(Year.10, Gender, FUN = sum), na.rm = FALSE)

>  FEMALE        MALE UNSPECIFIED 
  1181980          NA          NA 

我如何总结每个性别的每年诊断?并总结每年所有性别的诊断结合?

dput(dummy[sample(1:nrow(dummy), 15, replace=FALSE),])                                                                                                                                                                                                                   "factor")), Year.10.1 = c(13700L, 41370L, 52680L, 356070L, 

structure(list(Gender = structure(c(1L, 2L, 1L, 2L, 1L, 1L, 1L, 
2L, 1L, 1L, 3L, 3L, 3L, 3L, 3L), .Label = c("FEMALE", "MALE", 
"UNSPECIFIED"), class = "factor"), Age = structure(c(4L, 7L, 
7L, 1L, 3L, 5L, 1L, 2L, 2L, 6L, 6L, 8L, 4L, 3L, 2L), .Label = c("0-2", 
"3-9", "10-19", "20-39", "40-59", "60-64", "65+", "UNSP", "0-2", 
"3-9", "10-19", "20-39", "40-59", "60-64", "65+", "UNSP", "3-9", 
"10-19", "20-39", "40-59", "60-64", "65+", "UNSP"), class = "factor"), 
Year.10 = c(380610L, 63360L, 111790L, 16900L, 58440L, 484770L, 
13700L, 41520L, 46890L, 80090L, 2470L, 10920L, NA, 5690L, 
NA), Year.11 = c(387080L, 84750L, 84380L, 22380L, 92870L, 
439860L, 2470L, 38460L, 30150L, 76670L, NA, NA, 5590L, NA, 
2150L), Year.12 = c(291930L, 64810L, 123950L, 2260L, 50900L, 
454200L, 7820L, 21550L, 18020L, 92750L, NA, 5500L, NA, NA, 
NA), Year.13 = c(371290L, 79150L, 71890L, 5860L, 84110L, 
412650L, 2100L, 33680L, 19060L, 60710L, NA, 5700L, 11720L, 
NA, NA), Gender.1 = structure(c(1L, 2L, 1L, 2L, 1L, 1L, 1L, 
2L, 1L, 1L, 3L, 3L, 3L, 3L, 3L), .Label = c("FEMALE", "MALE", 
"UNSPECIFIED"), class = "factor"), Age.1 = structure(c(4L, 
7L, 7L, 1L, 3L, 5L, 1L, 2L, 2L, 6L, 7L, 2L, 5L, 4L, 3L), .Label = c("0-2", 
"3-9", "10-19", "20-39", "40-59", "60-64", "65+", "UNSP"), class = c("ordered", 
"factor")), Year.10.1 = c(356070L, 52240L, 106280L, 16900L, 
52680L, 460340L, 13700L, 35910L, 41370L, 80090L, NA, NA, 
12850L, NA, 5690L), Year.11.1 = c(372370L, 79200L, 73110L, 
22380L, 87280L, 417800L, 2470L, 38460L, 30150L, 76670L, NA, 
NA, NA, 5590L, NA), Year.12.1 = c(280720L, 59190L, 123950L, 
2260L, 50900L, 447400L, 7820L, 21550L, 18020L, 92750L, 3610L, 
NA, 5560L, NA, NA), Year.13.1 = c(356500L, 62130L, 62110L, 
5860L, 78440L, 395700L, 2100L, 30310L, 19060L, 49240L, NA, 
NA, NA, 11720L, NA)), .Names = c("Gender", "Age", "Year.10", 
"Year.11", "Year.12", "Year.13", "Gender.1", "Age.1", "Year.10.1", 
"Year.11.1", "Year.12.1", "Year.13.1"), row.names = c(4L, 15L, 
7L, 9L, 3L, 5L, 1L, 10L, 2L, 6L, 21L, 23L, 19L, 18L, 17L), class = "data.frame")

头数据

 Gender   Age Year.10 Year.11 Year.12 Year.13 Gender.1 Age.1 Year.10.1 Year.11.1 Year.12.1     Year.13.1
1 FEMALE   0-2   13700    2470    7820    2100   FEMALE   0-2     13700      2470      7820      2100
2 FEMALE   3-9   46890   30150   18020   19060   FEMALE   3-9     41370     30150     18020     19060
3 FEMALE 10-19   58440   92870   50900   84110   FEMALE 10-19     52680     87280     50900     78440

尾巴数据

        Gender   Age Year.10 Year.11 Year.12 Year.13    Gender.1 Age.1 Year.10.1 Year.11.1 Year.12.1
14        MALE 60-64   54780   54400   47960   40600        MALE 60-64     54780     54400     47960
15        MALE   65+   63360   84750   64810   79150        MALE   65+     52240     79200     59190
16        MALE  UNSP      NA      NA      NA    5670        MALE  UNSP        NA        NA        NA
17 UNSPECIFIED   3-9      NA    2150      NA      NA UNSPECIFIED 10-19      5690        NA        NA
18 UNSPECIFIED 10-19    5690      NA      NA      NA UNSPECIFIED 20-39        NA      5590        NA
19 UNSPECIFIED 20-39      NA    5590      NA   11720 UNSPECIFIED 40-59     12850        NA      5560
20 UNSPECIFIED 40-59   12850      NA    5560      NA UNSPECIFIED 60-64      2470        NA        NA
21 UNSPECIFIED 60-64    2470      NA      NA      NA UNSPECIFIED   65+        NA        NA      3610
22 UNSPECIFIED   65+      NA      NA    3610      NA UNSPECIFIED  UNSP     10920        NA        NA
23 UNSPECIFIED  UNSP   10920      NA    5500    5700 UNSPECIFIED   3-9        NA        NA        NA

1 个答案:

答案 0 :(得分:0)

使用dplyr

library(dplyr)
dummy1 <- dummy[,1:6] #subsetting columns 1:6 as it is not about columns 7 to 12

dummy1 %>%
       group_by(Gender) %>%
       summarise_each(funs(sum=sum(., na.rm=TRUE)), starts_with("Year"))
# Source: local data frame [3 x 5]

#        Gender Year.10 Year.11 Year.12 Year.13
#1      FEMALE 1176290 1113480 1039570 1021810
#2        MALE  121780  145590   88620  118690
#3 UNSPECIFIED   19080    7740    5500   17420

或使用aggregate

 aggregate(dummy1[,3:6], list(Gender=dummy1[,"Gender"]), FUN=sum, na.rm=TRUE)
 #        Gender Year.10 Year.11 Year.12 Year.13
 #1      FEMALE 1176290 1113480 1039570 1021810
 #2        MALE  121780  145590   88620  118690
 #3 UNSPECIFIED   19080    7740    5500   17420

更新

Age Age.1数据中安排dummyNtail

 dummyN$Age.1 <- factor(dummyN$Age.1, levels=c("0-2", "3-9", "10-19", "20-39", "40-59", "60-64", "65+", "UNSP"))
 dummyN$Age<- factor(dummyN$Age, levels=c("0-2", "3-9", "10-19", "20-39", "40-59", "60-64", "65+", "UNSP"))

或者您也可以使用?ordered获取有序因子。

  dummyN %>% 
         arrange(Gender, Age, Age.1) %>% #if you need in descending #`arrange(Gender, desc(Age), desc(Age.1))
          select(Gender, Age, Age.1) #just selecting the `Age` columns
   #        Gender   Age Age.1
   #1         MALE 60-64 60-64
   #2         MALE   65+   65+
   #3         MALE  UNSP  UNSP
   #4  UNSPECIFIED   3-9 10-19
   #5  UNSPECIFIED 10-19 20-39
   #6  UNSPECIFIED 20-39 40-59
   #7  UNSPECIFIED 40-59 60-64
   #8  UNSPECIFIED 60-64   65+
   #9  UNSPECIFIED   65+  UNSP
   #10 UNSPECIFIED  UNSP   3-9