R中的自动化部门

时间:2014-11-01 22:04:15

标签: r dplyr

这是我目前的手动解决方案。我想知道如何实现我的问题的自动化解决方案。

smarties <- aggregate(Mention_DRGU ~ USC5_CLASS + AGE + year , data = chanko , sum )

keto <- filter(smarties, AGE == "0-2" & year == "2010")[,4]/Actual$ZERO.TWO[1]
keto1 <- filter(smarties, AGE == "0-2" & year == "2011")[,4]/Actual$ZERO.TWO[2]
keto2 <- filter(smarties, AGE == "0-2" & year == "2012")[,4]/Actual$ZERO.TWO[3]
keto3 <- filter(smarties, AGE == "0-2" & year == "2013")[,4]/Actual$ZERO.TWO[4]
blarg <- matrix(c(keto,keto1, keto2,keto3), nrow=9, ncol = 1)
cbind(filter(smarties, AGE == "0-2")[,1:3],blarg)

我一直在阅读与我类似的其他用户问题,我遇到了this我试图实现,但我已经开始使用`tapply函数了。

基本上,如果年龄和年份相似,我希望它与实际data.frame中的相应列分开。

temp <- split(smarties, f = list(smarties$AGE, smarties$year))

dput(Actual)

    structure(list(Region = c("Canada", "Canada", "Canada", "Canada"
), Year = c("2010", "2011", "2012", "2013"), Level = c("ID", 
"PD", "PR", "PP"), Sex = c("3", "3", "3", "3"), Total = c(34005, 
34343, 34754, 35158), X0.2 = c(1139, 1139, 1140, 1145), X3.10 = c(2900, 
2935, 2980, 3024), X11.19 = c(3835, 3792, 3740, 3684), X20.39 = c(9247, 
9325, 9474, 9618), X40.59 = c(10106, 10150, 10185, 10197), X60.64 = c(1982, 
2050, 2071, 2110), X65. = c(4736, 4887, 5099, 5310)), .Names = c("Region", 
"Year", "Level", "Sex", "Total", "X0.2", "X3.10", "X11.19", "X20.39", 
"X40.59", "X60.64", "X65."), row.names = c(NA, -4L), class = "data.frame")

dput(smarties)

structure(list(USC5_CLASS = structure(c(5L, 6L, 7L, 3L, 5L, 6L, 
3L, 5L, 6L, 7L, 12L, 3L, 5L, 6L, 7L, 12L, 2L, 3L, 5L, 6L, 7L, 
8L, 11L, 12L, 3L, 5L, 6L, 7L, 8L, 12L, 2L, 5L, 6L, 7L, 12L, 5L, 
8L, 5L, 6L, 3L, 4L, 5L, 6L, 7L, 8L, 3L, 5L, 6L, 7L, 3L, 4L, 5L, 
6L, 7L, 12L, 2L, 3L, 4L, 5L, 6L, 7L, 9L, 11L, 12L, 3L, 5L, 6L, 
7L, 12L, 2L, 3L, 5L, 6L, 7L, 12L, 3L, 6L, 3L, 5L, 6L, 3L, 5L, 
6L, 2L, 3L, 5L, 6L, 7L, 9L, 12L, 1L, 2L, 3L, 5L, 6L, 7L, 9L, 
12L, 3L, 5L, 6L, 7L, 12L, 2L, 3L, 5L, 6L, 7L, 8L, 12L, 12L, 5L, 
6L, 3L, 5L, 6L, 3L, 5L, 6L, 7L, 2L, 3L, 5L, 6L, 7L, 10L, 12L, 
2L, 3L, 5L, 6L, 7L, 12L, 3L, 5L, 6L, 7L, 8L, 12L, 3L, 5L, 6L, 
7L, 12L, 5L, 6L), .Label = c("15111 TETRACYCLINE", "15112 TETRACYCLINE CONGENERS", 
"15130 CEPHALOSPORIN", "15141 ERYTHROMYCIN", "15142 EXTENDED SPEC MACROLIDES", 
"15152 AMOXICILLIN", "15153 OTHER BRD SPEC PENICILL", "15180 TRIMETHOPRIM COMBS", 
"15190 BRD/MED SPEC OTHER", "15210 PENICILLIN V & VK", "15230 ANTI-STAPH PENICILLIN", 
"15810 QUINOLONES, ORAL"), class = "factor"), AGE = structure(c(1L, 
1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 5L, 
5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 
7L, 7L, 8L, 8L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 
4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 
6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 1L, 1L, 2L, 2L, 2L, 3L, 
3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 
5L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 1L, 1L, 
2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 
5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 8L, 
8L), .Label = c("0-2", "3-9", "10-19", "20-39", "40-59", "60-64", 
"65+", "UNSP"), class = "factor"), year = c(2010L, 2010L, 2010L, 
2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 
2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 
2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 
2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2011L, 2011L, 
2011L, 2011L, 2011L, 2011L, 2011L, 2011L, 2011L, 2011L, 2011L, 
2011L, 2011L, 2011L, 2011L, 2011L, 2011L, 2011L, 2011L, 2011L, 
2011L, 2011L, 2011L, 2011L, 2011L, 2011L, 2011L, 2011L, 2011L, 
2011L, 2011L, 2011L, 2011L, 2011L, 2011L, 2011L, 2011L, 2011L, 
2012L, 2012L, 2012L, 2012L, 2012L, 2012L, 2012L, 2012L, 2012L, 
2012L, 2012L, 2012L, 2012L, 2012L, 2012L, 2012L, 2012L, 2012L, 
2012L, 2012L, 2012L, 2012L, 2012L, 2012L, 2012L, 2012L, 2012L, 
2012L, 2012L, 2012L, 2012L, 2012L, 2012L, 2012L, 2012L, 2012L, 
2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 
2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 
2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 
2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L), Mention_DRGU = c(5760L, 
19230L, 5610L, 4700L, 36660L, 16070L, 7600L, 32910L, 39850L, 
2230L, 5690L, 49240L, 214790L, 144420L, 27840L, 60320L, 5520L, 
98940L, 216040L, 103030L, 27440L, 10840L, 5500L, 145650L, 5610L, 
44470L, 16510L, 7910L, 2470L, 38280L, 5520L, 61050L, 22340L, 
10390L, 28960L, 5460L, 5460L, 22380L, 2470L, 5980L, 5550L, 11370L, 
23830L, 5840L, 5590L, 14680L, 68290L, 47860L, 2110L, 73700L, 
1960L, 177290L, 177440L, 47750L, 103110L, 16650L, 57340L, 5530L, 
214590L, 143790L, 50590L, 3280L, 5680L, 104760L, 28860L, 47150L, 
5550L, 28680L, 5680L, 10970L, 5590L, 50260L, 28380L, 11480L, 
28800L, 2260L, 7820L, 13750L, 11120L, 14700L, 16840L, 55900L, 
11060L, 5680L, 39920L, 166720L, 156230L, 19990L, 5540L, 60910L, 
5540L, 10960L, 16780L, 205220L, 153830L, 30970L, 3500L, 110530L, 
5540L, 50360L, 44410L, 18140L, 16760L, 5400L, 11080L, 39180L, 
31650L, 16840L, 5530L, 55590L, 1390L, 5860L, 2100L, 2100L, 28660L, 
15460L, 5770L, 22890L, 56150L, 5540L, 17010L, 45740L, 114940L, 
224230L, 37560L, 5670L, 60150L, 11310L, 39620L, 162890L, 178530L, 
57780L, 74110L, 16900L, 15500L, 16850L, 17090L, 5610L, 14490L, 
17020L, 11350L, 45100L, 5700L, 17100L, 11340L, 5670L)), .Names = c("USC5_CLASS", 
"AGE", "year", "Mention_DRGU"), row.names = c(NA, -146L), class = "data.frame")

2 个答案:

答案 0 :(得分:1)

这不是可靠的编程,但似乎有效:

# remove rows not to be analyzed
smarties <- smarties[smarties$AGE != 'UNSP', ]

# get row and column indices
smarties$agenum <- as.numeric(smarties$AGE)
smarties$colnum <- smarties$agenum + 5
smarties$yearnum <- as.numeric(as.factor(smarties$year))

# send col and row index (x and y) to find denominator from Actual df,
# and numerator z
smarties$result <- mapply(FUN = function(x,y,z,df){
  z/df[x,y]
},x = smarties$yearnum,y = smarties$colnum,z = smarties$Mention_DRGU,MoreArgs = list(df = Actual))

这取决于按正确顺序出现的年龄和年份,以及按正确顺序排列的列。

<强>更新

这很容易打破,因为它使用了名字。您可以使用适用于您的任何*适用。

# remove rows not to be analyzed
smarties <- smarties[smarties$AGE != 'UNSP', ]

# rename variable names and fix weird names
smarties$agecol <- paste0('X',gsub(pattern = '\\-|\\+',replacement = '.',x = smarties$AGE))
smarties[smarties$agecol == 'X3.9','agecol'] <- 'X3.10'
smarties[smarties$agecol == 'X10.19','agecol'] <- 'X11.19'

# mapply
smarties$mresult <- mapply(FUN = function(x,y,z,df){
  z/df[df$Year == x,names(df) == y]
},x = smarties$year,y = smarties$agecol,z = smarties$Mention_DRGU,MoreArgs = list(df = Actual))

# or apply
smarties$appresult <- apply(smarties,1,function(x,df){
  as.numeric(x['Mention_DRGU'])/df[df$Year == x['year'],names(df) == x['agecol']]
},df = Actual)

更新2 通过将数据更改为一致的格式,这是一种更好的方法。

修复数据集:

# tidy data aka wide to long format
require(tidyr)
smarties <- smarties[smarties$AGE != 'UNSP', ]
Actual <- gather(Actual,age,secretvalue,-c(Region,Year,Sex,Level,Total))

# fix age mismatches (should actual type them in)
Actual$age <- levels(smarties$AGE)[as.numeric(Actual$age)]

# make names in both datasets the same
names(Actual) <- tolower(names(Actual))
names(smarties) <- tolower(names(smarties))

重组数据后,解决方案很简单

# merge appropriate data since it is 1:1
smartiesmerged <- merge(smarties,Actual[ ,c('year','age','secretvalue')],by = c('year','age'),all.x = T)

# do the calculation
smartiesmerged$result <- smartiesmerged$mention_drgu / smartiesmerged$secretvalue

答案 1 :(得分:0)

我想我用dplyr找到了一种不那么简单的手动方式。

blarb <- melt(Actual)

Lol <- filter(blarb, variable == "X0.2", Year == "2010")
smarties %>%
  filter(AGE == "0-2", year == 2010) %>%
  mutate(Popul = Mention_DRGU/Lol$value)

不幸的是,我仍然必须为每个年龄和年份做这件事,有人知道我是否可以使用group_by进一步自动化这个?