根据条件计算平均值

时间:2018-04-24 11:19:46

标签: r dataframe average

我有一张桌子

Country ClaimId ClaimItem   ClaimAmt
IN      C1      1           100
IN      C1      2           200
US      C2      1           100
US      C2      2           100
US      C2      3           100
US      C3      1           100
US      C3      2           100
UK      C4      1           100
UK      C4      2           200
UK      C1      1           100
UK      C1      2           200

在这里,我想计算每个国家/地区的每个国家/地区的平均值,以便我的预期表格看起来像

Country ClaimId ClaimItem   ClaimAmt  Avg
IN      C1      1           100       300
IN      C1      2           200       300
US      C2      1           100       250
US      C2      2           100       250
US      C2      3           100       250
US      C3      1           100       250
US      C3      2           100       250
UK      C4      1           100       300
UK      C4      2           200       300
UK      C1      1           100       300
UK      C1      2           200       300

关于如何实现预期表格的任何想法。 感谢

以下是样本

> dput(claims)
structure(list(Country = structure(c(1L, 1L, 3L, 3L, 3L, 3L, 
3L, 2L, 2L, 2L, 2L), .Label = c("IN", "UK", "US"), class = "factor"), 
    ClaimId = structure(c(1L, 1L, 2L, 2L, 2L, 3L, 3L, 4L, 4L, 
    1L, 1L), .Label = c("C1", "C2", "C3", "C4"), class = "factor"), 
    ClaimItem = c(1L, 2L, 1L, 2L, 3L, 1L, 2L, 1L, 2L, 1L, 2L), 
    ClaimAmt = c(100L, 200L, 100L, 100L, 100L, 100L, 100L, 100L, 
    200L, 100L, 200L)), .Names = c("Country", "ClaimId", "ClaimItem", 
"ClaimAmt"), class = "data.frame", row.names = c(NA, -11L))

3 个答案:

答案 0 :(得分:2)

以下是data.table的解决方案:

claims <- 
structure(list(Country = structure(c(1L, 1L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L), 
  .Label = c("IN", "UK", "US"), class = "factor"), 
ClaimId = structure(c(1L, 1L, 2L, 2L, 2L, 3L, 3L, 4L, 4L, 1L, 1L), 
 .Label = c("C1", "C2", "C3", "C4"), class = "factor"), 
ClaimItem = c(1L, 2L, 1L, 2L, 3L, 1L, 2L, 1L, 2L, 1L, 2L), 
ClaimAmt = c(100L, 200L, 100L, 100L, 100L, 100L, 100L, 100L, 200L, 100L, 200L)), 
 .Names = c("Country", "ClaimId", "ClaimItem", "ClaimAmt"), 
class = "data.frame", row.names = c(NA, -11L))

library("data.table")
setDT(claims)
claims[, Avg:=sum(ClaimAmt)/uniqueN(ClaimId), Country][]

# > claims[, Avg:=sum(ClaimAmt)/uniqueN(ClaimId), Country][]
#     Country ClaimId ClaimItem ClaimAmt Avg
#  1:      IN      C1         1      100 300
#  2:      IN      C1         2      200 300
#  3:      US      C2         1      100 250
#  4:      US      C2         2      100 250
#  5:      US      C2         3      100 250
#  6:      US      C3         1      100 250
#  7:      US      C3         2      100 250
#  8:      UK      C4         1      100 300
#  9:      UK      C4         2      200 300
# 10:      UK      C1         1      100 300
# 11:      UK      C1         2      200 300

答案 1 :(得分:1)

考虑两个基本R ave调用与 ClaimAmt 之和的比例 Country ,然后是唯一 ClaimID 的长度通过国家/地区

claims$Avg <- with(claims, ave(ClaimAmt, Country, FUN=sum) /
                    ave(as.integer(ClaimId), Country, FUN=function(g) length(unique(g)))
                   )    
claims

#    Country ClaimId ClaimItem ClaimAmt Avg
# 1       IN      C1         1      100 300
# 2       IN      C1         2      200 300
# 3       US      C2         1      100 250
# 4       US      C2         2      100 250
# 5       US      C2         3      100 250
# 6       US      C3         1      100 250
# 7       US      C3         2      100 250
# 8       UK      C4         1      100 300
# 9       UK      C4         2      200 300
# 10      UK      C1         1      100 300
# 11      UK      C1         2      200 300

答案 2 :(得分:0)

df <- claims %>% group_by(Country, ClaimId) %>% mutate(
 Avg = mean(ClaimAmt)
)

as.data.frame(df)
    Country ClaimId ClaimItem ClaimAmt Avg
 1       IN      C1         1      100 150
 2       IN      C1         2      200 150
 3       US      C2         1      100 100
 4       US      C2         2      100 100
 5       US      C2         3      100 100
 6       US      C3         1      100 100
 7       US      C3         2      100 100
 8       UK      C4         1      100 150
 9       UK      C4         2      200 150
 10      UK      C1         1      100 150
 11      UK      C1         2      200 150

<强>校正:

 avg_test <- function(df,country){
        df <- claims[claims$Country==country,c("ClaimAmt","ClaimId")]
        Avg = sum(df$ClaimAmt)/length(unique(df$ClaimId))
        return(Avg)
   }

claims$Avg <- with(claims,mapply(avg_test,df=claims,countr=Country))

 > claims
    Country ClaimId ClaimItem ClaimAmt Avg
 1       IN      C1         1      100 300
 2       IN      C1         2      200 300
 3       US      C2         1      100 250
 4       US      C2         2      100 250
 5       US      C2         3      100 250
 6       US      C3         1      100 250
 7       US      C3         2      100 250
 8       UK      C4         1      100 300
 9       UK      C4         2      200 300
 10      UK      C1         1      100 300
 11      UK      C1         2      200 300