我面临着计算长格式数据均值的问题。
我使用的数据集很长很复杂, 所以我将重现4个案例。
dta = structure(list(idno = c(12501, 12501, 12501, 12501, 12501, 12502,
12502, 12502, 12502, 12502, 12502, 12502, 12502, 12502, 12502,
12502, 12502, 12591, 12591, 12591, 12591, 12591, 12591, 12591,
12591, 12591, 12591, 12591, 12591, 12591, 12591, 12591, 12592,
12592, 12592, 12592, 12592), isex = structure(c(1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L), .Label = c("FEMALE", "MALE"), class = "factor"), children = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), variable = structure(c(1L,
2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L,
144L, 1L, 2L, 3L, 4L, 5L, 135L, 136L, 137L, 138L, 139L, 140L,
141L, 142L, 143L, 144L, 1L, 2L, 3L, 4L, 5L), .Label = c("act1.001",
"act1.002", "act1.003", "act1.004", "act1.005", "act1.006", "act1.007",
"act1.008", "act1.009", "act1.010", "act1.011", "act1.012", "act1.013",
"act1.014", "act1.015", "act1.016", "act1.017", "act1.018", "act1.019",
"act1.020", "act1.021", "act1.022", "act1.023", "act1.024", "act1.025",
"act1.026", "act1.027", "act1.028", "act1.029", "act1.030", "act1.031",
"act1.032", "act1.033", "act1.034", "act1.035", "act1.036", "act1.037",
"act1.038", "act1.039", "act1.040", "act1.041", "act1.042", "act1.043",
"act1.044", "act1.045", "act1.046", "act1.047", "act1.048", "act1.049",
"act1.050", "act1.051", "act1.052", "act1.053", "act1.054", "act1.055",
"act1.056", "act1.057", "act1.058", "act1.059", "act1.060", "act1.061",
"act1.062", "act1.063", "act1.064", "act1.065", "act1.066", "act1.067",
"act1.068", "act1.069", "act1.070", "act1.071", "act1.072", "act1.073",
"act1.074", "act1.075", "act1.076", "act1.077", "act1.078", "act1.079",
"act1.080", "act1.081", "act1.082", "act1.083", "act1.084", "act1.085",
"act1.086", "act1.087", "act1.088", "act1.089", "act1.090", "act1.091",
"act1.092", "act1.093", "act1.094", "act1.095", "act1.096", "act1.097",
"act1.098", "act1.099", "act1.100", "act1.101", "act1.102", "act1.103",
"act1.104", "act1.105", "act1.106", "act1.107", "act1.108", "act1.109",
"act1.110", "act1.111", "act1.112", "act1.113", "act1.114", "act1.115",
"act1.116", "act1.117", "act1.118", "act1.119", "act1.120", "act1.121",
"act1.122", "act1.123", "act1.124", "act1.125", "act1.126", "act1.127",
"act1.128", "act1.129", "act1.130", "act1.131", "act1.132", "act1.133",
"act1.134", "act1.135", "act1.136", "act1.137", "act1.138", "act1.139",
"act1.140", "act1.141", "act1.142", "act1.143", "act1.144"), class = "factor"),
value = structure(c(8L, 9L, 9L, 9L, 9L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("a Sleep",
"c Work", "e Travel/Commute", "f Cooking", "g Housework",
"h Odd jobs", "i Eating", "j Child care", "k Care for others",
"m Leisure", "u TV/Radio", "v Others", "t Visiting/Socialising",
"p Telephone/Online Communication", "b Personal care", "d Studies/library",
"s Religious", "l Shopping", "q Computing/Internet", "n Highbrow",
"o Sport", "r Civic"), class = "factor"), sleep = c(0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 10, 10,
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
10, 10)), .Names = c("idno", "isex", "children", "variable",
"value", "sleep"), row.names = c(1L, 2L, 3L, 4L, 5L, 145L, 146L,
147L, 148L, 149L, 150L, 151L, 152L, 153L, 154L, 155L, 288L, 289L,
290L, 291L, 292L, 293L, 423L, 424L, 425L, 426L, 427L, 428L, 429L,
430L, 431L, 432L, 433L, 434L, 435L, 436L, 437L), class = "data.frame")
前5行看起来像这样。
idno isex children variable value sleep
1 12501 FEMALE 0 act1.001 j Child care 0
2 12501 FEMALE 0 act1.002 k Care for others 0
3 12501 FEMALE 0 act1.003 k Care for others 0
4 12501 FEMALE 0 act1.004 k Care for others 0
5 12501 FEMALE 0 act1.005 k Care for others 0
idno是个人识别符,然后是性别,孩子,时间戳,活动以及我为睡眠创建假人的最后一行。
如果我使用聚合
aggregate(sleep ~ isex, data = dta, FUN = sum)
150 / 2 # 2 women in my dataset
50 / 2 # 2 men in my dataset
我得到mean
女性睡眠时间为75分钟,男性为25分钟。
使用summarise
,我可以按性别计算此类所有活动的sum
。我们可以看到睡眠的总和是正确的。
dta %>%
group_by(isex, value, add = T) %>%
summarise(smn = n()*10)
isex value smn
1 FEMALE a Sleep 150
2 FEMALE j Child care 10
3 FEMALE k Care for others 40
4 MALE a Sleep 50
5 MALE j Child care 120
我的问题是,我现在需要将此smn
和除以正确的n
组(因此,在此最小示例2和2中)。
所以简单地使用mean
无效
dta %>%
group_by(isex, value, add = T) %>%
summarise(smn = n()*10, mean(smn) )
使用distinct
并未向我提供活动
dta %>%
distinct(idno) %>%
group_by(isex, value, add = T) %>%
summarise(smn = n()*10)
所以我需要使用distinct
来获得正确的总和来将我的活动总数除以计算这一活动总和。因此distinct
应该在第一个summarise
之后使用。我不知道该怎么做。
有任何线索吗?
所需的输出
isex value smn mean
1 FEMALE a Sleep 150 75
2 FEMALE j Child care 10 5
3 FEMALE k Care for others 40 20
4 MALE a Sleep 50 25
5 MALE j Child care 120 60
有一件事:我的问题是,在我的原始数据集中,男性/女性(或其他群体)的数量不均匀,因此我想避免手动划分。
答案 0 :(得分:1)
嗯,我想将group_by
重新用于add=TRUE
是一种方式:
dta %>%
group_by(isex) %>%
mutate( ng = n_distinct(idno) ) %>%
group_by(value,add=TRUE) %>%
summarise( smn = 10*n(), mean = 10*n()/ng[1] )
# isex value smn mean
# 1 FEMALE a Sleep 150 75
# 2 FEMALE j Child care 10 5
# 3 FEMALE k Care for others 40 20
# 4 MALE a Sleep 50 25
# 5 MALE j Child care 120 60
也可能有一些方法可以使用其中一个dplyr连接函数。