我有大量数据如下
structure(list(chr = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1",
"10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "2",
"20", "21", "22", "3", "4", "5", "6", "7", "8", "9", "X", "Y"
), class = "factor"), leftPos = c(1L, 15001L, 30001L, 45001L,
60001L, 75001L, 90001L, 105001L, 120001L, 135001L, 150001L, 165001L,
180001L, 195001L, 210001L, 225001L, 240001L, 255001L, 270001L,
285001L, 300001L, 315001L, 330001L, 345001L, 360001L, 375001L,
390001L, 405001L, 420001L, 435001L, 450001L, 465001L, 480001L,
495001L, 510001L, 525001L, 540001L, 555001L, 570001L, 585001L,
600001L, 615001L, 630001L, 645001L, 660001L, 675001L, 690001L,
705001L, 720001L, 735001L, 750001L, 765001L, 780001L, 795001L,
810001L, 825001L, 840001L, 855001L, 870001L, 885001L, 900001L,
915001L, 930001L, 945001L, 960001L, 975001L, 990001L, 1005001L,
1020001L, 1035001L), Means = c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
0.057, 0.162, -0.088, -0.271, 0.038, 0.089, -0.091, -0.223, 0.091,
-0.023, -0.008, NA, -0.152, -0.228)), .Names = c("chr", "leftPos",
"Means"), row.names = c(NA, 70L), class = "data.frame")
我想对数据进行分组,以便获得名为Means的列的平均值,以便将leftPos分组为1000000。
所以我使用了这段代码:
NadSWGSv <- NadSWGSv %>%
group_by(chr, binnum = (leftPos) %/% 1500000) %>%
summarise(Means = mean(Means)) %>%
mutate(leftPos = (binnum+1) * 120000) %>%
select(leftPos, Means)
但它给了我许多我期待的手段。我不知道为什么。
structure(list(chr = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1",
"10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "2",
"20", "21", "22", "3", "4", "5", "6", "7", "8", "9", "X", "Y"
), class = "factor"), leftPos = c(120000, 240000, 360000, 480000,
6e+05, 720000, 840000, 960000, 1080000, 1200000, 1320000, 1440000,
1560000, 1680000, 1800000, 1920000, 2040000, 2160000, 2280000,
2400000, 2520000, 2640000, 2760000, 2880000, 3e+06, 3120000,
3240000, 3360000, 3480000, 3600000, 3720000, 3840000, 3960000,
4080000, 4200000, 4320000, 4440000, 4560000, 4680000, 4800000,
4920000, 5040000, 5160000, 5280000, 5400000, 5520000, 5640000,
5760000, 5880000, 6e+06, 6120000, 6240000, 6360000, 6480000,
6600000, 6720000, 6840000, 6960000, 7080000, 7200000, 7320000,
7440000, 7560000, 7680000, 7800000, 7920000, 8040000, 8160000,
8280000, 8400000), Means = c(NA, NA, NA, NA, NA, NA, -0.07272,
NA, NA, NA, NA, NA, -0.000940000000000001, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
0.00673, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, -0.11982, NA, NA, -0.10338, -0.17146, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, -0.09175, NA, NA, NA, NA)), .Names = c("chr",
"leftPos", "Means"), class = c("grouped_df", "tbl_df", "tbl",
"data.frame"), row.names = c(NA, -70L), vars = list(chr), drop = TRUE, indices = list(
0:69), group_sizes = 70L, biggest_group_size = 70L, labels = structure(list(
chr = structure(1L, .Label = c("1", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "2", "20", "21", "22",
"3", "4", "5", "6", "7", "8", "9", "X", "Y"), class = "factor")), class = "data.frame", row.names = c(NA,
-1L), .Names = "chr", vars = list(chr)))
答案 0 :(得分:2)
您需要在Means = mean(Means, na.rm = TRUE)
来电中使用summarise
。
具有NA的任何向量的平均值默认为NA。我们可以通过将参数na.rm = TRUE
传递给平均值来取非NA的平均值:
mean(c(1, 2, NA))
[1] NA
mean(c(1, 2, NA), na.rm = TRUE)
[1] 1.5
在您的情况下,列Means
具有NA,因此您的NA。