在dplyr中突变hist()$ density

时间:2018-01-08 11:34:47

标签: r dplyr histogram mutate

我有一个示例数据集

    temp=structure(list(TimeStamp = structure(c(1360368000, 1360368600, 
1360369200, 1360369800, 1360370400, 1360371000, 1360371600, 1360372200, 
1360372800, 1360373400, 1360374000, 1360374600, 1360375200, 1360375800, 
1360376400, 1360377000, 1360377600, 1360378200, 1360378800, 1360379400, 
1360380000, 1360380600, 1360381200, 1360381800, 1360382400, 1360383000, 
1360383600, 1360384200, 1360384800, 1360385400, 1360386000, 1360386600, 
1360387200, 1360387800, 1360388400, 1360389000, 1360389600, 1360390200, 
1360390800, 1360391400, 1360392000, 1360392600, 1360393200, 1360393800, 
1360394400, 1360395000, 1360395600, 1360396200, 1360396800, 1360397400, 
1360398000, 1360398600, 1360399200, 1360399800, 1360400400, 1360401000, 
1360401600, 1360402200, 1360402800, 1360403400, 1360404000, 1360404600, 
1360405200, 1360405800, 1360406400, 1360407000, 1360407600, 1360408200, 
1360408800, 1360409400, 1360410000, 1360410600, 1360411200, 1360411800, 
1360412400, 1360413000, 1360413600, 1360414200, 1360414800, 1360415400, 
1360416000, 1360416600, 1360417200, 1360417800, 1360418400, 1360419000, 
1360419600, 1360420200, 1360420800, 1360421400, 1360422000, 1360422600, 
1360423200, 1360423800, 1360424400, 1360425000, 1360425600, 1360426200, 
1360426800, 1360427400), class = c("POSIXct", "POSIXt"), tzone = "GMT"), 
    MeanWindSpeed = c(10, 10, 9.7, 9.8, 9.1, 9.1, 9.3, 9.3, 9.8, 
    9.8, 10.3, 10.4, 10.2, 11, 11.4, 12.1, 11.9, 11.5, 11.3, 
    11.1, 10.9, 11, 11.1, 11.1, 11.1, 11.5, 11.1, 11.1, 10.8, 
    10.7, 10.9, 11.3, 11.6, 11.1, 10.7, 10.2, 10, 9.5, 9.3, 9.2, 
    9.8, 10.4, 11.1, 11.5, 11.8, 11.5, 11.4, 11.1, 11.2, 10.8, 
    9.5, 9, 8.3, 8, 7.6, 8.8, 9.6, 10.7, 10.1, 9.7, 10.1, 10.1, 
    9.9, 9.6, 9.5, 9.3, 9.7, 9.3, 9.3, 9, 9.3, 8.9, 9.4, 9.8, 
    9.8, 9.2, 9.9, 9.3, 9.9, 10, 9.8, 9.2, 8.9, 8.4, 7.9, 8.2, 
    8.1, 8.3, 8.5, 8.6, 8.3, 8.8, 8.1, 8.4, 8, 8.5, 8.6, 8.8, 
    8.2, 8.7), Direction = c(19, 21, 21, 19, 18, 20, 22, 19, 
    19, 15, 13, 11, 8, 10, 11, 12, 14, 16, 17, 17, 17, 17, 18, 
    17, 19, 21, 20, 20, 18, 18, 16, 16, 17, 10, 12, 17, 19, 23, 
    22, 25, 25, 24, 25, 25, 25, 24, 23, 26, 26, 27, 30, 29, 29, 
    28, 29, 31, 31, 31, 31, 30, 33, 32, 32, 30, 31, 33, 33, 32, 
    31, 30, 32, 34, 37, 35, 34, 35, 30, 30, 27, 24, 25, 23, 26, 
    28, 24, 27, 27, 31, 21, 18, 16, 19, 24, 22, 21, 24, 26, 17, 
    20, 16), TurbInt = c(0.01, 0.02, 0.0309, 0.0204, 0.033, 0.022, 
    0.0323, 0.0215, 0.0204, 0.0204, 0.0194, 0.0192, 0.0196, 0.0182, 
    0.0175, 0.0165, 0.0168, 0.0087, 0.0177, 0.009, 0.0183, 0.0182, 
    0.018, 0.009, 0.018, 0.0348, 0.027, 0.018, 0.0185, 0.028, 
    0.0183, 0.0088, 0.0172, 0.018, 0.028, 0.0196, 0.04, 0.0316, 
    0.0215, 0.0217, 0.0204, 0.0288, 0.027, 0.0261, 0.0254, 0.0261, 
    0.0351, 0.027, 0.0268, 0.0278, 0.0421, 0.0556, 0.0602, 0.075, 
    0.0921, 0.1136, 0.0833, 0.0841, 0.0792, 0.0619, 0.0693, 0.0594, 
    0.0606, 0.0833, 0.0632, 0.0753, 0.0722, 0.0538, 0.086, 0.1111, 
    0.0645, 0.1011, 0.0745, 0.102, 0.0918, 0.0978, 0.0808, 0.086, 
    0.101, 0.1, 0.1122, 0.1087, 0.1011, 0.119, 0.1013, 0.122, 
    0.1481, 0.1325, 0.0941, 0.1163, 0.1084, 0.125, 0.1235, 0.119, 
    0.125, 0.1176, 0.1163, 0.0795, 0.122, 0.1034), Temperature = c(19.8, 
    19.5, 19.3, 19.3, 19.2, 19.1, 18.8, 18.7, 18.5, 18.3, 18.4, 
    18.1, 17.9, 17.8, 17.8, 17.9, 17.7, 17.6, 17.6, 17.4, 17.1, 
    17.1, 16.9, 16.9, 16.9, 16.9, 16.9, 16.7, 16.6, 16.6, 16.5, 
    16.2, 16.1, 16, 16, 15.8, 15.6, 15.3, 15.2, 15.3, 15.3, 15.3, 
    15, 14.8, 14.9, 14.9, 14.8, 14.8, 15, 15.6, 16, 16.5, 17.2, 
    17.9, 18.6, 19.3, 19.8, 20.1, 20.5, 21.1, 21.1, 21.4, 21.7, 
    22.2, 22.8, 23.3, 23.6, 23.8, 24.3, 24.9, 24.9, 25.5, 25.8, 
    26.2, 26.6, 27, 27.1, 27.5, 28.2, 28.4, 28.8, 28.9, 29.1, 
    29.5, 29.9, 29.9, 30, 30.2, 30.2, 30.5, 30.6, 30.6, 30.7, 
    30.8, 30.7, 30.7, 30.8, 31, 30.9, 30.9)), .Names = c("TimeStamp", 
"MeanWindSpeed", "Direction", "TurbInt", "Temperature"), row.names = 2:101, class = "data.frame")

我选择风速和湍流强度数据并对风速进行分级:

dist_turb= temp%>%
     dplyr::select(matches("MeanWindSpeed|TurbInt")) %>%
     dplyr::mutate(tibin = cut(MeanWindSpeed, breaks = seq(-0.5, 25.5, 1), labels = seq(0, 25, 1))) %>% na.omit() %>%
     dplyr::group_by(tibin)

要查看每个风速箱中湍流强度的分布,我可以做到以下几点:

 p <- ggplot(dist_turb, aes(x = TurbInt)) +
           stat_density(aes(group = tibin, color = tibin),position="identity",geom="line",adjust = 6)
 p <- ggplotly(p)
 p

我怎样才能获得这些密度分布的数据?我尝试在hist中使用dplyr,但它会返回错误:

dist_turb= temp%>%
     dplyr::select(matches("MeanWindSpeed|TurbInt")) %>%
     dplyr::mutate(tibin = cut(MeanWindSpeed, breaks = seq(-0.5, 25.5, 1), labels = seq(0, 25, 1))) %>% na.omit() %>%
     dplyr::group_by(tibin)  %>%
     dplyr::mutate(den = hist(TurbInt, breaks = 6,plot=FALSE)$density) %>% na.omit()

1 个答案:

答案 0 :(得分:0)

问题是density输出中的元素数量(即breaks = 6中指定的数量)与每组的行数不同。

temp%>%
      dplyr::select(matches("MeanWindSpeed|TurbInt")) %>%
      dplyr::mutate(tibin = cut(MeanWindSpeed, breaks = seq(-0.5, 25.5, 1), 
              labels = seq(0, 25, 1))) %>% 
      na.omit() %>%
      dplyr::group_by(tibin)  %>%
      dplry::summarise(n = n())
# A tibble: 5 x 2
#  tibin      n
#  <fctr> <int>
#1 8         15
#2 9         27
#3 10        27
#4 11        27
#5 12         4

一种方法是将summarisemutate作为list

res1 <- temp%>%
           dplyr::select(matches("MeanWindSpeed|TurbInt")) %>%
           dplyr::mutate(tibin = cut(MeanWindSpeed, breaks = seq(-0.5, 25.5, 1), 
                        labels = seq(0, 25, 1))) %>% 
           na.omit() %>%
           dplyr::group_by(tibin)  %>%
           dplyr::summarise(den = list(hist(TurbInt, breaks = 6,plot=FALSE)$density))

如果我们想要以“长”格式提取列,请使用unnest

res1 %>%
     unnest