基于二进制对数据进行分组,并计算均值,sd

时间:2017-12-27 12:06:11

标签: r dataframe

我有df看起来像这样

        options(scipen=999) 
        df = data.frame(imei = c(35745407328, 35745407328, 35745407328, 
                                 35745407328, 35745407328, 35745407328,
                                 35745407328, 35745407328, 35745407328,
                                 35745407328, 35745407328, 35745407328),
                        ign = c("Off", "Off", "Off", "On",
                                "On", "Off", "Off", "On",
                                "On", "On",  "On",  "On" ),
                        unixTime =  c(1514313014000, 1514313074000, 1514313134000, 1514313194000,
                                      1514313254000, 1514313314000, 1514313374000, 1514313434000,
                                      1514313494000, 1514313554000, 1514313614000, 1514313674000))

DF

        ----------------------------------
        imei            ign unixTime
        ----------------------------------
        35745407328     Off 1514313014000
        ----------------------------------
        35745407328     Off 1514313074000
        ----------------------------------
        35745407328     Off 1514313134000 
        ----------------------------------
        35745407328     On  1514313194000 
        ----------------------------------
        35745407328     On  1514313254000
        ----------------------------------
        35745407328     Off 1514313314000
        ----------------------------------
        35745407328     Off 1514313374000
        ----------------------------------
        35745407328     On  1514313434000
        ----------------------------------
        35745407328     On  1514313494000
        ----------------------------------
        35745407328     On  1514313554000
        ----------------------------------
        35745407328     On  1514313614000
        ----------------------------------
        35745407328     On  1514313674000
        ----------------------------------

我想根据'ign'然后

对上述数据进行分组

我想计算平均值,即unixTime的标准偏差。

并基于组差异中的第一个和最后一个值 of unixTime

根据平均值,sd和diff进行分组

        ----------------------------------
         imei           ign unixTime
        ----------------------------------
        35745407328     Off 1514313014000
        ----------------------------------
        35745407328     Off 1514313074000
        ----------------------------------
        35745407328     Off 1514313134000 
        ----------------------------------

        ----------------------------------
        35745407328     On  1514313194000 
        ----------------------------------
        35745407328     On  1514313254000
        ----------------------------------

        ----------------------------------
        35745407328     Off 1514313314000
        ----------------------------------
        35745407328     Off 1514313374000
        ----------------------------------

        ----------------------------------
        35745407328     On  1514313434000
        ----------------------------------
        35745407328     On  1514313494000
        ----------------------------------
        35745407328     On  1514313554000
        ----------------------------------
        35745407328     On  1514313614000
        ----------------------------------
        35745407328     On  1514313674000
        ----------------------------------

请帮我解决这个问题

如果答案已经提供。请给我链接。三江源

1 个答案:

答案 0 :(得分:1)

使用的解决方案。

library(data.table)

setDT(df)

df2 <- df[, Group := rleid(ign)][
  , c("Mean", "SD", "Diff") := list(mean(unixTime), 
                                    sd(unixTime),
                                    first(unixTime) - last(unixTime)),
  by = Group]

df2[]
#            imei ign      unixTime Group          Mean       SD    Diff
#  1: 35745407328 Off 1514313014000     1 1514313074000 60000.00 -120000
#  2: 35745407328 Off 1514313074000     1 1514313074000 60000.00 -120000
#  3: 35745407328 Off 1514313134000     1 1514313074000 60000.00 -120000
#  4: 35745407328  On 1514313194000     2 1514313224000 42426.41  -60000
#  5: 35745407328  On 1514313254000     2 1514313224000 42426.41  -60000
#  6: 35745407328 Off 1514313314000     3 1514313344000 42426.41  -60000
#  7: 35745407328 Off 1514313374000     3 1514313344000 42426.41  -60000
#  8: 35745407328  On 1514313434000     4 1514313554000 94868.33 -240000
#  9: 35745407328  On 1514313494000     4 1514313554000 94868.33 -240000
# 10: 35745407328  On 1514313554000     4 1514313554000 94868.33 -240000
# 11: 35745407328  On 1514313614000     4 1514313554000 94868.33 -240000
# 12: 35745407328  On 1514313674000     4 1514313554000 94868.33 -240000

使用

的解决方案
library(dplyr)
library(data.table)

df2 <- df %>%
  group_by(Group = rleid(ign)) %>%
  mutate(Mean = mean(unixTime), SD = sd(unixTime),
         Diff = first(unixTime) - last(unixTime)) %>%
  ungroup()
df2
#           imei    ign      unixTime Group          Mean       SD    Diff
#          <dbl> <fctr>         <dbl> <int>         <dbl>    <dbl>   <dbl>
#  1 35745407328    Off 1514313014000     1 1514313074000 60000.00 -120000
#  2 35745407328    Off 1514313074000     1 1514313074000 60000.00 -120000
#  3 35745407328    Off 1514313134000     1 1514313074000 60000.00 -120000
#  4 35745407328     On 1514313194000     2 1514313224000 42426.41  -60000
#  5 35745407328     On 1514313254000     2 1514313224000 42426.41  -60000
#  6 35745407328    Off 1514313314000     3 1514313344000 42426.41  -60000
#  7 35745407328    Off 1514313374000     3 1514313344000 42426.41  -60000
#  8 35745407328     On 1514313434000     4 1514313554000 94868.33 -240000
#  9 35745407328     On 1514313494000     4 1514313554000 94868.33 -240000
# 10 35745407328     On 1514313554000     4 1514313554000 94868.33 -240000
# 11 35745407328     On 1514313614000     4 1514313554000 94868.33 -240000
# 12 35745407328     On 1514313674000     4 1514313554000 94868.33 -240000