在R中的某些观察之前选择组

时间:2018-08-22 13:04:18

标签: r dplyr data.table

data=structure(list(x1 = c(88L, 88L, 94L, 82L, 68L, 72L, 43L, 84L, 
65L, 91L, 65L, 80L, 82L, 63L, 67L, 58L, 100L, 32L, 75L, 66L, 
30L, 12L, 97L, 58L, 14L, 64L), group = structure(c(2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("female", "male"), class = "factor")), .Names = c("x1", 
"group"), class = "data.frame", row.names = c(NA, -26L))

此数据中存在组变量(性别(男性和女性) 我需要统计平均数和所有男性在女性之前的25%。男的,女的,我不动。也是女性,我不碰。 因此作为输出

x1  group   mean    25%
88  male    76,36   66,5
88  male    76,36   66,5
94  male    76,36   66,5
82  male    76,36   66,5
68  male    76,36   66,5
72  male    76,36   66,5
43  male    76,36   66,5
84  male    76,36   66,5
65  male    76,36   66,5
91  male    76,36   66,5
65  male    76,36   66,5
80  female      
82  female      
63  female      
67  female      
58  female      
100 female      
32  female      
75  male        
66  male        
30  male        
12  male        
97  male        
58  male        
14  male        
64  male        

如何做到?

编辑

x1  group
88  male
88  male
94  male
82  male
68  male
72  male
43  male
84  male
65  male
91  male
65  male
80  female
82  female
63  female
67  female
58  female
100 female
32  female
**76,36 male
**76,36 male
30  male
12  male
**76,36 male
58  male
14  male
64  male

这里结果。

4 个答案:

答案 0 :(得分:4)

data.table中,您可以编辑rleid(group) == 1所在的行,即第一行,按group的值分组。

library(data.table)
setDT(df)

df[rleid(group) == 1, `:=`(mean = mean(x1), Q25 = quantile(x1, 0.25))]

结果

#      x1  group     mean  Q25
#  1:  88   male 76.36364 66.5
#  2:  88   male 76.36364 66.5
#  3:  94   male 76.36364 66.5
#  4:  82   male 76.36364 66.5
#  5:  68   male 76.36364 66.5
#  6:  72   male 76.36364 66.5
#  7:  43   male 76.36364 66.5
#  8:  84   male 76.36364 66.5
#  9:  65   male 76.36364 66.5
# 10:  91   male 76.36364 66.5
# 11:  65   male 76.36364 66.5
# 12:  80 female       NA   NA
# 13:  82 female       NA   NA
# 14:  63 female       NA   NA
# 15:  67 female       NA   NA
# 16:  58 female       NA   NA
# 17: 100 female       NA   NA
# 18:  32 female       NA   NA
# 19:  75   male       NA   NA
# 20:  66   male       NA   NA
# 21:  30   male       NA   NA
# 22:  12   male       NA   NA
# 23:  97   male       NA   NA
# 24:  58   male       NA   NA
# 25:  14   male       NA   NA
# 26:  64   male       NA   NA
#      x1  group     mean  Q25

答案 1 :(得分:2)

library(dplyr)
library(data.table)

data %>%
  group_by(group, group2 = rleid(group)) %>%                       # group by gender and it's position
  mutate(MEAN = mean(x1[group=="male" & group2==1]),               # calculate metrics only for male in position 1
         Q25 = quantile(x1[group=="male" & group2==1], 0.25)) %>%
  ungroup() %>%                                                    # ungroup
  select(-group2) %>%                                              # remove column
  data.frame()                                                     # only for visualisation purposes

#     x1  group     MEAN  Q25
# 1   88   male 76.36364 66.5
# 2   88   male 76.36364 66.5
# 3   94   male 76.36364 66.5
# 4   82   male 76.36364 66.5
# 5   68   male 76.36364 66.5
# 6   72   male 76.36364 66.5
# 7   43   male 76.36364 66.5
# 8   84   male 76.36364 66.5
# 9   65   male 76.36364 66.5
# 10  91   male 76.36364 66.5
# 11  65   male 76.36364 66.5
# 12  80 female      NaN   NA
# 13  82 female      NaN   NA
# 14  63 female      NaN   NA
# 15  67 female      NaN   NA
# 16  58 female      NaN   NA
# 17 100 female      NaN   NA
# 18  32 female      NaN   NA
# 19  75   male      NaN   NA
# 20  66   male      NaN   NA
# 21  30   male      NaN   NA
# 22  12   male      NaN   NA
# 23  97   male      NaN   NA
# 24  58   male      NaN   NA
# 25  14   male      NaN   NA
# 26  64   male      NaN   NA

要根据您提到的逻辑更新x1列,可以使用以下方法:

data %>%
  group_by(group, group2 = rleid(group)) %>%                       
  mutate(MEAN = mean(x1[group=="male" & group2==1]),               
         Q25 = quantile(x1[group=="male" & group2==1], 0.25)) %>%
  ungroup() %>%
  mutate(x1 = ifelse(group=="male" & group2==3 & x1 > unique(Q25[!is.na(Q25)]), unique(MEAN[!is.na(MEAN)]), x1)) %>%
  ungroup() %>%
  select(-group2) %>%
  data.frame()

#     x1  group     MEAN  Q25
# 1   88.00000   male 76.36364 66.5
# 2   88.00000   male 76.36364 66.5
# 3   94.00000   male 76.36364 66.5
# 4   82.00000   male 76.36364 66.5
# 5   68.00000   male 76.36364 66.5
# 6   72.00000   male 76.36364 66.5
# 7   43.00000   male 76.36364 66.5
# 8   84.00000   male 76.36364 66.5
# 9   65.00000   male 76.36364 66.5
# 10  91.00000   male 76.36364 66.5
# 11  65.00000   male 76.36364 66.5
# 12  80.00000 female      NaN   NA
# 13  82.00000 female      NaN   NA
# 14  63.00000 female      NaN   NA
# 15  67.00000 female      NaN   NA
# 16  58.00000 female      NaN   NA
# 17 100.00000 female      NaN   NA
# 18  32.00000 female      NaN   NA
# 19  76.36364   male      NaN   NA
# 20  66.00000   male      NaN   NA
# 21  30.00000   male      NaN   NA
# 22  12.00000   male      NaN   NA
# 23  76.36364   male      NaN   NA
# 24  58.00000   male      NaN   NA
# 25  14.00000   male      NaN   NA
# 26  64.00000   male      NaN   NA

我添加的额外代码(mutate仅针对男性,之后是女性(x1 x1`大于分位数)来更新group2 = 3') and only if

答案 2 :(得分:0)

这是另一种dplyr方法,它按rleid()组进行汇总,并使用left_join()附加结果列:

library(dplyr)
result <- data %>% 
  group_by(rleid = data.table::rleid(group)) %>% 
  left_join(., filter(., rleid == 1) %>% 
              summarise(mean = mean(x1), q25 = quantile(x1, 0.25))
  ) %>% 
  ungroup() %>%
  select(-rleid)
result %>% 
  print(n = Inf)   # make sure to print all rows
# A tibble: 26 x 4
      x1 group   mean   q25
   <int> <fct>  <dbl> <dbl>
 1    88 male    76.4  66.5
 2    88 male    76.4  66.5
 3    94 male    76.4  66.5
 4    82 male    76.4  66.5
 5    68 male    76.4  66.5
 6    72 male    76.4  66.5
 7    43 male    76.4  66.5
 8    84 male    76.4  66.5
 9    65 male    76.4  66.5
10    91 male    76.4  66.5
11    65 male    76.4  66.5
12    80 female  NA    NA  
13    82 female  NA    NA  
14    63 female  NA    NA  
15    67 female  NA    NA  
16    58 female  NA    NA  
17   100 female  NA    NA  
18    32 female  NA    NA  
19    75 male    NA    NA  
20    66 male    NA    NA  
21    30 male    NA    NA  
22    12 male    NA    NA  
23    97 male    NA    NA  
24    58 male    NA    NA  
25    14 male    NA    NA  
26    64 male    NA    NA

请注意,除非结果被分配回data,否则data不会被修改。

答案 3 :(得分:0)

这也是另一种data.table方法,可以回答OP的原始问题以及OP在评论herehere中提出的其他问题。

对于两个问题,我们都需要计算第一组男性的总量,然后通过第一个问题的 update join 通过引用更新data 和第二个问题的 update non-equi join

计算第一组男性的聚集体

library(data.table)
# coerce to data.table, append rleid for later joins
setDT(data)[, rleid := rleid(group)][
  # ensure that x1 has the same type as mean(x1)
  , x1 := as.double(x1)]
agg <- data[rleid == 1, .(mean(x1), quantile(x1, .25)), by = rleid]
agg
   rleid       V1   V2
1:     1 76.36364 66.5

原始问题:为第一个男性群体添加统计数据

这是通过更新联接来实现的

data[agg, on = "rleid", c("mean", "q25") := .(V1, V2)]
data[]
     x1  group rleid     mean  q25
 1:  88   male     1 76.36364 66.5
 2:  88   male     1 76.36364 66.5
 3:  94   male     1 76.36364 66.5
 4:  82   male     1 76.36364 66.5
 5:  68   male     1 76.36364 66.5
 6:  72   male     1 76.36364 66.5
 7:  43   male     1 76.36364 66.5
 8:  84   male     1 76.36364 66.5
 9:  65   male     1 76.36364 66.5
10:  91   male     1 76.36364 66.5
11:  65   male     1 76.36364 66.5
12:  80 female     2       NA   NA
13:  82 female     2       NA   NA
14:  63 female     2       NA   NA
15:  67 female     2       NA   NA
16:  58 female     2       NA   NA
17: 100 female     2       NA   NA
18:  32 female     2       NA   NA
19:  75   male     3       NA   NA
20:  66   male     3       NA   NA
21:  30   male     3       NA   NA
22:  12   male     3       NA   NA
23:  97   male     3       NA   NA
24:  58   male     3       NA   NA
25:  14   male     3       NA   NA
26:  64   male     3       NA   NA
     x1  group rleid     mean  q25

请注意,data已通过引用进行更新,即未复制。

其他问题:修改第二个男性组中的选定值

OP请求替换第二位男性组中的任何x1值,该值超过为第一位男性q25 >按第一个男性组计算的平均值分组。请注意,第二男性组由rleid == 3L标识,因为女性组介于两者之间。

这可以通过 update non-equi join 实现。连接条件仅选择属于rleid == 3Lx1较大q25的那些行。

data[agg[, .(rleid = 3, V1, V2)], on = .(rleid, x1 > V2), x1 := V1][]
# remove helper column no longer needed
data[, rleid := NULL]
data[]
           x1  group     mean  q25
 1:  88.00000   male 76.36364 66.5
 2:  88.00000   male 76.36364 66.5
 3:  94.00000   male 76.36364 66.5
 4:  82.00000   male 76.36364 66.5
 5:  68.00000   male 76.36364 66.5
 6:  72.00000   male 76.36364 66.5
 7:  43.00000   male 76.36364 66.5
 8:  84.00000   male 76.36364 66.5
 9:  65.00000   male 76.36364 66.5
10:  91.00000   male 76.36364 66.5
11:  65.00000   male 76.36364 66.5
12:  80.00000 female       NA   NA
13:  82.00000 female       NA   NA
14:  63.00000 female       NA   NA
15:  67.00000 female       NA   NA
16:  58.00000 female       NA   NA
17: 100.00000 female       NA   NA
18:  32.00000 female       NA   NA
19:  76.36364   male       NA   NA
20:  66.00000   male       NA   NA
21:  30.00000   male       NA   NA
22:  12.00000   male       NA   NA
23:  76.36364   male       NA   NA
24:  58.00000   male       NA   NA
25:  14.00000   male       NA   NA
26:  64.00000   male       NA   NA
           x1  group     mean  q25

请注意,第19和23行已根据要求进行了更新。同样,data通过引用进行更新。