Question

我有一个类似于此的数据集：

head(df,20)
   mmpd tot
1     0   0
2    mm   0
3    mm   1
4     0   0
5     0   0
6    mm   0
7    mm   1
8    mm   3
9    mm   1
10    0   0
11    0   0
12    0   0
13    0   0
14   mm   0
15   mm   0
16    0   0
17    0   0
18   mm   4
19   mm   1
20   mm   0

我想获得df $ tot的平均值，当它对应于df $ mmpd中的mm字符串时。因此，对于示例数据集，我想获得以下数字字符串：.5,1.25,0,1.667。 df $ mmpd将始终为mm＆gt;的字符串。 1或0，列可以以0或mm的字符串开头。

如果没有for循环，有没有办法做到这一点？

Answer 1

使用data.table

library(data.table) # v 1.9.5+
setDT(df)[,.(my=mean(tot)), by=.(indx=rleid(mmpd),mmpd)][,indx:=NULL][mmpd=='mm']
   mmpd       my
#1:   mm 0.500000
#2:   mm 1.250000
#3:   mm 0.000000
#4:   mm 1.666667

显然，有很多方法可以做到（见r search along a vector and calculate the mean）。 data.table方法速度最快，适用于此。

注意：rleid可以在data.table语法之外使用。这将更像是＆＃34;传统＆＃34; R语法并生成相同的结果。

subset(aggregate(tot ~ indx + mmpd, 
          data=cbind(df,indx=rleid(df$mmpd)),
          FUN=mean),mmpd=="mm")

速度比较不同的方式生成rleid（myrleid 来自@JasonAizkalns回答）。

> set.seed(1); x<-sample(1:2,100000,replace=T); 
  microbenchmark(rleid(x),
                 myrleid2=cumsum(c(1,diff(x)!=0)),
                 myrleid(x))
Unit: milliseconds
       expr      min       lq     mean   median       uq       max neval cld
   rleid(x) 1.422263 1.500873 1.586482 1.571315 1.662982  1.938254   100 a  
   myrleid2 3.860290 3.908308 4.369646 3.962497 4.177673 15.674611   100  b 
 myrleid(x) 7.282868 7.386515 7.753515 7.444008 7.654126 18.864898   100   c

对于非数字x：

>  set.seed(1); x<-sample(c('a','b'),100000,replace=T); 
>  microbenchmark(rleid(x),myrleid2=cumsum(c(1,diff(as.numeric(factor(x)))!=0)),myrleid(x))
Unit: milliseconds
       expr       min        lq      mean    median       uq       max neval cld
   rleid(x)  1.465466  1.571662  1.684568  1.606614  1.66080  2.900983   100 a
   myrleid2  8.705447  9.276787 12.393393  9.907403 10.35032 61.080374   100  b
 myrleid(x) 11.970271 13.176144 18.779256 13.790767 14.09626 69.845587   100   c

Answer 2

使用此数据：

df = structure(list(mmpd = structure(c(1L, 2L, 2L, 1L, 1L, 2L, 2L, 
2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L), .Label = c("0", 
"mm"), class = "factor"), tot = c(0L, 0L, 1L, 0L, 0L, 0L, 1L, 
3L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 4L, 1L, 0L)), .Names = c("mmpd", 
"tot"), class = "data.frame", row.names = c("1", "2", "3", "4", 
"5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", 
"16", "17", "18", "19", "20"))

添加分组列（仅需要对＆＃34; mm＆＃34;值有效）

df$group = cumsum(df$mmpd != "mm")

对于＆＃34; mm＆＃34;数据的子集，采用分组方式：

tapply(df$tot[df$mmpd == "mm"], INDEX = group[df$mmpd == "mm"], FUN = mean)
#        1        3        7        9 
# 0.500000 1.250000 0.000000 1.666667

群体指数没有多大意义（它们通过干预零的数量减去1来增加），但你无论如何都没有要求它们，结果是正确的;）

Answer 3

使用基础R - 自行启动relid()功能，其灵感来自data.table包中的功能：

myrleid <- function(x) {
  x <- rle(x)$lengths
  rep(seq_along(x), times=x)
}

然后使用此函数创建group变量并使用aggregate：

df$group <- myrleid(df$mmpd)
aggregate(data = subset(df, mmpd == "mm"), tot ~ group, mean)

#   group      tot
# 1     2 0.500000
# 2     4 1.250000
# 3     6 0.000000
# 4     8 1.666667

如何在给定r中另一列的连续字符串的列中找到连续数字的均值

3 个答案: