Question

我正在使用R并尝试根据A列中的值从另一个数据框创建一个新的平均结果数据框。为了证明我的目标，这里有一些数据：

set.seed(1981)
df <- data.frame(A = sample(c(0,1), replace=TRUE, size=100),
B=round(runif(100), digits=4), 
C=sample(1:1000, 100, replace=TRUE))
head(df, 30)

A   B       C
0   0.6739  459
1   0.5466  178
0   0.154   193
0   0.41    206
1   0.7526  791
1   0.3104  679
1   0.739   434
1   0.421   171
0   0.3653  577
1   0.4035  739
0   0.8796  147
0   0.9138  37
0   0.7257  350
1   0.2125  779
0   0.1502  495
1   0.2972  504
0   0.2406  245
1   0.0325  613
0   0.8642  539
1   0.1096  630
1   0.2113  363
1   0.277   974
0   0.0485  755
1   0.0553  412
0   0.509   24
0   0.2934  795
0   0.0725  413
0   0.8723  606
0   0.3192  591
1   0.5557  177

我需要通过计算列B和列C的平均值来减少数据的大小，因为列A中的值保持连续相同，最多可达3行。如果值A保持为1或0超过3行，它将翻转到新数据帧的下一行，如下所示。

新数据框需要以下列：

Value of A  B.Av         C.Av   No. of rows used
0           0.6739       459            1
1           0.5466       178            1
0           0.282        199.5          2
1           0.600666667  634.6666667    3
1           0.421        171            1
0           0.3653       577            1
1           0.4035       739            1
0           0.8397       178            3
1           0.2125       779            1
0           0.1502       495            1
1           0.2972       504            1
0           0.2406       245            1
1           0.0325       613            1
0           0.8642       539            1
1           0.1993       655.6666667    3
0           0.0485       755            1
1           0.0553       412            1
0           0.291633333  410.6666667    3
0           0.59575      598.5          2
1           0.5557       177            1

在搜索Stack Overflow时，我还没有设法找到另一个类似的场景，所以任何帮助都会非常感激。

Answer 1

这是一个基础R解决方案：

## define a function to split the run-length if greater than 3
split.3 <- function(l,v) {
  o <- c(values=v,lengths=min(l,3)) 
  while(l > 3) {
    l <- l - 3
    o <- rbind(o,c(values=v,lengths=min(l,3)))
  } 
  return(o)
}

## compute the run-length encoding of column A
rl <- rle(df$A)

## apply split.3 to the run-length encoding
## the first column of vl are the values of column A
## the second column of vl are the corresponding run-length limited to 3
vl <- do.call(rbind,mapply(split.3,rl$lengths,rl$values))

## compute the begin and end row indices of df for each value of A to average
fin <- cumsum(vl[,2])
beg <- fin - vl[,2] + 1

## compute the averages
out <- do.call(rbind,lapply(1:length(beg), function(i) data.frame(`Value of A`=vl[i,1],
                                                                  B.Av=mean(df$B[beg[i]:fin[i]]),
                                                                  C.Av=mean(df$C[beg[i]:fin[i]]),
                                                                  `No. of rows used`=fin[i]-beg[i]+1)))
##   Value.of.A      B.Av     C.Av No..of.rows.used
##1           0 0.6739000 459.0000                1
##2           1 0.5466000 178.0000                1
##3           0 0.2820000 199.5000                2
##4           1 0.6006667 634.6667                3
##5           1 0.4210000 171.0000                1
##6           0 0.3653000 577.0000                1
##7           1 0.4035000 739.0000                1
##8           0 0.8397000 178.0000                3
##9           1 0.2125000 779.0000                1
##10          0 0.1502000 495.0000                1
##11          1 0.2972000 504.0000                1
##12          0 0.2406000 245.0000                1
##13          1 0.0325000 613.0000                1
##14          0 0.8642000 539.0000                1
##15          1 0.1993000 655.6667                3
##16          0 0.0485000 755.0000                1
##17          1 0.0553000 412.0000                1
##18          0 0.2916333 410.6667                3
##19          0 0.5957500 598.5000                2
##20          1 0.5557000 177.0000                1

Answer 2

以下是data.table解决方案：

library(data.table)
setDT(df)
# create two group variables, consecutive A and for each consecutive A every three rows
(df[,rleid := rleid(A)][, threeWindow := ((1:.N) - 1) %/% 3, rleid]

# calculate the average of the columns grouped by the above two variables
   [, c(.N, lapply(.SD, mean)), .(rleid, threeWindow)]

# drop group variables
   [, `:=`(rleid = NULL, threeWindow = NULL)][])

 #   N A         B        C
 #1: 1 0 0.6739000 459.0000
 #2: 1 1 0.5466000 178.0000
 #3: 2 0 0.2820000 199.5000
 #4: 3 1 0.6006667 634.6667
 #5: 1 1 0.4210000 171.0000
 #6: 1 0 0.3653000 577.0000
 #7: 1 1 0.4035000 739.0000
 #8: 3 0 0.8397000 178.0000
 #9: 1 1 0.2125000 779.0000
#10: 1 0 0.1502000 495.0000
#11: 1 1 0.2972000 504.0000
#12: 1 0 0.2406000 245.0000
#13: 1 1 0.0325000 613.0000
#14: 1 0 0.8642000 539.0000
#15: 3 1 0.1993000 655.6667
#16: 1 0 0.0485000 755.0000
#17: 1 1 0.0553000 412.0000
#18: 3 0 0.2916333 410.6667
#19: 2 0 0.5957500 598.5000
#20: 1 1 0.5557000 177.0000

基于R中其他列值和行数的平均列

2 个答案: