我有一个data.table,并希望按组计算统计数据。
R) set.seed(1)
R) DT=data.table(a=rnorm(100),b=rnorm(100))
这些组应由
定义R) quantile(DT$a,probs=seq(.1,.9,.1))
10% 20% 30% 40% 50% 60% 70% 80% 90%
-1.05265747329 -0.61386923071 -0.37534201964 -0.07670312896 0.11390916079 0.37707993057 0.58121734252 0.77125359976 1.18106507751
我如何计算每箱的b
的平均值,比如b=-.5
我在[-0.61386923071,-0.37534201964]
内,所以在3
答案 0 :(得分:9)
怎么样:
> DT[, mean(b), keyby=cut(a,quantile(a,probs=seq(.1,.9,.1)))]
cut V1
1: NA -0.31359818
2: (-1.05,-0.614] -0.14103182
3: (-0.614,-0.375] -0.33474492
4: (-0.375,-0.0767] 0.20827735
5: (-0.0767,0.114] 0.14890251
6: (0.114,0.377] 0.16685304
7: (0.377,0.581] 0.07086979
8: (0.581,0.771] 0.17950572
9: (0.771,1.18] -0.04951607
要看看那个NA(并检查结果),我接下来做了:
> DT[, list(mean(b),.N,list(a)), keyby=cut(a,quantile(a,probs=seq(.1,.9,.1)))]
cut V1 N V3
1: NA -0.31359818 20 1.59528080213779,1.51178116845085,-2.2146998871775,-1.98935169586337,-1.47075238389927,1.35867955152904,
2: (-1.05,-0.614] -0.14103182 10 -0.626453810742332,-0.835628612410047,-0.820468384118015,-0.621240580541804,-0.68875569454952,-0.70749515696212,
3: (-0.614,-0.375] -0.33474492 10 -0.47815005510862,-0.41499456329968,-0.394289953710349,-0.612026393250771,-0.443291873218433,-0.589520946188072,
4: (-0.375,-0.0767] 0.20827735 10 -0.305388387156356,-0.155795506705329,-0.102787727342996,-0.164523596253587,-0.253361680136508,-0.112346212150228,
5: (-0.0767,0.114] 0.14890251 10 -0.0449336090152309,-0.0161902630989461,0.0745649833651906,-0.0561287395290008,-0.0538050405829051,-0.0593133967111857,
6: (0.114,0.377] 0.16685304 10 0.183643324222082,0.329507771815361,0.36458196213683,0.341119691424425,0.188792299514343,0.153253338211898,
7: (0.377,0.581] 0.07086979 10 0.487429052428485,0.575781351653492,0.389843236411431,0.417941560199702,0.387671611559369,0.556663198673657,
8: (0.581,0.771] 0.17950572 10 0.738324705129217,0.593901321217509,0.61982574789471,0.763175748457544,0.696963375404737,0.768532924515416,
9: (0.771,1.18] -0.04951607 10 1.12493091814311,0.943836210685299,0.821221195098089,0.918977371608218,0.782136300731067,1.10002537198388,
除此之外:我已经返回了一个list
列(每个单元格本身就是一个向量),可以快速查看进入二进制位的值,只是为了检查。 data.table
在打印时显示逗号(并且每个单元格仅显示前6个项目),但V3
的每个单元格实际上都有一个数字向量。
因此,第一个和最后一个break
之外的值被编码为NA。对我来说,如何告诉cut
不这样做并不明显。所以我刚刚添加了-Inf和+ Inf:
> DT[,list(mean(b),.N),keyby=cut(a,c(-Inf,quantile(a,probs=seq(.1,.9,.1)),+Inf))]
cut V1 N
1: (-Inf,-1.05] -0.16938368 10
2: (-1.05,-0.614] -0.14103182 10
3: (-0.614,-0.375] -0.33474492 10
4: (-0.375,-0.0767] 0.20827735 10
5: (-0.0767,0.114] 0.14890251 10
6: (0.114,0.377] 0.16685304 10
7: (0.377,0.581] 0.07086979 10
8: (0.581,0.771] 0.17950572 10
9: (0.771,1.18] -0.04951607 10
10: (1.18, Inf] -0.45781268 10
那更好。或者:
> DT[, list(mean(b),.N), keyby=cut(a,quantile(a,probs=seq(0,1,.1)),include=TRUE)]
cut V1 N
1: [-2.21,-1.05] -0.16938368 10
2: (-1.05,-0.614] -0.14103182 10
3: (-0.614,-0.375] -0.33474492 10
4: (-0.375,-0.0767] 0.20827735 10
5: (-0.0767,0.114] 0.14890251 10
6: (0.114,0.377] 0.16685304 10
7: (0.377,0.581] 0.07086979 10
8: (0.581,0.771] 0.17950572 10
9: (0.771,1.18] -0.04951607 10
10: (1.18,2.4] -0.45781268 10
这样你就可以看到最小值和最大值,而不是显示-Inf和+ Inf。请注意,您需要将include=TRUE
传递给cut
,否则将返回11个区域,第一个区域只有1个区域。
答案 1 :(得分:3)
我做了很多类型的事情,所以我在我的R包中为它写了一个非常灵活的bin_data()方法 - mltools。它完全以data.table
为基础,并使用新的non-equi joins。
要回答您的具体问题,请将Bin1设置为DT
中的列,然后按Bin1分组
library(data.table)
library(mltools)
DT[, Bin1 := bin_data(vals=a, bins=seq(.1, .9, .1), binType="quantile")]
DT[, list(mean(b)), keyby=Bin1]
Bin1 V1
1: NA -0.31359818
2: [-1.05265747329296, -0.613869230708978) -0.14103182
3: [-0.613869230708978, -0.375342019639661) -0.33474492
4: [-0.375342019639661, -0.0767031289639095) 0.20827735
5: [-0.0767031289639095, 0.113909160788544) 0.14890251
6: [0.113909160788544, 0.377079930573521) 0.16685304
7: [0.377079930573521, 0.581217342522697) 0.07086979
8: [0.581217342522697, 0.771253599758546) 0.17950572
9: [0.771253599758546, 1.1810650775142] -0.04951607
按分位数
制作10个等间距的分档DT[, Bin2 := bin_data(vals=a, bins=10, binType="quantile")]
DT[, list(mean(b)), keyby=Bin2]
Bin2 V1
1: [-2.2146998871775, -1.05265747329296) -0.16938368
2: [-1.05265747329296, -0.613869230708978) -0.14103182
3: [-0.613869230708978, -0.375342019639661) -0.33474492
4: [-0.375342019639661, -0.0767031289639095) 0.20827735
5: [-0.0767031289639095, 0.113909160788544) 0.14890251
6: [0.113909160788544, 0.377079930573521) 0.16685304
7: [0.377079930573521, 0.581217342522697) 0.07086979
8: [0.581217342522697, 0.771253599758546) 0.17950572
9: [0.771253599758546, 1.1810650775142) -0.04951607
10: [1.1810650775142, 2.40161776050478] -0.45781268
使最后一个边界左右关闭
DT[, Bin3 := bin_data(vals=a, bins=10, binType="quantile", boundaryType="lcro)")]
DT[, list(mean(b)), keyby=Bin2]
1: NA 0.42510038
2: [-2.2146998871775, -1.05265747329296) -0.16938368
3: [-1.05265747329296, -0.613869230708978) -0.14103182
4: [-0.613869230708978, -0.375342019639661) -0.33474492
5: [-0.375342019639661, -0.0767031289639095) 0.20827735
6: [-0.0767031289639095, 0.113909160788544) 0.14890251
7: [0.113909160788544, 0.377079930573521) 0.16685304
8: [0.377079930573521, 0.581217342522697) 0.07086979
9: [0.581217342522697, 0.771253599758546) 0.17950572
10: [0.771253599758546, 1.1810650775142) -0.04951607
11: [1.1810650775142, 2.40161776050478) -0.55591413
指定您自己的显式分档(请注意返回空分档)
bin_data(dt=DT, binCol="a", bins=seq(-5, 5, 1), returnDT=TRUE)
Bin a b
1: [-5, -4) NA NA
2: [-4, -3) NA NA
3: [-3, -2) -2.214700 -0.65069635
4: [-2, -1) -1.989352 -0.17955653
5: [-2, -1) -1.470752 -0.03763417
---
100: [1, 2) 1.586833 -1.20808279
101: [2, 3) 2.401618 0.42510038
102: [2, 3) 2.172612 0.20753834
103: [3, 4) NA NA
104: [4, 5] NA NA
使用可变大小的分档
bin_data(dt=DT, binCol="a", bins=data.table(LB=c(-5, 0, 1), RB=c(0, 1, Inf)), returnDT=TRUE)
Bin a b
1: [-5, 0) -0.626453811 -0.62036668
2: [-5, 0) -0.835628612 -0.91092165
3: [-5, 0) -0.820468384 1.76728727
4: [-5, 0) -0.305388387 1.68217608
5: [-5, 0) -0.621240581 1.43228224
---
95: [1, Inf] 2.172611670 0.20753834
96: [1, Inf] 1.178086997 0.21992480
97: [1, Inf] 1.063099837 1.46458731
98: [1, Inf] 1.207867806 0.40201178
99: [1, Inf] 1.160402616 -0.73174817
100: [1, Inf] 1.586833455 -1.20808279
Bin a b