使用具有可变binwidths和因子的R - 频率计数

时间:2015-08-18 13:14:11

标签: r count dplyr

我有一个很大的数据集(超过100万行),其中有一个小样本:

structure(list(Feret = c(0.017, 0.016, 2.12, 0.016, 0.02, 0.023, 
0.017, 0.021, 0.02, 0.016, 0.027, 0.052, 0.061, 0.033, 0.041, 
0.017, 6.561, 7.123, 0.027, 0.018, 0.024, 4.099, 0.022, 0.025, 
0.037, 0.037, 0.018, 0.039, 0.027, 0.053, 0.016, 0.107, 0.52, 
0.041, 0.038, 0.039, 0.03, 0.071, 0.022, 0.118, 0.032, 0.018, 
0.027, 0.035, 8.113, 0.078, 4.089, 0.035, 0.057, 6.905, 2.5, 
0.282, 0.045, 0.039, 0.071, 0.037, 0.029, 0.027, 0.016, 0.02, 
0.026, 0.025, 0.026, 0.016, 0.016, 0.021), sample.type = structure(c(2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L), .Label = c("flower", "leaf"), class = "factor"), leaf.side = structure(c(2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L), .Label = c("lower", "upper"), class = "factor"), canopy = structure(c(2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L), .Label = c("bottom", "top"), class = "factor"), treatment = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L), .Label = c("blue", "green", "grey", "white", "yel-green"
), class = "factor")), .Names = c("Feret", "sample.type", "leaf.side", 
"canopy", "treatment"), row.names = c(500000L, 500001L, 500002L, 
500003L, 500004L, 500005L, 500006L, 500007L, 500008L, 500009L, 
500010L, 800000L, 800001L, 800002L, 800003L, 800004L, 800005L, 
800006L, 800007L, 800008L, 800009L, 800010L, 1000L, 1001L, 1002L, 
1003L, 1004L, 1005L, 1006L, 1007L, 1008L, 1009L, 1010L, 10000L, 
10001L, 10002L, 10003L, 10004L, 10005L, 10006L, 10007L, 10008L, 
10009L, 10010L, 100000L, 100001L, 100002L, 100003L, 100004L, 
100005L, 100006L, 100007L, 100008L, 100009L, 100010L, 1160000L, 
1160001L, 1160002L, 1160003L, 1160004L, 1160005L, 1160006L, 1160007L, 
1160008L, 1160009L, 1160010L), class = "data.frame")

我一直在尝试使用以下二进制宽度创建'Feret'变量的频率计数:

bins <- c(0.01,0.03,0.1,0.3,1,3,10)

然后使用:

freq<-hist(df_temp$Feret, breaks=bins)
ranges<-paste(head(bins,-1),bins[-1],sep=" - ")
freq$counts
df5<-data.frame(ranges = ranges, frequency = freq$counts)
df5

但我真正需要做的是将data.frame分成各种因素(“sample.type”,“leaf.side”,“canopy”,“treatment”)并提取每个子集的频率计数。 我可以通过手动创建每个子集以漫长的方式做到这一点,但我想以更好的方式做到这一点。我尝试使用循环来创建子集,然后将hist()函数应用于每个子集,但这需要很长时间。有没有更好的方法使用Dplyr或Apply? 我更愿意将结果放在表格中,然后我可以根据需要绘制它们。

3 个答案:

答案 0 :(得分:0)

以下代码段应该符合您的要求:

我将您的样本加载到df

library("dplyr")
df %>% group_by(sample.type, leaf.side, canopy, treatment) %>%
  dplyr::select(Feret) %>%
  do(data.frame(table(cut(.$Feret, breaks=bins, include.lowest=T))))

我推荐你dplyr documentation。简而言之,x %>% ff(x)x -> f(a)f(x,a)

请注意,dplyr::select只是select,但我有很多次命名空间,现在我总是指定包。

table(cut(df$Feret, breaks=bins))只是使用hist执行操作的更好方法。使用cut,您可以创建一个因子变量(如果您的值可以达到下限,请记住添加include.lowest = T),使用table计算每个级别的频率。

这给出了:

   sample.type leaf.side canopy treatment        Var1 Freq
1       flower     upper    top     green (0.01,0.03]    0
2       flower     upper    top     green  (0.03,0.1]    6
3       flower     upper    top     green   (0.1,0.3]    1
4       flower     upper    top     green     (0.3,1]    0
5       flower     upper    top     green       (1,3]    1
6       flower     upper    top     green      (3,10]    3
7       flower     upper    top     white (0.01,0.03]    4
8       flower     upper    top     white  (0.03,0.1]    4
9       flower     upper    top     white   (0.1,0.3]    0
10      flower     upper    top     white     (0.3,1]    0
11      flower     upper    top     white       (1,3]    0
12      flower     upper    top     white      (3,10]    3
13        leaf     lower bottom     white (0.01,0.03]    5
14        leaf     lower bottom     white  (0.03,0.1]    4
15        leaf     lower bottom     white   (0.1,0.3]    1
16        leaf     lower bottom     white     (0.3,1]    1
17        leaf     lower bottom     white       (1,3]    0
18        leaf     lower bottom     white      (3,10]    0
19        leaf     lower    top      grey (0.01,0.03]   10
20        leaf     lower    top      grey  (0.03,0.1]    1
21        leaf     lower    top      grey   (0.1,0.3]    0
22        leaf     lower    top      grey     (0.3,1]    0
23        leaf     lower    top      grey       (1,3]    0
24        leaf     lower    top      grey      (3,10]    0
25        leaf     upper bottom     white (0.01,0.03]    4
26        leaf     upper bottom     white  (0.03,0.1]    6
27        leaf     upper bottom     white   (0.1,0.3]    1
28        leaf     upper bottom     white     (0.3,1]    0
29        leaf     upper bottom     white       (1,3]    0
30        leaf     upper bottom     white      (3,10]    0
31        leaf     upper    top      blue (0.01,0.03]   10
32        leaf     upper    top      blue  (0.03,0.1]    0
33        leaf     upper    top      blue   (0.1,0.3]    0
34        leaf     upper    top      blue     (0.3,1]    0
35        leaf     upper    top      blue       (1,3]    1
36        leaf     upper    top      blue      (3,10]    0

(实际上,它不会像这样打印,因为这是一个tbl,但你可以使用print.data.frame以旧方式打印tbl。)

从这里可以直接提取您想要的信息。

答案 1 :(得分:0)

首先定义一个带有因子名称的字符向量:

factors <- c("sample.type","leaf.side","canopy", "treatment")

然后使用此向量将hist()函数应用于每个因子(假设数据存储在名为df的数据框对象中):

res <- sapply(factors, function(factor) {
  lapply(split(df[, c("Feret", factor)], df[[factor]]), function(group) {
    hist(group$Feret, breaks = bins, plot = FALSE)
  })
}, simplify = FALSE)

现在,您有一个列表,每个因子都有一个元素,每个元素都是一个列表,每个级别都有一个元素:

> names(res)
[1] "sample.type" "leaf.side"   "canopy"      "treatment"  
> names(res$sample.type)
[1] "flower" "leaf"
> res$sample.type$flower
$breaks
[1]  0.01  0.03  0.10  0.30  1.00  3.00 10.00

$counts
[1]  4 10  1  0  1  6

$density
[1] 9.09090909 6.49350649 0.22727273 0.00000000 0.02272727 0.03896104

$mids
[1] 0.020 0.065 0.200 0.650 2.000 6.500

$xname
[1] "group$Feret"

$equidist
[1] FALSE

attr(,"class")
[1] "histogram"
> 

您可以将其格式化为适合绘图的内容。

答案 2 :(得分:0)

如果我们对没有出现的箱子不感兴趣,我们只需要:

df %>% 
  group_by(sample.type, leaf.side, canopy, treatment, groups = cut(Feret, bins)) %>% 
  summarise(freq =n())

输出:

   sample.type leaf.side canopy treatment      groups freq
1       flower     upper    top     green  (0.03,0.1]    6
2       flower     upper    top     green   (0.1,0.3]    1
3       flower     upper    top     green       (1,3]    1
4       flower     upper    top     green      (3,10]    3
5       flower     upper    top     white (0.01,0.03]    4
6       flower     upper    top     white  (0.03,0.1]    4
7       flower     upper    top     white      (3,10]    3
8         leaf     lower bottom     white (0.01,0.03]    5
9         leaf     lower bottom     white  (0.03,0.1]    4
10        leaf     lower bottom     white   (0.1,0.3]    1
11        leaf     lower bottom     white     (0.3,1]    1
12        leaf     lower    top      grey (0.01,0.03]   10
13        leaf     lower    top      grey  (0.03,0.1]    1
14        leaf     upper bottom     white (0.01,0.03]    4
15        leaf     upper bottom     white  (0.03,0.1]    6
16        leaf     upper bottom     white   (0.1,0.3]    1
17        leaf     upper    top      blue (0.01,0.03]   10
18        leaf     upper    top      blue       (1,3]    1