Question

我有一个df：

 df<- data.frame(region= c("1", "1", "1","1","1","1","1","1", "2","2"),plot=c("1", "1", "1","2","2","2", "3","3","3","3"), interact=c("A_B", "C_D","C_D", "E_F","C_D","C_D", "D_E", "D_E","C_B","A_B"))

我想按plot对数据进行子集化。对于每个plot子集，我想计算每个唯一interact类型的频率。输出应如下所示：

df<- data.frame(region= c("1", "1", "1","1", "2","2", 
"2"),plot=c("1", 
"1", "2","2", "3","3","3"), interact=c("A_B", "C_D", "E_F","C_D", 
"D_E", "C_B","A_B"), freq= c(1,2,1,2,2,1,1))

然后我想创建一个函数，为df的每个plot子集计算以下内容：

 sum<-sum(df$freq) # Calculate sum of `freq` for each plot subset (this calculates the total number of interactions)
 prop<-unique(df$freq)/sum  #Divide each level of `freq` by the sum (this finds the proportion of each interaction type to the total number of interactions) 
 prop2<-prop^2 # Square this proportion 
 D<-sum(prop2) # Find the sum of these proportion for each plot subset
 simp<-1/D)# Use this to calculate simpsons diversity

我想要使用的功能类似于下页所述的功能：http://rfunctions.blogspot.com.ng/2012/02/diversity-indices-simpsons-diversity.html。但是，引用的版本是在宽数据集上执行的，而我的数据集将很长。

最后，我会得到每个情节的df值：

  result<- 
         Plot    div
          1      1.8
          2      1.8
          3      2.6

Answer 1

我使用dplyr但是plot3的结果是不同的，我不知道为什么。您能否为每次计算提供结果或检查我的结果并让我知道错误在哪里？

另外。如果您有兴趣计算多样性指数，您可以熟悉vegan包，尤其是diversity()函数

df<- data.frame(region= c("1", "1", "1","1","1","1","1","1", "2","2"),
                plot=c("1", "1", "1","2","2","2", "3","3","3","3"),
                interact=c("A_B", "C_D","C_D", "E_F","C_D","C_D", "D_E", "D_E","C_B","A_B"))

library(dplyr)

df1 <- df %>% group_by(region, plot, interact) %>% summarise(freq = n()) 
df2 <- df1 %>% group_by(plot) %>%  mutate(sum=sum(freq), prop=freq/sum, prop2 = prop^2)
df2

 A tibble: 7 x 7
# Groups:   plot [3]
  region   plot interact  freq   sum      prop     prop2
  <fctr> <fctr>   <fctr> <int> <int>     <dbl>     <dbl>
1      1      1      A_B     1     3 0.3333333 0.1111111
2      1      1      C_D     2     3 0.6666667 0.4444444
3      1      2      C_D     2     3 0.6666667 0.4444444
4      1      2      E_F     1     3 0.3333333 0.1111111
5      1      3      D_E     2     4 0.5000000 0.2500000
6      2      3      A_B     1     4 0.2500000 0.0625000
7      2      3      C_B     1     4 0.2500000 0.0625000


df2 %>% group_by(plot) %>% summarise(D=sum(prop2), simp=1/D)

 A tibble: 3 x 3
    plot         D     simp
  <fctr>     <dbl>    <dbl>
1      1 0.5555556 1.800000
2      2 0.5555556 1.800000
3      3 0.3750000 2.666667

以下是使用diversity()包中的vegan函数的方法。

首先，你需要使用spread来创建一个＆＃34;矩阵＆＃34;把你所有的互动作为单独的列

library(vegan)
library(tidyr)
library(dplyr)

df5 <- df %>% group_by(plot, interact) %>% summarise(freq = n())
df6 <-spread(data=df5, key = interact, value = freq, fill=0)
df6

# A tibble: 3 x 6
# Groups:   plot [3]
    plot   A_B   C_B   C_D   D_E   E_F
* <fctr> <dbl> <dbl> <dbl> <dbl> <dbl>
1      1     1     0     2     0     0
2      2     0     0     2     0     1
3      3     1     1     0     2     0

比你计算多样性，给出数据矩阵df6没有1列，这是绘图。最后，您可以将计算出的多样性作为列添加到df6。

simp <-diversity(x=df6[,-1], index = "invsimpson")
df6$simp <- simp
df6

# A tibble: 3 x 7
# Groups:   plot [3]
    plot   A_B   C_B   C_D   D_E   E_F     simp
* <fctr> <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>
1      1     1     0     2     0     0 1.800000
2      2     0     0     2     0     1 1.800000
3      3     1     1     0     2     0 2.666667

来自do()包

的tidy()和broom，甚至更短

df5 <- df %>% group_by(plot, interact) %>% summarise(freq = n())

library(broom)

df5 %>% spread(key = interact, value = freq, fill=0) %>% 
  do(tidy(diversity(x=.[,-1], index = "invsimpson")))

子集数据帧和应用函数，用于计算每个因子级别的频率

1 个答案: