Question

我是R语言的初学者，所以以下内容对我来说非常复杂。

我有以下data.frame，其中包含纽约市5个行政区和2012-2015年的数据。每年有两类：P和Q。

数据

 input_df = data.frame(
      Manhattan=c(1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0), 
      Brooklyn=c(0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0), 
      Queens=c(1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0), 
      The_Bronx=c(1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0), 
      Staten_Island=c(0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0), 
      "2012"=c("P", "P", "P", "P", "P", "P", "P", "P", "P", "P", "Q", "Q", "Q", "Q", "Q", "Q", "Q", "Q", "Q"), 
      "2013"=c("P", "P", "P", "P", "P", "P", "P", "P", "Q", "Q", "P", "P", "P", "P", "Q", "Q", "Q", "Q", "Q"), 
      "2014"=c("P", "P", "P", "Q", "Q", "P", "P", "Q", "Q", "Q", "Q", "Q", "P", "Q", "P", "P", "P", "Q", "Q"), 
      "2015"=c("P", "P", "P", "P", "P", "Q", "Q", "Q", "P", "Q", "P", "P", "Q", "Q", "Q", "Q", "Q", "Q", "Q"), 
 check.names=FALSE)

我想使用fisher.test系统地确定类别P中的事件（“ 1”）是否同时发生于类别Q中，而不是类别Q（反之亦然）。 >

例如，在2012年，P类中的曼哈顿和布鲁克林事件是否同时发生（同一行中均为“ 1”）而不是Q类事件？ P等于10的4，Q等于9的0，因此fisher.test(matrix(c(4,6,0,9), nrow=2))$p.value等于0.08668731。

有没有办法系统地做到这一点？有关简单的开始和理想的输出data.frame，请参见下文。我将对所有接近此输出的内容感到满意。谢谢。

代码（仅是开始）

 library(reshape2)
 input_df <- melt(input_df, measure.vars = 6:9) # transform the data
 # can maybe use: function x {fisher.test(matrix(x, nrow=2))}
 # how to proceed?

理想的输出

 # ideally hoping to get output similar to this:
 output_df = data.frame(
 borough_1=c("Manhattan", "Manhattan", "Manhattan", "Manhattan", "Manhattan", "Manhattan", "etc"), 
 borough_2=c("Brooklyn", "Brooklyn", "Brooklyn", "Brooklyn", "Queens", "Queens", "etc"),
 year=c("2012", "2013", "2014", "2015", "2012", "2013", "etc"), 
 P_both_boroughs_1=c("4", "2", "1", "2", "4", "4", "etc"), 
 P_not_both_boroughs_1=c("6", "11", "8", "6", "6", "8", "etc"), 
 Q_both_boroughs_1=c("0", "2", "3", "2", "1", "1", "etc"), 
 Q_not_both_boroughs_1=c("9", "5", "7", "9", "8", "6", "etc"), 
 fisher.test.pval=c("0.086687307", "0.586790506", "0.582043344", "1", "0.303405573", "0.602683179", "etc"), 
 check.names=FALSE)

编辑@ user2974951

user2974951，您能否请我在以下替代方法input_df上平稳地运行相同的代码？如果我使用此input_df，不幸的是它将引发错误，因为tmp3不再是2x2的表。我将衷心感谢您的帮助。谢谢。

 input_df = data.frame(
      Manhattan=c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0), 
      Brooklyn=c(0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0), 
      Queens=c(1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0), 
      The_Bronx=c(1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0), 
      Staten_Island=c(0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0), 
      "2012"=c("P", "P", "P", "P", "P", "P", "P", "P", "P", "P", "Q", "Q", "Q", "Q", "Q", "Q", "Q", "Q", "Q"), 
      "2013"=c("P", "P", "P", "P", "P", "P", "P", "P", "Q", "Q", "P", "P", "P", "P", "Q", "Q", "Q", "Q", "Q"), 
      "2014"=c("P", "P", "P", "Q", "Q", "P", "P", "Q", "Q", "Q", "Q", "Q", "P", "Q", "P", "P", "P", "Q", "Q"), 
      "2015"=c("P", "P", "P", "P", "P", "Q", "Q", "Q", "P", "Q", "P", "P", "Q", "Q", "Q", "Q", "Q", "Q", "Q"), 
 check.names=FALSE)

Answer 1

我将按以下方式解决此问题。首先，我将加载用于分析的软件包

# packages
library(dplyr)
library(tidyr)
library(purrr)

并创建数据集。

# data
input_df <- tibble(
  Manhattan = c(1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0),
  Brooklyn = c(0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0),
  Queens = c(1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0),
  The_Bronx = c(1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0),
  Staten_Island = c(0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0),
  "2012" = c("P", "P", "P", "P", "P", "P", "P", "P", "P", "P", "Q", "Q", "Q", "Q", "Q", "Q", "Q", "Q", "Q"),
  "2013" = c("P", "P", "P", "P", "P", "P", "P", "P", "Q", "Q", "P", "P", "P", "P", "Q", "Q", "Q", "Q", "Q"),
  "2014" = c("P", "P", "P", "Q", "Q", "P", "P", "Q", "Q", "Q", "Q", "Q", "P", "Q", "P", "P", "P", "Q", "Q"),
  "2015" = c("P", "P", "P", "P", "P", "Q", "Q", "Q", "P", "Q", "P", "P", "Q", "Q", "Q", "Q", "Q", "Q", "Q")
)
head(input_df)
#> # A tibble: 6 x 9
#>   Manhattan Brooklyn Queens The_Bronx Staten_Island `2012` `2013` `2014`
#>       <dbl>    <dbl>  <dbl>     <dbl>         <dbl> <chr>  <chr>  <chr> 
#> 1         1        0      1         1             0 P      P      P     
#> 2         1        0      1         1             0 P      P      P     
#> 3         0        0      0         0             0 P      P      P     
#> 4         1        1      0         0             0 P      P      Q     
#> 5         1        0      1         0             0 P      P      Q     
#> 6         1        1      1         0             0 P      P      P     
#> # ... with 1 more variable: `2015` <chr>

然后，我将您的数据集从宽结构更改为长结构。 year和borough列的值分别为2012，...，2015和Manhattan，...，Staten_Island category和flag在数据集中采用borough和year的组合的相应值。我需要此结构用于后续功能。

# tidying
tidy_input_df <- input_df %>%
  gather("year", "category", `2012`:`2015`) %>%
  gather("borough", "flag", -category, -year)
tidy_input_df
#> # A tibble: 380 x 4
#>    year  category borough    flag
#>    <chr> <chr>    <chr>     <dbl>
#>  1 2012  P        Manhattan     1
#>  2 2012  P        Manhattan     1
#>  3 2012  P        Manhattan     0
#>  4 2012  P        Manhattan     1
#>  5 2012  P        Manhattan     1
#>  6 2012  P        Manhattan     1
#>  7 2012  P        Manhattan     1
#>  8 2012  P        Manhattan     0
#>  9 2012  P        Manhattan     1
#> 10 2012  P        Manhattan     1
#> # ... with 370 more rows

我还需要一个包含所有行政区名称的向量

borough <- unique(tidy_input_df$borough)

现在，我必须以一种这样的方式来修改您的数据集，即每年都有两列，其中包含两个自治市（即曼哈顿-布鲁克林，曼哈顿-皇后区等）的每个可能的对，并带有相应的值。由于我每年都需要重复相同的过程，因此我将数据嵌套在年份中

nested_input_df <- nest(tidy_input_df, -year)
nested_input_df
#> # A tibble: 4 x 2
#>   year  data             
#>   <chr> <list>           
#> 1 2012  <tibble [95 x 3]>
#> 2 2013  <tibble [95 x 3]>
#> 3 2014  <tibble [95 x 3]>
#> 4 2015  <tibble [95 x 3]>

并创建一个执行上面我描述的过程的新功能。我现在可以使用here中所述的nest-map方法。

该函数的第一部分在数据框中创建一个新列，该列表示类别和自治市镇的每种组合的唯一ID，而代码的第二部分创建一个新数据框，一次包含2个自治市镇的所有组合并关联标志和类别的相应值（即0/1和P / Q）。

create_boroughs_combinations <- function(data, borough) {
  # Create a unique ID for all combinations of category
  # and borough
  data <- data %>%
    group_by(category, borough) %>%
    mutate(ID = 1:n()) %>%
    ungroup()

  # Create all combinations of n boroughs taken 2 at a time. 
  t(combn(length(borough), 2)) %>%
  # transorm that matrix in a tibble
    as_tibble(.name_repair = ~ c("borough_1", "borough_2")) %>%
  # associate each matrix value to the corresponding borough name
    mutate(borough_1 = borough[borough_1], borough_2 = borough[borough_2]) %>%
  # join the two dataframes wrt the name of the first borough
    inner_join(data, by = c("borough_1" = "borough")) %>%
  # joint the two dataframes wrt the name of the second column, the category
  # and the unique ID
    inner_join(data, by = c("borough_2" = "borough", "category", "ID")) %>%
  # create a new variable that checks if the incidents occurred at the same time
    mutate(equal = factor(flag.x == 1 & flag.y == 1, levels = c(TRUE, FALSE)))
}

现在，我可以使用nested_input函数将该函数应用于map。我必须使用map，因为我需要每年分别应用该功能。这就是结果。 flag.x是第一个自治市镇的flag的值，而flag.y是第二个自治市镇的flag的值。

unnested_input_df <- nested_input_df %>%
  mutate(data = map(data, create_boroughs_combinations, borough = borough)) %>%
  unnest()
unnested_input_df
#> # A tibble: 760 x 8
#>    year  borough_1 borough_2 category flag.x    ID flag.y equal
#>    <chr> <chr>     <chr>     <chr>     <dbl> <int>  <dbl> <fct>
#>  1 2012  Manhattan Brooklyn  P             1     1      0 FALSE
#>  2 2012  Manhattan Brooklyn  P             1     2      0 FALSE
#>  3 2012  Manhattan Brooklyn  P             0     3      0 FALSE
#>  4 2012  Manhattan Brooklyn  P             1     4      1 TRUE 
#>  5 2012  Manhattan Brooklyn  P             1     5      0 FALSE
#>  6 2012  Manhattan Brooklyn  P             1     6      1 TRUE 
#>  7 2012  Manhattan Brooklyn  P             1     7      0 FALSE
#>  8 2012  Manhattan Brooklyn  P             0     8      0 FALSE
#>  9 2012  Manhattan Brooklyn  P             1     9      1 TRUE 
#> 10 2012  Manhattan Brooklyn  P             1    10      1 TRUE 
#> # ... with 750 more rows

现在，我可以使用相同的想法并创建一个新函数，该函数可以估计fisher检验的pvalue并将其应用于年份和各行政区的每个组合。我再次嵌套数据：

nested_input_df <- unnested_input_df %>%
  nest(-year, -borough_1, -borough_2)
nested_input_df
#> # A tibble: 40 x 4
#>    year  borough_1 borough_2     data             
#>    <chr> <chr>     <chr>         <list>           
#>  1 2012  Manhattan Brooklyn      <tibble [19 x 5]>
#>  2 2012  Manhattan Queens        <tibble [19 x 5]>
#>  3 2012  Manhattan The_Bronx     <tibble [19 x 5]>
#>  4 2012  Manhattan Staten_Island <tibble [19 x 5]>
#>  5 2012  Brooklyn  Queens        <tibble [19 x 5]>
#>  6 2012  Brooklyn  The_Bronx     <tibble [19 x 5]>
#>  7 2012  Brooklyn  Staten_Island <tibble [19 x 5]>
#>  8 2012  Queens    The_Bronx     <tibble [19 x 5]>
#>  9 2012  Queens    Staten_Island <tibble [19 x 5]>
#> 10 2012  The_Bronx Staten_Island <tibble [19 x 5]>
#> # ... with 30 more rows

定义功能：

run_fisher_test <- function(data) {
  data <- data %>%
    select(category, equal)

  fisher.test(table(data))$p.value
}

应用它，结果如下：

result <- nested_input_df %>%
  mutate(p.value = map_dbl(data, run_fisher_test)) %>%
  select(-data)
result
#> # A tibble: 40 x 4
#>    year  borough_1 borough_2     p.value
#>    <chr> <chr>     <chr>           <dbl>
#>  1 2012  Manhattan Brooklyn       0.0867
#>  2 2012  Manhattan Queens         0.303 
#>  3 2012  Manhattan The_Bronx      0.303 
#>  4 2012  Manhattan Staten_Island  1     
#>  5 2012  Brooklyn  Queens         1     
#>  6 2012  Brooklyn  The_Bronx      1     
#>  7 2012  Brooklyn  Staten_Island  1     
#>  8 2012  Queens    The_Bronx      0.350 
#>  9 2012  Queens    Staten_Island  1     
#> 10 2012  The_Bronx Staten_Island  1     
#> # ... with 30 more rows

^{由reprex package（v0.3.0）}

创建于2019-09-10

我希望这很清楚。如果您有任何疑问，请对此帖子发表评论。我知道这不是最简单的方法，但我真的很喜欢nest-map方法，如果您理解它，它就非常灵活。

Answer 2

这是我尝试使用for循环

res=vector("list",4)
names(res)=colnames(input_df)[6:9]
for (k in 1:4) { #years
  res[[k]]=matrix(NA,5,5)
  rownames(res[[k]])=colnames(res[[k]])=colnames(input_df)[1:5]
  for (i in 1:4) { #first in par
    for (j in (i+1):5) { #second in pair
      tmp1=which(input_df[,k+5]=="P")
      tmp2=which(input_df[,k+5]=="Q")
      tmp3=table(input_df[tmp1,i],input_df[tmp1,j]) #table for P
      tmp4=table(input_df[tmp2,i],input_df[tmp2,j]) #table for Q
      tmp5=matrix(c(tmp3[2,2],sum(tmp3)-tmp3[2,2],
                    tmp4[2,2],sum(tmp4)-tmp4[2,2]),nrow=2,byrow=T)
      res[[k]][i,j]=fisher.test(tmp5)$p.value
    }
  }
}

以及所有p值的输出

res

$`2012`
              Manhattan   Brooklyn    Queens The_Bronx Staten_Island
Manhattan            NA 0.08668731 0.3034056 0.3034056             1
Brooklyn             NA         NA 1.0000000 1.0000000             1
Queens               NA         NA        NA 0.3498452             1
The_Bronx            NA         NA        NA        NA             1
Staten_Island        NA         NA        NA        NA            NA

$`2013`
              Manhattan  Brooklyn    Queens  The_Bronx Staten_Island
Manhattan            NA 0.6026832 0.6026832 0.30469556     0.3684211
Brooklyn             NA        NA 1.0000000 0.03611971     0.3684211
Queens               NA        NA        NA 1.00000000     1.0000000
The_Bronx            NA        NA        NA         NA     0.1228070
Staten_Island        NA        NA        NA         NA            NA

$`2014`
              Manhattan  Brooklyn    Queens The_Bronx Staten_Island
Manhattan            NA 0.5820433 0.1408669 0.6284830             1
Brooklyn             NA        NA 0.2105263 1.0000000             1
Queens               NA        NA        NA 0.3498452             1
The_Bronx            NA        NA        NA        NA             1
Staten_Island        NA        NA        NA        NA            NA

$`2015`
              Manhattan Brooklyn    Queens The_Bronx Staten_Island
Manhattan            NA        1 0.6026832 0.6026832     0.4210526
Brooklyn             NA       NA 0.4853801 1.0000000     0.4210526
Queens               NA       NA        NA 0.3188854     1.0000000
The_Bronx            NA       NA        NA        NA     1.0000000
Staten_Island        NA       NA        NA        NA            NA

或者，如果要在一个数据帧中全部包含其他信息，则

res=matrix(NA,4*choose(5,2),8)
colnames(res)=c("borough_1","borough_2","year","P_both_boroughs_1",
                "P_not_both_boroughs_1","Q_both_boroughs_1",
                "Q_not_both_boroughs_1","fisher.test.pval")
m=1
for (k in 1:4) { #years
  for (i in 1:4) { #first in par
    for (j in (i+1):5) { #second in pair
      tmp1=which(input_df[,k+5]=="P")
      tmp2=which(input_df[,k+5]=="Q")
      tmp3=table(input_df[tmp1,i],input_df[tmp1,j]) #table for P
      tmp4=table(input_df[tmp2,i],input_df[tmp2,j]) #table for Q
      tmp5=matrix(c(tmp3[2,2],sum(tmp3)-tmp3[2,2],
                    tmp4[2,2],sum(tmp4)-tmp4[2,2]),nrow=2,byrow=T)
      res[m,]=c(colnames(input_df)[i],
                colnames(input_df)[j],
                colnames(input_df)[k+5],
                tmp5[1,1],tmp5[1,2],tmp5[2,1],tmp5[2,2],
                fisher.test(tmp5)$p.value)
      m=m+1
    }
  }
}

和输出的前几行

data.frame(res)

  borough_1     borough_2 year P_both_boroughs_1 P_not_both_boroughs_1
1 Manhattan      Brooklyn 2012                 4                     6
2 Manhattan        Queens 2012                 4                     6
3 Manhattan     The_Bronx 2012                 4                     6
4 Manhattan Staten_Island 2012                 1                     9
5  Brooklyn        Queens 2012                 1                     9
6  Brooklyn     The_Bronx 2012                 2                     8
  Q_both_boroughs_1 Q_not_both_boroughs_1   fisher.test.pval
1                 0                     9 0.0866873065015479
2                 1                     8  0.303405572755418
3                 1                     8  0.303405572755418
4                 0                     9                  1
5                 1                     8                  1
6                 1                     8                  1

编辑：作为对缺失级别的修复，您可以使用自己的表格功能

myTable=function(t1,t2) {
  res=matrix(NA,2,2)
  res[1,1]=sum(t1==0 & t2==0)
  res[1,2]=sum(t1==0 & t2==1)
  res[2,1]=sum(t1==1 & t2==0)
  res[2,2]=sum(t1==1 & t2==1)
  return(res)
}

使用它代替table。

R

2 个答案: