我是R语言的初学者,所以以下内容对我来说非常复杂。
我有以下data.frame
,其中包含纽约市5个行政区和2012-2015年的数据。每年有两类:P和Q。
数据
input_df = data.frame(
Manhattan=c(1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0),
Brooklyn=c(0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0),
Queens=c(1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0),
The_Bronx=c(1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0),
Staten_Island=c(0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0),
"2012"=c("P", "P", "P", "P", "P", "P", "P", "P", "P", "P", "Q", "Q", "Q", "Q", "Q", "Q", "Q", "Q", "Q"),
"2013"=c("P", "P", "P", "P", "P", "P", "P", "P", "Q", "Q", "P", "P", "P", "P", "Q", "Q", "Q", "Q", "Q"),
"2014"=c("P", "P", "P", "Q", "Q", "P", "P", "Q", "Q", "Q", "Q", "Q", "P", "Q", "P", "P", "P", "Q", "Q"),
"2015"=c("P", "P", "P", "P", "P", "Q", "Q", "Q", "P", "Q", "P", "P", "Q", "Q", "Q", "Q", "Q", "Q", "Q"),
check.names=FALSE)
我想使用fisher.test
系统地确定类别P中的事件(“ 1”)是否同时发生于类别Q中,而不是类别Q(反之亦然)。 >
例如,在2012年,P类中的曼哈顿和布鲁克林事件是否同时发生(同一行中均为“ 1”)而不是Q类事件? P等于10的4,Q等于9的0,因此fisher.test(matrix(c(4,6,0,9), nrow=2))$p.value
等于0.08668731
。
有没有办法系统地做到这一点?有关简单的开始和理想的输出data.frame
,请参见下文。我将对所有接近此输出的内容感到满意。谢谢。
代码(仅是开始)
library(reshape2)
input_df <- melt(input_df, measure.vars = 6:9) # transform the data
# can maybe use: function x {fisher.test(matrix(x, nrow=2))}
# how to proceed?
理想的输出
# ideally hoping to get output similar to this:
output_df = data.frame(
borough_1=c("Manhattan", "Manhattan", "Manhattan", "Manhattan", "Manhattan", "Manhattan", "etc"),
borough_2=c("Brooklyn", "Brooklyn", "Brooklyn", "Brooklyn", "Queens", "Queens", "etc"),
year=c("2012", "2013", "2014", "2015", "2012", "2013", "etc"),
P_both_boroughs_1=c("4", "2", "1", "2", "4", "4", "etc"),
P_not_both_boroughs_1=c("6", "11", "8", "6", "6", "8", "etc"),
Q_both_boroughs_1=c("0", "2", "3", "2", "1", "1", "etc"),
Q_not_both_boroughs_1=c("9", "5", "7", "9", "8", "6", "etc"),
fisher.test.pval=c("0.086687307", "0.586790506", "0.582043344", "1", "0.303405573", "0.602683179", "etc"),
check.names=FALSE)
编辑@ user2974951
user2974951,您能否请我在以下替代方法input_df
上平稳地运行相同的代码?如果我使用此input_df
,不幸的是它将引发错误,因为tmp3
不再是2x2的表。我将衷心感谢您的帮助。谢谢。
input_df = data.frame(
Manhattan=c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0),
Brooklyn=c(0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0),
Queens=c(1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0),
The_Bronx=c(1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0),
Staten_Island=c(0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0),
"2012"=c("P", "P", "P", "P", "P", "P", "P", "P", "P", "P", "Q", "Q", "Q", "Q", "Q", "Q", "Q", "Q", "Q"),
"2013"=c("P", "P", "P", "P", "P", "P", "P", "P", "Q", "Q", "P", "P", "P", "P", "Q", "Q", "Q", "Q", "Q"),
"2014"=c("P", "P", "P", "Q", "Q", "P", "P", "Q", "Q", "Q", "Q", "Q", "P", "Q", "P", "P", "P", "Q", "Q"),
"2015"=c("P", "P", "P", "P", "P", "Q", "Q", "Q", "P", "Q", "P", "P", "Q", "Q", "Q", "Q", "Q", "Q", "Q"),
check.names=FALSE)
答案 0 :(得分:2)
我将按以下方式解决此问题。首先,我将加载用于分析的软件包
# packages
library(dplyr)
library(tidyr)
library(purrr)
并创建数据集。
# data
input_df <- tibble(
Manhattan = c(1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0),
Brooklyn = c(0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0),
Queens = c(1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0),
The_Bronx = c(1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0),
Staten_Island = c(0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0),
"2012" = c("P", "P", "P", "P", "P", "P", "P", "P", "P", "P", "Q", "Q", "Q", "Q", "Q", "Q", "Q", "Q", "Q"),
"2013" = c("P", "P", "P", "P", "P", "P", "P", "P", "Q", "Q", "P", "P", "P", "P", "Q", "Q", "Q", "Q", "Q"),
"2014" = c("P", "P", "P", "Q", "Q", "P", "P", "Q", "Q", "Q", "Q", "Q", "P", "Q", "P", "P", "P", "Q", "Q"),
"2015" = c("P", "P", "P", "P", "P", "Q", "Q", "Q", "P", "Q", "P", "P", "Q", "Q", "Q", "Q", "Q", "Q", "Q")
)
head(input_df)
#> # A tibble: 6 x 9
#> Manhattan Brooklyn Queens The_Bronx Staten_Island `2012` `2013` `2014`
#> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr>
#> 1 1 0 1 1 0 P P P
#> 2 1 0 1 1 0 P P P
#> 3 0 0 0 0 0 P P P
#> 4 1 1 0 0 0 P P Q
#> 5 1 0 1 0 0 P P Q
#> 6 1 1 1 0 0 P P P
#> # ... with 1 more variable: `2015` <chr>
然后,我将您的数据集从宽结构更改为长结构。 year
和borough
列的值分别为2012
,...,2015
和Manhattan
,...,Staten_Island
category
和flag
在数据集中采用borough
和year
的组合的相应值。我需要此结构用于后续功能。
# tidying
tidy_input_df <- input_df %>%
gather("year", "category", `2012`:`2015`) %>%
gather("borough", "flag", -category, -year)
tidy_input_df
#> # A tibble: 380 x 4
#> year category borough flag
#> <chr> <chr> <chr> <dbl>
#> 1 2012 P Manhattan 1
#> 2 2012 P Manhattan 1
#> 3 2012 P Manhattan 0
#> 4 2012 P Manhattan 1
#> 5 2012 P Manhattan 1
#> 6 2012 P Manhattan 1
#> 7 2012 P Manhattan 1
#> 8 2012 P Manhattan 0
#> 9 2012 P Manhattan 1
#> 10 2012 P Manhattan 1
#> # ... with 370 more rows
我还需要一个包含所有行政区名称的向量
borough <- unique(tidy_input_df$borough)
现在,我必须以一种这样的方式来修改您的数据集,即每年都有两列,其中包含两个自治市(即曼哈顿-布鲁克林,曼哈顿-皇后区等)的每个可能的对,并带有相应的值。由于我每年都需要重复相同的过程,因此我将数据嵌套在年份中
nested_input_df <- nest(tidy_input_df, -year)
nested_input_df
#> # A tibble: 4 x 2
#> year data
#> <chr> <list>
#> 1 2012 <tibble [95 x 3]>
#> 2 2013 <tibble [95 x 3]>
#> 3 2014 <tibble [95 x 3]>
#> 4 2015 <tibble [95 x 3]>
并创建一个执行上面我描述的过程的新功能。我现在可以使用here中所述的nest
-map
方法。
该函数的第一部分在数据框中创建一个新列,该列表示类别和自治市镇的每种组合的唯一ID,而代码的第二部分创建一个新数据框,一次包含2个自治市镇的所有组合并关联标志和类别的相应值(即0/1和P / Q)。
create_boroughs_combinations <- function(data, borough) {
# Create a unique ID for all combinations of category
# and borough
data <- data %>%
group_by(category, borough) %>%
mutate(ID = 1:n()) %>%
ungroup()
# Create all combinations of n boroughs taken 2 at a time.
t(combn(length(borough), 2)) %>%
# transorm that matrix in a tibble
as_tibble(.name_repair = ~ c("borough_1", "borough_2")) %>%
# associate each matrix value to the corresponding borough name
mutate(borough_1 = borough[borough_1], borough_2 = borough[borough_2]) %>%
# join the two dataframes wrt the name of the first borough
inner_join(data, by = c("borough_1" = "borough")) %>%
# joint the two dataframes wrt the name of the second column, the category
# and the unique ID
inner_join(data, by = c("borough_2" = "borough", "category", "ID")) %>%
# create a new variable that checks if the incidents occurred at the same time
mutate(equal = factor(flag.x == 1 & flag.y == 1, levels = c(TRUE, FALSE)))
}
现在,我可以使用nested_input
函数将该函数应用于map
。我必须使用map
,因为我需要每年分别应用该功能。这就是结果。 flag.x
是第一个自治市镇的flag
的值,而flag.y
是第二个自治市镇的flag
的值。
unnested_input_df <- nested_input_df %>%
mutate(data = map(data, create_boroughs_combinations, borough = borough)) %>%
unnest()
unnested_input_df
#> # A tibble: 760 x 8
#> year borough_1 borough_2 category flag.x ID flag.y equal
#> <chr> <chr> <chr> <chr> <dbl> <int> <dbl> <fct>
#> 1 2012 Manhattan Brooklyn P 1 1 0 FALSE
#> 2 2012 Manhattan Brooklyn P 1 2 0 FALSE
#> 3 2012 Manhattan Brooklyn P 0 3 0 FALSE
#> 4 2012 Manhattan Brooklyn P 1 4 1 TRUE
#> 5 2012 Manhattan Brooklyn P 1 5 0 FALSE
#> 6 2012 Manhattan Brooklyn P 1 6 1 TRUE
#> 7 2012 Manhattan Brooklyn P 1 7 0 FALSE
#> 8 2012 Manhattan Brooklyn P 0 8 0 FALSE
#> 9 2012 Manhattan Brooklyn P 1 9 1 TRUE
#> 10 2012 Manhattan Brooklyn P 1 10 1 TRUE
#> # ... with 750 more rows
现在,我可以使用相同的想法并创建一个新函数,该函数可以估计fisher检验的pvalue并将其应用于年份和各行政区的每个组合。我再次嵌套数据:
nested_input_df <- unnested_input_df %>%
nest(-year, -borough_1, -borough_2)
nested_input_df
#> # A tibble: 40 x 4
#> year borough_1 borough_2 data
#> <chr> <chr> <chr> <list>
#> 1 2012 Manhattan Brooklyn <tibble [19 x 5]>
#> 2 2012 Manhattan Queens <tibble [19 x 5]>
#> 3 2012 Manhattan The_Bronx <tibble [19 x 5]>
#> 4 2012 Manhattan Staten_Island <tibble [19 x 5]>
#> 5 2012 Brooklyn Queens <tibble [19 x 5]>
#> 6 2012 Brooklyn The_Bronx <tibble [19 x 5]>
#> 7 2012 Brooklyn Staten_Island <tibble [19 x 5]>
#> 8 2012 Queens The_Bronx <tibble [19 x 5]>
#> 9 2012 Queens Staten_Island <tibble [19 x 5]>
#> 10 2012 The_Bronx Staten_Island <tibble [19 x 5]>
#> # ... with 30 more rows
定义功能:
run_fisher_test <- function(data) {
data <- data %>%
select(category, equal)
fisher.test(table(data))$p.value
}
应用它,结果如下:
result <- nested_input_df %>%
mutate(p.value = map_dbl(data, run_fisher_test)) %>%
select(-data)
result
#> # A tibble: 40 x 4
#> year borough_1 borough_2 p.value
#> <chr> <chr> <chr> <dbl>
#> 1 2012 Manhattan Brooklyn 0.0867
#> 2 2012 Manhattan Queens 0.303
#> 3 2012 Manhattan The_Bronx 0.303
#> 4 2012 Manhattan Staten_Island 1
#> 5 2012 Brooklyn Queens 1
#> 6 2012 Brooklyn The_Bronx 1
#> 7 2012 Brooklyn Staten_Island 1
#> 8 2012 Queens The_Bronx 0.350
#> 9 2012 Queens Staten_Island 1
#> 10 2012 The_Bronx Staten_Island 1
#> # ... with 30 more rows
由reprex package(v0.3.0)
创建于2019-09-10我希望这很清楚。如果您有任何疑问,请对此帖子发表评论。我知道这不是最简单的方法,但我真的很喜欢nest
-map
方法,如果您理解它,它就非常灵活。
答案 1 :(得分:1)
这是我尝试使用for循环
res=vector("list",4)
names(res)=colnames(input_df)[6:9]
for (k in 1:4) { #years
res[[k]]=matrix(NA,5,5)
rownames(res[[k]])=colnames(res[[k]])=colnames(input_df)[1:5]
for (i in 1:4) { #first in par
for (j in (i+1):5) { #second in pair
tmp1=which(input_df[,k+5]=="P")
tmp2=which(input_df[,k+5]=="Q")
tmp3=table(input_df[tmp1,i],input_df[tmp1,j]) #table for P
tmp4=table(input_df[tmp2,i],input_df[tmp2,j]) #table for Q
tmp5=matrix(c(tmp3[2,2],sum(tmp3)-tmp3[2,2],
tmp4[2,2],sum(tmp4)-tmp4[2,2]),nrow=2,byrow=T)
res[[k]][i,j]=fisher.test(tmp5)$p.value
}
}
}
以及所有p值的输出
res
$`2012`
Manhattan Brooklyn Queens The_Bronx Staten_Island
Manhattan NA 0.08668731 0.3034056 0.3034056 1
Brooklyn NA NA 1.0000000 1.0000000 1
Queens NA NA NA 0.3498452 1
The_Bronx NA NA NA NA 1
Staten_Island NA NA NA NA NA
$`2013`
Manhattan Brooklyn Queens The_Bronx Staten_Island
Manhattan NA 0.6026832 0.6026832 0.30469556 0.3684211
Brooklyn NA NA 1.0000000 0.03611971 0.3684211
Queens NA NA NA 1.00000000 1.0000000
The_Bronx NA NA NA NA 0.1228070
Staten_Island NA NA NA NA NA
$`2014`
Manhattan Brooklyn Queens The_Bronx Staten_Island
Manhattan NA 0.5820433 0.1408669 0.6284830 1
Brooklyn NA NA 0.2105263 1.0000000 1
Queens NA NA NA 0.3498452 1
The_Bronx NA NA NA NA 1
Staten_Island NA NA NA NA NA
$`2015`
Manhattan Brooklyn Queens The_Bronx Staten_Island
Manhattan NA 1 0.6026832 0.6026832 0.4210526
Brooklyn NA NA 0.4853801 1.0000000 0.4210526
Queens NA NA NA 0.3188854 1.0000000
The_Bronx NA NA NA NA 1.0000000
Staten_Island NA NA NA NA NA
或者,如果要在一个数据帧中全部包含其他信息,则
res=matrix(NA,4*choose(5,2),8)
colnames(res)=c("borough_1","borough_2","year","P_both_boroughs_1",
"P_not_both_boroughs_1","Q_both_boroughs_1",
"Q_not_both_boroughs_1","fisher.test.pval")
m=1
for (k in 1:4) { #years
for (i in 1:4) { #first in par
for (j in (i+1):5) { #second in pair
tmp1=which(input_df[,k+5]=="P")
tmp2=which(input_df[,k+5]=="Q")
tmp3=table(input_df[tmp1,i],input_df[tmp1,j]) #table for P
tmp4=table(input_df[tmp2,i],input_df[tmp2,j]) #table for Q
tmp5=matrix(c(tmp3[2,2],sum(tmp3)-tmp3[2,2],
tmp4[2,2],sum(tmp4)-tmp4[2,2]),nrow=2,byrow=T)
res[m,]=c(colnames(input_df)[i],
colnames(input_df)[j],
colnames(input_df)[k+5],
tmp5[1,1],tmp5[1,2],tmp5[2,1],tmp5[2,2],
fisher.test(tmp5)$p.value)
m=m+1
}
}
}
和输出的前几行
data.frame(res)
borough_1 borough_2 year P_both_boroughs_1 P_not_both_boroughs_1
1 Manhattan Brooklyn 2012 4 6
2 Manhattan Queens 2012 4 6
3 Manhattan The_Bronx 2012 4 6
4 Manhattan Staten_Island 2012 1 9
5 Brooklyn Queens 2012 1 9
6 Brooklyn The_Bronx 2012 2 8
Q_both_boroughs_1 Q_not_both_boroughs_1 fisher.test.pval
1 0 9 0.0866873065015479
2 1 8 0.303405572755418
3 1 8 0.303405572755418
4 0 9 1
5 1 8 1
6 1 8 1
编辑:作为对缺失级别的修复,您可以使用自己的表格功能
myTable=function(t1,t2) {
res=matrix(NA,2,2)
res[1,1]=sum(t1==0 & t2==0)
res[1,2]=sum(t1==0 & t2==1)
res[2,1]=sum(t1==1 & t2==0)
res[2,2]=sum(t1==1 & t2==1)
return(res)
}
使用它代替table
。