我正在使用凌乱的选民文件。请考虑以下tibble
:
library(dplyr)
library(tidyr)
dat <- tibble(
id = factor(c("A","B","C","D","E")),
demographic_info1 = round(rnorm(5),2),
demographic_info2 = round(rnorm(5),2),
election_1 = c(NA,"GN2016","GN2016","SE2016","GN2008"),
election_2 = c(NA,"MT2014","GN2012","GN2016","GN2004"),
election_3 = c(NA,NA,NA,"MT2014","GN2000"),
election_4 = c(NA,NA,NA,"GN2012",NA),
election_5 = c(NA,NA,NA,"MT2010",NA),
)
看起来像:
# A tibble: 5 x 8
id demographic_info1 demographic_info2 election_1 election_2 election_3 election_4 election_5
<fctr> <dbl> <dbl> <chr> <chr> <chr> <chr> <chr>
1 A -1.50 0.81 <NA> <NA> <NA> <NA> <NA>
2 B -1.84 -0.64 GN2016 MT2014 <NA> <NA> <NA>
3 C 1.66 -0.10 GN2016 GN2012 <NA> <NA> <NA>
4 D 0.91 -0.08 SE2016 GN2016 MT2014 GN2012 MT2010
5 E 0.04 -1.15 GN2008 GN2004 GN2000 <NA> <NA>
id
是投票人的唯一标识符。 demographic_info
列是填充符,只是为了证明我在重新整理数据时保留这些值。 election_1
到election_5
列是我感兴趣的。数据的结构使得该文件包含有人参与的最近5次选举。election_1
是最多的最近,election_5
是最近的。
请注意,A
人从未投票,而人D
总是投票。我想要做的是将这些列变为多个变量:SE2016
,GN2016
,MT2014
,GN2012
等;也就是说,election_1
到election_5
中的所有值。我希望每个变量都是TRUE
或FALSE
的变量,无论该人是否出现在民意调查中。我试过这段代码:
dat %>% # take data
gather(election, race, election_1:election_5) %>% # gather by election
mutate(temp=TRUE) %>% # make new variable that is all TRUE
select(-election) %>% # drop election variable
spread(race, temp, fill=FALSE) # spread by this all TRUE variable, fill all NAs as FALSE
但是,spread
会引发错误:
Error: Duplicate identifiers for rows (1, 6, 11, 16, 21), (12, 17, 22), (13, 18, 23), (20, 25)
这是因为race
变量的每个值都有多个条目。我在group_by(id)
之前尝试spread
,但是会抛出同样的错误。
我希望生成的tibble
看起来像:
# A tibble: 5 x 11
id demographic_info1 demographic_info2 SE2016 GN2016 MT2014 GN2012 MT2010 GN2008 GN2004 GN2000
<fctr> <dbl> <dbl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl>
1 A -0.91 -0.56 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
2 B 1.24 -1.78 FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
3 C 0.61 0.11 FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE
4 D 2.43 -0.53 TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE
5 E -1.40 -1.23 FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE
答案 0 :(得分:2)
tidyr提供了一些处理此问题的语法。
# set up
library(dplyr)
library(tidyr)
dat <- tibble(
id = factor(c("A","B","C","D","E")),
demographic_info1 = round(rnorm(5),2),
demographic_info2 = round(rnorm(5),2),
election_1 = c(NA,"GN2016","GN2016","SE2016","GN2008"),
election_2 = c(NA,"MT2014","GN2012","GN2016","GN2004"),
election_3 = c(NA,NA,NA,"MT2014","GN2000"),
election_4 = c(NA,NA,NA,"GN2012",NA),
election_5 = c(NA,NA,NA,"MT2010",NA)
)
我们最终想要的是每个选民(5)x选举(8)配对的TRUE
或FALSE
。当我们将数据收集成长格式时,我们只会在数据集中看到选举x选举组合 。
d_votes <- dat %>%
gather("variable", "election", election_1:election_5) %>%
select(-variable) %>%
mutate(voted = TRUE)
d_votes
#> # A tibble: 25 x 5
#> id demographic_info1 demographic_info2 election voted
#> <fctr> <dbl> <dbl> <chr> <lgl>
#> 1 A 0.76 -0.23 <NA> TRUE
#> 2 B -0.80 0.08 GN2016 TRUE
#> 3 C -0.33 1.60 GN2016 TRUE
#> 4 D -0.50 -1.27 SE2016 TRUE
#> 5 E -1.03 0.59 GN2008 TRUE
#> 6 A 0.76 -0.23 <NA> TRUE
#> 7 B -0.80 0.08 MT2014 TRUE
#> 8 C -0.33 1.60 GN2012 TRUE
#> 9 D -0.50 -1.27 GN2016 TRUE
#> 10 E -1.03 0.59 GN2004 TRUE
#> # ... with 15 more rows
count(d_votes, election)
#> # A tibble: 9 x 2
#> election n
#> <chr> <int>
#> 1 GN2000 1
#> 2 GN2004 1
#> 3 GN2008 1
#> 4 GN2012 2
#> 5 GN2016 3
#> 6 MT2010 1
#> 7 MT2014 2
#> 8 SE2016 1
#> 9 <NA> 13
我们需要产生选民和选举的每一个组合。 tidyr的expand()
函数创建来自不同数据列/向量的变量的所有组合。 (它的作用类似于基函数expand.grid()
,因此名称expand()
是令人回味的。)
d_possible_votes <- d_votes %>%
expand(nesting(id, demographic_info1, demographic_info2),
election)
d_possible_votes
#> # A tibble: 40 x 4
#> id demographic_info1 demographic_info2 election
#> <fctr> <dbl> <dbl> <chr>
#> 1 A 0.76 -0.23 GN2000
#> 2 A 0.76 -0.23 GN2004
#> 3 A 0.76 -0.23 GN2008
#> 4 A 0.76 -0.23 GN2012
#> 5 A 0.76 -0.23 GN2016
#> 6 A 0.76 -0.23 MT2010
#> 7 A 0.76 -0.23 MT2014
#> 8 A 0.76 -0.23 SE2016
#> 9 B -0.80 0.08 GN2000
#> 10 B -0.80 0.08 GN2004
#> # ... with 30 more rows
请注意,我们现在有8个选举x 5 ids = 40行。
我们使用nesting()
函数将每个(id
,demographic_info1
,demographic_info2
)集合/行视为一个单元;人口统计信息在ids中嵌套。展开提供了(id
,demographic_info1
,demographic_info2
)x election
的所有40种组合。
如果我们将观察到的投票加入到可能的投票中,则voted
列会填充TRUE
或NA
个值。 tidyr的replace_na()
函数可以纠正那些NA
值。
d_possible_votes <- d_possible_votes %>%
left_join(d_votes) %>%
replace_na(list(voted = FALSE))
#> Joining, by = c("id", "demographic_info1", "demographic_info2", "election")
d_possible_votes
#> # A tibble: 40 x 5
#> id demographic_info1 demographic_info2 election voted
#> <fctr> <dbl> <dbl> <chr> <lgl>
#> 1 A 0.76 -0.23 GN2000 FALSE
#> 2 A 0.76 -0.23 GN2004 FALSE
#> 3 A 0.76 -0.23 GN2008 FALSE
#> 4 A 0.76 -0.23 GN2012 FALSE
#> 5 A 0.76 -0.23 GN2016 FALSE
#> 6 A 0.76 -0.23 MT2010 FALSE
#> 7 A 0.76 -0.23 MT2014 FALSE
#> 8 A 0.76 -0.23 SE2016 FALSE
#> 9 B -0.80 0.08 GN2000 FALSE
#> 10 B -0.80 0.08 GN2004 FALSE
#> # ... with 30 more rows
现在,我们可以分散选举并实现所需的数据框架。
spread(d_possible_votes, election, voted)
#> # A tibble: 5 x 11
#> id demographic_info1 demographic_info2 GN2000 GN2004 GN2008 GN2012 GN2016 MT2010 MT2014 SE2016
#> * <fctr> <dbl> <dbl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl>
#> 1 A 0.76 -0.23 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
#> 2 B -0.80 0.08 FALSE FALSE FALSE FALSE TRUE FALSE TRUE FALSE
#> 3 C -0.33 1.60 FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE
#> 4 D -0.50 -1.27 FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE
#> 5 E -1.03 0.59 TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
这种生成标识符组合,连接实际数据和纠正缺失值的模式非常常见 - 以至于tidyr包含一个函数complete()
来同时执行所有三个。
d_votes %>%
complete(nesting(id, demographic_info1, demographic_info2),
election, fill = list(voted = FALSE)) %>%
spread(election, voted)
#> # A tibble: 5 x 11
#> id demographic_info1 demographic_info2 GN2000 GN2004 GN2008 GN2012 GN2016 MT2010 MT2014 SE2016
#> * <fctr> <dbl> <dbl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl>
#> 1 A 0.76 -0.23 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
#> 2 B -0.80 0.08 FALSE FALSE FALSE FALSE TRUE FALSE TRUE FALSE
#> 3 C -0.33 1.60 FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE
#> 4 D -0.50 -1.27 FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE
#> 5 E -1.03 0.59 TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
答案 1 :(得分:1)
我们可以在'id'上使用group_by
创建一个序列变量,因为'id'是重复的,然后在spread
dat %>%
gather(election, race, election_1:election_5) %>%
mutate(temp=TRUE)%>% group_by(id) %>%
mutate(i1 = row_number()) %>%
select(-election) %>%
spread(race, temp, fill=FALSE) %>%
select(-i1)
答案 2 :(得分:0)
问题是NA
值存在重复条目。我通过仅unique
行,然后按id
进行分组,解决了来自akrun答案的重复标识符和多行问题:
dat %>%
gather(election, race, election_1:election_5) %>%
mutate(temp=TRUE) %>%
select(-election) %>%
unique() %>% # GET RID OF DUPLICATE NA ENTRIES
group_by(id) %>%
spread(race, temp, fill=FALSE) %>%
select(-`<NA>`)
# A tibble: 5 x 11
# Groups: id [5]
id demographic_info1 demographic_info2 GN2000 GN2004 GN2008 GN2012 GN2016 MT2010 MT2014 SE2016
* <fctr> <dbl> <dbl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl>
1 A -1.19 -0.94 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
2 B 1.41 -0.62 FALSE FALSE FALSE FALSE TRUE FALSE TRUE FALSE
3 C -0.21 1.62 FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE
4 D 1.51 0.09 FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE
5 E 0.65 -2.09 TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE