如何使用表示在现有列中找到的字符串的变长向量来mutate()
一个新变量?
我有一个来自多个大洲的许多国家的数据框。我想mutate()
一个名为continent
的新变量。
head(chocolate_data_common_beans3, n = 10)
company_location cocoa_percent rating
1 France 63 3.75
2 Fiji 72 3.50
3 Ecuador 55 2.75
4 U.S.A. 75 2.75
5 U.S.A. 70 2.75
6 U.S.A. 55 2.75
7 Canada 72 3.75
8 U.S.A. 85 3.50
9 Australia 78 3.75
10 Austria 70 3.75
这是每个大洲的所有值。
# company_location by continent
africa <- c("South Africa", "Sao Tome", "Madagascar", "Ghana")
asia <- c("Vietnam", "South Korea", "Singapore", "Russia", "Philippines",
"Japan", "Israel", "India")
europe <- c("Wales", "U.K.", "Switzerland", "Sweden", "Spain", "Scotland",
"Portugal", "Poland", "Netherlands", "Lithuania", "Italy",
"Ireland", "Iceland", "Hungary", "Germany", "France","Finland",
"Denmark", "Czech Republic", "Belgium", "Austria", "Amsterdam")
south_america <- c("Venezuela", "Suriname", "Peru", "Ecuador", "Costa Rica",
"Colombia", "Chile", "Brazil", "Bolivia", "Argentina")
north_america <- c("U.S.A.", "St. Lucia", "Puerto Rico", "Nicaragua",
"Niacragua", "Mexico","Martinique", "Honduras",
"Guatemala", "Grenada", "Dominican Republic", "Canada")
oceania <- c("New Zealand", "Fiji", "Australia")
我尝试使用case_when
创建continent
列,但是由于向量长度不同,我收到了错误消息。
# create new column of continents
chocolate_data_common_beans2 <- chocolate_data_common_beans2 %>%
mutate(continent = case_when(
africa %in% company_location ~ "Africa",
asia %in% company_location ~ "Asia",
europe %in% company_location ~ "Europe",
south_america %in% company_location ~ "South America",
north_america %in% company_location ~ "North America",
oceania %in% company_location ~ "Oceania"
))
我该怎么做?
您可以看到所有代码in my Kaggle workbook。
答案 0 :(得分:1)
%in%
的工作方式与此相反(这在语言上也更有意义:您要问的是 this 是该列表的成员) >):
df %>%
mutate(continent = case_when(
company_location %in% africa ~ "Africa",
company_location %in% asia ~ "Asia",
company_location %in% europe ~ "Europe",
company_location %in% south_america ~ "South America",
company_location %in% north_america ~ "North America",
company_location %in% oceania ~ "Oceania"
))
company_location cocoa_percent rating continent
1 France 63 3.75 Europe
2 Fiji 72 3.50 Oceania
3 Ecuador 55 2.75 South America
4 U.S.A. 75 2.75 North America
5 U.S.A. 70 2.75 North America
6 U.S.A. 55 2.75 North America
7 Canada 72 3.75 North America
8 U.S.A. 85 3.50 North America
9 Australia 78 3.75 Oceania
10 Austria 70 3.75 Europe
答案 1 :(得分:0)
我们可以在创建key/val
数据集之后进行联接
library(tidyverse)
list(Africa = africa, Asia = asia, Europe = europe,
`South America` = south_america, `North America` = north_america,
Oceania = oceania) %>%
stack %>%
right_join(chocolate_data_common_beans2, by = c("values" = "company_location")) %>%
rename(continent = ind)
# values continent cocoa_percent rating
#1 France Europe 63 3.75
#2 Fiji Oceania 72 3.50
#3 Ecuador South America 55 2.75
#4 U.S.A. North America 75 2.75
#5 U.S.A. North America 70 2.75
#6 U.S.A. North America 55 2.75
#7 Canada North America 72 3.75
#8 U.S.A. North America 85 3.50
#9 Australia Oceania 78 3.75
#10 Austria Europe 70 3.75
或者使用enframe
代替stack
list(Africa = africa, Asia = asia, Europe = europe,
`South America` = south_america, `North America` = north_america,
Oceania = oceania) %>%
enframe(name = "continent", value = "company_location") %>%
unnest %>%
right_join(chocolate_data_common_beans2)
注意:此方法的优点是不使用多个嵌套条件来更改值。我们只需要一个join
。
在稍大的数据集上
dfN <- chocolate_data_common_beans2[rep(seq_len(nrow(chocolate_data_common_beans2)), each = 1e5),]
library(microbenchmark)
akrun <- function() {
list(Africa = africa, Asia = asia, Europe = europe,
`South America` = south_america, `North America` = north_america,
Oceania = oceania) %>%
enframe(name = "continent", value = "company_location") %>%
unnest %>%
right_join(dfN)
}
iod <- function() {
dfN %>%
mutate(continent = case_when(
company_location %in% africa ~ "Africa",
company_location %in% asia ~ "Asia",
company_location %in% europe ~ "Europe",
company_location %in% south_america ~ "South America",
company_location %in% north_america ~ "North America",
company_location %in% oceania ~ "Oceania"
))
}
microbenchmark(akrun(), iod(), times = 10L, unit = "relative")
# expr min lq mean median uq max neval cld
# akrun() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10 a
# iod() 6.332611 6.201221 5.953432 6.125145 5.567748 5.751538 10 b
chocolate_data_common_beans2 <- structure(list(company_location =
c("France", "Fiji", "Ecuador",
"U.S.A.", "U.S.A.", "U.S.A.", "Canada", "U.S.A.", "Australia",
"Austria"), cocoa_percent = c(63L, 72L, 55L, 75L, 70L, 55L, 72L,
85L, 78L, 70L), rating = c(3.75, 3.5, 2.75, 2.75, 2.75, 2.75,
3.75, 3.5, 3.75, 3.75)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10"))