Mutate()基于另一列中不同长度的字符串向量的新列

时间:2018-11-01 15:35:37

标签: r dplyr

如何使用表示在现有列中找到的字符串的变长向量来mutate()一个新变量?

我有一个来自多个大洲的许多国家的数据框。我想mutate()一个名为continent的新变量。

head(chocolate_data_common_beans3, n = 10)

company_location cocoa_percent rating
1  France           63            3.75  
2  Fiji             72            3.50  
3  Ecuador          55            2.75  
4  U.S.A.           75            2.75  
5  U.S.A.           70            2.75  
6  U.S.A.           55            2.75  
7  Canada           72            3.75  
8  U.S.A.           85            3.50  
9  Australia        78            3.75  
10 Austria          70            3.75

这是每个大洲的所有值。

# company_location by continent
africa <- c("South Africa", "Sao Tome", "Madagascar", "Ghana")

asia <- c("Vietnam", "South Korea", "Singapore", "Russia", "Philippines",
          "Japan", "Israel", "India")

europe <- c("Wales", "U.K.", "Switzerland", "Sweden", "Spain", "Scotland", 
            "Portugal", "Poland", "Netherlands", "Lithuania", "Italy", 
            "Ireland", "Iceland", "Hungary", "Germany", "France","Finland", 
            "Denmark", "Czech Republic", "Belgium", "Austria", "Amsterdam")

south_america <- c("Venezuela", "Suriname", "Peru", "Ecuador", "Costa Rica",
                   "Colombia", "Chile", "Brazil", "Bolivia", "Argentina")

north_america <- c("U.S.A.", "St. Lucia", "Puerto Rico",  "Nicaragua", 
                   "Niacragua", "Mexico","Martinique", "Honduras", 
                   "Guatemala", "Grenada", "Dominican Republic", "Canada")

oceania <- c("New Zealand", "Fiji", "Australia")

我尝试使用case_when创建continent列,但是由于向量长度不同,我收到了错误消息。

# create new column of continents
chocolate_data_common_beans2 <- chocolate_data_common_beans2 %>%
    mutate(continent = case_when(
    africa %in% company_location ~ "Africa",
    asia %in% company_location ~ "Asia",
    europe %in% company_location ~ "Europe",
    south_america %in% company_location ~ "South America",
    north_america %in% company_location ~ "North America",
    oceania %in% company_location ~ "Oceania"
    ))

我该怎么做?

您可以看到所有代码in my Kaggle workbook

2 个答案:

答案 0 :(得分:1)

%in%的工作方式与此相反(这在语言上也更有意义:您要问的是 this 是该列表的成员) >):

df %>%
     mutate(continent = case_when(
         company_location %in% africa ~ "Africa",
         company_location %in% asia ~ "Asia",
         company_location %in% europe ~ "Europe",
         company_location %in% south_america ~ "South America",
         company_location %in% north_america ~ "North America",
         company_location %in% oceania ~ "Oceania"
     ))

   company_location cocoa_percent rating     continent
1            France            63   3.75        Europe
2              Fiji            72   3.50       Oceania
3           Ecuador            55   2.75 South America
4            U.S.A.            75   2.75 North America
5            U.S.A.            70   2.75 North America
6            U.S.A.            55   2.75 North America
7            Canada            72   3.75 North America
8            U.S.A.            85   3.50 North America
9         Australia            78   3.75       Oceania
10          Austria            70   3.75        Europe

答案 1 :(得分:0)

我们可以在创建key/val数据集之后进行联接

library(tidyverse)
list(Africa = africa, Asia = asia, Europe = europe, 
   `South America` = south_america, `North America` = north_america,
   Oceania = oceania) %>% 
 stack %>% 
 right_join(chocolate_data_common_beans2, by = c("values" = "company_location")) %>%
 rename(continent = ind)
#       values     continent cocoa_percent rating
#1     France        Europe            63   3.75
#2       Fiji       Oceania            72   3.50
#3    Ecuador South America            55   2.75
#4     U.S.A. North America            75   2.75
#5     U.S.A. North America            70   2.75
#6     U.S.A. North America            55   2.75
#7     Canada North America            72   3.75
#8     U.S.A. North America            85   3.50
#9  Australia       Oceania            78   3.75
#10   Austria        Europe            70   3.75

或者使用enframe代替stack

list(Africa = africa, Asia = asia, Europe = europe, 
   `South America` = south_america, `North America` = north_america,
   Oceania = oceania)  %>% 
   enframe(name = "continent", value = "company_location") %>% 
   unnest %>%
   right_join(chocolate_data_common_beans2)

注意:此方法的优点是不使用多个嵌套条件来更改值。我们只需要一个join

基准

在稍大的数据集上

dfN <- chocolate_data_common_beans2[rep(seq_len(nrow(chocolate_data_common_beans2)), each = 1e5),]
library(microbenchmark)

akrun <- function() {
  list(Africa = africa, Asia = asia, Europe = europe, 
     `South America` = south_america, `North America` = north_america,
     Oceania = oceania)  %>% 
     enframe(name = "continent", value = "company_location") %>% 
     unnest %>%
     right_join(dfN)


}

iod <- function() {
dfN %>%
     mutate(continent = case_when(
         company_location %in% africa ~ "Africa",
         company_location %in% asia ~ "Asia",
         company_location %in% europe ~ "Europe",
         company_location %in% south_america ~ "South America",
         company_location %in% north_america ~ "North America",
         company_location %in% oceania ~ "Oceania"
     ))


}
microbenchmark(akrun(), iod(), times = 10L, unit = "relative")
#  expr      min       lq     mean   median       uq      max neval cld
# akrun() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000    10  a 
#   iod() 6.332611 6.201221 5.953432 6.125145 5.567748 5.751538    10   b

数据

chocolate_data_common_beans2 <- structure(list(company_location = 
 c("France", "Fiji", "Ecuador", 
 "U.S.A.", "U.S.A.", "U.S.A.", "Canada", "U.S.A.", "Australia", 
 "Austria"), cocoa_percent = c(63L, 72L, 55L, 75L, 70L, 55L, 72L, 
 85L, 78L, 70L), rating = c(3.75, 3.5, 2.75, 2.75, 2.75, 2.75, 
 3.75, 3.5, 3.75, 3.75)), class = "data.frame", row.names = c("1", 
 "2", "3", "4", "5", "6", "7", "8", "9", "10"))