Question

我在data.frame列中有一个location，其中包含国家和城市，我想通过与world.cities$country.etc（或其他任何数据）中的library(maps)数据帧进行匹配来提取前者其他国家名称集合。

考虑以下示例：

df <- data.frame(location = c("Aarup, Denmark",
                              "Switzerland",
                              "Estonia: Aaspere"),
                 other_col = c(2,3,4))

我尝试使用此代码

df %>% extract(location,
               into = c("country", "rest_location"),
               remove = FALSE,
               function(x) x[which x %in% world.cities$country.etc])

但是并不成功；我期望这样的事情：

          location other_col     country rest_location
1   Aarup, Denmark         2     Denmark       Aarup, 
2      Switzerland         3 Switzerland              
3 Estonia: Aaspere         4     Estonia     : Aaspere

Answer 1

您可以以此为起点

Result

值得注意的是，这仅在位置列中只有两个“单词”的情况下有效。如有必要，您必须指定适当的单独名称，例如library(tidyverse) df %>% rownames_to_column() %>% separate_rows(location) %>% mutate(gr = location %in% world.cities$country.etc) %>% mutate(gr = ifelse(gr, "country", "rest_location")) %>% spread(gr, location) %>% right_join(df %>% rownames_to_column(), by = c("rowname", "other_col")) %>% select(location, other_col, country, rest_location) location other_col country rest_location 1 Aarup, Denmark 2 Denmark Aarup 2 Switzerland 3 Switzerland <NA> 3 Estonia: Aaspere 4 Estonia Aaspere

Answer 2

我们可以通过将所有国家名称粘贴在一起来创建一个模式，并使用# Name of the resource we're selectively copying GOOGLESERVICE_INFO_PLIST=GoogleService-Info.plist # Get references to dev and prod versions of the GoogleService-Info.plist # NOTE: These should only live on the file system and should NOT be part of the target (since we'll be adding them to the target manually) GOOGLESERVICE_INFO_DEV=${PROJECT_DIR}/${TARGET_NAME}/Firebase/Dev/${GOOGLESERVICE_INFO_PLIST} GOOGLESERVICE_INFO_PROD=${PROJECT_DIR}/${TARGET_NAME}/Firebase/Prod/${GOOGLESERVICE_INFO_PLIST} # Make sure the dev version of GoogleService-Info.plist exists echo "Looking for ${GOOGLESERVICE_INFO_PLIST} in ${GOOGLESERVICE_INFO_DEV}" if [ ! -f $GOOGLESERVICE_INFO_DEV ] then echo "No Development GoogleService-Info.plist found. Please ensure it's in the proper directory." exit 1 fi # Make sure the prod version of GoogleService-Info.plist exists echo "Looking for ${GOOGLESERVICE_INFO_PLIST} in ${GOOGLESERVICE_INFO_PROD}" if [ ! -f $GOOGLESERVICE_INFO_PROD ] then echo "No Production GoogleService-Info.plist found. Please ensure it's in the proper directory." exit 1 fi # Get a reference to the destination location for the GoogleService-Info.plist PLIST_DESTINATION=${BUILT_PRODUCTS_DIR}/${PRODUCT_NAME}.app echo "Will copy ${GOOGLESERVICE_INFO_PLIST} to final destination: ${PLIST_DESTINATION}" # Copy over the prod GoogleService-Info.plist for Release builds if [ "${CONFIGURATION}" == "Release" ] then echo "Using ${GOOGLESERVICE_INFO_PROD}" cp "${GOOGLESERVICE_INFO_PROD}" "${PLIST_DESTINATION}" else echo "Using ${GOOGLESERVICE_INFO_DEV}" cp "${GOOGLESERVICE_INFO_DEV}" "${PLIST_DESTINATION}" fi获取与str_extract_all中的模式匹配的所有国家名称，并删除与该国家名称匹配的单词以得到location。

rest_location

将library(maps) library(stringr) all_countries <- str_c(unique(world.cities$country.etc), collapse = "|") df$country <- sapply(str_extract_all(df$location, all_countries), toString) df$rest_location <- str_remove_all(df$location, all_countries) #OR can also do #df$rest_location <- str_remove_all(df$location, df$country) df # location other_col country rest_location #1 Aarup, Denmark 2 Denmark Aarup, #2 Switzerland 3 Switzerland #3 Estonia: Aaspere 4 Estonia : Aaspere和sapply用作toString，因为如果country中有多个国家/地区名称，则它们都将以一个字符串连接。

Answer 3

Base R（不包括地图包）：

# Import the library: 

library(maps)

# Split the string on the spaces: 

country_city_vec <- strsplit(df$location, "\\s+")

# Replicate the other col's rows by the split string vec: 

rolled_out_df <- data.frame(other_col = rep(df$other_col, sapply(country_city_vec, length)), 

                            location = gsub("[[:punct:]]", "", unlist(country_city_vec)), stringsAsFactors = F)

# Match with the world df: 

matched_with_world_df <- merge(df,

                               setNames(rolled_out_df[rolled_out_df$location %in% world.cities$country.etc,],
                                        c("other_col", "country")),

                               by = "other_col", all.x = T)

# Extract the city/location drilldown: 

matched_with_world_df$rest_location <- trimws(gsub("[[:punct:]]",
                                                   "",
                                                   gsub(paste0(matched_with_world_df$country,
                                                               collapse = "|"),
                                           "", matched_with_world_df$location)), "both")

从列

3 个答案: