Question

我有一个包含超过20.000个观测值的数据集，基本上看起来像这个：

df <- data.frame(
    user = c("ABC", "DEF", "GHI"),
    location = c("Chicago, the windy city", "Oxford University", "Paris")
)

我想添加三个额外的列city，long，lat，并使用城市名称和地理位置（经度和纬度）填充这些列。

因此我想使用maps包及其world.cities数据库：

library(maps)
data(world.cities)

如果location中的城市名称以正确的方式显示，则添加城市名称和地理位置并不困难。但是，他们中的大多数都有其他字符串（例如“芝加哥，多风的城市”）。

如何根据world.cities数据库仅提取城市名称，并将实际城市名称写入city列，将地理位置写入long和lat？

Answer 1

正如@Heroka在评论中所提到的，如果城市名称始终是location中的第一个字符串，您可以使用stringi，left_join提取第一个字符串world.cities数据，并筛选匹配中最大的人口。

library(stringi)
library(dplyr)

df %>%
  mutate(city = stri_extract_first_words(location)) %>%
  left_join(world.cities, by = c("city" = "name")) %>%
  group_by(city) %>%
  filter(row_number(desc(pop)) == 1)

给出了：

#Source: local data frame [3 x 8]
#Groups: city [3]
#
#    user                location    city country.etc     pop   lat   long capital
#  (fctr)                  (fctr)   (chr)       (chr)   (int) (dbl)  (dbl)   (int)
#1    ABC Chicago, the windy city Chicago         USA 2830144 41.84 -87.68       0
#2    DEF       Oxford University  Oxford          UK  157568 51.76  -1.26       0
#3    GHI                   Paris   Paris      France 2141839 48.86   2.34       1

更新

如果城市名称并非始终是location中的第一个字符串，您可以先尝试将location中的字词与字典（此处为name列匹配） .cities）然后使用返回TRUE的匹配项作为您的位置名称。这是一个快速实施（我添加了＆＃34;伦敦大学学院＆＃34;案例给你data.frame）

> df
#  user                  location
#1  ABC   Chicago, the windy city
#2  DEF         Oxford University
#3  GHI                     Paris
#4  JKL University College London

对于每一行，我们会提取location中的所有字词并将其存储在列表lst中，然后在其中循环以查找匹配的name在world.cities中的位置将其存储在p中，最后在p中提取与位置lst对应的元素，并将其存储在city

中

df %>%
  mutate(lst = stri_extract_all_words(location),
         p = sapply(lst, function (x) which(x %in% world.cities$name), simplify=TRUE)) %>%
  mutate(city = sapply(1:length(lst), function(x) .$lst[[x]][.$p[x]])) %>%
  left_join(world.cities, by = c("city" = "name")) %>%
  group_by(city) %>%
  filter(row_number(desc(pop)) == 1)

您还可以添加p

来删除临时列lst和... %>% select(-lst, -p)

更新2

这不应该打破畸形词，但不会为＃34;纽约＆＃34;情况下：

df %>%
  mutate(
    city = lapply(stri_extract_all_words(location), 
                  function (x) { world.cities$name[match(x, world.cities$name)] })) %>%
  tidyr::unnest(city) %>%
  filter(!is.na(city)) %>%
  left_join(world.cities, by = c("city" = "name")) %>%
  group_by(city) %>%
  filter(row_number(desc(pop)) == 1)

更新3

这适用于所有情况：

> df
#  user                  location
#1  ABC   Chicago, the windy city
#2  DEF         Oxford University
#3  GHI                     Paris
#4  JKL                  New York
#5  MNO                  m0ntr3al
#6  PQR University College London

df$l <- gsub("[^[:alnum:]]+", " ", df$location)
lst  <- lapply(world.cities$name, function (x) { grep(x, df$l, value = TRUE) })
m    <- data.table::melt(lst)

df %>% 
  left_join(m, by = c("l" = "value")) %>%
  left_join(world.cities %>% 
              add_rownames %>% 
              mutate(rowname = as.numeric(rowname)), 
            by = c("L1" = "rowname")) %>% 
  tidyr::replace_na(list(pop = 0)) %>%
  group_by(location) %>%
  filter(row_number(desc(pop)) == 1) %>%
  select(-(l:L1))

给出了：

#Source: local data frame [6 x 8]
#Groups: location [6]
#
#    user                  location     name country.etc     pop   lat   long capital
#  (fctr)                    (fctr)    (chr)       (chr)   (dbl) (dbl)  (dbl)   (int)
#1    ABC   Chicago, the windy city  Chicago         USA 2830144 41.84 -87.68       0
#2    DEF         Oxford University   Oxford          UK  157568 51.76  -1.26       0
#3    GHI                     Paris    Paris      France 2141839 48.86   2.34       1
#4    JKL                  New York New York         USA 8124427 40.67 -73.94       0
#5    MNO                  m0ntr3al       NA          NA       0    NA     NA      NA
#6    PQR Univeristy College London   London          UK 7489022 51.52  -0.10       1

将城市名称和地理位置数据添加到数据框

1 个答案: