我想将第一个数据集中的字符串与其最接近的常见匹配项进行匹配。
数据如下:
数据集1:
California
Texas
Florida
New York
数据集2:
Californiia
callifoornia
T3xas
Te xas
texas
Fl0 rida
folrida
New york
new york
期望的结果是:
col_1 col_2 col_3 col4
California Californiia callifoornia
Texas T3xas texas Te xas
Florida folrida Fl0 rida
New York New york new york
问题是:
谢谢。
答案 0 :(得分:0)
library(fuzzyjoin); library(tidyverse)
dataset1 %>%
stringdist_left_join(dataset2,
max_dist = 3) %>%
rename(col_1 = "states.x") %>%
group_by(col_1) %>%
mutate(col = paste0("col_", row_number() + 1)) %>%
spread(col, states.y)
#Joining by: "states"
## A tibble: 4 x 4
## Groups: col_1 [4]
# col_1 col_2 col_3 col_4
# <chr> <chr> <chr> <chr>
#1 California Californiia callifoornia NA
#2 Florida Fl0 rida folrida NA
#3 New York New york new york NA
#4 Texas T3xas Te xas texas
数据:
dataset1 <- data.frame(states = c("California",
"Texas",
"Florida",
"New York"),
stringsAsFactors = F)
dataset2 <- data.frame(stringsAsFactors = F,
states = c(
"Californiia",
"callifoornia",
"T3xas",
"Te xas",
"texas",
"Fl0 rida",
"folrida",
"New york",
"new york"
)
)
答案 1 :(得分:0)
我读了一些最严格的文章,并提出了这个建议。这是一种解决方法,但我喜欢它。绝对可以改进:
library(stringdist)
library(janitor)
ds1a <- read.csv('dataset1')
ds2a <- read.csv('dataset2')
distancematrix <- stringdistmatrix(ds2a$name, ds1a$name, useNames = T)
df <- data.frame(stringdistmatrix(ds2a$name, ds1a$name, useNames = T), ncol=maxcol in distance matrix)
# go thru this df, and every cell that's < 4, replace with the column name, otherwise replace with empty string
for (j in 1:ncol(df)) {
trigger <- df[j,] < 4
df[trigger , j] <- names(df)[j]
df[!trigger , j] <- ""
}
df <- remove_constant(df)
write.csv(df, file="~/Desktop/df.csv")