我正在寻找一种使数据框的选定列匿名化的整洁方法。
我能想到的最好的办法是定义一个映射表,然后使用plyr::mapvalues()
,但是我不能全神贯注地对此进行概括以使其与dplyr::mutate_at()
一起使用(请参阅下面的伪代码)。
还是最好通过purrr::map2()
完成此操作?
library(magrittr)
df <- tibble::tribble(
~name, ~surname, ~value,
"John", "Doe", 10,
"Jane", "Doe", 20
)
seed <- 2093
cols_to_anon <- c("name", "surname")
recode_table <- cols_to_anon %>%
dplyr::syms() %>%
purrr::map(function(.x) {
uniques <- df %>%
dplyr::distinct(!!.x) %>%
dplyr::pull()
n <- length(uniques)
set.seed(seed)
original <- uniques[sample(1:n)]
set.seed(seed)
anon_1 <- sample(LETTERS, n, replace = TRUE)
set.seed(seed)
anon_2 <- sample(1:1000, n, replace = TRUE)
anon <- stringr::str_glue("{anon_1}{anon_2}")
tibble::tibble(original, anon)
}) %>%
purrr::set_names(cols_to_anon)
recode_table
#> $name
#> # A tibble: 2 x 2
#> original anon
#> <chr> <S3: glue>
#> 1 Jane W875
#> 2 John D149
#>
#> $surname
#> # A tibble: 1 x 2
#> original anon
#> <chr> <S3: glue>
#> 1 Doe W875
df_anon <- df %>%
dplyr::mutate(
name = plyr::mapvalues(name,
recode_table$name$original,
recode_table$name$anon
),
surname = plyr::mapvalues(surname,
recode_table$surname$original,
recode_table$surname$anon
)
)
df_anon
#> # A tibble: 2 x 3
#> name surname value
#> <chr> <chr> <dbl>
#> 1 D149 W875 10
#> 2 W875 W875 20
由reprex package(v0.2.1.9000)于2019-05-16创建
df_anon <- df %>%
dplyr::mutate_at(
dplyr::vars(one_of(cols_to_anon)),
~plyr::mapvalues(<col_name_i>,
mtable_list[[<col_name_i>]]$original,
mtable_list[[<col_name_i>]]$anon
)
)
with `<col_name_i>` being the name of the respective column that is to be anonymized
答案 0 :(得分:1)
一种方法是:
library(rlang)
library(stringr)
library(tidyverse)
df <- tibble::tribble(
~name, ~surname, ~value,
"John", "Doe", 10,
"Jane", "Doe", 20
)
df
my_selection <- exprs(name, surname)
map(df %>%
select(!!!my_selection),
~enframe(unique(.), name = NULL, value = "original") %>%
mutate(anon = str_c(sample(LETTERS, n(), replace = TRUE),
sample(1:1000, n(), replace = TRUE),
sep = ""))) -> recode_table
recode_table
# $name
# # A tibble: 2 x 2
# original anon
# <chr> <chr>
# 1 John F330
# 2 Jane O445
#
# $surname
# # A tibble: 1 x 2
# original anon
# <chr> <chr>
# 1 Doe N710
imap_dfc(recode_table,
~df %>%
select(..2) %>%
`colnames<-`("original") %>%
left_join(recode_table[[..2]], by = "original") %>%
select(-original) %>%
`colnames<-`(..2)) %>%
cbind(
df %>%
select(-c(!!!my_selection))) -> df_anon
df_anon
# name surname value
# 1 F330 N710 10
# 2 O445 N710 20