整洁地匿名化数据框的选定列

时间:2019-05-16 12:06:47

标签: r dplyr mapping

我正在寻找一种使数据框的选定列匿名化的整洁方法。

我能想到的最好的办法是定义一个映射表,然后使用plyr::mapvalues(),但是我不能全神贯注地对此进行概括以使其与dplyr::mutate_at()一起使用(请参阅下面的伪代码)。

还是最好通过purrr::map2()完成此操作?

library(magrittr)
df <- tibble::tribble(
  ~name,  ~surname, ~value,
  "John", "Doe",    10,
  "Jane", "Doe",    20
)

seed <- 2093
cols_to_anon <- c("name", "surname")
recode_table <- cols_to_anon %>%
  dplyr::syms() %>%
  purrr::map(function(.x) {
    uniques <- df %>%
      dplyr::distinct(!!.x) %>%
      dplyr::pull()
    n <- length(uniques)

    set.seed(seed)
    original <- uniques[sample(1:n)]

    set.seed(seed)
    anon_1 <- sample(LETTERS, n, replace = TRUE)
    set.seed(seed)
    anon_2 <- sample(1:1000, n, replace = TRUE)

    anon <- stringr::str_glue("{anon_1}{anon_2}")
    tibble::tibble(original, anon)
  }) %>%
  purrr::set_names(cols_to_anon)
recode_table
#> $name
#> # A tibble: 2 x 2
#>   original anon      
#>   <chr>    <S3: glue>
#> 1 Jane     W875      
#> 2 John     D149      
#> 
#> $surname
#> # A tibble: 1 x 2
#>   original anon      
#>   <chr>    <S3: glue>
#> 1 Doe      W875

df_anon <- df %>%
  dplyr::mutate(
    name = plyr::mapvalues(name,
      recode_table$name$original,
      recode_table$name$anon
    ),
    surname = plyr::mapvalues(surname,
      recode_table$surname$original,
      recode_table$surname$anon
    )
  )
df_anon
#> # A tibble: 2 x 3
#>   name  surname value
#>   <chr> <chr>   <dbl>
#> 1 D149  W875       10
#> 2 W875  W875       20

reprex package(v0.2.1.9000)于2019-05-16创建

“所需”解决方案的伪代码

df_anon <- df %>%
  dplyr::mutate_at(
    dplyr::vars(one_of(cols_to_anon)),
    ~plyr::mapvalues(<col_name_i>,
      mtable_list[[<col_name_i>]]$original,
      mtable_list[[<col_name_i>]]$anon
    )
  )

with `<col_name_i>` being the name of the respective column that is to be anonymized

1 个答案:

答案 0 :(得分:1)

一种方法是:

library(rlang)
library(stringr)
library(tidyverse)

df <- tibble::tribble(
  ~name,  ~surname, ~value,
  "John", "Doe",    10,
  "Jane", "Doe",    20
)
df

my_selection <- exprs(name, surname)

map(df %>%
      select(!!!my_selection),
    ~enframe(unique(.), name = NULL, value = "original") %>%
      mutate(anon = str_c(sample(LETTERS, n(), replace = TRUE),
                          sample(1:1000, n(), replace = TRUE),
                          sep = ""))) -> recode_table
recode_table
# $name
# # A tibble: 2 x 2
# original anon 
# <chr>    <chr>
#   1 John     F330 
# 2 Jane     O445 
# 
# $surname
# # A tibble: 1 x 2
# original anon 
# <chr>    <chr>
#   1 Doe      N710 

imap_dfc(recode_table,
     ~df %>% 
       select(..2) %>%
       `colnames<-`("original") %>%
       left_join(recode_table[[..2]], by = "original") %>%
       select(-original) %>%
       `colnames<-`(..2)) %>%
  cbind(
    df %>%
          select(-c(!!!my_selection))) -> df_anon
df_anon
# name surname value
# 1 F330    N710    10
# 2 O445    N710    20