Question

我有一个数据框，其中包含多个用于单词表单标记的列，如此模拟示例中所示：

df <- data.frame(
   w1 = c("NN1", "NN0", "ADJ", "ADC", "NP0", "VVZ"),
   w2 = c("NN0", "NN2", "ADC", "NN0", "VBN", "NN1"),
   w3 = c("ADJ", "NN2", "NP0", "VVG", "ADS", "NN1"),
   w4 = c("NN2", "NN2", "ADJ", "ADJ", "ADS", "VVD")
 )
df

现在我想使用更广泛的描述对标签进行重新分类，例如“名词”用于单数名词（标记为“NN1”），复数名词（“NN2”），专有名词（“NP0”）等。我可以按列进行此转换，如下所示：

df$w1_class <- ifelse(grepl("^N", df$w1), "noun", 
                      ifelse(grepl("^V", df$w1), "verb", "adjective"))
df$w2_class <- ifelse(grepl("^N", df$w2), "noun", 
                      ifelse(grepl("^V", df$w2), "verb", "adjective"))
df$w3_class <- ifelse(grepl("^N", df$w3), "noun", 
                      ifelse(grepl("^V", df$w3), "verb", "adjective"))
df$w4_class <- ifelse(grepl("^N", df$w4), "noun", 
                      ifelse(grepl("^V", df$w4), "verb", "adjective"))

如果有更多这样的列和更多标记类型，就像我在真实数据框中那样，那么鉴于高度重复的代码，这是一项单调乏味的练习。变换可以一次完成吗？

Answer 1

您可以将映射逻辑放在函数中，然后使用dplyr::mutate_all：

library(dplyr)

df <- data.frame(
    w1 = c("NN1", "NN0", "ADJ", "ADC", "NP0", "VVZ"),
    w2 = c("NN0", "NN2", "ADC", "NN0", "VBN", "NN1"),
    w3 = c("ADJ", "NN2", "NP0", "VVG", "ADS", "NN1"),
    w4 = c("NN2", "NN2", "ADJ", "ADJ", "ADS", "VVD"),
    stringsAsFactors = FALSE
)

foo <- function(tags) {
    tags <- sub("^N.*", "noun", tags)
    tags <- sub("^V.*", "verb", tags)
    tags <- sub("^A.*", "adjective", tags)
    tags
}

out <- df %>%
    mutate_all(foo) %>%
    rename_all(funs(paste0(., "_class")))

Answer 2

使用dplyr::mutate_all和dplyr::case_when的一个解决方案可以是：

library(dplyr)

df %>% mutate_all(funs(case_when(
  grepl("^N", .) ~ "noun",
  grepl("^V", .) ~ "verb",
  grepl("^A", .) ~ "adjective",
  TRUE           ~ "Other"
                 )))

#          w1        w2        w3        w4
# 1      noun      noun adjective      noun
# 2      noun      noun      noun      noun
# 3 adjective adjective      noun adjective
# 4 adjective      noun      verb adjective
# 5      noun      verb adjective adjective
# 6      verb      noun      noun      verb

数据：

df <- data.frame( w1 = c("NN1", "NN0", "ADJ", "ADC", "NP0", "VVZ"), w2 = c("NN0", "NN2", "ADC", "NN0", "VBN", "NN1"), w3 = c("ADJ", "NN2", "NP0", "VVG", "ADS", "NN1"), w4 = c("NN2", "NN2", "ADJ", "ADJ", "ADS", "VVD") )

在R中一次执行多列的转换

2 个答案: