R按子串分组

时间:2019-02-19 21:30:33

标签: r string substring

样本数据

data = data.frame(id = c(1, 2, 3, 4, 5),
              name = c("blue", "green", "red", "read", "HUE"),
              WANT = c("ue", "re", "re", "re", "ue"))

说明。如果“名称”包含“ ue”,则WANT =“ ue”,如果“名称”包含“ re”,则WANT =“ re”。大写无所谓。

这是我的尝试:

    df$attempt <- NA
df$attempt[substr(df$name) == "ue"] <- "ue"
df$attempt[substr(df$name) == "re"] <- "re"

4 个答案:

答案 0 :(得分:2)

这里有几个版本

data = data.frame(id = c(1, 2, 3, 4, 5),
                  name = c("blue", "green", "red", "read", "HUE"))


#base r version
data$want <- ifelse(grepl("ue", data$name, ignore.case = T), "ue",
                    ifelse(grepl("re", data$name, ignore.case = T), "re",
                           NA))
#tidyverse version
library(dplyr)

data <- data %>%
  mutate(want = ifelse(grepl("ue", name, ignore.case = T), "ue",
                       ifelse(grepl("re", name, ignore.case = T), "re",
                              NA)))

答案 1 :(得分:2)

使用stringrtidyverse的一部分)的解决方案。

library(tidyverse)

data2 <- data %>%
  mutate(attempt = str_extract(name, pattern = regex("ue|re", ignore_case = TRUE)),
         attempt = str_to_lower(attempt))
data2
#   id  name WANT attempt
# 1  1  blue   ue      ue
# 2  2 green   re      re
# 3  3   red   re      re
# 4  4  read   re      re
# 5  5   HUE   ue      ue

数据

data = data.frame(id = c(1, 2, 3, 4, 5),
              name = c("blue", "green", "red", "read", "HUE"),
              WANT = c("ue", "re", "re", "re", "ue"))

答案 2 :(得分:0)

尝试使用ifelsemutategrepl("ue",name,ignore.case = T)检查ue或UE是否存在。相同的逻辑适用于[re]

library(dplyr)

    data = data%>%
  mutate(Attempt = ifelse(grepl("ue",name,ignore.case = T),"ue",
                          ifelse(grepl("re",name,ignore.case = T),"re",NA)))

答案 3 :(得分:0)

使用purrrdplyr

library(dplyr)
library(purrr)

data %>%
  mutate(group = map2_chr(WANT, name, ~ .x[grepl(.x, .y, ignore.case = TRUE)]))

输出:

  id  name WANT group
1  1  blue   ue    ue
2  2 green   re    re
3  3   red   re    re
4  4  read   re    re
5  5   HUE   hu    hu

数据:

data = data.frame(id = c(1, 2, 3, 4, 5),
                   name = c("blue", "green", "red", "read", "HUE"),
                   WANT = c("ue", "re", "re", "re", "hu"),
                   stringsAsFactors = FALSE)