我尝试使用模式匹配基于另一个添加新列。 我已阅读this post,但没有获得所需的输出。
我想基于GreatGroup列创建一个新列(SubOrder)。 我尝试过以下方法:
SubOrder <- rep(NA_character_, length(myData))
SubOrder[grepl("udults", myData, ignore.case = TRUE)] <- "Udults"
SubOrder[grepl("aquults", myData, ignore.case = TRUE)] <- "Aquults"
SubOrder[grepl("aqualfs", myData, ignore.case = TRUE)] <- "aqualfs"
SubOrder[grepl("humods", myData, ignore.case = TRUE)] <- "humods"
SubOrder[grepl("udalfs", myData, ignore.case = TRUE)] <- "udalfs"
SubOrder[grepl("orthods", myData, ignore.case = TRUE)] <- "orthods"
SubOrder[grepl("udalfs", myData, ignore.case = TRUE)] <- "udalfs"
SubOrder[grepl("psamments", myData, ignore.case = TRUE)] <- "psamments"
SubOrder[grepl("udepts", myData, ignore.case = TRUE)] <- "udepts"
SubOrder[grepl("fluvents", myData, ignore.case = TRUE)] <- "fluvents"
SubOrder[grepl("aquods", myData, ignore.case = TRUE)] <- "aquods"
例如,我正在寻找&#34; udults&#34;在任何单词中,例如Hapludults或Paleudults,只返回&#34; udults&#34;。
编辑:如果有人想拍摄alistaire的评论,这就是我要使用的搜索模式。 subOrderNames <- c("Udults", "Aquults", "Aqualfs", "Humods", "Udalfs", "Orthods", "Psamments", "Udepts", "fluvents")
以下示例数据。
myData <- dput(head(test))
structure(list(1:6, SID = c(200502L, 200502L, 200502L, 200502L,
200502L, 200502L), Groupdepth = c(11L, 12L, 13L, 14L, 21L, 22L
), AWC0to10 = c(0.12, 0.12, 0.12, 0.12, 0.12, 0.12), AWC10to20 = c(0.12,
0.12, 0.12, 0.12, 0.12, 0.12), AWC20to50 = c(0.12, 0.12, 0.12,
0.12, 0.12, 0.12), AWC50to100 = c(0.15, 0.15, 0.15, 0.15, 0.15,
0.15), Db3rdbar0to10 = c(1.43, 1.43, 1.43, 1.43, 1.43, 1.43),
Db3rdbar10to20 = c(1.43, 1.43, 1.43, 1.43, 1.43, 1.43), Db3rdbar20to50 = c(1.43,
1.43, 1.43, 1.43, 1.43, 1.43), Db3rdbar50to100 = c(1.43,
1.43, 1.43, 1.43, 1.43, 1.43), HydrcRatngPP = c(0L, 0L, 0L,
0L, 0L, 0L), OrgMatter0to10 = c(1.25, 1.25, 1.25, 1.25, 1.25,
1.25), OrgMatter10to20 = c(1.25, 1.25, 1.25, 1.25, 1.25,
1.25), OrgMatter20to50 = c(1.02, 1.02, 1.02, 1.02, 1.02,
1.02), OrgMatter50to100 = c(0.12, 0.12, 0.12, 0.12, 0.12,
0.12), Clay0to10 = c(8, 8, 8, 8, 8, 8), Clay10to20 = c(8,
8, 8, 8, 8, 8), Clay20to50 = c(9.4, 9.4, 9.4, 9.4, 9.4, 9.4
), Clay50to100 = c(40, 40, 40, 40, 40, 40), Sand0to10 = c(85,
85, 85, 85, 85, 85), Sand10to20 = c(85, 85, 85, 85, 85, 85
), Sand20to50 = c(83, 83, 83, 83, 83, 83), Sand50to100 = c(45.8,
45.8, 45.8, 45.8, 45.8, 45.8), pHwater0to20 = c(6.3, 6.3,
6.3, 6.3, 6.3, 6.3), Ksat0to10 = c(23, 23, 23, 23, 23, 23
), Ksat10to20 = c(23, 23, 23, 23, 23, 23), Ksat20to50 = c(19.7333,
19.7333, 19.7333, 19.7333, 19.7333, 19.7333), Ksat50to100 = c(9,
9, 9, 9, 9, 9), TaxClName = c("Fine, mixed, semiactive, mesic Oxyaquic Hapludults",
"Fine, mixed, semiactive, mesic Oxyaquic Hapludults", "Fine, mixed, semiactive, mesic Oxyaquic Hapludults",
"Fine, mixed, semiactive, mesic Oxyaquic Hapludults", "Fine, mixed, semiactive, mesic Oxyaquic Hapludults",
"Fine, mixed, semiactive, mesic Oxyaquic Hapludults"), GreatGroup = c("Hapludults",
"Hapludults", "Hapludults", "Hapludults", "Hapludults", "Hapludults"
)), .Names = c("", "SID", "Groupdepth", "AWC0to10", "AWC10to20",
"AWC20to50", "AWC50to100", "Db3rdbar0to10", "Db3rdbar10to20",
"Db3rdbar20to50", "Db3rdbar50to100", "HydrcRatngPP", "OrgMatter0to10",
"OrgMatter10to20", "OrgMatter20to50", "OrgMatter50to100", "Clay0to10",
"Clay10to20", "Clay20to50", "Clay50to100", "Sand0to10", "Sand10to20",
"Sand20to50", "Sand50to100", "pHwater0to20", "Ksat0to10", "Ksat10to20",
"Ksat20to50", "Ksat50to100", "TaxClName", "GreatGroup"), class = c("tbl_df",
"data.frame"), row.names = c(NA, -6L))
答案 0 :(得分:3)
一些选项,其中一些是我在上面的评论中发布的。
注意:所有选项都假设替换匹配模式的字符串只是模式。如果您想要其他内容,则可以轻松编辑它们以包含单独的替换值。
for
+ grepl
使用与原始代码相同的代码,但循环以避免重复代码:
# make a list of patterns
pat <- c('udults', 'aquults', 'aqualfs', 'humods', 'udalfs', 'orthods', 'psamments', 'udepts', 'fluvents', 'aquods')
SubOrder <- rep(NA_character_, length(myData))
for(x in 1:length(pat)){
SubOrder[grepl(pat[x], myData$GreatGroup, ignore.case = TRUE)] <- pat[x]
}
for
+ gsub
通过复制myData$GreatGroup
然后使用gsub
更改新列来构建新列。粘贴的额外正则表达式包括同一字符串中的字符。
myData$SubOrder <- myData$GreatGroup
for(x in pat){
myData$SubOrder <- gsub(paste0('.*', x, '.*'), x, myData$SubOrder, ignore.case = TRUE)
}
请注意,与pat
中的某个字符串不匹配的值的值为GreatGroup
,而不是NA
。如果您希望它们为NA
,请使用
myData$SubOrder[!(myData$SubOrder %in% pat)] <- NA
stringr::str_replace_all
我最喜欢的是因为它不循环,虽然它需要stringr
包(无论如何都非常棒)。
从pat
创建一个命名列表,其中名称是您要替换的正则表达式,该项是要匹配的字符串:
l <- as.list(pat)
names(l) <- paste0('.*', pat, '.*')
所以它看起来像
> l
$`.*udults.*`
[1] "udults"
$`.*aquults.*`
[1] "aquults"
$`.*aqualfs.*`
[1] "aqualfs"
......
然后使用str_replace_all
完成所有操作:
myData$SubOrder <- str_replace_all(myData$GreatGroup, l)
动臂。
注1: str_replace_all
没有ignore.case
选项,但您可以将myData$GreatGroup
包裹在tolower
中(简单)或重新配置正则表达式(硬)。
注2:与选项2 一样,它会将不匹配的条目作为GreatGroup
的值,因此请使用该选项末尾的行如果你愿意,可以回到NA
。
答案 1 :(得分:1)
试试这个:
myData$SubOrder[grepl("udults", myData$TaxClName, ignore.case = TRUE) | grepl("udults", myData$GreatGroup, ignore.case = TRUE)] <- "Udults"
您可以根据需要为过滤器添加任意数量的列。
答案 2 :(得分:1)
我正在使用dplyr,但您可能需要创建一个巨大的嵌套ifelse语句...
library(dplyr)
myData %>%
mutate(SubOrder = ifelse(grepl('udults', GreatGroup, ignore.case = T), 'Udults',
ifelse(grepl('aquults', GreatGroup, ignore.case = T, 'Aquults',
### All of the other ifelse statements
ifelse(grepl('fluvents', GreatGroup, ignore.case = T), 'fluvents', 'aquods')
))))
答案 3 :(得分:1)
您可以使用连续替换每个模式的函数来执行此操作,从而避免重复使用代码。请注意,使用此方法,如果给定字符串匹配多个模式,则替换序列中的第一个模式将是使用的模式。
# multi-grepl function adapted from http://stackoverflow.com/a/15254254/496488
mgrepl <- function(pattern, replacement, x, ...) {
if (length(pattern) != length(replacement)) {
stop("pattern and replacement do not have the same length.")
}
result <- x
for (i in 1:length(pattern)) {
result[grepl(pattern[i], result, ...)] = replacement[i]
}
result
}
# Patterns and replacements
pat = c("udults","aquults","humods","fluvents")
repl = c("Udults","Aquults","humods","fluvents")
SubOrder = mgrepl(pat, repl, myData$GreatGroup)
SubOrder
[1] "Udults" "Udults" "Udults" "Udults" "Udults" "Udults"
# Or, if you want to add this as a new column to the data:
myData$SubOrder = mgrepl(pat, repl, myData$GreatGroup)
另外一个注意事项:问题中代码的一个问题是您引用了整个数据框,而不是要替换的列:
SubOrder[grepl("udults", myData, ignore.case = TRUE)] <- "Udults"
应改为
SubOrder[grepl("udults", myData$GreatGroup, ignore.case = TRUE)] <- "Udults"
更新:关于您的评论,请参阅以下代码。该函数确实用“Udults”替换了两个值。
myData$GreatGroup[1] = "Paleudults"
myData$GreatGroup
[1] "Paleudults" "Hapludults" "Hapludults" "Hapludults" "Hapludults" "Hapludults"
mgrepl(pat, repl, myData$GreatGroup)
[1] "Udults" "Udults" "Udults" "Udults" "Udults" "Udults"