我正在尝试转换字符的数据帧,这些字符有时在字符串的末尾包含数字或字符范围。如果指定了范围,则需要将范围(A:Z | 0:9)的每个值分隔成自己的行,并预先粘贴前面的字符。
test.data <- data.frame("zone" = c('X','X','Y','Y'),"postal"=c('A1V','A2B-D', 'B1V 1A1','B2V 1B6-9'),stringsAsFactors = F)
> test.data
zone postal
X A1V
X A2B-D
Y B1V 1A1
Y B2V 1B6-7
到目前为止,我只能将范围隔离为单独的元素。
strsplit( gsub("([[:alnum:]]-[[:alnum:]])","~\\1",test.data$postal), "~" )
理想情况下,我正在寻找类似的东西(注意B:D和6:9范围):
> desired.output
zone postal
X A1V
X A2B
X A2C
X A2D
Y B1V 1A1
Y B2V 1B6
Y B2V 1B7
Y B2V 1B8
Y B2V 1B9
答案 0 :(得分:0)
library(tidyr)
library(stringr)
range2sequence <- function(start, end) {
s <- strtoi(start)
e <- strtoi(end)
# if s and e are integers
if (!is.na(s) && !is.na(e)) {
sequence <- as.character(seq(s, e))
return(sequence)
}
s <- as.character(start)
e <- as.character(end)
# if s and e are letters A-Z
if (s %in% LETTERS && e %in% LETTERS) {
s_pos <- which(LETTERS == s)
e_pos <- which(LETTERS == e)
if (s_pos > e_pos)
stop("start must be lower or equal than end.")
return(LETTERS[s_pos:e_pos])
}
stop("start and end do not seem to define a valid range: ",
start,
" and ",
end,
".")
}
expand_range <- Vectorize(function(str) {
sub_strs <- str_match(str, "([^-]+)([^-])-([^-])")
if (is.na(sub_strs[1]))
return(str)
else {
prefix <- sub_strs[2]
start <- sub_strs[3]
end <- sub_strs[4]
return(str_c(prefix, range2sequence(start, end)))
}
})
test.data <-
data.frame(
"zone" = c('X', 'X', 'Y', 'Y'),
"postal" = c('A1V', 'A2B-D', 'B1V 1A1', 'B2V 1B6-9'),
stringsAsFactors = F
)
test.data %>%
transform(postal = expand_range(postal)) %>%
unnest -> desired.output
desired.output
#> zone postal
#> 1 X A1V
#> 2 X A2B
#> 3 X A2C
#> 4 X A2D
#> 5 Y B1V 1A1
#> 6 Y B2V 1B6
#> 7 Y B2V 1B7
#> 8 Y B2V 1B8
#> 9 Y B2V 1B9