用数字和字符范围分割字符

时间:2019-01-30 22:46:37

标签: r sequence gsub

我正在尝试转换字符的数据帧,这些字符有时在字符串的末尾包含数字或字符范围。如果指定了范围,则需要将范围(A:Z | 0:9)的每个值分隔成自己的行,并预先粘贴前面的字符。

test.data <- data.frame("zone" = c('X','X','Y','Y'),"postal"=c('A1V','A2B-D', 'B1V 1A1','B2V 1B6-9'),stringsAsFactors = F)

> test.data
zone postal
X    A1V
X    A2B-D
Y    B1V 1A1
Y    B2V 1B6-7

到目前为止,我只能将范围隔离为单独的元素。

strsplit( gsub("([[:alnum:]]-[[:alnum:]])","~\\1",test.data$postal), "~" )

理想情况下,我正在寻找类似的东西(注意B:D和6:9范围):

> desired.output
zone postal
X    A1V
X    A2B
X    A2C
X    A2D
Y    B1V 1A1
Y    B2V 1B6
Y    B2V 1B7
Y    B2V 1B8
Y    B2V 1B9 

1 个答案:

答案 0 :(得分:0)

library(tidyr)
library(stringr)

range2sequence <- function(start, end) {
  s <- strtoi(start)
  e <- strtoi(end)

  # if s and e are integers
  if (!is.na(s) && !is.na(e)) {
    sequence <- as.character(seq(s, e))
    return(sequence)
  }

  s <- as.character(start)
  e <- as.character(end)
  # if s and e are letters A-Z
  if (s %in% LETTERS && e %in% LETTERS) {
    s_pos <- which(LETTERS == s)
    e_pos <- which(LETTERS == e)
    if (s_pos > e_pos)
      stop("start must be lower or equal than end.")
    return(LETTERS[s_pos:e_pos])
  }

  stop("start and end do not seem to define a valid range: ",
       start,
       " and ",
       end,
       ".")

}

expand_range <- Vectorize(function(str) {
  sub_strs <- str_match(str, "([^-]+)([^-])-([^-])")
  if (is.na(sub_strs[1]))
    return(str)
  else {
    prefix <- sub_strs[2]
    start <- sub_strs[3]
    end <- sub_strs[4]
    return(str_c(prefix, range2sequence(start, end)))
  }
})

test.data <-
  data.frame(
    "zone" = c('X', 'X', 'Y', 'Y'),
    "postal" = c('A1V', 'A2B-D', 'B1V 1A1', 'B2V 1B6-9'),
    stringsAsFactors = F
  )

test.data %>%
  transform(postal = expand_range(postal)) %>%
  unnest -> desired.output

desired.output
#>   zone  postal
#> 1    X     A1V
#> 2    X     A2B
#> 3    X     A2C
#> 4    X     A2D
#> 5    Y B1V 1A1
#> 6    Y B2V 1B6
#> 7    Y B2V 1B7
#> 8    Y B2V 1B8
#> 9    Y B2V 1B9