Question

我正在尝试转换字符的数据帧，这些字符有时在字符串的末尾包含数字或字符范围。如果指定了范围，则需要将范围（A：Z | 0：9）的每个值分隔成自己的行，并预先粘贴前面的字符。

test.data <- data.frame("zone" = c('X','X','Y','Y'),"postal"=c('A1V','A2B-D', 'B1V 1A1','B2V 1B6-9'),stringsAsFactors = F)

> test.data
zone postal
X    A1V
X    A2B-D
Y    B1V 1A1
Y    B2V 1B6-7

到目前为止，我只能将范围隔离为单独的元素。

strsplit( gsub("([[:alnum:]]-[[:alnum:]])","~\\1",test.data$postal), "~" )

理想情况下，我正在寻找类似的东西（注意B：D和6：9范围）：

> desired.output
zone postal
X    A1V
X    A2B
X    A2C
X    A2D
Y    B1V 1A1
Y    B2V 1B6
Y    B2V 1B7
Y    B2V 1B8
Y    B2V 1B9

Answer 1

library(tidyr)
library(stringr)

range2sequence <- function(start, end) {
  s <- strtoi(start)
  e <- strtoi(end)

  # if s and e are integers
  if (!is.na(s) && !is.na(e)) {
    sequence <- as.character(seq(s, e))
    return(sequence)
  }

  s <- as.character(start)
  e <- as.character(end)
  # if s and e are letters A-Z
  if (s %in% LETTERS && e %in% LETTERS) {
    s_pos <- which(LETTERS == s)
    e_pos <- which(LETTERS == e)
    if (s_pos > e_pos)
      stop("start must be lower or equal than end.")
    return(LETTERS[s_pos:e_pos])
  }

  stop("start and end do not seem to define a valid range: ",
       start,
       " and ",
       end,
       ".")

}

expand_range <- Vectorize(function(str) {
  sub_strs <- str_match(str, "([^-]+)([^-])-([^-])")
  if (is.na(sub_strs[1]))
    return(str)
  else {
    prefix <- sub_strs[2]
    start <- sub_strs[3]
    end <- sub_strs[4]
    return(str_c(prefix, range2sequence(start, end)))
  }
})

test.data <-
  data.frame(
    "zone" = c('X', 'X', 'Y', 'Y'),
    "postal" = c('A1V', 'A2B-D', 'B1V 1A1', 'B2V 1B6-9'),
    stringsAsFactors = F
  )

test.data %>%
  transform(postal = expand_range(postal)) %>%
  unnest -> desired.output

desired.output
#>   zone  postal
#> 1    X     A1V
#> 2    X     A2B
#> 3    X     A2C
#> 4    X     A2D
#> 5    Y B1V 1A1
#> 6    Y B2V 1B6
#> 7    Y B2V 1B7
#> 8    Y B2V 1B8
#> 9    Y B2V 1B9

用数字和字符范围分割字符

1 个答案: