Question

我正在处理HCUP数据，并且在一个列中有一系列值需要拆分成多列。以下是HCUP数据框供参考：

code            label
61000-61003     excision of CNS
0169T-0169T     ventricular shunt

所需的输出应为：

code            label
61000           excision of CNS
61001           excision of CNS
61002           excision of CNS
61003           excision of CNS
0169T           ventricular shunt

我遇到此问题的方法是使用包 splitstackshape 并使用此代码

library(data.table)
library(splitstackshape)

cSplit(hcup, "code", "-")[, list(code = code_1:code_2, by = label)]

这种方法会导致内存问题。有没有更好的方法解决这个问题？

一些意见：

除“T”外，数据有很多字母。
这封信可以在前面或最后但不在两个数字之间。
在一个范围内，“T”到“U”的字母没有变化

Answer 1

以下是使用dplyr中的all.is.numeric和Hmisc的解决方案：

library(dplyr)
library(Hmisc)
library(tidyr)
dat %>% separate(code, into=c("code1", "code2")) %>%
        rowwise %>%
        mutate(lists = ifelse(all.is.numeric(c(code1, code2)),
                         list(as.character(seq(from = as.numeric(code1), to = as.numeric(code2)))),
                         list(code1))) %>%
        unnest(lists) %>%
        select(code = lists, label)

Source: local data frame [5 x 2]

   code             label
  (chr)            (fctr)
1 61000   excision of CNS
2 61001   excision of CNS
3 61002   excision of CNS
4 61003   excision of CNS
5 0169T ventricular shunt

使用字符值修复范围的编辑。简单介绍一下简单：

dff %>% mutate(row = row_number()) %>%
        separate(code, into=c("code1", "code2")) %>%
        group_by(row) %>%
        summarise(lists = if(all.is.numeric(c(code1, code2)))
                              {list(str_pad(as.character(
                                   seq(from = as.numeric(code1), to = as.numeric(code2))),
                                         nchar(code1), pad="0"))}
                          else if(grepl("^[0-9]", code1))
                              {list(str_pad(paste0(as.character(
                                   seq(from = extract_numeric(code1), to = extract_numeric(code2))),
                                      strsplit(code1, "[0-9]+")[[1]][2]),
                                         nchar(code1), pad = "0"))}
                          else
                              {list(paste0(
                                      strsplit(code1, "[0-9]+")[[1]],
                                      str_pad(as.character(
                                    seq(from = extract_numeric(code1), to = extract_numeric(code2))),
                                         nchar(gsub("[^0-9]", "", code1)), pad="0")))},
                   label = first(label)) %>%
        unnest(lists) %>%
        select(-row)
Source: local data frame [15 x 2]

               label lists
               (chr) (chr)
1    excision of CNS 61000
2    excision of CNS 61001
3    excision of CNS 61002
4  ventricular shunt 0169T
5  ventricular shunt 0170T
6  ventricular shunt 0171T
7    excision of CNS 01000
8    excision of CNS 01001
9    excision of CNS 01002
10    some procedure A2543
11    some procedure A2544
12    some procedure A2545
13    some procedure A0543
14    some procedure A0544
15    some procedure A0545

数据：

dff <- structure(list(code = c("61000-61002", "0169T-0171T", "01000-01002", 
"A2543-A2545", "A0543-A0545"), label = c("excision of CNS", "ventricular shunt", 
"excision of CNS", "some procedure", "some procedure")), .Names = c("code", 
"label"), row.names = c(NA, 5L), class = "data.frame")

Answer 2

原始答案：请参阅下面的更新。

首先，我通过将第一行添加到底部来使您的示例数据更具挑战性。

dff <- structure(list(code = c("61000-61003", "0169T-0169T", "61000-61003"
), label = c("excision of CNS", "ventricular shunt", "excision of CNS"
)), .Names = c("code", "label"), row.names = c(NA, 3L), class = "data.frame")

dff
#          code             label
# 1 61000-61003   excision of CNS
# 2 0169T-0169T ventricular shunt
# 3 61000-61003   excision of CNS

我们可以使用序列运算符:来获取code列的序列，并使用tryCatch()包装，这样我们就可以避免错误，并保存无法排序的值。首先，我们使用短划线标记-拆分值，然后通过lapply()运行。

xx <- lapply(
    strsplit(dff$code, "-", fixed = TRUE), 
    function(x) tryCatch(x[1]:x[2], warning = function(w) x)
)
data.frame(code = unlist(xx), label = rep(dff$label, lengths(xx)))
#     code             label
# 1  61000   excision of CNS
# 2  61001   excision of CNS
# 3  61002   excision of CNS
# 4  61003   excision of CNS
# 5  0169T ventricular shunt
# 6  0169T ventricular shunt
# 7  61000   excision of CNS
# 8  61001   excision of CNS
# 9  61002   excision of CNS
# 10 61003   excision of CNS

我们正在尝试将序列运算符:应用于strsplit()中的每个元素，如果无法使用x[1]:x[2]，则只返回这些元素的值并继续否则序列x[1]:x[2]。然后，我们只需根据label中的结果长度复制xx列的值，即可获得新的label列。

更新：以下是我为响应您的修改而想出的内容。将上面的xx替换为

xx <- lapply(strsplit(dff$code, "-", TRUE), function(x) {
    s <- stringi::stri_locate_first_regex(x, "[A-Z]")
    nc <- nchar(x)[1L]
    fmt <- function(n) paste0("%0", n, "d")
    if(!all(is.na(s))) {
        ss <- s[1,1]
        fmt <- fmt(nc-1)
        if(ss == 1L) {
            xx <- substr(x, 2, nc)
            paste0(substr(x, 1, 1), sprintf(fmt, xx[1]:xx[2]))
        } else {
            xx <- substr(x, 1, ss-1)
            paste0(sprintf(fmt, xx[1]:xx[2]), substr(x, nc, nc))
        }
    } else {
        sprintf(fmt(nc), x[1]:x[2])
    }
})

是的，这很复杂。现在，如果我们将以下数据框df2作为测试用例

df2 <- structure(list(code = c("61000-61003", "0169T-0174T", "61000-61003", 
"T0169-T0174"), label = c("excision of CNS", "ventricular shunt", 
"excision of CNS", "ventricular shunt")), .Names = c("code", 
"label"), row.names = c(NA, 4L), class = "data.frame")

并从上面运行xx代码，我们可以得到以下结果。

data.frame(code = unlist(xx), label = rep(df2$label, lengths(xx)))
#     code             label
# 1  61000   excision of CNS
# 2  61001   excision of CNS
# 3  61002   excision of CNS
# 4  61003   excision of CNS
# 5  0169T ventricular shunt
# 6  0170T ventricular shunt
# 7  0171T ventricular shunt
# 8  0172T ventricular shunt
# 9  0173T ventricular shunt
# 10 0174T ventricular shunt
# 11 61000   excision of CNS
# 12 61001   excision of CNS
# 13 61002   excision of CNS
# 14 61003   excision of CNS
# 15 T0169 ventricular shunt
# 16 T0170 ventricular shunt
# 17 T0171 ventricular shunt
# 18 T0172 ventricular shunt
# 19 T0173 ventricular shunt
# 20 T0174 ventricular shunt

Answer 3

为此类代码创建排序规则：

seq_code <- function(from,to){

    ext = function(x, part) gsub("([^0-9]?)([0-9]*)([^0-9]?)", paste0("\\",part), x)

    pre = unique(sapply(list(from,to), ext, part = 1 ))
    suf = unique(sapply(list(from,to), ext, part = 3 ))

    if (length(pre) > 1 | length(suf) > 1){
        return("NO!")
    }

    num = do.call(seq, lapply(list(from,to), function(x) as.integer(ext(x, part = 2))))
    len = nchar(from)-nchar(pre)-nchar(suf)

    paste0(pre, sprintf(paste0("%0",len,"d"), num), suf)

}

使用@ jeremycg的例子：

setDT(dff)[,.(
  label = label[1], 
  code  = do.call(seq_code, tstrsplit(code,'-'))
), by=.(row=seq(nrow(dff)))]

给出了

    row             label  code
 1:   1   excision of CNS 61000
 2:   1   excision of CNS 61001
 3:   1   excision of CNS 61002
 4:   2 ventricular shunt 0169T
 5:   2 ventricular shunt 0170T
 6:   2 ventricular shunt 0171T
 7:   3   excision of CNS 01000
 8:   3   excision of CNS 01001
 9:   3   excision of CNS 01002
10:   4    some procedure A2543
11:   4    some procedure A2544
12:   4    some procedure A2545
13:   5    some procedure A0543
14:   5    some procedure A0544
15:   5    some procedure A0545

从@ jeremycg回复的数据：

dff <- structure(list(code = c("61000-61002", "0169T-0171T", "01000-01002", 
"A2543-A2545", "A0543-A0545"), label = c("excision of CNS", "ventricular shunt", 
"excision of CNS", "some procedure", "some procedure")), .Names = c("code", 
"label"), row.names = c(NA, 5L), class = "data.frame")

Answer 4

如果你足够耐心，你可能会将字符串解析为单独的部分，而不是eval / parse技巧，唉，我不是，所以：

fancy.seq = function(x) eval(parse(text=sub(', \\)', ')', sub('\\(, ', '(',
               sub('.*?([0-9]+)(.*)-(.*?)([1-9][0-9]*).*',
                   'paste0("\\3",
                           formatC(\\1:\\4, width=log10(\\4)+1, format="d", flag="0"),
                           "\\2")',
                   x)))))
# using example from jeremycg's answer
dt[, .(fancy.seq(code), label), by = 1:nrow(dt)]
#    nrow    V1             label
# 1:    1 61000   excision of CNS
# 2:    1 61001   excision of CNS
# 3:    1 61002   excision of CNS
# 4:    2 0169T ventricular shunt
# 5:    2 0170T ventricular shunt
# 6:    2 0171T ventricular shunt
# 7:    3 01000   excision of CNS
# 8:    3 01001   excision of CNS
# 9:    3 01002   excision of CNS
#10:    4 A2543    some procedure
#11:    4 A2544    some procedure
#12:    4 A2545    some procedure
#13:    5 A0543    some procedure
#14:    5 A0544    some procedure
#15:    5 A0545    some procedure

如果不清楚上面做了什么 - 只需在其中一个“代码”字符串上逐个运行sub命令。

Answer 5

不太优雅的方式：

# the data
hcup <- data.frame(code=c("61000-61003", "0169T-0169T"),
                   label=c("excision of CNS", "ventricular shunt"), stringsAsFactors = F)
hcup
>         code             label
>1 61000-61003   excision of CNS
>2 0169T-0169T ventricular shunt

# reshaping
# split the code ranges into separate columns
seq.ends <- cbind(do.call(rbind.data.frame, strsplit(hcup$code, "-")), hcup$label)
# create a list with a data.frame for each original line
new.list <- apply(seq.ends, 1, FUN=function(x){data.frame(code=if(grepl("\\d{5}", x[1])){
                     z<-x[1]:x[2]}else{z<-x[1]}, label=rep(x[3], length(z)),
                     stringsAsFactors = F)})
# collapse the list into a df
new.df <- do.call(rbind, lapply(new.list, data.frame, stringsAsFactors=F))

new.df
>     code             label
>1.1 61000   excision of CNS
>1.2 61001   excision of CNS
>1.3 61002   excision of CNS
>1.4 61003   excision of CNS
>2   0169T ventricular shunt

使用R将单个柱分成多个观察

5 个答案: