我在表的第二列中有一个代码列表,我想提取每个代码的一些元素,然后将它们存储在与每个代码关联的新列中。 每个代码由字母后跟一些数字组成。字母是P,F,I,R,C,在所有代码中以相同的顺序重复,但在每个代码中数字的数量是变化的。
例如:考虑以下代码:
P1F2I235R15C145 P1 F2 I23 R15 C145
P24F1I12R124C96 P24 F1 I12 R124 C96
因此,我可以将每个代码拆分为其构成子代码,并将这些组件存储到同一个表中的新列中。 感谢
答案 0 :(得分:4)
这是一个可能的stringi
解决方案
library(stringi)
x <- c("P1F2I235R15C145","P24F1I12R124C96")
res <- stri_split_regex(x,"(?=([A-Za-z]=?))",perl = TRUE,simplify = TRUE,omit_empty = TRUE)
cbind.data.frame(x, res)
# x 1 2 3 4 5
# 1 P1F2I235R15C145 P1 F2 I235 R15 C145
# 2 P24F1I12R124C96 P24 F1 I12 R124 C96
答案 1 :(得分:3)
试试这个:
#simulate your data frame
df<-data.frame(code=c("P1F2I235R15C145","P24F1I12R124C96"),stringsAsFactors=FALSE)
#split the columns
cbind(df,do.call(rbind,regmatches(df$code,gregexpr("[PFIRC][0-9]+",df$code))))
# code 1 2 3 4 5
#1 P1F2I235R15C145 P1 F2 I235 R15 C145
#2 P24F1I12R124C96 P24 F1 I12 R124 C96
@AnandaMatho在评论中建议让代码前面的字母消失并相应地命名列。这样的事情:
res<-cbind(df,do.call(rbind,regmatches(df$code,gregexpr("(?<=[PFIRC])[0-9]+",df$code,perl=TRUE))))
names(res)<-c("Code","P","F","I","R","C")
# Code P F I R C
#1 P1F2I235R15C145 1 2 235 15 145
#2 P24F1I12R124C96 24 1 12 124 96
答案 2 :(得分:1)
data.table
解决方案:
library(data.table)
dt<-data.table(code=c("P1F2I235R15C145","P24F1I12R124C96"))
dt[,c("P","F","I","R","C"):=
lapply(c("P","F","I","R","C"),
function(x)regmatches(code,regexpr(paste0(x,"[0-9]+"),code)))]
> dt
code P F I R C
1: P1F2I235R15C145 P1 F2 I235 R15 C145
2: P24F1I12R124C96 P24 F1 I12 R124 C96
如果你最终决定放弃前面的字母,那就进行微调:
dt[,c("P","F","I","R","C"):=
lapply(c("P","F","I","R","C"),
function(x)regmatches(code,regexpr(paste0("(?<=",x,")[0-9]+"),
code,perl=T)))]
> dt
code P F I R C
1: P1F2I235R15C145 1 2 235 15 145
2: P24F1I12R124C96 24 1 12 124 96
或使用devel版本的data.table (v1.9.5+)
:
dt[, c("P", "F", "I", "R", "C") :=
tstrsplit(code, "(?<=.)(?=[[:alpha:]][0-9]+)", perl=TRUE)]
# code P F I R C
# 1: P1F2I235R15C145 P1 F2 I235 R15 C145
# 2: P24F1I12R124C96 P24 F1 I12 R124 C96