您好我已将我的arules转换为数据框以供进一步分析,但问题是我的数据框看起来像这样:
df <- data.frame(rules=c("{45107} => {62557}","{17759} => {60521 }",
"{53721} => {53720}","{63830} => {17753}","{45413} => {45412}",
"{3885,59800,17759} => {4749}","{17721,55906} => {9314}"))
rules
{45107} => {62557}
{17759} => {60521 }
{53721} => {53720}
{63830} => {17753}
{45413} => {45412}
{3885,59800,17759} => {4749}
{17721,55906} => {9314}
您能帮助我将数据框格更改为此格式吗?
lhs1 lhs2 lhs3 rhs
45107 62557
17759 60521
53721 53720
63830 17753
45413 45412
3885 59800 17759 4749
17721 55906 9314
答案 0 :(得分:2)
你也可以这样做,这应该是非常有效的。
library(splitstackshape) ## for cSplit() and loads data.table package
dt <- data.table(
do.call(rbind, strsplit(gsub("[{} ]", "", df$rules), "=>"))
)
cbind(cSplit(dt[, .(V1)], "V1", ","), dt[, .(V2)])
# V1_1 V1_2 V1_3 V2
# 1: 45107 NA NA 62557
# 2: 17759 NA NA 60521
# 3: 53721 NA NA 53720
# 4: 63830 NA NA 17753
# 5: 45413 NA NA 45412
# 6: 3885 59800 17759 4749
# 7: 17721 55906 NA 9314
答案 1 :(得分:1)
使用您的data.frame df并将=>
之后的所有数字放入rhs
:
# define the number of maximum "lhs", there is 2 options :
# option 1, if there are few rules and number of maximum "lhs" is obvious :
maxlhs<-3
# option 2, if there are many many rules and you don't want to count all "lhs" :
maxlhs<-max(sapply(df$rules,FUN=function(x)length(gregexpr(',',x)[[1]]))) + 1
# create your new data.frame by "reformatting" the rules
newdf<-t(apply(df,1,function(rule,maxlhs){
split1<-strsplit(gsub("[ }{]","",rule),"=>")[[1]]
split2<-strsplit(split1[1],",")[[1]]
split2<-c(split2,rep(NA,maxlhs-length(split2)))
return(as.numeric(c(split2,split1[2])))
},maxlhs=maxlhs))
# name the new data.frame's columns
colnames(newdf)<-c(paste0("lhs",1:maxlhs),"rhs")
> newdf
lhs1 lhs2 lhs3 rhs
[1,] 45107 NA NA 62557
[2,] 17759 NA NA 60521
[3,] 53721 NA NA 53720
[4,] 63830 NA NA 17753
[5,] 45413 NA NA 45412
[6,] 3885 59800 17759 4749
[7,] 17721 55906 NA 9314
这样可以,还是您希望新的data.frame与问题中显示的完全一样?
答案 2 :(得分:1)
# your data
library(stringr)
data <- structure(list(rules = c("{45107} => {62557}", "{17759} => {60521 }", "{53721} => {53720}", "{63830} => {17753}", "{45413} => {45412}", "{3885,59800,17759} => {4749}", "{17721,55906} => {9314}")), .Names = "rules", class = "data.frame", row.names = c(NA, -7L))
# extract all numbers
lhs <- lapply(data, function(x) str_extract_all(x, "\\d+"))$rules
mx <- max(sapply(lhs, length))
do.call("rbind", lapply(lhs, function(x){
if(length(x) < mx){
return(c(unlist(x)[-length(x)], matrix(NA, 1, mx - length(x)), unlist(x)[length(x)]))
} else {
return(x)
}}))
[,1] [,2] [,3] [,4]
[1,] "45107" NA NA "62557"
[2,] "17759" NA NA "60521"
[3,] "53721" NA NA "53720"
[4,] "63830" NA NA "17753"
[5,] "45413" NA NA "45412"
[6,] "3885" "59800" "17759" "4749"
[7,] "17721" "55906" NA "9314"