在R中编辑Arules数据帧

时间:2014-11-26 07:27:32

标签: r arules

您好我已将我的arules转换为数据框以供进一步分析,但问题是我的数据框看起来像这样:

df <- data.frame(rules=c("{45107} => {62557}","{17759} => {60521 }",
"{53721} => {53720}","{63830} => {17753}","{45413} => {45412}",
"{3885,59800,17759} => {4749}","{17721,55906} => {9314}"))

    rules
{45107} => {62557}
{17759} => {60521 }
{53721} => {53720}
{63830} => {17753}
{45413} => {45412}
{3885,59800,17759} => {4749}
{17721,55906} => {9314}

您能帮助我将数据框格更改为此格式吗?

lhs1    lhs2    lhs3    rhs
45107           62557
17759           60521
53721           53720
63830           17753
45413           45412
3885    59800   17759   4749
17721   55906   9314

3 个答案:

答案 0 :(得分:2)

你也可以这样做,这应该是非常有效的。

library(splitstackshape)  ## for cSplit() and loads data.table package

dt <- data.table(
    do.call(rbind, strsplit(gsub("[{} ]", "", df$rules), "=>"))
)
cbind(cSplit(dt[, .(V1)], "V1", ","), dt[, .(V2)])

#     V1_1  V1_2  V1_3    V2
# 1: 45107    NA    NA 62557
# 2: 17759    NA    NA 60521
# 3: 53721    NA    NA 53720
# 4: 63830    NA    NA 17753
# 5: 45413    NA    NA 45412
# 6:  3885 59800 17759  4749
# 7: 17721 55906    NA  9314

答案 1 :(得分:1)

使用您的data.frame df并将=>之后的所有数字放入rhs

# define the number of maximum "lhs", there is 2 options :
   # option 1, if there are few rules and number of maximum "lhs" is obvious :
maxlhs<-3
   # option 2, if there are many many rules and you don't want to count all "lhs" :
maxlhs<-max(sapply(df$rules,FUN=function(x)length(gregexpr(',',x)[[1]]))) + 1 

# create your new data.frame by "reformatting" the rules
newdf<-t(apply(df,1,function(rule,maxlhs){
                split1<-strsplit(gsub("[ }{]","",rule),"=>")[[1]]
                split2<-strsplit(split1[1],",")[[1]]
                split2<-c(split2,rep(NA,maxlhs-length(split2)))
                return(as.numeric(c(split2,split1[2])))
                    },maxlhs=maxlhs))
# name the new data.frame's columns
colnames(newdf)<-c(paste0("lhs",1:maxlhs),"rhs")

> newdf
      lhs1  lhs2  lhs3   rhs
[1,] 45107    NA    NA 62557
[2,] 17759    NA    NA 60521
[3,] 53721    NA    NA 53720
[4,] 63830    NA    NA 17753
[5,] 45413    NA    NA 45412
[6,]  3885 59800 17759  4749
[7,] 17721 55906    NA  9314

这样可以,还是您希望新的data.frame与问题中显示的完全一样?

答案 2 :(得分:1)

# your data
library(stringr)
data <- structure(list(rules = c("{45107} => {62557}", "{17759} => {60521 }", "{53721} =>     {53720}", "{63830} => {17753}", "{45413} => {45412}", "{3885,59800,17759} => {4749}", "{17721,55906} => {9314}")), .Names = "rules", class = "data.frame", row.names = c(NA, -7L))

# extract all numbers
lhs <- lapply(data, function(x) str_extract_all(x, "\\d+"))$rules
mx <- max(sapply(lhs, length))

do.call("rbind", lapply(lhs, function(x){
  if(length(x) < mx){
   return(c(unlist(x)[-length(x)], matrix(NA, 1, mx - length(x)), unlist(x)[length(x)]))
   } else {
   return(x)
}}))

     [,1]    [,2]    [,3]    [,4]   
[1,] "45107" NA      NA      "62557"
[2,] "17759" NA      NA      "60521"
[3,] "53721" NA      NA      "53720"
[4,] "63830" NA      NA      "17753"
[5,] "45413" NA      NA      "45412"
[6,] "3885"  "59800" "17759" "4749" 
[7,] "17721" "55906" NA      "9314"