arules:如何在R或SQL WHERE子句中找到与lhs(规则)匹配的数据?

时间:2014-04-24 22:55:37

标签: r apriori arules

我发现使用arule包有点棘手。我正在使用apriori算法来查找关联规则;类似于arules文档中的示例。

data("AdultUCI")
dim(AdultUCI)
AdultUCI[1:2,]

#Ignore everything from here to the last two lines, this is just data preparation

## remove attributes
AdultUCI[["fnlwgt"]] <- NULL
AdultUCI[["education-num"]] <- NULL

## map metric attributes
AdultUCI[[ "age"]] <- ordered(cut(AdultUCI[[ "age"]], c(15,25,45,65,100)),
                              labels = c("Young", "Middle-aged", "Senior", "Old"))

AdultUCI[[ "hours-per-week"]] <- ordered(cut(AdultUCI[[ "hours-per-week"]],
                                             c(0,25,40,60,168)),
                                         labels = c("Part-time", "Full-time", "Over-time", "Workaholic"))

AdultUCI[[ "capital-gain"]] <- ordered(cut(AdultUCI[[ "capital-gain"]],
                                           c(-Inf,0,median(AdultUCI[[ "capital-gain"]][AdultUCI[[ "capital-gain"]]>0]),
                                             Inf)), labels = c("None", "Low", "High"))

AdultUCI[[ "capital-loss"]] <- ordered(cut(AdultUCI[[ "capital-loss"]],
                                           c(-Inf,0, median(AdultUCI[[ "capital-loss"]][AdultUCI[[ "capital-loss"]]>0]),
                                             Inf)), labels = c("None", "Low", "High"))

#resume reading here
rules <- apriori(AdultUCI, parameter=list(support=0.6, confidence=0.75, minlen=4))
inspect(rules)

返回以下四条规则

lhs                               rhs                             support confidence      lift
1 {race=White,                                                                                  
   capital-gain=None,                                                                           
   native-country=United-States} => {capital-loss=None}            0.680398  0.9457029 0.9920537
2 {race=White,                                                                                  
   capital-loss=None,                                                                           
   native-country=United-States} => {capital-gain=None}            0.680398  0.9083504 0.9901500
3 {race=White,                                                                                  
   capital-gain=None,                                                                           
   capital-loss=None}            => {native-country=United-States} 0.680398  0.9189249 1.0239581
4 {capital-gain=None,                                                                           
   capital-loss=None,                                                                           
   native-country=United-States} => {race=White}                   0.680398  0.8730100 1.0210133

我一定错过了一些东西:如何使用arules函数找到源数据中与lhs规则匹配的行?

是否有一种简单的方法可以从lhs(规则)构建SQL WHERE子句?

由于

2 个答案:

答案 0 :(得分:3)

这个答案基于以下答案:https://stats.stackexchange.com/questions/21340/finding-suitable-rules-for-new-data-using-arules。 解决方案非常慢,我不确定是否适用于大型应用程序。

library(arules)

rules <- apriori(AdultUCI, parameter=list(support=0.4, confidence=0.75, minlen=4))
inspect(rules)

rec <- function(rules, data, iter){
  basket <- data[iter]
  rulesMatchLHS <- is.subset(rules@lhs,basket)
  suitableRules <-  rulesMatchLHS & !(is.subset(rules@rhs,basket))
  rules <- sort(rules[rulesMatchLHS], decreasing=TRUE, by="lift")
  as(head(rules, 1), "data.frame")
}

recom_loop <- function(rules, data){
  temp <- lapply(seq_along(data), function(x) rec(rules, data, x))
  temp <- do.call("rbind", temp)
  recom <- gsub(".*=> |\\{|\\}", "", temp$rules)
  as.data.frame(cbind(as(data, "data.frame"), recom))  
}

trans <- as(AdultUCI, "transactions")
recom <- recom_loop(rules, trans[1:50])

以下是一些示例输出:

head(recom)
  transactionID
1             1
2             2
3             3
4             4
5             5
6             6
                                                                                                                                                                                                                                                                     items
1      {age=Middle-aged,workclass=State-gov,education=Bachelors,marital-status=Never-married,occupation=Adm-clerical,relationship=Not-in-family,race=White,sex=Male,capital-gain=Low,capital-loss=None,hours-per-week=Full-time,native-country=United-States,income=small}
2 {age=Senior,workclass=Self-emp-not-inc,education=Bachelors,marital-status=Married-civ-spouse,occupation=Exec-managerial,relationship=Husband,race=White,sex=Male,capital-gain=None,capital-loss=None,hours-per-week=Part-time,native-country=United-States,income=small}
3         {age=Middle-aged,workclass=Private,education=HS-grad,marital-status=Divorced,occupation=Handlers-cleaners,relationship=Not-in-family,race=White,sex=Male,capital-gain=None,capital-loss=None,hours-per-week=Full-time,native-country=United-States,income=small}
4             {age=Senior,workclass=Private,education=11th,marital-status=Married-civ-spouse,occupation=Handlers-cleaners,relationship=Husband,race=Black,sex=Male,capital-gain=None,capital-loss=None,hours-per-week=Full-time,native-country=United-States,income=small}
5                {age=Middle-aged,workclass=Private,education=Bachelors,marital-status=Married-civ-spouse,occupation=Prof-specialty,relationship=Wife,race=Black,sex=Female,capital-gain=None,capital-loss=None,hours-per-week=Full-time,native-country=Cuba,income=small}
6        {age=Middle-aged,workclass=Private,education=Masters,marital-status=Married-civ-spouse,occupation=Exec-managerial,relationship=Wife,race=White,sex=Female,capital-gain=None,capital-loss=None,hours-per-week=Full-time,native-country=United-States,income=small}
              recom
1        race=White
2        race=White
3        race=White
4        race=White
5        race=White
6 capital-gain=None

答案 1 :(得分:0)

关于第一个问题,可以使用此函数找到支持事务(应该比其他响应更快地工作):

supp_trans_ids = function(items, transactions){
  # makes a logical matrix showing which set of items in rows are fully contains in transactions on rows 
  tmp = is.subset(items, transactions)

  tmp2 = lapply(
    seq_len(nrow(tmp)),
    # 'which' alone would leave a name for each index, which is a complete rule (and would use a lot of memory therefore)
    function(i) {
      t = which(tmp[i,])
      names(t) = NULL
      t
    }
  )

  # to easily idenfify sets of items
  names(tmp2) = rownames(tmp)

  tmp2
}

现在,您可以通过以下方式找到哪些交易支持每个规则&l; lhs:

AdultUCI_trans = as(AdultUCI, 'transactions')
trans_supporting = supp_trans_ids(lhs(rules), AdultUCI_trans)

e.g。

> str(trans_supporting)
List of 4
 $ {race=White,capital-gain=None,native-country=United-States}       : int [1:35140] 2 3 6 8 13 17 18 19 20 21 ...
 $ {race=White,capital-loss=None,native-country=United-States}       : int [1:36585] 1 2 3 6 8 9 10 13 17 18 ...
 $ {race=White,capital-gain=None,capital-loss=None}                  : int [1:36164] 2 3 6 8 13 17 18 19 20 21 ...
 $ {capital-gain=None,capital-loss=None,native-country=United-States}: int [1:38066] 2 3 4 6 8 11 13 14 17 18 ...

您可以找到的数据:

AdultUCI_trans[trans_supporting[[1]]] # transactions supporting
AdultUCI[trans_supporting[[1]],] # data on which these transactions are based