我发现使用arule包有点棘手。我正在使用apriori算法来查找关联规则;类似于arules文档中的示例。
data("AdultUCI")
dim(AdultUCI)
AdultUCI[1:2,]
#Ignore everything from here to the last two lines, this is just data preparation
## remove attributes
AdultUCI[["fnlwgt"]] <- NULL
AdultUCI[["education-num"]] <- NULL
## map metric attributes
AdultUCI[[ "age"]] <- ordered(cut(AdultUCI[[ "age"]], c(15,25,45,65,100)),
labels = c("Young", "Middle-aged", "Senior", "Old"))
AdultUCI[[ "hours-per-week"]] <- ordered(cut(AdultUCI[[ "hours-per-week"]],
c(0,25,40,60,168)),
labels = c("Part-time", "Full-time", "Over-time", "Workaholic"))
AdultUCI[[ "capital-gain"]] <- ordered(cut(AdultUCI[[ "capital-gain"]],
c(-Inf,0,median(AdultUCI[[ "capital-gain"]][AdultUCI[[ "capital-gain"]]>0]),
Inf)), labels = c("None", "Low", "High"))
AdultUCI[[ "capital-loss"]] <- ordered(cut(AdultUCI[[ "capital-loss"]],
c(-Inf,0, median(AdultUCI[[ "capital-loss"]][AdultUCI[[ "capital-loss"]]>0]),
Inf)), labels = c("None", "Low", "High"))
#resume reading here
rules <- apriori(AdultUCI, parameter=list(support=0.6, confidence=0.75, minlen=4))
inspect(rules)
返回以下四条规则
lhs rhs support confidence lift
1 {race=White,
capital-gain=None,
native-country=United-States} => {capital-loss=None} 0.680398 0.9457029 0.9920537
2 {race=White,
capital-loss=None,
native-country=United-States} => {capital-gain=None} 0.680398 0.9083504 0.9901500
3 {race=White,
capital-gain=None,
capital-loss=None} => {native-country=United-States} 0.680398 0.9189249 1.0239581
4 {capital-gain=None,
capital-loss=None,
native-country=United-States} => {race=White} 0.680398 0.8730100 1.0210133
我一定错过了一些东西:如何使用arules函数找到源数据中与lhs规则匹配的行?
是否有一种简单的方法可以从lhs(规则)构建SQL WHERE子句?
由于
答案 0 :(得分:3)
这个答案基于以下答案:https://stats.stackexchange.com/questions/21340/finding-suitable-rules-for-new-data-using-arules。 解决方案非常慢,我不确定是否适用于大型应用程序。
library(arules)
rules <- apriori(AdultUCI, parameter=list(support=0.4, confidence=0.75, minlen=4))
inspect(rules)
rec <- function(rules, data, iter){
basket <- data[iter]
rulesMatchLHS <- is.subset(rules@lhs,basket)
suitableRules <- rulesMatchLHS & !(is.subset(rules@rhs,basket))
rules <- sort(rules[rulesMatchLHS], decreasing=TRUE, by="lift")
as(head(rules, 1), "data.frame")
}
recom_loop <- function(rules, data){
temp <- lapply(seq_along(data), function(x) rec(rules, data, x))
temp <- do.call("rbind", temp)
recom <- gsub(".*=> |\\{|\\}", "", temp$rules)
as.data.frame(cbind(as(data, "data.frame"), recom))
}
trans <- as(AdultUCI, "transactions")
recom <- recom_loop(rules, trans[1:50])
以下是一些示例输出:
head(recom)
transactionID
1 1
2 2
3 3
4 4
5 5
6 6
items
1 {age=Middle-aged,workclass=State-gov,education=Bachelors,marital-status=Never-married,occupation=Adm-clerical,relationship=Not-in-family,race=White,sex=Male,capital-gain=Low,capital-loss=None,hours-per-week=Full-time,native-country=United-States,income=small}
2 {age=Senior,workclass=Self-emp-not-inc,education=Bachelors,marital-status=Married-civ-spouse,occupation=Exec-managerial,relationship=Husband,race=White,sex=Male,capital-gain=None,capital-loss=None,hours-per-week=Part-time,native-country=United-States,income=small}
3 {age=Middle-aged,workclass=Private,education=HS-grad,marital-status=Divorced,occupation=Handlers-cleaners,relationship=Not-in-family,race=White,sex=Male,capital-gain=None,capital-loss=None,hours-per-week=Full-time,native-country=United-States,income=small}
4 {age=Senior,workclass=Private,education=11th,marital-status=Married-civ-spouse,occupation=Handlers-cleaners,relationship=Husband,race=Black,sex=Male,capital-gain=None,capital-loss=None,hours-per-week=Full-time,native-country=United-States,income=small}
5 {age=Middle-aged,workclass=Private,education=Bachelors,marital-status=Married-civ-spouse,occupation=Prof-specialty,relationship=Wife,race=Black,sex=Female,capital-gain=None,capital-loss=None,hours-per-week=Full-time,native-country=Cuba,income=small}
6 {age=Middle-aged,workclass=Private,education=Masters,marital-status=Married-civ-spouse,occupation=Exec-managerial,relationship=Wife,race=White,sex=Female,capital-gain=None,capital-loss=None,hours-per-week=Full-time,native-country=United-States,income=small}
recom
1 race=White
2 race=White
3 race=White
4 race=White
5 race=White
6 capital-gain=None
答案 1 :(得分:0)
关于第一个问题,可以使用此函数找到支持事务(应该比其他响应更快地工作):
supp_trans_ids = function(items, transactions){
# makes a logical matrix showing which set of items in rows are fully contains in transactions on rows
tmp = is.subset(items, transactions)
tmp2 = lapply(
seq_len(nrow(tmp)),
# 'which' alone would leave a name for each index, which is a complete rule (and would use a lot of memory therefore)
function(i) {
t = which(tmp[i,])
names(t) = NULL
t
}
)
# to easily idenfify sets of items
names(tmp2) = rownames(tmp)
tmp2
}
现在,您可以通过以下方式找到哪些交易支持每个规则&l; lhs:
AdultUCI_trans = as(AdultUCI, 'transactions')
trans_supporting = supp_trans_ids(lhs(rules), AdultUCI_trans)
e.g。
> str(trans_supporting)
List of 4
$ {race=White,capital-gain=None,native-country=United-States} : int [1:35140] 2 3 6 8 13 17 18 19 20 21 ...
$ {race=White,capital-loss=None,native-country=United-States} : int [1:36585] 1 2 3 6 8 9 10 13 17 18 ...
$ {race=White,capital-gain=None,capital-loss=None} : int [1:36164] 2 3 6 8 13 17 18 19 20 21 ...
$ {capital-gain=None,capital-loss=None,native-country=United-States}: int [1:38066] 2 3 4 6 8 11 13 14 17 18 ...
您可以找到的数据:
AdultUCI_trans[trans_supporting[[1]]] # transactions supporting
AdultUCI[trans_supporting[[1]],] # data on which these transactions are based