Question

我一直在尝试使用this数据开发apriori算法。我能够获得对和三元组的关联和置信度，但是在格式化输出和提取正确的元素时遇到了问题。

我在这个test数据上运行算法。它只是原始数据集的一个子集。目前输出如下：

[[frozenset({'GRO73461'}), frozenset({'ELE17451'}), 1.0], 
[frozenset({'GRO99222'}), frozenset({'ELE17451'}), 0.8125], [frozenset({'ELE17451'}), frozenset({'GRO99222'}), 0.5], [frozenset({'ELE17451'}), frozenset({'GRO73461'}), 0.38461538461538464]]
frozenset({'GRO73461', 'ELE17451'}), 0.8], [frozenset({'GRO73461'}), frozenset({'DAI22896', 'ELE17451'}), 0.8]

你可以看到它的一团糟。该列表是根据降序的置信度排序的。我想将频繁对与频繁三元组分开并排列输出，使其看起来像这样：

OUTPUT A
FRO11987 FRO12685 0.4325
FRO11987 ELE11375 0.4225
FRO11987 GRO94758 0.4125
FRO11987 SNA80192 0.4025
FRO11987 FRO18919 0.4015
OUTPUT B
FRO11987 FRO12685 DAI95741 0.4325
FRO11987 ELE11375 GRO73461 0.4225
FRO11987 GRO94758 ELE26917 0.4125
FRO11987 SNA80192 ELE28189 0.4025
FRO11987 FRO18919 GRO68850 0.4015

以上是前5名频繁对，前5名基于信心的频繁三元组。

我遇到问题的主要区域是在频繁对和三元组之间辨别，然后从frozensets中提取项目，使它们采用上述格式。

from numpy import *
import pandas as pd
from operator import itemgetter

def loadDataSet(data=None):
    return pd.read_csv(data, sep = ' ', error_bad_lines=False)

def createCandidateSet(data):
    C1 = []
    for transaction in data:
        for item in transaction:
            if not [item] in C1:
                C1.append([item])
    C1.sort()
    return list(map(frozenset, C1))

def scanData(dataset, Ck, support):
    ssCount = {}

    for tID in dataset:
        for candidate in Ck:
            if candidate.issubset(tID):
                if not candidate in ssCount:
                    ssCount[candidate] = 1
                else:
                    ssCount[candidate]+=1
#    numItems = float(len(dataset))
    res = []
    supportData ={}
    for key in ssCount:
        #Support is a proportion or a integer; the occurrence of the item in relation to the data set
#        currSupport = ssCount[key]/numItems
        currSupport = ssCount[key]
        if currSupport >= support:
            res.insert(0, key)
        supportData[key] = currSupport
    return res, supportData

def aprioriHelper(Lk, k): #creates candidate itemsets
    res = []
    freqItemLen = len(Lk)
    for i in range(freqItemLen):
        for j in range(i+1, freqItemLen):
            L1 = list(Lk[i])[:k-2]
            L2 = list(Lk[j])[:k-2]
            L1.sort()
            L2.sort()
            if L1 == L2:
                res.append(Lk[i] | Lk[j])
    return res

def apriori(dataset, minSupport=100):
    C1 = createCandidateSet(dataset)
    D = list(map(set, dataset))
    L1, supportData = scanData(D, C1, minSupport)
    L = [L1]
    k = 2
    while (len(L[k-2]) > 0):
        Ck = aprioriHelper(L[k-2], k)
        Lk, supportK = scanData(D, Ck, minSupport) #scan dataset to get frequent items sets, now the itemsets are bigger
        supportData.update(supportK)
        L.append(Lk)
        k+=1

    return L, supportData

def generateRules(L, supportData, conf = 0.7): #support data is data on each item sets support, comes from scanData
    rules = [] #takes tuples of associations, consequences, and confidence
    for i in range(1, len(L)): #get itemsets with number of items >=2
        for freq in L[i]:
            association = [frozenset([item]) for item in freq]
            if i > 1:
                rulesFromConsequences(freq, association, supportData, rules, conf)
            else:
                calculateConfidence(freq, association, supportData, rules, conf)
    return rules

def calculateConfidence(freq, association, supportData, rules, conf=0.7):
    filteredAssociations = []
    for consequence in association:
        #confidence(I -> J) = Support(I U J)/Support(I)
        confidence = supportData[freq]/supportData[freq - consequence] #calculate confidence
        if confidence >= conf:
#            print(freq-consequence, ' ', consequence, ' ', confidence) #print out association rule and confidence
            rules.append((freq-consequence, consequence, confidence))
            filteredAssociations.append(consequence)
    return filteredAssociations

def rulesFromConsequences(freq, association, supportData, rules, conf=0.7):
    #generate more rules when frequent itemsets become larger
    a_len = len(association[0])
    if (len(freq) > (a_len+1)): #try to merge into a bigger itemset that is frequent
        association_p1 = aprioriHelper(association, a_len+1) #create association+1 new candidates- create bigger itemset and get more candidates for association rules
        association_p1 = calculateConfidence(freq, association_p1, supportData, rules, conf)
        if len(association_p1) > 1: #need to have at least two sets in order to merge
            rulesFromConsequences(freq, association_p1, supportData, rules, conf) #recursively call to build bigger itemset and get more rules


def main():



    dataset = [line.split() for line in open('datatest.txt')]
    L, supportData = apriori(dataset, minSupport=8)



    rules = generateRules(L, supportData, conf=0)
    rules = sorted(rules, key = itemgetter(2), reverse=True)
    triples = []
    doubles = []
    i = 0
    while len(triples) < 5:
        if i == len(rules):
           break
        if len(rules[i][1]) == 2:
           triples.append(rules[i])
        i+=1

    j = 0
    while len(doubles) < 5:
        if j == len(rules):
           break
        if len(rules[j][1]) == 1:
           doubles.append(rules[j])
        j+=1
if __name__ == '__main__':
    main()

对此问题的任何建议表示赞赏。如果您对代码或思考过程有任何疑问，请告诉我们。如果有任何粗心的错误，请提前道歉。

感谢您阅读

从frozenset中提取元素

0 个答案: