在熊猫中实施Apriori算法的最佳方法是什么?到目前为止,我一直坚持转换使用for循环提取模式。从for循环开始的一切都不起作用。是否有一种矢量化的方式在熊猫中做到这一点?
import pandas as pd
import numpy as np
trans=pd.read_table('output.txt', header=None,index_col=0)
def apriori(trans, support=4):
ts=pd.get_dummies(trans.unstack().dropna()).groupby(level=1).sum()
#user input
collen, rowlen =ts.shape
#max length of items
tssum=ts.sum(axis=1)
maxlen=tssum.loc[tssum.idxmax()]
items=list(ts.columns)
results=[]
#loop through items
for c in range(1, maxlen):
#generate patterns
pattern=[]
for n in len(pattern):
#calculate support
pattern=['supp']=pattern.sum/rowlen
#filter by support level
Condit=pattern['supp']> support
pattern=pattern[Condit]
results.append(pattern)
return results
results =apriori(trans)
print results
当我用支持3插入时
a b c d e
0
11 1 1 1 0 0
666 1 0 0 1 1
10101 0 1 1 1 0
1010 1 1 1 1 0
414147 0 1 1 0 0
10101 1 1 0 1 0
1242 0 0 0 1 1
101 1 1 1 1 0
411 0 0 1 1 1
444 1 1 1 0 0
它应输出类似
的内容 Pattern support
a 6
b 7
c 7
d 7
e 3
a,b 5
a,c 4
a,d 4
答案 0 :(得分:4)
假设我明白你在追求什么,也许
from itertools import combinations
def get_support(df):
pp = []
for cnum in range(1, len(df.columns)+1):
for cols in combinations(df, cnum):
s = df[list(cols)].all(axis=1).sum()
pp.append([",".join(cols), s])
sdf = pd.DataFrame(pp, columns=["Pattern", "Support"])
return sdf
会让你开始:
>>> s = get_support(df)
>>> s[s.Support >= 3]
Pattern Support
0 a 6
1 b 7
2 c 7
3 d 7
4 e 3
5 a,b 5
6 a,c 4
7 a,d 4
9 b,c 6
10 b,d 4
12 c,d 4
14 d,e 3
15 a,b,c 4
16 a,b,d 3
21 b,c,d 3
[15 rows x 2 columns]