我已经实现了Apriori以找到重复的字母序列,但我想要做的是找到重复的单词。我的输出给了我所有的字母数。但是,我想要字数;
I / P 数据= ["我是唐纳德特朗普","我是唐纳德鸭子"]
O / P - > {' d':3,' d':3,' on':3,' ld':3,' am':3,' do':3 ,' al':3,' na'}
- >我想要的 - > 我:2
上午:2特朗普:2
唐纳德:3 我是:2 唐纳德:2 唐纳德特朗普:1我是唐纳德:2
import re
import unittest
from collections import defaultdict
import itertools
class Apriori(dict):
def __init__(self, listOfSequences, support):
Args:
listOfSequences (list): A list of strings, each letter representing a specific event.
support (int): The minimum percentage of sequences a pattern must match.
"""
super(Apriori, self).__init__()
self.data = listOfSequences
self.thres = (support * len(self.data)) / 100.0
self.primitives = self.getPrimitives()
self.apriori()
del self.data
def apriori(self):
candidates = self.getNewCandidates(self.primitives)
while len(candidates) > 0:
res = self.getPatternsCount(candidates)
self.update(res)
candidates = self.getNewCandidates(res.keys())
def getPrimitives(self):
primitives = set()
for seq in self.data:
for event in seq:
primitives.add(event)
return primitives
def getNewCandidates(self, candidates):
newCandidates = set()
for seq in self.data:
for can in candidates:
for subs in re.findall(can + ".", seq):
newCandidates.add(subs)
return newCandidates
def getPatternsCount(self, candidates):
patternsCount = defaultdict(int)
for seq in self.data:
for can in candidates:
if can in seq:
patternsCount[can] += 1
return {k: v for k, v in patternsCount.items() if v > self.thres}
if __name__ == '__main__':
pass
执行命令
import csv
from ne import *
#print(t)
data = ["i am donald trump","i am donald duck"]
#print(data.type())
patterns= Apriori(data,15)
print(patterns)
答案 0 :(得分:0)
专注于您的特定问题(并忽略您发布的代码中的一些问题),您只需分割输入字符串并维护这些分割代币的频率计数即可实现您想要的效果。
例如,使用Counter
:
from collections import Counter
data = ["i am donald trump","i am donald duck"]
c = Counter()
for seq in data:
c += Counter(seq.split(' '))
print c # Counter({'i': 2, 'donald': 2, 'am': 2, 'trump': 1, 'duck': 1})
P.S。然后你可能对most_common
值不感兴趣,但是在超过某个阈值的那些值中,例如:
print {(k,v) for k,v in c.iteritems() if v >= 2} # {('am', 2), ('donald', 2), ('i', 2)}