使用apriori的单词序列

时间:2018-03-01 10:51:23

标签: python apriori

我已经实现了Apriori以找到重复的字母序列,但我想要做的是找到重复的单词。我的输出给了我所有的字母数。但是,我想要字数;

I / P     数据= ["我是唐纳德特朗普","我是唐纳德鸭子"]

O / P      - > {' d':3,' d':3,' on':3,' ld':3,' am':3,' do':3 ,' al':3,' na'}

- >我想要的 - >  我:2

上午:2

特朗普:2

唐纳德:3

我是:2

唐纳德:2

唐纳德特朗普:1

我是唐纳德:2

import re
import unittest
from collections import defaultdict
import itertools

class Apriori(dict):

    def __init__(self, listOfSequences, support):

        Args:
            listOfSequences (list): A list of strings, each letter representing a specific event.
            support (int): The minimum percentage of sequences a pattern must match.
        """

        super(Apriori, self).__init__()
        self.data = listOfSequences
        self.thres = (support * len(self.data)) / 100.0
        self.primitives = self.getPrimitives()
        self.apriori()
        del self.data

    def apriori(self):
        candidates = self.getNewCandidates(self.primitives)
        while len(candidates) > 0:
            res = self.getPatternsCount(candidates)
            self.update(res)
            candidates = self.getNewCandidates(res.keys())

    def getPrimitives(self):
        primitives = set()
        for seq in self.data:
            for event in seq:
                primitives.add(event)
        return primitives

    def getNewCandidates(self, candidates):
        newCandidates = set()
        for seq in self.data:
            for can in candidates:
                for subs in re.findall(can + ".", seq):
                    newCandidates.add(subs)
        return newCandidates

    def getPatternsCount(self, candidates):
        patternsCount = defaultdict(int)
        for seq in self.data:
            for can in candidates:
                if can in seq:
                    patternsCount[can] += 1
        return {k: v for k, v in patternsCount.items() if v > self.thres}


    if __name__ == '__main__':
        pass

执行命令

import csv
from ne import *

#print(t)
data = ["i am donald trump","i am donald duck"]
#print(data.type())

patterns= Apriori(data,15)

print(patterns)

1 个答案:

答案 0 :(得分:0)

专注于您的特定问题(并忽略您发布的代码中的一些问题),您只需分割输入字符串并维护这些分割代币的频率计数即可实现您想要的效果。

例如,使用Counter

from collections import Counter

data = ["i am donald trump","i am donald duck"]
c = Counter()
for seq in data:
    c += Counter(seq.split(' '))

print c  # Counter({'i': 2, 'donald': 2, 'am': 2, 'trump': 1, 'duck': 1})

P.S。然后你可能对most_common值不感兴趣,但是在超过某个阈值的那些值中,例如:

print {(k,v) for k,v in c.iteritems() if v >= 2}  # {('am', 2), ('donald', 2), ('i', 2)}