Python:创建多个字母过渡字典

时间:2013-11-29 21:27:47

标签: python dictionary markov

因此,我和我的同事正在尝试制作马尔可夫模型,以便在文本文件中找到字母过渡的概率。在文本文件中,我们有一组词“Steam,Teams,Meets,Teems,Eat,Ate,State,Tease,Test,Mast,Mates”。在代码中,我们将空格添加到第一个字母的开头和每个单词的最后一个字母之后。所以我们遇到的问题是创建一个函数,将字母转换放入单独的字典中。例如,所有e转换(例如:“_ e”,“ea”等等,_是空格)将进入字典,然后进入t,s,a和m。

这是我们到目前为止的代码:

import random
import re

inFile = open("markov.txt",'r')
file = inFile.read().lower()
inFile.close()
file=re.sub('[^[a-z\ \']+', " ", file)

fileTuple=tuple(file.split())
fileList=list(fileTuple)
fileString=file


def addSpaces(atuple):
    theString=''
    for i in atuple:
        theString=theString+' '+i+' '
    return(theString)

print('The words in the text file:',addSpaces(fileTuple))


fileDict = { }
for i in fileList:
    fileDict['_'+i+'_']=''

print("This is a dictionary of the words in the text file with underscores as spaces:",fileDict)

def countTotalWords(atuple):
    count=0
    for i in atuple:
        count=count+1
    return(count)

print('Total amount of words:',countTotalWords(fileTuple))

def findFirstLetter(aDict):
    for i in aDict:
        aDict[i]=i[0:2]
    return(aDict)

print('The first letters of each word in the file:',findFirstLetter(fileDict))



valueList=list(fileDict.values())
keyList=list(fileDict.keys())



def countFirstLetters(alist):
    d={}
    count = 0
    for character in alist:
        if character in d:
            d[character] += 1
        else:
            d[character] = 1

    return d

print('Total amount of occurences of each first letter:',countFirstLetters(valueList))

def countFirstLettersProbability(alist):
    d={}
    count = 0
    for character in alist:
        if character in d:
            d[character] += (1/countTotalWords(fileTuple))
        else:
            d[character] = (1/countTotalWords(fileTuple))

    return d


print('Probility that each letter is the first in the word:',countFirstLettersProbability(valueList))


def countAllLetters(alist):
    d={}
    for word in alist:
        for char in word:
            if char in d:
                d[char] += 1
            else:
                d[char] = 1

    return d

print('Total amount of occurences of each letter:',countFirstLetters(fileString))

1 个答案:

答案 0 :(得分:1)

这是一个坚实的开始;我已将您的代码重写为马尔可夫类。

from random import choice
import re
from collections import defaultdict
from itertools import chain, tee, izip

def strip_non_alpha(text, reg=re.compile('[^a-z\']+', re.IGNORECASE)):
    return reg.sub(' ', text.strip())

def nwise(iterable, n):
    "s -> (s0,s1, ... sn-1), (s1,s2, ... sn), (s2, s3, ... sn+1), ..."
    args = tee(iterable, n)
    for i,t in enumerate(args):
        for j in range(i):
            next(t, None)
    return izip(*args)

class Markov():
    CHAINLEN = 3
    PRE = ' '*(CHAINLEN - 1)

    @classmethod
    def from_file(cls, fname):
        with open(fname) as inf:
            return Markov(inf)

    def __init__(self, text):
        """
        Create a new Markov chain model

            text
                Either a string or a sequence of strings
        """
        self.lookup = defaultdict(list)
        self.words = 0
        self.strings = 0

        if hasattr(text, '__iter__'):
            for s in text:
                self.add_text(s)
        else:
            self.add_text(text)

    def add_text(self, text):
        """
        Add a string to the lookup table

            text
                string to add
        """
        text = strip_non_alpha(text).lower()
        self.words += len(text.split())
        self.strings += 1
        for chars in nwise(chain(Markov.PRE, text, Markov.PRE), Markov.CHAINLEN):
            stem = ''.join(chars[:-1])
            self.lookup[stem].append(chars[-1])

    def gen_text(self, upto=200):
        """
        Generate a string

            upto
                maximum length of string to be generated
        """
        s = Markov.PRE
        res = []
        for i in range(upto + Markov.CHAINLEN):
            ch = choice(self.lookup[s])
            res.append(ch)
            s = s[1:] + ch
            if s == Markov.PRE:    # terminal string
                break
        return ''.join(res[:-(Markov.CHAINLEN - 1)])

    def __str__(self):
        return '\n'.join("'{}': {}".format(k, self.lookup[k]) for k in sorted(self.lookup))

def main():
    # mc = Markov.from_file('markov.txt')
    mc = Markov('Steam,Teams,Meets,Teems,Eat,Ate,State,Tease,Test,Mast,Mates'.split(','))

    print mc.strings, mc.words
    print mc

    for i in range(10):
        print(mc.gen_text())

if __name__=="__main__":
    main()