因此,我和我的同事正在尝试制作马尔可夫模型,以便在文本文件中找到字母过渡的概率。在文本文件中,我们有一组词“Steam,Teams,Meets,Teems,Eat,Ate,State,Tease,Test,Mast,Mates”。在代码中,我们将空格添加到第一个字母的开头和每个单词的最后一个字母之后。所以我们遇到的问题是创建一个函数,将字母转换放入单独的字典中。例如,所有e转换(例如:“_ e”,“ea”等等,_是空格)将进入字典,然后进入t,s,a和m。
这是我们到目前为止的代码:
import random
import re
inFile = open("markov.txt",'r')
file = inFile.read().lower()
inFile.close()
file=re.sub('[^[a-z\ \']+', " ", file)
fileTuple=tuple(file.split())
fileList=list(fileTuple)
fileString=file
def addSpaces(atuple):
theString=''
for i in atuple:
theString=theString+' '+i+' '
return(theString)
print('The words in the text file:',addSpaces(fileTuple))
fileDict = { }
for i in fileList:
fileDict['_'+i+'_']=''
print("This is a dictionary of the words in the text file with underscores as spaces:",fileDict)
def countTotalWords(atuple):
count=0
for i in atuple:
count=count+1
return(count)
print('Total amount of words:',countTotalWords(fileTuple))
def findFirstLetter(aDict):
for i in aDict:
aDict[i]=i[0:2]
return(aDict)
print('The first letters of each word in the file:',findFirstLetter(fileDict))
valueList=list(fileDict.values())
keyList=list(fileDict.keys())
def countFirstLetters(alist):
d={}
count = 0
for character in alist:
if character in d:
d[character] += 1
else:
d[character] = 1
return d
print('Total amount of occurences of each first letter:',countFirstLetters(valueList))
def countFirstLettersProbability(alist):
d={}
count = 0
for character in alist:
if character in d:
d[character] += (1/countTotalWords(fileTuple))
else:
d[character] = (1/countTotalWords(fileTuple))
return d
print('Probility that each letter is the first in the word:',countFirstLettersProbability(valueList))
def countAllLetters(alist):
d={}
for word in alist:
for char in word:
if char in d:
d[char] += 1
else:
d[char] = 1
return d
print('Total amount of occurences of each letter:',countFirstLetters(fileString))
答案 0 :(得分:1)
这是一个坚实的开始;我已将您的代码重写为马尔可夫类。
from random import choice
import re
from collections import defaultdict
from itertools import chain, tee, izip
def strip_non_alpha(text, reg=re.compile('[^a-z\']+', re.IGNORECASE)):
return reg.sub(' ', text.strip())
def nwise(iterable, n):
"s -> (s0,s1, ... sn-1), (s1,s2, ... sn), (s2, s3, ... sn+1), ..."
args = tee(iterable, n)
for i,t in enumerate(args):
for j in range(i):
next(t, None)
return izip(*args)
class Markov():
CHAINLEN = 3
PRE = ' '*(CHAINLEN - 1)
@classmethod
def from_file(cls, fname):
with open(fname) as inf:
return Markov(inf)
def __init__(self, text):
"""
Create a new Markov chain model
text
Either a string or a sequence of strings
"""
self.lookup = defaultdict(list)
self.words = 0
self.strings = 0
if hasattr(text, '__iter__'):
for s in text:
self.add_text(s)
else:
self.add_text(text)
def add_text(self, text):
"""
Add a string to the lookup table
text
string to add
"""
text = strip_non_alpha(text).lower()
self.words += len(text.split())
self.strings += 1
for chars in nwise(chain(Markov.PRE, text, Markov.PRE), Markov.CHAINLEN):
stem = ''.join(chars[:-1])
self.lookup[stem].append(chars[-1])
def gen_text(self, upto=200):
"""
Generate a string
upto
maximum length of string to be generated
"""
s = Markov.PRE
res = []
for i in range(upto + Markov.CHAINLEN):
ch = choice(self.lookup[s])
res.append(ch)
s = s[1:] + ch
if s == Markov.PRE: # terminal string
break
return ''.join(res[:-(Markov.CHAINLEN - 1)])
def __str__(self):
return '\n'.join("'{}': {}".format(k, self.lookup[k]) for k in sorted(self.lookup))
def main():
# mc = Markov.from_file('markov.txt')
mc = Markov('Steam,Teams,Meets,Teems,Eat,Ate,State,Tease,Test,Mast,Mates'.split(','))
print mc.strings, mc.words
print mc
for i in range(10):
print(mc.gen_text())
if __name__=="__main__":
main()