单词对齐 - python中的内存错误

时间:2015-07-22 05:25:57

标签: python memory alignment

******** Read from corpus ********************
NEWRESULT
Traceback (most recent call last):
File "em2.py", line 168, in <module>
main()
File "em2.py", line 165, in main
print  em_run(NEWSENTENCES)
File "em2.py", line 54, in em_run
for source, target in sentence_pairs] 
MemoryError

当我尝试运行以下程序时,会获得上述错误。任何人都可以帮我解决错误吗?

以下程序采用两个文本文件&#34; sample1&#34;和&#34; sample2&#34;。 &#34; SAMPLE1&#34;在马拉雅拉姆语中包含一个句子,句子中包含14个单词。 &#34; SAMPLE2&#34;包含相应的英文翻译,其中包含23个单词。该程序将在两种语言之间采用相应的单词对齐方式。

实际上该程序适用于最多10个单词的句子。当单词高于10时,问题就会到来。

#!/usr/bin/env python

from itertools import izip
from collections import defaultdict
import copy
import itertools
import operator
import codecs
import StringIO


def em_run(sentence_pairs):

#Run expectation-maximization on a list of pairs of the form
# `(source_tokens, target_tokens)`
# where `source_tokens` is a list of tokens in the source language and
#`target_tokens` is a list of tokens for a translationally equivalent
#sentence in the target language.
#Returns a mapping `(t1, t2) => p` where `t1` is a source-language
#token, `t2` is a target-language token, and the value `p` represents
#$P(t1|t2)$.


source_sentences, target_sentences = zip(*sentence_pairs)
#print " \n SOURCE SENTENCE"
#print "\n", source_sentences
#print "\n TARGET SENTENCE"
#print "\n", target_sentences

source_vocabulary = set(itertools.chain.from_iterable(source_sentences))
#print "\nSOURCE VOCABULARY"
#print "\n", source_vocabulary

target_vocabulary = set(itertools.chain.from_iterable(target_sentences))
#print "\n TARGET VOCABULARY"
#print "\n", target_vocabulary

# Value with which to initialize each conditional probability
uniform_prob = 1.0 / len(source_vocabulary)
#print len(source_vocabulary)
#print "\n INITIAL PROBABILITY VALUE\n"
#print uniform_prob

conditional_probs_old = None
conditional_probs = { (source_w, target_w) : uniform_prob 
                                                for source_w in source_vocabulary
                                                    for target_w in target_vocabulary}
#print "\n INITIAL CONDITIONAL PROBABILITY VALUE"
#print "\n", conditional_probs

#alignments = [(source,target) for source,target in sentence_pairs ]
alignments = [[zip(source, target_perm)
                        for target_perm in itertools.permutations(target)]
                            for source, target in sentence_pairs] 
#print "\n SET OF ALL POSSIBLE ALIGNMENTS\n"
#print alignments

'''fw=codecs.open('file_mal.txt','w','utf8')
fw.write(str(alignments))
fw.close()'''
# Repeat until convergence
i = 0
while conditional_probs_old != conditional_probs:
    conditional_probs_old = copy.copy(conditional_probs)
    alignment_probs = {
        i: {
            tuple(alignment):
            reduce(operator.mul, [conditional_probs[pair]
                for pair in alignment])
            for alignment in sentence_alignments
       }

for i, sentence_alignments in enumerate(alignments)
}



# Normalize alignment probabilities
for sentence_idx, sentence_alignments in alignment_probs.iteritems():
    total = float(sum(sentence_alignments.values()))
    probs = {alignment: value / total
        for alignment, value in sentence_alignments.iteritems()}
    alignment_probs[sentence_idx] = probs

# Now join all alignments and begin the maximization step: group
# by target-language word and collect corresponding
# source-language probabilities
word_translations = defaultdict(lambda: defaultdict(float))
for sentence_alignments in alignment_probs.itervalues():
    for word_pairs, prob in sentence_alignments.iteritems():
        for source_word, target_word in word_pairs:
    #print source_word
            word_translations[target_word][source_word] += prob

# Now calculate new conditional probability mapping, ungrouping
# the `word_translations` tree and normalizing values into
# conditional probabilities
st = []
words=[]
conditional_probs = {}

for target_word, translations in word_translations.iteritems():
    total = float(sum(translations.values()))
    for source_word, score in translations.iteritems():
        conditional_probs[source_word, target_word] = score / total
#print conditional_probs

for key,value in conditional_probs.iteritems():
    #print key[0]
    words.append(key[0])
    #for key,value in conditional_probs.iteritems():
#print words 
st=set(words)
#print st
for i in st:
    final(i,conditional_probs)


with codecs.open ('output_data.txt', 'w','utf8') as fp:
    for key,value in conditional_probs.iteritems():
        fp.write(key[0].decode('utf8')+" , "+key[1]+" : "+str(value))
    fp.write("\n")
return conditional_probs

def final(i,conditional_probs):
lst = []
output=StringIO.StringIO()
ha=[]
val=0
val1=0
#print i
fd=open('final.txt','a')
for key,value in conditional_probs.iteritems():
    if i in key[0]:
        val1=value
        if val<val1:
            lst=value
            key2=key[0]
            key3=key[1]
            val=value

        ha.append(value)
output.write(key2 + "," + key3 + '=' + str(val) + "   \n")
contents=output.getvalue()

fd.write(contents)
fd.close()
output.close() 
#print lst
#print ha

def main():

print "******** Read from corpus ********************"
NEWSENTENCES = []
with open("sample1") as textMal, open("sample2",) as textEn:
    for x, y in izip(textMal, textEn):
        x = x.strip().split()
        y = y.strip().split()
        NEWSENTENCES.append((x, y))
        #print x
    #print y

print "NEWRESULT"
print  em_run(NEWSENTENCES)

if __name__ == '__main__':
main()

SAMPLE1

ദൈവംദൈവംവെളളങളങെെവവവവവവവവവവവവവവെെെെെെെെെെെെെെെെെെെ添加了1张新的照片和更多的照片和其他照片。

SAMPLE2

上帝说,在水中有一个穹苍,让水从水中分开。

0 个答案:

没有答案