现在,我有一个这样的文件:
with open('BigramCounter.txt', encoding='utf-8') as wordfile:
text = wordfile.read()
words = nltk.word_tokenize(text)
bigram_measures = nltk.collocations.BigramAssocMeasures()
bgs = nltk.bigrams(words)
fdist = nltk.FreqDist(bgs)
pmi = bgs.score_ngrams(bigram_measures.pmi)
现在我可以获得文件中每个bigram的频率,并且我可以单独获取文件中的双字母组件的PMI,但我不知道如何将它们放在一起以便NLTK创建Bigram并得分他们的PMI !还有其他人遇到过这个问题吗?谢谢!
答案 0 :(得分:1)
您可以使用此代码提取bigrams及其频率,或提取某个二元组的pmi分数:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import math
import nltk
from collections import defaultdict
def generateUnigramsInMovie(Tokens,freqThreshold):
unigrams_in_movie=defaultdict(int)
fdistUnigrams = nltk.FreqDist(Tokens)
for unigram, freq in sorted(fdistUnigrams.iteritems(), key=lambda (k,v): (v,k)):
if freq > freqThreshold:
unigrams_in_movie[unigram] = freq
return unigrams_in_movie
def generateBigramsInMovie(Tokens,freqThreshold):
bigrams_in_movie=defaultdict(int)
b = nltk.collocations.BigramCollocationFinder.from_words(Tokens)
b.apply_freq_filter(freqThreshold)
for bigram, freq in b.ngram_fd.items():
bigram=" ".join([bigram[0], bigram[1]])
bigrams_in_movie[bigram] = freq
return bigrams_in_movie
#This method is copied from the code given by "alvas"
#Taken from this project: Multi-Word Expression (MWE) extractor from the "Terminator" project
#Liling Tan. 2013. Terminator - Terminology Extraction to Improve
#Machine Translation [Software]. Available from
#https://github.com/alvations/Terminator.
def pmi(word1, word2, unigram_freq, bigram_freq):
prob_word1 = unigram_freq[word1] / float(sum(unigram_freq.values()))
prob_word2 = unigram_freq[word2] / float(sum(unigram_freq.values()))
prob_word1_word2 = bigram_freq[" ".join([word1, word2])] / float(sum(bigram_freq.values()))
try:
return math.log(prob_word1_word2/float(prob_word1*prob_word2),2)
except: # Occurs when calculating PMI for Out-of-Vocab words.
return 0
with open('Text.txt') as wordfile:
text = wordfile.read()
Tokens = nltk.word_tokenize(text)
unigrams_in_movie= generateUnigramsInMovie(Tokens,1)
bigrams_in_movie= generateBigramsInMovie(Tokens,1)
b = nltk.collocations.BigramCollocationFinder.from_words(Tokens)
b.apply_freq_filter(1)
bigram_measures = nltk.collocations.BigramAssocMeasures()
bestBigrams=b.nbest(bigram_measures.pmi, 50)
#I guess that this is what you are looking for it prints the bigram along with its frequency
for bigram in bestBigrams:
bigram=" ".join([bigram[0], bigram[1]])
bigrmaFreq=bigrams_in_movie[bigram]
print str(bigram) +" "+str(bigrmaFreq)
# Then if you want the pmi score for a certain bigram use this :
#As stated before this method is copied from the code given by "alvas"
print pmi(word1, word2, unigrams_in_movie, bigrams_in_movie)
希望这有帮助。 干杯
答案 1 :(得分:0)
试试这个:
#!/usr/bin/env python -*- coding: utf-8 -*-
"""
This is a Multi-Word Expression (MWE) extractor from the "Terminator" project,
see https://github.com/alvations/Terminator.
Here's some legalese:
##############################################################################
Terminator is copyrighted under MIT License by alvations.
Copyright (c) 2013-2014 Liling Tan (@alvations)
Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation the
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
Please cite the following when using part-of or the full code/software:
Liling Tan. 2013. Terminator - Terminology Extraction to Improve
Machine Translation [Software]. Available from
https://github.com/alvations/Terminator.
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
"""
# Authorship Info.
__author__ = "Liling Tan (aka @alvations)"
__copyright__ = "(c) Copyright 2013"
__license__ = "MIT"
__date__ = "20 Dec 2013"
__version__ = "0.1"
__maintainer__ = "Liling Tan"
__email__ = "alvations@gmail.com"
__status__ = "pre-development"
import codecs, math, os
from collections import Counter
import cPickle as pickle
def ngram(text,n=2):
if n==1: return text.split();
return zip(*[text.split()[i:] for i in range(n)])
def pmi(word1, word2, unigram_freq, bigram_freq):
prob_word1 = unigram_freq[word1] / float(sum(unigram_freq.values()))
prob_word2 = unigram_freq[word2] / float(sum(unigram_freq.values()))
prob_word1_word2 = bigram_freq[" ".join([word1, word2])] / float(sum(bigram_freq.values()))
try:
return math.log(prob_word1_word2/float(prob_word1*prob_word2),2)
except: # Occurs when calculating PMI for Out-of-Vocab words.
return 0
def phi2(word1,word2, unigram_freq, bigram_freq):
n12 = sum(bigram_freq[i] for i in bigram_freq if \
word1 in i.split() and word2 not in i.split())
n21 = sum(bigram_freq[i] for i in bigram_freq if \
word1 not in i.split() and word2 in i.split())
n11 = bigram_freq[word1+" "+word2]
n22 = sum(bigram_freq.values()) - n11
n1p = n11 + n12
n2p = n21 + n22
np1 = n11 + n21
np2 = n12 + n22
assert np1 + np2 == n1p + n2p
return math.log((n11*n22 - n21*n12)*(n11*n22 - n21*n12)/float(n1p*np1*np2*n2p),2)
def llr(word1,word2, unigram_freq):
return math.log(unigram_freq[word1]*unigram_freq[word2],2)
def load_ngramfreq_pickle(filename):
if os.path.exists(filename):
return pickle.load(codecs.open(filename,'rb'))
else:
infile,n = filename.split("-"); n = int(n[0])
ngram_freq = Counter()
with codecs.open(infile,'r','utf8') as fin:
for line in fin:
line = line.lower()
if n > 1: ngram_freq.update([" ".join(j) for j in ngram(line,n)]);
else: ngram_freq.update(ngram(line,n));
pickle.dump(ngram_freq, codecs.open(filename,'wb'))
return ngram_freq
def load_ngramfreq(srcfile, trgfile):
src_unigramfile = srcfile+"-1gram.pk"
src_bigramfile = srcfile+"-2gram.pk"
trg_unigramfile = trgfile+"-1gram.pk"
trg_bigramfile = trgfile+"-2gram.pk"
return load_ngramfreq_pickle(src_unigramfile), \
load_ngramfreq_pickle(src_bigramfile), \
load_ngramfreq_pickle(trg_unigramfile), \
load_ngramfreq_pickle(trg_bigramfile)
def load_precalculated_pmi(srcfile,trgfile):
filename = srcfile+"_"+trgfile+"_pmi.pk"
if os.path.exists(filename):
return pickle.load(codecs.open(filename,'rb'))
else:
return {}
def extract_mwe(sentence, unigramfreq, bigramfreq, precal_pmi, threshold=10):
mwes = []
for ng in ngram(sentence,2):
ng = ng[0].lower()+" "+ng[1].lower()
if ng in precal_pmi:
score = precal_pmi[ng]
else:
score = pmi(ng[0].lower(), ng[1].lower(), unigramfreq, bigramfreq)
precal_pmi[ng] = score
if score > threshold:
mwes.append(ng)
return " ".join(mwes)
def main(srcfile, trgfile):
src_unigram, src_bigram, trg_unigram, trg_bigram = \
load_ngramfreq(srcfile, trgfile)
precal_pmi = load_precalculated_pmi(srcfile, trgfile)
fout = codecs.open('mwe_pmi.de-en','w','utf8')
with codecs.open(srcfile,'r','utf8') as srcfin, \
codecs.open(trgfile, 'r','utf8') as trgfin:
for src, trg in zip(srcfin,trgfin):
src_mwe = extract_mwe(src.strip().lower(), src_unigram, src_bigram,
precal_pmi)
trg_mwe = extract_mwe(trg.strip().lower(), trg_unigram, trg_bigram,
precal_pmi)
if src_mwe and len(src_mwe) == len(trg_mwe):
print>>fout, " ".join(src_mwe) +"\t"+" ".join(trg_mwe)
if __name__ == '__main__':
import sys
if len(sys.argv) < 2:
sys.stderr.write('Usage: python %s srcfile trgfile \n' % sys.argv[0])
sys.exit(1)
main(sys.argv[1], sys.argv[2])