使用NLTK查找Bigram的频率和PMI分数(Python 3)

时间:2014-09-19 01:22:55

标签: python-3.x pandas nltk

现在,我有一个这样的文件:

with open('BigramCounter.txt', encoding='utf-8') as wordfile:
    text = wordfile.read()
words = nltk.word_tokenize(text)
bigram_measures = nltk.collocations.BigramAssocMeasures()
bgs = nltk.bigrams(words)
fdist = nltk.FreqDist(bgs)
pmi = bgs.score_ngrams(bigram_measures.pmi)

现在我可以获得文件中每个bigram的频率,并且我可以单独获取文件中的双字母组件的PMI,但我不知道如何将它们放在一起以便NLTK创建Bigram并得分他们的PMI !还有其他人遇到过这个问题吗?谢谢!

2 个答案:

答案 0 :(得分:1)

您可以使用此代码提取bigrams及其频率,或提取某个二元组的pmi分数:

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import math
import nltk
from collections import defaultdict

def generateUnigramsInMovie(Tokens,freqThreshold):          
    unigrams_in_movie=defaultdict(int)          
    fdistUnigrams = nltk.FreqDist(Tokens)
    for unigram, freq in sorted(fdistUnigrams.iteritems(), key=lambda (k,v): (v,k)):
        if freq > freqThreshold:
            unigrams_in_movie[unigram] = freq
    return unigrams_in_movie

def generateBigramsInMovie(Tokens,freqThreshold):
        bigrams_in_movie=defaultdict(int)

        b = nltk.collocations.BigramCollocationFinder.from_words(Tokens)
        b.apply_freq_filter(freqThreshold)
        for bigram, freq in b.ngram_fd.items():

                bigram=" ".join([bigram[0], bigram[1]])
                bigrams_in_movie[bigram] = freq
        return bigrams_in_movie


#This method is copied from the code given by "alvas"
#Taken from this project: Multi-Word Expression (MWE) extractor from the "Terminator" project   
#Liling Tan. 2013. Terminator - Terminology Extraction to Improve 
#Machine Translation [Software]. Available from
#https://github.com/alvations/Terminator.

def pmi(word1, word2, unigram_freq, bigram_freq):

    prob_word1 = unigram_freq[word1] / float(sum(unigram_freq.values()))
    prob_word2 = unigram_freq[word2] / float(sum(unigram_freq.values()))
    prob_word1_word2 = bigram_freq[" ".join([word1, word2])] / float(sum(bigram_freq.values()))

    try:

        return math.log(prob_word1_word2/float(prob_word1*prob_word2),2)

    except: # Occurs when calculating PMI for Out-of-Vocab words.

        return 0



with open('Text.txt') as wordfile:
    text = wordfile.read()
Tokens = nltk.word_tokenize(text)

unigrams_in_movie= generateUnigramsInMovie(Tokens,1)
bigrams_in_movie=  generateBigramsInMovie(Tokens,1)

b = nltk.collocations.BigramCollocationFinder.from_words(Tokens)
b.apply_freq_filter(1)
bigram_measures = nltk.collocations.BigramAssocMeasures()
bestBigrams=b.nbest(bigram_measures.pmi, 50) 
#I guess that this is what you are looking for it prints the bigram along with its frequency
for bigram in bestBigrams:
    bigram=" ".join([bigram[0], bigram[1]])

    bigrmaFreq=bigrams_in_movie[bigram]
    print str(bigram) +" "+str(bigrmaFreq)

# Then if you want the pmi score for a certain bigram use this :
#As stated before this method is copied from the code given by "alvas"
print pmi(word1, word2, unigrams_in_movie, bigrams_in_movie)

希望这有帮助。 干杯

答案 1 :(得分:0)

试试这个:

#!/usr/bin/env python -*- coding: utf-8 -*-

"""
This is a Multi-Word Expression (MWE) extractor from the "Terminator" project,
see https://github.com/alvations/Terminator.

Here's some legalese:

##############################################################################
Terminator is copyrighted under MIT License by alvations.

Copyright (c) 2013-2014 Liling Tan (@alvations)

Permission is hereby granted, free of charge, to any person obtaining a 
copy of this software and associated documentation files (the "Software"), 
to deal in the Software without restriction, including without limitation the 
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 
sell copies of the Software, and to permit persons to whom the Software is 
furnished to do so, subject to the following conditions:

Please cite the following when using part-of or the full code/software:

  Liling Tan. 2013. Terminator - Terminology Extraction to Improve 
  Machine Translation [Software]. Available from 
  https://github.com/alvations/Terminator.

The above copyright notice and this permission notice shall be included in all 
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 
THE SOFTWARE.

"""

# Authorship Info.
__author__ = "Liling Tan (aka @alvations)"
__copyright__ = "(c) Copyright 2013"
__license__ = "MIT"
__date__ = "20 Dec 2013"
__version__ = "0.1"
__maintainer__ = "Liling Tan"
__email__ = "alvations@gmail.com"
__status__ = "pre-development"

import codecs, math, os
from collections import Counter
import cPickle as pickle

def ngram(text,n=2):
  if n==1: return text.split();
  return zip(*[text.split()[i:] for i in range(n)])

def pmi(word1, word2, unigram_freq, bigram_freq):
  prob_word1 = unigram_freq[word1] / float(sum(unigram_freq.values()))
  prob_word2 = unigram_freq[word2] / float(sum(unigram_freq.values()))
  prob_word1_word2 = bigram_freq[" ".join([word1, word2])] / float(sum(bigram_freq.values()))
  try:
    return math.log(prob_word1_word2/float(prob_word1*prob_word2),2)
  except: # Occurs when calculating PMI for Out-of-Vocab words.
    return 0

def phi2(word1,word2, unigram_freq, bigram_freq):
  n12 = sum(bigram_freq[i] for i in bigram_freq if \
            word1 in i.split() and word2 not in i.split())
  n21 =  sum(bigram_freq[i] for i in bigram_freq if \
            word1 not in i.split() and word2 in i.split())
  n11 = bigram_freq[word1+" "+word2]
  n22 = sum(bigram_freq.values()) - n11
  n1p = n11 + n12
  n2p = n21 + n22
  np1 = n11 + n21
  np2 = n12 + n22
  assert np1 + np2 == n1p + n2p
  return math.log((n11*n22 - n21*n12)*(n11*n22 - n21*n12)/float(n1p*np1*np2*n2p),2)

def llr(word1,word2, unigram_freq):
  return math.log(unigram_freq[word1]*unigram_freq[word2],2)

def load_ngramfreq_pickle(filename):
  if os.path.exists(filename):
    return pickle.load(codecs.open(filename,'rb'))
  else:
    infile,n = filename.split("-"); n = int(n[0])
    ngram_freq = Counter()
    with codecs.open(infile,'r','utf8') as fin:
      for line in fin:
        line = line.lower()
        if n > 1: ngram_freq.update([" ".join(j) for j in ngram(line,n)]);
        else: ngram_freq.update(ngram(line,n));
    pickle.dump(ngram_freq, codecs.open(filename,'wb'))
    return ngram_freq

def load_ngramfreq(srcfile, trgfile):
  src_unigramfile = srcfile+"-1gram.pk"
  src_bigramfile = srcfile+"-2gram.pk"
  trg_unigramfile = trgfile+"-1gram.pk"
  trg_bigramfile = trgfile+"-2gram.pk"

  return load_ngramfreq_pickle(src_unigramfile), \
    load_ngramfreq_pickle(src_bigramfile), \
    load_ngramfreq_pickle(trg_unigramfile), \
    load_ngramfreq_pickle(trg_bigramfile)

def load_precalculated_pmi(srcfile,trgfile):
  filename = srcfile+"_"+trgfile+"_pmi.pk"
  if os.path.exists(filename):
    return pickle.load(codecs.open(filename,'rb'))
  else:
    return {}

def extract_mwe(sentence, unigramfreq, bigramfreq, precal_pmi, threshold=10):
  mwes = []
  for ng in ngram(sentence,2):
    ng = ng[0].lower()+" "+ng[1].lower()
    if ng in precal_pmi:
      score = precal_pmi[ng]
    else:
      score = pmi(ng[0].lower(), ng[1].lower(), unigramfreq, bigramfreq)
      precal_pmi[ng] = score
    if score > threshold:
      mwes.append(ng)
  return " ".join(mwes)

def main(srcfile, trgfile):
  src_unigram, src_bigram, trg_unigram, trg_bigram = \
  load_ngramfreq(srcfile, trgfile)

  precal_pmi = load_precalculated_pmi(srcfile, trgfile)

  fout = codecs.open('mwe_pmi.de-en','w','utf8')

  with codecs.open(srcfile,'r','utf8') as srcfin, \
  codecs.open(trgfile, 'r','utf8') as trgfin:
    for src, trg in zip(srcfin,trgfin):
      src_mwe = extract_mwe(src.strip().lower(), src_unigram, src_bigram, 
                            precal_pmi)
      trg_mwe = extract_mwe(trg.strip().lower(), trg_unigram, trg_bigram, 
                            precal_pmi)
      if src_mwe and len(src_mwe) == len(trg_mwe):
        print>>fout, " ".join(src_mwe) +"\t"+" ".join(trg_mwe)

if __name__ == '__main__':
  import sys
  if len(sys.argv) < 2:
    sys.stderr.write('Usage: python %s srcfile trgfile \n' % sys.argv[0])
    sys.exit(1)
  main(sys.argv[1], sys.argv[2])