from __future__ import division
import urllib
import json
from math import log
def hits(word1,word2=""):
query = ""
if word2 == "":
results = urllib.urlopen(query % word1)
results = urllib.urlopen(query % word1+" "+"AROUND(10)"+" "+word2)
json_res = json.loads(
return google_hits
def so(phrase):
num = hits(phrase,"excellent")
#print num
den = hits(phrase,"poor")
#print den
ratio = num / den
#print ratio
sop = log(ratio)
return sop
print so("ugly product")
我需要此代码来计算Point wise Mutual Information,它可用于将评论分类为正面或负面。基本上我使用Turney(2002)指定的技术:作为情感分析的无监督分类方法的一个例子。
计算出的值非常不稳定。他们没有坚持特定的模式。 例如,SO("丑陋的产品")结果是2.85462098541而SO("漂亮的产品")是1.71395061117。虽然前者预计是负面而另一个是积极的。
答案 0 :(得分:14)
log(p(a,b) / ( p(a) * p(b) ))
def pmi(word1, word2, unigram_freq, bigram_freq):
prob_word1 = unigram_freq[word1] / float(sum(unigram_freq.values()))
prob_word2 = unigram_freq[word2] / float(sum(unigram_freq.values()))
prob_word1_word2 = bigram_freq[" ".join([word1, word2])] / float(sum(bigram_freq.values()))
return math.log(prob_word1_word2/float(prob_word1*prob_word2),2)
$ wget
$ printf "This is a foo bar sentence .\nI need multi-word expression from this text file.\nThe text file is messed up , I know you foo bar multi-word expression thingy .\n More foo bar is needed , so that the text file is populated with some sort of foo bar bigrams to extract the multi-word expression ." > src.txt
$ printf "" > trg.txt
$ python
>>> import codecs
>>> from mwe import load_ngramfreq, extract_mwe
>>> # Calculates the unigrams and bigrams counts.
>>> # More superfluously, "Training a bigram 'language model'."
>>> unigram, bigram, _ , _ = load_ngramfreq('src.txt','trg.txt')
>>> sent = "This is another foo bar sentence not in the training corpus ."
>>> for threshold in range(-2, 4):
... print threshold, [mwe for mwe in extract_mwe(sent.strip().lower(), unigram, bigram, threshold)]
-2 ['this is', 'is another', 'another foo', 'foo bar', 'bar sentence', 'sentence not', 'not in', 'in the', 'the training', 'training corpus', 'corpus .']
-1 ['this is', 'is another', 'another foo', 'foo bar', 'bar sentence', 'sentence not', 'not in', 'in the', 'the training', 'training corpus', 'corpus .']
0 ['this is', 'foo bar', 'bar sentence']
1 ['this is', 'foo bar', 'bar sentence']
2 ['this is', 'foo bar', 'bar sentence']
3 ['foo bar', 'bar sentence']
4 []
答案 1 :(得分:5)
Python库DISSECT包含共生矩阵的a few methods to compute Pointwise Mutual Information。
Enter a valid option
Code on GitHub for the PMI methods
参考: Georgiana Dinu,Nghia The Pham和Marco Baroni。 2013. DISSECT: DIStributional SEmantics Composition Toolkit。在系统演示的论文集中 ACL 2013,索非亚,保加利亚
相关:Calculating pointwise mutual information between two strings
答案 2 :(得分:3)
如需更深入地讨论此事,请阅读Adam Kilgarriff撰写的“Googleology is bad science”。