我尝试将NLTK工具包中存在的QuadgramCollationFinde类转换为Data 1 Data 2 Data 3 Data 4 Data 5 Data 6 Data 7 Data 8 Data 9 Data 10
team 1 78 9 0 23 45 67 89 44
,即使用5个项目,而不仅仅是4个项目,但确实有效。
这是nltk classe
PentagramCollocationFinder one
这就是我所想的
import nltk
from nltk.compat import iteritems
from nltk.probability import FreqDist
from nltk.util import ngrams
from nltk.metrics import ContingencyMeasures, BigramAssocMeasures,TrigramAssocMeasures
from nltk.metrics.spearman import ranks_from_scores, spearman_correlation
from __future__ import print_function
class QuadgramCollocationFinder(AbstractCollocationFinder):
"""A tool for the finding and ranking of quadgram collocations or other association measures.
It is often useful to use from_words() rather than constructing an instance directly.
"""
default_ws = 4
def __init__(self, word_fd, quadgram_fd, ii, iii, ixi, ixxi, iixi, ixii):
"""Construct a QuadgramCollocationFinder, given FreqDists for appearances of words,
bigrams, trigrams, two words with one word and two words between them, three words
with a word between them in both variations.
"""
AbstractCollocationFinder.__init__(self, word_fd, quadgram_fd)
self.iii = iii
self.ii = ii
self.ixi = ixi
self.ixxi = ixxi
self.iixi = iixi
self.ixii = ixii
def from_words(cls, words, window_size=4):
if window_size < 4:
raise ValueError("Specify window_size at least 4")
ixxx = FreqDist()
iiii = FreqDist()
ii = FreqDist()
iii = FreqDist()
ixi = FreqDist()
ixxi = FreqDist()
iixi = FreqDist()
ixii = FreqDist()
for window in ngrams(words, window_size, pad_right=True):
w1 = window[0]
if w1 is None:
continue
for w2, w3, w4 in _itertools.combinations(window[1:], 3):
ixxx[w1] += 1
if w2 is None:
continue
ii[(w1, w2)] += 1
if w3 is None:
continue
iii[(w1, w2, w3)] += 1
ixi[(w1, w3)] += 1
if w4 is None:
continue
iiii[(w1, w2, w3, w4)] += 1
ixxi[(w1, w4)] += 1
ixii[(w1, w3, w4)] += 1
iixi[(w1, w2, w4)] += 1
return cls(ixxx, iiii, ii, iii, ixi, ixxi, iixi, ixii)
def score_ngram(self, score_fn, w1, w2, w3, w4):
n_all = self.word_fd.N()
n_iiii = self.ngram_fd[(w1, w2, w3, w4)]
if not n_iiii:
return
n_iiix = self.iii[(w1, w2, w3)]
n_xiii = self.iii[(w2, w3, w4)]
n_iixi = self.iixi[(w1, w2, w4)]
n_ixii = self.ixii[(w1, w3, w4)]
n_iixx = self.ii[(w1, w2)]
n_xxii = self.ii[(w3, w4)]
n_xiix = self.ii[(w2, w3)]
n_ixix = self.ixi[(w1, w3)]
n_ixxi = self.ixxi[(w1, w4)]
n_xixi = self.ixi[(w2, w4)]
n_ixxx = self.word_fd[w1]
n_xixx = self.word_fd[w2]
n_xxix = self.word_fd[w3]
n_xxxi = self.word_fd[w4]
return score_fn(n_iiii,
(n_iiix, n_iixi, n_ixii, n_xiii),
(n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix),
(n_ixxx, n_xixx, n_xxix, n_xxxi),
n_all)
所以我们需要的是更新类以便同时检测class fivegramCollocationFinder(AbstractCollocationFinder):
"""A tool for the finding and ranking of quadgram collocations or other association measures.
It is often useful to use from_words() rather than constructing an instance directly.
"""
default_ws = 4
def __init__(self, word_fd, quingram_fd, ii, iii, ixi,iiii,ixxi, ixii, iixi, ixxxi, iixxi, ixixi, ixxii, iiixi, ixiii, iixii):
"""Construct a QuadgramCollocationFinder, given FreqDists for appearances of words,
bigrams, trigrams, two words with one word and two words between them, three words
with a word between them in both variations.
"""
AbstractCollocationFinder.__init__(self, word_fd, quingram_fd)
self.iiii = iiii
self.iii = iii
self.ii = ii
self.ixi = ixi
self.ixxi = ixxi
self.iixi = iixi
self.ixii = ixii
self.ixxxi = ixxxi
self.iixxi = iixxi
self.ixixi = ixixi
self.ixxii = ixxii
self.iiixi = iiixi
self.ixiii = ixiii
self.iixii = iixii
@classmethod
def from_words(cls, words, window_size=5):
if window_size < 5:
raise ValueError("Specify window_size at least 4")
ixxxx = FreqDist()
ii = FreqDist()
iii = FreqDist()
iiii = FreqDist()
iiiii = FreqDist()
ixi = FreqDist()
ixxi = FreqDist()
iixi = FreqDist()
ixii = FreqDist()
ixxxi = FreqDist()
iixxi = FreqDist()
ixixi = FreqDist()
ixxii = FreqDist()
iiixi = FreqDist()
ixiii = FreqDist()
iixii = FreqDist()
for window in ngrams(words, window_size, pad_right=True):
w1 = window[0]
if w1 is None:
continue
for w2, w3, w4, w5 in _itertools.combinations(window[1:], 4):
ixxxx[w1] += 1
if w2 is None:
continue
ii[(w1, w2)] += 1
if w3 is None:
continue
iii[(w1, w2, w3)] += 1
ixi[(w1, w3)] += 1
if w4 is None:
continue
iiii[(w1, w2, w3, w4)] += 1
ixxi[(w1, w4)] += 1
ixii[(w1, w3, w4)] += 1
iixi[(w1, w2, w4)] += 1
if w5 is None:
continue
iiiii[(w1, w2, w3, w4, w5)] += 1
ixxxi[(w1, w5)] += 1
iixxi[(w1, w2, w5)] += 1
ixixi[(w1, w3, w5)] += 1
ixxii[(w1, w4, w5)] += 1
iiixi[(w1, w2, w3, w5)] +=1
ixiii[(w1, w3, w4, w5)] +=1
iixii[(w1, w2, w4, w5)] +=1
return cls(ixxxx, ii,iii, ixi, ixxi, iixi, ixii,iiii, iiiii, ixxxi, iixxi, ixixi, ixxii, iiixi, ixiii, iixii)
def score_ngram(self, score_fn, w1, w2, w3, w4, w5):
n_all = self.word_fd.N()
n_iiiii = self.ngram_fd[(w1, w2, w3, w4, w5)]
if not n_iiiii:
return
n_iiixi = self.iiixi[(w1, w2, w3, w5)]
n_ixiii = self.ixiii[(w1, w3, w4, w5)]
n_iixii = self.iixii[(w1, w2, w4, w5)]
n_iiiix = self.iiii[(w1,w2,w3,w4)]
n_xiiii = self.iiii[(w2,w3,w4,w5)]
n_iixix = self.iixix[(w1, w2, w4)]
n_iixxi = self.iixxi[(w1, w2, w5)]
n_ixixi = self.ixixi[(w1, w3, w5)]
n_ixxii = self.ixxii[(w1, w4, w5)]
n_xiixi = self.xiixi[(w2, w3, w5)]
n_xixii = self.xixii[(w2, w4, w5)]
n_ixiix = self.ixiix[(w1, w3, w4)]
n_iiixx = self.iii[(w1, w2, w3)]
n_xiiix = self.iii[(w2, w3, w4)]
n_xxiii = self.iii[(w3, w4, w5)]
n_ixixx = self.ixixx[(w1, w3)]
n_ixxix = self.ixxix[(w1, w4)]
n_ixxxi = self.ixxxi[(w1, w5)]
n_xixix = self.xixix[(w2, w4)]
n_xixxi = self.xixxi[(w2, w5)]
n_xxixi = self.xxixi[(w2, w5)]
n_iixxx = self.ii[(w1, w2)]
n_xiixx = self.ii[(w2, w3)]
n_xxiix= self.ii[(w3, w4)]
n_xxxii= self.ii[(w4, w5)]
n_ixxxx = self.word_fd[w1]
n_xixxx = self.word_fd[w2]
n_xxixx = self.word_fd[w3]
n_xxxix = self.word_fd[w4]
n_xxxxi = self.word_fd[w5]
return score_fn(n_iiiii,
(n_iiixi, n_ixiii, n_iixii, n_iiiix, n_xiiii),
(n_iixix,n_iixxi, n_ixixi, n_ixxii, n_xiixi, n_xixii,n_ixiix, n_iiixx, n_xiiix, n_xxiii),
(n_ixixx, n_ixxix,n_ixxxi, n_xixix, n_xixxi, n_xxixi, n_iixxx, n_xiixx,n_xxiix,n_xxxii),
(n_ixxxx,n_xixxx, n_xxixx, n_xxxix, n_xxxxi),
n_all)
有人可以帮忙吗?
答案 0 :(得分:1)
构建模式似乎有些担心,所以这里有一些代码可以构建所有合法的i模式,以及要使用的n模式。
import collections
def make_ngram_ipatterns(n):
"""Make all needed patterns used by *gramCollocationFinder up to n words"""
i_patterns = []
for i in xrange(1, n+1):
if i <= 2:
i_patterns.append('i' * i)
else:
for j in xrange(2**(i-2)):
bin_str = '{0:0{1}b}'.format(j, i-2)
ix_pattern = bin_str.replace('0', 'x').replace('1', 'i')
i_patterns.append('i{}i'.format(ix_pattern))
return i_patterns
def make_ngram_npatterns(n):
"""Make all needed n-patterings used by *gramCollocationFinder up to n words"""
all_ipatterns = make_ngram_ipatterns(n)
npatterns = []
for ipattern in all_ipatterns:
i_order = sum(c == 'i' for c in ipattern)
i_length = len(ipattern)
for j in xrange(n - i_length+1):
npattern = 'n_{}{}{}'.format('x'* j,
ipattern ,
'x'* (n - i_length - j))
npatterns.append((i_order, ipattern, npattern))
return sorted(npatterns)
def main():
n = 5
all_ipatterns = make_ngram_ipatterns(n)
print '\n'.join(make_ngram_ipatterns(n))
for order, ipattern, npattern in make_ngram_npatterns(n):
wparams = ', '.join('w{}'.format(i+1)
for i, c in enumerate(npattern[2:])
if c == 'i'
)
print('order: {1:2} ipattern: {2:{0}s} npattern: {3}'
' -> {3} = self.{2}({4})'.format(
n, order, ipattern, npattern, wparams))
if __name__ == '__main__':
main()
n=5
的输出现状如下:
i
ii
ixi
iii
ixxi
ixii
iixi
iiii
ixxxi
ixxii
ixixi
ixiii
iixxi
iixii
iiixi
iiiii
order: 1 ipattern: i npattern: n_ixxxx -> n_ixxxx = self.i(w1)
order: 1 ipattern: i npattern: n_xixxx -> n_xixxx = self.i(w2)
order: 1 ipattern: i npattern: n_xxixx -> n_xxixx = self.i(w3)
order: 1 ipattern: i npattern: n_xxxix -> n_xxxix = self.i(w4)
order: 1 ipattern: i npattern: n_xxxxi -> n_xxxxi = self.i(w5)
order: 2 ipattern: ii npattern: n_iixxx -> n_iixxx = self.ii(w1, w2)
order: 2 ipattern: ii npattern: n_xiixx -> n_xiixx = self.ii(w2, w3)
order: 2 ipattern: ii npattern: n_xxiix -> n_xxiix = self.ii(w3, w4)
order: 2 ipattern: ii npattern: n_xxxii -> n_xxxii = self.ii(w4, w5)
order: 2 ipattern: ixi npattern: n_ixixx -> n_ixixx = self.ixi(w1, w3)
order: 2 ipattern: ixi npattern: n_xixix -> n_xixix = self.ixi(w2, w4)
order: 2 ipattern: ixi npattern: n_xxixi -> n_xxixi = self.ixi(w3, w5)
order: 2 ipattern: ixxi npattern: n_ixxix -> n_ixxix = self.ixxi(w1, w4)
order: 2 ipattern: ixxi npattern: n_xixxi -> n_xixxi = self.ixxi(w2, w5)
order: 2 ipattern: ixxxi npattern: n_ixxxi -> n_ixxxi = self.ixxxi(w1, w5)
order: 3 ipattern: iii npattern: n_iiixx -> n_iiixx = self.iii(w1, w2, w3)
order: 3 ipattern: iii npattern: n_xiiix -> n_xiiix = self.iii(w2, w3, w4)
order: 3 ipattern: iii npattern: n_xxiii -> n_xxiii = self.iii(w3, w4, w5)
order: 3 ipattern: iixi npattern: n_iixix -> n_iixix = self.iixi(w1, w2, w4)
order: 3 ipattern: iixi npattern: n_xiixi -> n_xiixi = self.iixi(w2, w3, w5)
order: 3 ipattern: iixxi npattern: n_iixxi -> n_iixxi = self.iixxi(w1, w2, w5)
order: 3 ipattern: ixii npattern: n_ixiix -> n_ixiix = self.ixii(w1, w3, w4)
order: 3 ipattern: ixii npattern: n_xixii -> n_xixii = self.ixii(w2, w4, w5)
order: 3 ipattern: ixixi npattern: n_ixixi -> n_ixixi = self.ixixi(w1, w3, w5)
order: 3 ipattern: ixxii npattern: n_ixxii -> n_ixxii = self.ixxii(w1, w4, w5)
order: 4 ipattern: iiii npattern: n_iiiix -> n_iiiix = self.iiii(w1, w2, w3, w4)
order: 4 ipattern: iiii npattern: n_xiiii -> n_xiiii = self.iiii(w2, w3, w4, w5)
order: 4 ipattern: iiixi npattern: n_iiixi -> n_iiixi = self.iiixi(w1, w2, w3, w5)
order: 4 ipattern: iixii npattern: n_iixii -> n_iixii = self.iixii(w1, w2, w4, w5)
order: 4 ipattern: ixiii npattern: n_ixiii -> n_ixiii = self.ixiii(w1, w3, w4, w5)
order: 5 ipattern: iiiii npattern: n_iiiii -> n_iiiii = self.iiiii(w1, w2, w3, w4, w5)
现在,更改为新维度的问题是将所有i-pattern设置为低阶类,替换n模式,并将所有相同顺序的n模式整理为score_fn()
集。 / p>
编辑:使用适当的w#
来完成n模式的设置
答案 1 :(得分:0)
它似乎不需要做太多工作,但你有一些定义要做。你应该允许超过两次分离吗?这使内部逻辑更长,但仍然可行。
另一个问题是,您是否希望继续沿着大量ixxi
,ixii
,...变量的路径前进,或者您是否应该在包含所有变量的字典中整理所有这些变量允许n
变量的排列?
仍然需要所有四个字母变量,但您需要添加五个字母变体,即您需要将以下所有变体添加到__init__
:ixxxi
,{{ 1}},iiixi
,...,iixii
,iixxi
。这些变量也需要作为参数添加到类中,以及ixxii
中使用的cls()
intialiser。
在from_words()
中,您还需要初始化新的五个字母变量,以及添加所有8个变量的其他块:
from_words()
类似地,您需要将这8个五个字母变量添加到if w5 is None:
continue
iiiii[(w1, w2, w3, w4, w5)] += 1
ixxxi[(w1, w5)] += 1
iiixi[(w1, w2, w3, w5)] +=
...
ixxii[(w1,w3,w4,w5)= +=
。注意所有这些如何以score_ngram()
开始和结束。
您当然还需要将所有4个更改为5个,在适当的位置添加i
,依此类推。
您需要做的最后一项更改是将相应的行添加到w5
,您还需要将案例添加到较低的订单score_ngram()
&amp;合。并且n_iixx
的一行代表了四个已知的score_fn()
。
关于i
组合,您需要将它们扩展为n_####
组合,并添加所有相关排列。第一个包含三个单词的块需要扩展到:
n_#####
其中n_iiixx, n_xiiix, +n_xxiii ~ self.iii(...)
n_iixix, +n_xiixi ~ self.iixi(...)
n_ixiix, +n_xixii ~ self.ixii(...)
表示新的变体,另一个只是修改相应的块。对于+
,您可以添加与n_xiixi
匹配的相应w#
,换句话说i
。
您还可以在相关问题的[我的回答]中建议删除所有self.iixi[(w2, w3, w5)]
和i
变量:“NgramCollocationFinder in NLTK”。但这需要更多的工作!