Mapreduce& Python:Bigrams

时间:2014-10-14 04:17:44

标签: python mapreduce

我希望改变我的地图缩小文件以输出一大块文本中的顶级双字母而不是单词计数,所以这两个单词和双字母组合计数

这是我目前的代码和方法。

地图:

import sys

for line in sys.stdin:
    line = line.strip()
    words = line.split()  #bigrams = line.split()
    for word in words:    #for bigram in words
        print '%s\t%s' % (word,1)  #print ... word pair???

减少

mydict = dict() 
for line in sys.stdin:
    (word,cnt) = line.strip().split('\t') #bigram and bigram count
    mydict[word] = mydict.get(word,0) 1

for word,cnt in mydict.items():
    print word,cnt                      #print bigram and bigram count

谢谢。

我认为nltk是计算双字母组合的流行解决方案,即使我的mapreduce格式也应该采用这种方法吗?

1 个答案:

答案 0 :(得分:0)

我不会用stdin和stdout来做。我宁愿抛出multiprocessing并阅读一些保存的文件:

import multiprocessing as mp

def main(infilepath):
    bgqIn, bgqOut = [mp.Queue() for _ in xrange(2)]
    procs = [mp.Process(target=mapper, args=(bgqIn, bgqOut)) for _ in xrange(mp.cpu_count())]
    for p in procs:
        p.start()
    with open(infilepath) as infile:
        first = ''
        second = ''
        for line in infile:
            line = line.lower()
            for word in line.split():
                first = second
                second = word
                bigram = (first, second)
                bgqIn.put(bigram)
        for p in procs:
            bgqIn.put(None)

    rqs = [(mp.Queue() for _ in xrange(2)) for i in xrange(mp.cpu_count())]
    rprocs = [mp.Process(target=reducer, args=(*rqs[i])) for i in xrange(mp.cpu_count())]
    for p in rprocs:
        p.start()
    qmap = {}
    for char in xrange(97,123):
        qmap[ord(char)] = rqs[(char-97)/len(rqs)]
    dones = 0
    while dones != len(procs):
        t = bgqOut.get()
        if t is None:
            dones += 1
        else:
            qmap[t[0][0]].put(t)
    for q in rqs:
        q.put(None)

    answer = {}
    for q in rqs:
        for bg,count in iter(q.get, None):
            if bg not in answer:
                answer[bg] = 0
            answer[bg] += count

    for bg,count in answer.iteritems():
        print "There are", count, "occurrences of", bg

def mapper(qIn, qOut):
    counts = {}
    for bg in iter(qIn.get, None):
        if bg not in counts:
            counts[bg] = 0
        counts[bg] += 1
    for k,v in counts.iteritems():
        qOut.put((k,v))
    qOut.put(None)

def reducer(qIn, qOut):
    counts = {}
    for bg,count in iter(qIn.get, None):
        if bg not in counts:
            counts[bg] = 0
        counts[bg] += count
    for bg,count in counts.iteritems():
        qOut.put((bg,count))
    qOut.put(None)

我没有对此进行过测试,但它是一个基本的骨架,可以帮助你入门。