我希望改变我的地图缩小文件以输出一大块文本中的顶级双字母而不是单词计数,所以这两个单词和双字母组合计数
这是我目前的代码和方法。
地图:
import sys
for line in sys.stdin:
line = line.strip()
words = line.split() #bigrams = line.split()
for word in words: #for bigram in words
print '%s\t%s' % (word,1) #print ... word pair???
减少
mydict = dict()
for line in sys.stdin:
(word,cnt) = line.strip().split('\t') #bigram and bigram count
mydict[word] = mydict.get(word,0) 1
for word,cnt in mydict.items():
print word,cnt #print bigram and bigram count
谢谢。
我认为nltk是计算双字母组合的流行解决方案,即使我的mapreduce格式也应该采用这种方法吗?
答案 0 :(得分:0)
我不会用stdin和stdout来做。我宁愿抛出multiprocessing
并阅读一些保存的文件:
import multiprocessing as mp
def main(infilepath):
bgqIn, bgqOut = [mp.Queue() for _ in xrange(2)]
procs = [mp.Process(target=mapper, args=(bgqIn, bgqOut)) for _ in xrange(mp.cpu_count())]
for p in procs:
p.start()
with open(infilepath) as infile:
first = ''
second = ''
for line in infile:
line = line.lower()
for word in line.split():
first = second
second = word
bigram = (first, second)
bgqIn.put(bigram)
for p in procs:
bgqIn.put(None)
rqs = [(mp.Queue() for _ in xrange(2)) for i in xrange(mp.cpu_count())]
rprocs = [mp.Process(target=reducer, args=(*rqs[i])) for i in xrange(mp.cpu_count())]
for p in rprocs:
p.start()
qmap = {}
for char in xrange(97,123):
qmap[ord(char)] = rqs[(char-97)/len(rqs)]
dones = 0
while dones != len(procs):
t = bgqOut.get()
if t is None:
dones += 1
else:
qmap[t[0][0]].put(t)
for q in rqs:
q.put(None)
answer = {}
for q in rqs:
for bg,count in iter(q.get, None):
if bg not in answer:
answer[bg] = 0
answer[bg] += count
for bg,count in answer.iteritems():
print "There are", count, "occurrences of", bg
def mapper(qIn, qOut):
counts = {}
for bg in iter(qIn.get, None):
if bg not in counts:
counts[bg] = 0
counts[bg] += 1
for k,v in counts.iteritems():
qOut.put((k,v))
qOut.put(None)
def reducer(qIn, qOut):
counts = {}
for bg,count in iter(qIn.get, None):
if bg not in counts:
counts[bg] = 0
counts[bg] += count
for bg,count in counts.iteritems():
qOut.put((bg,count))
qOut.put(None)
我没有对此进行过测试,但它是一个基本的骨架,可以帮助你入门。