我是MapReduce的新手,我有一个非常简单的问题。我解决了WordCount问题然后我想将问题更改为文本上的前N个记录。虽然我对文本中的所有单词排序,但我不能采用最后N值。首先,我读取文本并将每个单词发送到减速器1,然后reducer找到每个不同单词的数字。然后我试着根据单词的出现对这些单词进行排序。但我找不到前N个记录
from mrjob.job import MRJob
from mrjob.step import MRStep
from stemming.porter2 import stem
class MRWordCount(MRJob):
def steps(self):
return [
MRStep(mapper=self.mapper,
reducer=self.reducer),
MRStep(mapper=self.secondmapper,
reducer = self.secondreducer)
]
def mapper(self,_,lines):
words = lines.strip().split()
for w in words:
yield stem(w.lower()),1
def reducer(self, key, values):
yield key, (sum(values))
def secondmapper(self, key,value):
yield '%04d'%int(value), key
def secondreducer(self, key, values):
for v in values:
yield v,key
if __name__ == '__main__':
MRWordCount.run()
答案 0 :(得分:0)
我使用以下代码
解决了这个问题from mrjob.job import MRJob
from mrjob.step import MRStep
from stemming.porter2 import stem
class MRWordCount(MRJob):
def steps(self):
return [
MRStep(mapper=self.mapper,
reducer=self.reducer),
MRStep(reducer = self.secondreducer)
]
def mapper(self,_,lines):
words = lines.strip().split()
for w in words:
w = unicode(w, "utf-8", errors="ignore")
yield stem(w.lower()),1
def reducer(self, key, values):
yield None, ('%04d'%int(sum(values)),key)
def secondreducer(self, key, values):
self.aList= []
for v in values:
self.aList.append(v)
count = len(self.aList)
for m in range(count-5,count):
yield self.aList[m]
if __name__ == '__main__':
MRWordCount.run()