我希望此地图减少作业(下面的代码)输出前10名评价最高的产品。它不断给我以下错误消息:
it = izip(iterable,count(0,-1))#decorate TypeError:izip参数#1必须支持迭代。
我认为它与我尝试应用的 nlargest 功能有关。
任何指针?
谢谢!
from mrjob.job import MRJob
from mrjob.step import MRStep
from heapq import nlargest
class MostRatedProduct(MRJob):
def steps(self):
return [
MRStep(mapper = self.mapper_get_ratings,
reducer = self.reducer_count_ratings),
MRStep(reducer = self.reducer_find_top10)
]
def mapper_get_ratings(self, _, line):
(userID, itemID, rating, timestamp) = line.split(',')
yield itemID, 1
def reducer_count_ratings(self, itemID, ratingCount):
yield None, (sum(ratingCount), itemID)
def top_10(self, ratingPair):
for ratingTotal, itemID in ratingPair:
top_rated = nlargest(10, ratingTotal)
for top_rated in ratingTotal:
return (ratingTotal, itemID)
def reducer_find_top10(self, key, ratingPair):
ratingTotal, itemID = self.top_10(ratingPair)
yield ratingTotal, itemID
if __name__ == '__main__':
MostRatedProduct.run()
答案 0 :(得分:1)
使用mrjob库,您可以在python中做同样的事情:-
#Write a Code to print the top 5 word - occurences
#Import Dependencies
from mrjob.job import MRJob
from mrjob.step import MRStep
class MRWordCount(MRJob):
def steps(self):
return [MRStep(mapper=self.mapper,reducer=self.reducer),MRStep(reducer = self.secondreducer)]
def mapper(self,_,lines):
words = lines.split()
for word in words:
yield word.lower(),1
def reducer(self,key,values):
yield None,('%04d'%int(sum(values)),key)
def secondreducer(self,key,values):
self.alist = []
for value in values:
self.alist.append(value)
self.blist = []
for i in range(5):
self.blist.append(max(self.alist))
self.alist.remove(max(self.alist))
for i in range(5):
yield self.blist[i]
if __name__ == '__main__':
MRWordCount.run()
答案 1 :(得分:0)
我还没有使用过mrjob
,但我之前在AWS群集上使用了MapReduce来查找最高值。这是我的代码,它不使用heapq
。希望您能够将相同的概念应用于您的代码。这是映射器函数
import sys, time
def Parser():
for line in sys.stdin:
line = line.strip('\n')
yield line.split()
def mapper():
counts = list(Parser())
z = sorted(counts, key = lambda x: int(x[1]))[-10:]
print '\n'.join(map(lambda x: '\t'.join(x), z))
if __name__=='__main__':
mapper()
以下是reducer的代码
import sys, operator, itertools
def Parser():
for line in sys.stdin:
yield tuple(line.strip('\n').split('\t'))
def reducer():
for key, pairs in itertools.groupby(Parser(), operator.itemgetter(0)):
counts = list(Parser())
z = sorted(counts, key = lambda x: int(x[1]))[-10:]
print '\n'.join(map(lambda x: '\t'.join(x), z))
if __name__=='__main__':
reducer()
我改变它输出前10个单词。请记住,这是一个字数统计示例,我解析了一个文本文档。我希望这在某种程度上有所帮助!