你可能知道在python中使用MapReduce的人

时间:2017-10-19 09:11:11

标签: python hadoop mapreduce

我试图在python中使用MapReduce构建你可能知道的人,这样它应该显示最多没有的人(前5名)的结果。建议。 数据集的格式为

#!/usr/bin/env python2

import itertools
import sys

for line in sys.stdin:
    line = line.strip()
    line = line.split("\t")
    key = int(line[0])
    if len(line)>1:
        friends = line[1]
        if friends!='':
            friends = line[1].split(",")
            friends = sorted(map(int,friends))
            for friend in friends:
                pair = tuple(sorted([key,friend]))
                pair = ','.join(map(str,pair))
                print pair,"\t",1
            for pair in itertools.combinations(friends,2):
                pair = ','.join(map(str,pair))
                print pair,"\t",0

我使用的代码是。 说明:http://andresromero.github.io/People-you-may-know/

映射

#!/usr/bin/env python2
import itertools
import sys

flist={}
def checking(k,size):
    if len(flist)==0 | len(flist)<5:
        return 0
    elif len(flist) > 5:
        for i in flist.keys():
            if len(flist[i]) < size:
                return i;
    else:
        return -1
def addPair(users,k1,k2,friendFlag):
    if friendFlag==1:
        friendFlag = True
    else:
        friendFlag = False

    if k1 not in users:
        users[k1] = {}
        users[k1][k2] = [1,False]
    else:
        if k2 in users[k1]:
            users[k1][k2][0] += 1            
        else:
            users[k1][k2] = [1,False]
    if friendFlag==True:
        users[k1][k2][0] -= 1
        users[k1][k2][1] = True

users = {}

for line in sys.stdin:
    line = line.strip()
    line = line.split("\t")
    key = tuple(map(int,line[0].strip().split(",")))
    friendFlag = int(line[1])
    k1,k2 = key
    addPair(users,k1,k2,friendFlag)
    addPair(users,k2,k1,friendFlag)
for k1 in users.keys():
    recommendations = []
    for k2 in users[k1].keys():
        n,flag = users[k1][k2]
        if flag==False:
            recommendations.append((k2,n))
    recommendations = sorted(recommendations,key=lambda x: x[0])
    recommendations = sorted(recommendations,key=lambda x: x[1],reverse=True)
    if len(recommendations)>0:
        recommendations = list(map(str,zip(*recommendations)[0]))
        flag=checking(k1,len(recommendations))
        if flag==0:
            flist[k1]=[]
            flist[k1]=recommendations
            temp3 = sorted(flist.items(), key = lambda item : len(item[1]))
            flist = dict(temp3)
        elif flag!= -1:
            flist[k1]=flist.pop(flag)
            flist.update({k1,recommendations})
            temp3 = sorted(flist.items(), key = lambda item : len(item[1]))
            flist = dict(temp3)
        #print k1,"\t",','.join(recommendations[:-1])
for i in flist.keys():
    print i,"has", len(flist[i])," recommendations"

减速

0 has 830  recommendations  
1 has 1802  recommendations 
2 has 499  recommendations  
3 has 309  recommendations  
4 has 245  recommendations  

但我得到的结果是

=ARRAYFORMULA(if(isblank(B2:B),"",B2:B))

但推荐最多的人是8685,有4600条建议

0 个答案:

没有答案