我试图在python中使用MapReduce构建你可能知道的人,这样它应该显示最多没有的人(前5名)的结果。建议。 数据集的格式为
#!/usr/bin/env python2
import itertools
import sys
for line in sys.stdin:
line = line.strip()
line = line.split("\t")
key = int(line[0])
if len(line)>1:
friends = line[1]
if friends!='':
friends = line[1].split(",")
friends = sorted(map(int,friends))
for friend in friends:
pair = tuple(sorted([key,friend]))
pair = ','.join(map(str,pair))
print pair,"\t",1
for pair in itertools.combinations(friends,2):
pair = ','.join(map(str,pair))
print pair,"\t",0
我使用的代码是。 说明:http://andresromero.github.io/People-you-may-know/
映射
#!/usr/bin/env python2
import itertools
import sys
flist={}
def checking(k,size):
if len(flist)==0 | len(flist)<5:
return 0
elif len(flist) > 5:
for i in flist.keys():
if len(flist[i]) < size:
return i;
else:
return -1
def addPair(users,k1,k2,friendFlag):
if friendFlag==1:
friendFlag = True
else:
friendFlag = False
if k1 not in users:
users[k1] = {}
users[k1][k2] = [1,False]
else:
if k2 in users[k1]:
users[k1][k2][0] += 1
else:
users[k1][k2] = [1,False]
if friendFlag==True:
users[k1][k2][0] -= 1
users[k1][k2][1] = True
users = {}
for line in sys.stdin:
line = line.strip()
line = line.split("\t")
key = tuple(map(int,line[0].strip().split(",")))
friendFlag = int(line[1])
k1,k2 = key
addPair(users,k1,k2,friendFlag)
addPair(users,k2,k1,friendFlag)
for k1 in users.keys():
recommendations = []
for k2 in users[k1].keys():
n,flag = users[k1][k2]
if flag==False:
recommendations.append((k2,n))
recommendations = sorted(recommendations,key=lambda x: x[0])
recommendations = sorted(recommendations,key=lambda x: x[1],reverse=True)
if len(recommendations)>0:
recommendations = list(map(str,zip(*recommendations)[0]))
flag=checking(k1,len(recommendations))
if flag==0:
flist[k1]=[]
flist[k1]=recommendations
temp3 = sorted(flist.items(), key = lambda item : len(item[1]))
flist = dict(temp3)
elif flag!= -1:
flist[k1]=flist.pop(flag)
flist.update({k1,recommendations})
temp3 = sorted(flist.items(), key = lambda item : len(item[1]))
flist = dict(temp3)
#print k1,"\t",','.join(recommendations[:-1])
for i in flist.keys():
print i,"has", len(flist[i])," recommendations"
减速
0 has 830 recommendations
1 has 1802 recommendations
2 has 499 recommendations
3 has 309 recommendations
4 has 245 recommendations
但我得到的结果是
=ARRAYFORMULA(if(isblank(B2:B),"",B2:B))
但推荐最多的人是8685,有4600条建议