python:按年龄段细分平均好友数

时间:2019-01-17 14:19:40

标签: python pyspark

我使用python串行编写了一个代码,并在spark数据帧中编写了同一代码的并行版本。我的并行实现看起来不错,但是将其转换为串行实现有点挑战,尤其是代码效率。目的是评估并行实施的性能提升。下面是示例数据,该数据具有4个属性,即用户ID,姓名,年龄和与他链接的朋友的姓名。

enter image description here

下面是我的串行和并行实现:

 parallel implementation
         sc = SparkContext(master='local[4]')
         from pyspark.sql import SQLContext
         sqlContext = SQLContext(sc)
    data=sc.textFile('C:/Users/abanfo/Desktop/Assignment_parralel/testing_data.csv')
    def parse(line):
        fields = line.split(",")
        name = (fields[1])
        friends = (fields[3])
        friends = 1
        age = int(fields[2])
        name_age = (name,age) # the name and age combination uniquely identifies individuals
        return (name_age,friends)
    name_age_friend =data.map(parse)
    # name_age_friend is an RDD with name and age as a key and friends name is replaced with 1, 
    #meanining the person has one friend.
    print(name_age_friend.take(30))
# the number of friends for a user are added and the name is removed, it is not important for further calculation
first_RDD = name_age_friend.reduceByKey(lambda x,y : x + y).map(lambda row: (row[0][0],(row[0][1],row[1]))).map(lambda x : x[1])
print(first_RDD.take(30))
# the value is replaced with the number of friends and a number one ,
#means he is a single person that wil help latter to know the size of age group 
Second_RDD = first_RDD.mapValues(lambda x : (x,1))
print(Second_RDD.take(30))
# breaking age range into age group
def age(line):
    ageRange = int(line[0])
    number_friends =line[1] 
    if ageRange in range(16,20):
        ageRange ='teens'        
    elif ageRange in range(20,40):
        ageRange = 'Adult'
    elif ageRange in range(40,50):
        ageRange = 'MiddleAge'
    else:
        ageRange = 'old'        
    return (ageRange,number_friends)
third_RDD = Second_RDD.map(age)
print(third_RDD.take(30))
## collected user an the same age range
fourth_RDD = third_RDD.reduceByKey(lambda x,y : (x[0]+y[0],x[1]+y[1]))
print(fourth_RDD.take(30))
age_group_average_friends = fourth_RDD.mapValues(lambda x : int(x[0]/x[1]))
print(age_group_average_friends.take(5))

并且串行实现是

#loc the attribute or features of interest
friends = df.iloc[:,3]
ages = df.iloc[:,2]

# default of dictionary with age as key and value as a list of friends 
dictionary_age_friends = defaultdict(list)

# populating the dictionary with key age and values friend
for i,j in zip(ages,friends):
    dictionary_age_friends[i].append(j)
print("first dict")
print(dictionary_age_friends)

#second dictionary, the same age is collected and the number of friends is added 
set_dict ={}
for x in dictionary_age_friends:
    list_friends =[]
    for y in dictionary_age_friends[x]:
        list_friends.append(y)
    set_list_len = len(list_friends) # assign a friend with a number 1
    set_dict[x] = set_list_len
print(set_dict)

# set_dict ={}
# for x in dictionary_age_friends:
#     print("inside the loop")
#     lis_1 =[]
#     for y in dictionary_age_friends[x]:
#         lis_1.append(y)
#         set_list = lis_1
#         set_list = [1 for x in set_list] # assign a friend with a number 1
#         set_dict[x] = sum(set_list)

# a dictionary that assign the age range into age-groups
second_dict = defaultdict(list)
for i,j in set_dict.items(): 
    if i in range(16,20):           
        i = 'teens_youthAdult'
        second_dict[i].append(j)
    elif i in range(20,40):       
        i ="Adult"
        second_dict[i].append(j)
    elif i in  range(40,60):        
        i ="MiddleAge"
        second_dict[i].append(j)
    elif i in range(60,72):       
        i = "old"
        second_dict[i].append(j)
print(second_dict)
print("final dict stared")
new_dic ={}

for key,value in second_dict.items():
    if key == 'teens_youthAdult':
        new_dic[key] = round((sum(value)/len(value)),2)
    elif key =='Adult':
        new_dic[key] = round((sum(value)/len(value)),2)
    elif key =='MiddleAge' :
        new_dic[key] = round((sum(value)/len(value)),2)
    else:
        new_dic[key] = round((sum(value)/len(value)),2)
new_dic
end_time = datetime.datetime.now()


print(end_time-start_time)


print(new_dic)

我从串行实现中得到的反馈是

  1. 不确定是否执行正确的任务
  2. 未优化
  3. 有很多不必要的词典和代码

iam栈将并行实现转换为串行。任何建议或帮助都将受到高度赞赏。

0 个答案:

没有答案