有没有一种快速的方法可以在列表中组合单词词典(MapType)?
单词
[[word1-> 2],[wor2-> 3] .... [word2-> 4]]
--------------------------------------结果-------- ---------------
单词
[[word1-> 2],[wor2-> 7]]
使用udf函数会花费很长时间。
def dictsum(keywords) :
dictlist = []
sumdict = {}
for wordcounts in keywords :
for k, v in wordcounts.items() :
print(wordcounts.items())
if k not in sumdict :
sumdict[k] = 1
else :
sumdict[k] += 1
dictlist.append(sumdict)
return dictlist
dict_df = noun_df.select("createDate","nounwords")
wordcountUdf = udf(wordcount, ArrayType(MapType(StringType(),IntegerType())))
dict_df = dict_df.withColumn("wordcount",wordcountUdf(dict_df['nounwords']))
#dict_df.show(100,False)
keyword_f = dict_df.select("createDate","wordcount")
keyword_f = keyword_f.groupby("createDate").agg(flatten(collect_list("wordcount")).alias("keywords"))
keyword_f = keyword_f.withColumn("statistic_type",lit("keyword_f"))
#keyword_f.show(10,False)
dictsumUdf = udf(dictsum, ArrayType(MapType(StringType(),IntegerType())))
keyword_f = keyword_f.withColumn("wordcounts",dictsumUdf(keyword_f['keywords']))
keyword_f = keyword_f.drop("keywords")
#keyword_f.show(100,False)