召回Pyspark中的k

时间:2018-10-30 14:06:51

标签: pyspark apache-spark-sql recommendation-engine precision-recall

有人在k实施过Recall评估使用Pyspark构建的推荐系统吗?我已在k处实现精度,请参考此处给出的详细信息(使用内置的RankMetrics类):Build a recommender system with Spark: Implicit ALS

class RankingEvaluator(Evaluator):

@keyword_only
def __init__(self, k=None):
    super(RankingEvaluator, self).__init__()
    self.k = Param(self, 'k', 'Top K')
    self._setDefault(k=20)
    kwargs = self._input_kwargs
    self.setParams(**kwargs)

@keyword_only
def setParams(self, k=None):
    kwargs = self._input_kwargs
    return self._set(**kwargs)

def isLargerBetter(self):
    return True

def setK(self, value):
    self._paramMap[self.k] = value
    return self

def getK(self):
    return self.getOrDefault(self.k)

def _evaluate(self, predictedDF):
    k = self.getK()

    windowSpec = Window.partitionBy('user_id').orderBy(col('prediction').desc())
    perUserPredictedItemsDF = predictedDF \
        .select('user_id', 'article_id', 'prediction', F.rank().over(windowSpec).alias('rank')) \
        .where('rank <= {0}'.format(k)) \
        .groupBy('user_id') \
        .agg(expr('collect_list(article_id) as items'))

    windowSpec = Window.partitionBy('user_id').orderBy(col('article_count').desc())
    perUserActualItemsDF = predictedDF \
        .select('user_id', 'article_id', 'article_count', F.rank().over(windowSpec).alias('rank')) \
        .where('rank <= {0}'.format(k)) \
        .groupBy('user_id') \
        .agg(expr('collect_list(article_id) as items'))

    perUserItemsRDD = perUserPredictedItemsDF.join(F.broadcast(perUserActualItemsDF), 'user_id', 'inner') \
        .rdd \
        .map(lambda row: (row[1], row[2]))

    if perUserItemsRDD.isEmpty():
        return 0.0

    rankingMetrics = RankingMetrics(perUserItemsRDD)
    precision_at_k = rankingMetrics.precisionAt(k)
    return precision_at_k

但是,我找不到任何可以帮助我解决问题的评估类。

谢谢。

0 个答案:

没有答案