Question

我正在尝试在PySpark中构建推荐引擎。我在Sparks的ALS库中使用协作过滤方法。我已经用最好的参数集训练了模型，保存了模型，现在我想加载模型并进行预测。运行此代码给我错误。有人可以告诉我它有什么问题以及如何解决吗？

"""
This code loads saved model and makes predictions
"""
import os
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.mllib.recommendation import MatrixFactorizationModel
from pyspark.sql import functions as F
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class RecommendationEngine:
    """
    A job recommendation engine
    """
    def load_model(self, model_path):
        """
        Load the saved ALS model with the current dataset
        """
        logger.info("Loading the ALS model with best set of parameters...")
        self.model = MatrixFactorizationModel.load(sc, model_path)
        logger.info("ALS model loaded!")
        # return model

    def get_counts_and_averages(self, ID_and_clicks_tuple):
        nclicks = len(ID_and_clicks_tuple[1])
        return ID_and_clicks_tuple[0], (nclicks, float(sum(x for x in ID_and_clicks_tuple[1])) / nclicks)

    def count_and_average_ratings(self):
        """
        Updates the job clicks counts from the current data self.clicks_RDD
        """
        logger.info("Counting job clicks...")
        job_ID_with_clicks_RDD = self.clicks_RDD.map(lambda x: (x[1], x[2])).groupByKey()
        job_ID_with_avg_clicks_RDD = job_ID_with_clicks_RDD.map(self.get_counts_and_averages)
        self.job_click_counts_RDD = job_ID_with_avg_clicks_RDD.map(lambda x: (x[0], x[1][0]))
        # return job_click_counts_RDD

    def predict_clicks(self, user_unclicked_jobs_RDD):
        """
        Gets predictions for a given (userID, jobID) formatted RDD
        Returns: an RDD with format (JobTitle, Click, numCLicks)
        """

        # Pre-calculate jobs ratings counts
        self.count_and_average_ratings()

        predicted_RDD = self.model.predictAll(user_unclicked_jobs_RDD)
        predicted_clicks_RDD = predicted_RDD.map(lambda x: (x.product, x.rating))
        predicted_clicks_title_and_count_RDD = predicted_clicks_RDD.join(self.jobs_titles_RDD).join(self.job_click_counts_RDD)
        predicted_clicks_title_and_count_RDD = predicted_clicks_title_and_count_RDD.map(lambda r: (r[1][0][1], r[1][0][0], r[1][1]))
        return predicted_clicks_title_and_count_RDD

    def get_top_ratings(self, user_id):
        """
        Recommends up to jobs_count top unrated movies to user_id
        """
        # Get pairs of (userID, movieID) for user_id unrated movies
        hasattr(self.clicks_RDD, "toDF")
        all_user_clicks_DF = self.clicks_RDD.toDF()
        user_clicks_DF = all_user_clicks_DF.filter(F.col('_1') == user_id)

        # user_clicks_RDD = self.clicks_RDD.filter( lambda tokens: (int(tokens[0]) == user_id, int(tokens[1]), int(float(tokens[2]))))
        user_clicks_RDD = user_clicks_DF.rdd
        user_clicks_RDD = user_clicks_RDD.map(lambda tokens: (int(tokens[0]), int(tokens[1]), int(float(tokens[2])))).cache()

        user_clicks_ids = user_clicks_RDD.map(lambda x: x[1]).collect()
        user_unclicked_jobs_RDD = (self.jobs_RDD.filter(lambda x: x[0] not in user_clicks_ids).map(lambda x: (user_id, x[0])))

        # Get predicted ratings
        top_jobs = self.predict_clicks(user_unclicked_jobs_RDD)
        top_jobs = top_jobs.filter(lambda r: r[2] >= 25).takeOrdered(25, key=lambda x: -x[1])
        return top_jobs
    """
    def get_ratings_for_movie_ids(self, user_id, movie_ids):
        
        Given a user_id and a list of movie_ids, predict ratings for them
        
        requested_movies_RDD = self.sc.parallelize(movie_ids).map(lambda x: (user_id, x))
        # Get predicted ratings
        ratings = self.predict_clicks(requested_movies_RDD).collect()

        return ratings
    """
    def __init__(self, sc, dataset_path, model_path):
        """
        Init the recommendation engine given a Spark context and a dataset path
        """
        logger.info("Starting up the Recommendation Engine: ")

        self.sc = sc
        # Load clicks data for later use
        logger.info("Loading Clicks data...")
        clicks_file_path = os.path.join(dataset_path, 'job_clicks.csv')
        clicks_raw_RDD = self.sc.textFile(clicks_file_path)
        ratings_raw_data_header = clicks_raw_RDD.take(1)[0]
        self.clicks_RDD = clicks_raw_RDD.filter(lambda line: line != ratings_raw_data_header) \
                                        .map(lambda line: line.split(","))\
                                        .map(lambda tokens: (int(tokens[0]), int(tokens[1]), float(tokens[2]))).cache()
        # Load jobs data for later use
        logger.info("Loading Jobs data...")
        jobss_file_path = os.path.join(dataset_path, 'jobs.csv')
        jobs_raw_RDD = self.sc.textFile(jobss_file_path)
        jobs_raw_data_header = jobs_raw_RDD.take(1)[0]
        self.jobs_RDD = jobs_raw_RDD.filter(lambda line: line != jobs_raw_data_header) \
                                    .map(lambda line: line.split(","))\
                                    .map(lambda tokens: (int(tokens[0]), tokens[1])).cache()
        # Get job titles
        self.jobs_titles_RDD = self.jobs_RDD.map(lambda x: (int(x[0]), x[1])).cache()
        self.load_model(model_path)

if __name__ == "__main__":

    spark = SparkSession.builder \
                        .master("local[*]") \
                        .appName("Job Recommender ALS") \
                        .config("spark.executor.memory", "1gb") \
                        .getOrCreate()

    sc = spark.sparkContext

    dataset_path = './datasets/'
    model_path = os.path.join('.', 'models')

    recommendation_engine = RecommendationEngine(sc, dataset_path, model_path)
    top_ratings = recommendation_engine.get_top_ratings(1)
    print(top_ratings)
    # return top_ratings
    # ratings = recommendation_engine.get_ratings_for_movie_ids(1, [movie_id])

Traceback (most recent call last):
  File "C:/Users/ywwl/Desktop/Blogging/recommendation system/recommender.py", line 130, in <module>
    top_ratings = recommendation_engine.get_top_ratings(1)
  File "C:/Users/ywwl/Desktop/Blogging/recommendation system/recommender.py", line 75, in get_top_ratings
    top_jobs = self.predict_clicks(user_unclicked_jobs_RDD)
  File "C:/Users/ywwl/Desktop/Blogging/recommendation system/recommender.py", line 54, in predict_clicks
    predicted_clicks_title_and_count_RDD = predicted_clicks_title_and_count_RDD.join(self.job_click_counts_RDD)
  File "C:\Users\ywwl\anaconda3\envs\RecSys\lib\site-packages\pyspark\rdd.py", line 1680, in join
    return python_join(self, other, numPartitions)
  File "C:\Users\ywwl\anaconda3\envs\RecSys\lib\site-packages\pyspark\join.py", line 53, in python_join
    return _do_python_join(rdd, other, numPartitions, dispatch)
  File "C:\Users\ywwl\anaconda3\envs\RecSys\lib\site-packages\pyspark\join.py", line 41, in _do_python_join
    return vs.union(ws).groupByKey(numPartitions).flatMapValues(lambda x: dispatch(x.__iter__()))
  File "C:\Users\ywwl\anaconda3\envs\RecSys\lib\site-packages\pyspark\rdd.py", line 560, in union
    rdd = RDD(self._jrdd.union(other._jrdd), self.ctx,
  File "C:\Users\ywwl\anaconda3\envs\RecSys\lib\site-packages\pyspark\rdd.py", line 2532, in _jrdd
    self._jrdd_deserializer, profiler)
  File "C:\Users\ywwl\anaconda3\envs\RecSys\lib\site-packages\pyspark\rdd.py", line 2434, in _wrap_function
    pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command)
  File "C:\Users\ywwl\anaconda3\envs\RecSys\lib\site-packages\pyspark\rdd.py", line 2420, in _prepare_for_python_RDD
    pickled_command = ser.dumps(command)
  File "C:\Users\ywwl\anaconda3\envs\RecSys\lib\site-packages\pyspark\serializers.py", line 607, in dumps
raise pickle.PicklingError(msg)
_pickle.PicklingError: Could not serialize object: Exception: It appears that you are attempting to reference SparkContext from a broadcast variable, action, or transformation. SparkContext can only be used on the driver, not in code that it run on workers. For more information, see SPARK-5063.

pyspark上的PicklingError：无法序列化对象

0 个答案: