我正在尝试在PySpark中构建推荐引擎。我在Sparks的ALS库中使用协作过滤方法。我已经用最好的参数集训练了模型,保存了模型,现在我想加载模型并进行预测。运行此代码给我错误。有人可以告诉我它有什么问题以及如何解决吗?
"""
This code loads saved model and makes predictions
"""
import os
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.mllib.recommendation import MatrixFactorizationModel
from pyspark.sql import functions as F
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class RecommendationEngine:
"""
A job recommendation engine
"""
def load_model(self, model_path):
"""
Load the saved ALS model with the current dataset
"""
logger.info("Loading the ALS model with best set of parameters...")
self.model = MatrixFactorizationModel.load(sc, model_path)
logger.info("ALS model loaded!")
# return model
def get_counts_and_averages(self, ID_and_clicks_tuple):
nclicks = len(ID_and_clicks_tuple[1])
return ID_and_clicks_tuple[0], (nclicks, float(sum(x for x in ID_and_clicks_tuple[1])) / nclicks)
def count_and_average_ratings(self):
"""
Updates the job clicks counts from the current data self.clicks_RDD
"""
logger.info("Counting job clicks...")
job_ID_with_clicks_RDD = self.clicks_RDD.map(lambda x: (x[1], x[2])).groupByKey()
job_ID_with_avg_clicks_RDD = job_ID_with_clicks_RDD.map(self.get_counts_and_averages)
self.job_click_counts_RDD = job_ID_with_avg_clicks_RDD.map(lambda x: (x[0], x[1][0]))
# return job_click_counts_RDD
def predict_clicks(self, user_unclicked_jobs_RDD):
"""
Gets predictions for a given (userID, jobID) formatted RDD
Returns: an RDD with format (JobTitle, Click, numCLicks)
"""
# Pre-calculate jobs ratings counts
self.count_and_average_ratings()
predicted_RDD = self.model.predictAll(user_unclicked_jobs_RDD)
predicted_clicks_RDD = predicted_RDD.map(lambda x: (x.product, x.rating))
predicted_clicks_title_and_count_RDD = predicted_clicks_RDD.join(self.jobs_titles_RDD).join(self.job_click_counts_RDD)
predicted_clicks_title_and_count_RDD = predicted_clicks_title_and_count_RDD.map(lambda r: (r[1][0][1], r[1][0][0], r[1][1]))
return predicted_clicks_title_and_count_RDD
def get_top_ratings(self, user_id):
"""
Recommends up to jobs_count top unrated movies to user_id
"""
# Get pairs of (userID, movieID) for user_id unrated movies
hasattr(self.clicks_RDD, "toDF")
all_user_clicks_DF = self.clicks_RDD.toDF()
user_clicks_DF = all_user_clicks_DF.filter(F.col('_1') == user_id)
# user_clicks_RDD = self.clicks_RDD.filter( lambda tokens: (int(tokens[0]) == user_id, int(tokens[1]), int(float(tokens[2]))))
user_clicks_RDD = user_clicks_DF.rdd
user_clicks_RDD = user_clicks_RDD.map(lambda tokens: (int(tokens[0]), int(tokens[1]), int(float(tokens[2])))).cache()
user_clicks_ids = user_clicks_RDD.map(lambda x: x[1]).collect()
user_unclicked_jobs_RDD = (self.jobs_RDD.filter(lambda x: x[0] not in user_clicks_ids).map(lambda x: (user_id, x[0])))
# Get predicted ratings
top_jobs = self.predict_clicks(user_unclicked_jobs_RDD)
top_jobs = top_jobs.filter(lambda r: r[2] >= 25).takeOrdered(25, key=lambda x: -x[1])
return top_jobs
"""
def get_ratings_for_movie_ids(self, user_id, movie_ids):
Given a user_id and a list of movie_ids, predict ratings for them
requested_movies_RDD = self.sc.parallelize(movie_ids).map(lambda x: (user_id, x))
# Get predicted ratings
ratings = self.predict_clicks(requested_movies_RDD).collect()
return ratings
"""
def __init__(self, sc, dataset_path, model_path):
"""
Init the recommendation engine given a Spark context and a dataset path
"""
logger.info("Starting up the Recommendation Engine: ")
self.sc = sc
# Load clicks data for later use
logger.info("Loading Clicks data...")
clicks_file_path = os.path.join(dataset_path, 'job_clicks.csv')
clicks_raw_RDD = self.sc.textFile(clicks_file_path)
ratings_raw_data_header = clicks_raw_RDD.take(1)[0]
self.clicks_RDD = clicks_raw_RDD.filter(lambda line: line != ratings_raw_data_header) \
.map(lambda line: line.split(","))\
.map(lambda tokens: (int(tokens[0]), int(tokens[1]), float(tokens[2]))).cache()
# Load jobs data for later use
logger.info("Loading Jobs data...")
jobss_file_path = os.path.join(dataset_path, 'jobs.csv')
jobs_raw_RDD = self.sc.textFile(jobss_file_path)
jobs_raw_data_header = jobs_raw_RDD.take(1)[0]
self.jobs_RDD = jobs_raw_RDD.filter(lambda line: line != jobs_raw_data_header) \
.map(lambda line: line.split(","))\
.map(lambda tokens: (int(tokens[0]), tokens[1])).cache()
# Get job titles
self.jobs_titles_RDD = self.jobs_RDD.map(lambda x: (int(x[0]), x[1])).cache()
self.load_model(model_path)
if __name__ == "__main__":
spark = SparkSession.builder \
.master("local[*]") \
.appName("Job Recommender ALS") \
.config("spark.executor.memory", "1gb") \
.getOrCreate()
sc = spark.sparkContext
dataset_path = './datasets/'
model_path = os.path.join('.', 'models')
recommendation_engine = RecommendationEngine(sc, dataset_path, model_path)
top_ratings = recommendation_engine.get_top_ratings(1)
print(top_ratings)
# return top_ratings
# ratings = recommendation_engine.get_ratings_for_movie_ids(1, [movie_id])
Traceback (most recent call last):
File "C:/Users/ywwl/Desktop/Blogging/recommendation system/recommender.py", line 130, in <module>
top_ratings = recommendation_engine.get_top_ratings(1)
File "C:/Users/ywwl/Desktop/Blogging/recommendation system/recommender.py", line 75, in get_top_ratings
top_jobs = self.predict_clicks(user_unclicked_jobs_RDD)
File "C:/Users/ywwl/Desktop/Blogging/recommendation system/recommender.py", line 54, in predict_clicks
predicted_clicks_title_and_count_RDD = predicted_clicks_title_and_count_RDD.join(self.job_click_counts_RDD)
File "C:\Users\ywwl\anaconda3\envs\RecSys\lib\site-packages\pyspark\rdd.py", line 1680, in join
return python_join(self, other, numPartitions)
File "C:\Users\ywwl\anaconda3\envs\RecSys\lib\site-packages\pyspark\join.py", line 53, in python_join
return _do_python_join(rdd, other, numPartitions, dispatch)
File "C:\Users\ywwl\anaconda3\envs\RecSys\lib\site-packages\pyspark\join.py", line 41, in _do_python_join
return vs.union(ws).groupByKey(numPartitions).flatMapValues(lambda x: dispatch(x.__iter__()))
File "C:\Users\ywwl\anaconda3\envs\RecSys\lib\site-packages\pyspark\rdd.py", line 560, in union
rdd = RDD(self._jrdd.union(other._jrdd), self.ctx,
File "C:\Users\ywwl\anaconda3\envs\RecSys\lib\site-packages\pyspark\rdd.py", line 2532, in _jrdd
self._jrdd_deserializer, profiler)
File "C:\Users\ywwl\anaconda3\envs\RecSys\lib\site-packages\pyspark\rdd.py", line 2434, in _wrap_function
pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command)
File "C:\Users\ywwl\anaconda3\envs\RecSys\lib\site-packages\pyspark\rdd.py", line 2420, in _prepare_for_python_RDD
pickled_command = ser.dumps(command)
File "C:\Users\ywwl\anaconda3\envs\RecSys\lib\site-packages\pyspark\serializers.py", line 607, in dumps
raise pickle.PicklingError(msg)
_pickle.PicklingError: Could not serialize object: Exception: It appears that you are attempting to reference SparkContext from a broadcast variable, action, or transformation. SparkContext can only be used on the driver, not in code that it run on workers. For more information, see SPARK-5063.