package com.summer.movie;
import org.apache.spark.sql.*;
public class MovieRecommendation {
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().master("local[*]").appName("movie").getOrCreate();
Dataset<Row> movies = spark.read().option("inferschema", true).option("header", true)
.csv("/Users/rushikesh/Desktop/spark-aadhar/movie/ml-20m/ratings.csv")
.drop(functions.col("timestamp"));
movies.persist();
Dataset<Row> raters = movies.groupBy(functions.col("movieId")).agg(functions.count("rating").as("count"));
Dataset<Row> data = movies.join(raters,"movieId");
data.persist();
movies.unpersist();
Dataset<Row> dupdata =data.toDF("userId","movieId2","rating2","count2");
Dataset<Row> selfjoined = data.join(dupdata,"userId");
Dataset<Row> filterdata = selfjoined.filter(functions.col("movieId").lt(functions.col("movieId2")));
data.unpersist();
Dataset<Row> caldata = filterdata
.withColumn("r1*r2",functions.col("rating").multiply(functions.col("rating2")))
.withColumn("sqrating1",functions.col("rating").multiply(functions.col("rating")))
.withColumn("sqrating2",functions.col("rating2").multiply(functions.col("rating2")));
caldata.groupBy("movieId","movieId2").agg(functions.sum("r1*r2"),functions.sum("rating"),
functions.sum("rating2"),functions.sum("sqrating1"),functions.sum("sqrating2")).show(100,false);
}
}
我正在为20米行的imdb电影数据集尝试电影推荐系统, 但它几乎用了半个小时才能完成并且随机写入超过30gb我在这里做错了,这是因为代码不正确还是由于单节点集群,因为我知道dataframe api本身管理执行并选择最多优化方式。