Spark调整,Job takin更多时间

时间:2018-06-04 11:49:04

标签: apache-spark apache-spark-sql

package com.summer.movie;

import org.apache.spark.sql.*;

public class MovieRecommendation {

public static void main(String[] args) {

    SparkSession spark = SparkSession.builder().master("local[*]").appName("movie").getOrCreate();

    Dataset<Row> movies = spark.read().option("inferschema", true).option("header", true)
                .csv("/Users/rushikesh/Desktop/spark-aadhar/movie/ml-20m/ratings.csv")
                .drop(functions.col("timestamp"));

    movies.persist();

    Dataset<Row> raters = movies.groupBy(functions.col("movieId")).agg(functions.count("rating").as("count"));

    Dataset<Row> data = movies.join(raters,"movieId");

    data.persist();

    movies.unpersist();

    Dataset<Row> dupdata =data.toDF("userId","movieId2","rating2","count2");

    Dataset<Row> selfjoined = data.join(dupdata,"userId");

    Dataset<Row> filterdata = selfjoined.filter(functions.col("movieId").lt(functions.col("movieId2")));

data.unpersist();


    Dataset<Row> caldata = filterdata
            .withColumn("r1*r2",functions.col("rating").multiply(functions.col("rating2")))
            .withColumn("sqrating1",functions.col("rating").multiply(functions.col("rating")))
            .withColumn("sqrating2",functions.col("rating2").multiply(functions.col("rating2")));


    caldata.groupBy("movieId","movieId2").agg(functions.sum("r1*r2"),functions.sum("rating"),
            functions.sum("rating2"),functions.sum("sqrating1"),functions.sum("sqrating2")).show(100,false);



}
}

我正在为20米行的imdb电影数据集尝试电影推荐系统, 但它几乎用了半个小时才能完成并且随机写入超过30gb我在这里做错了,这是因为代码不正确还是由于单节点集群,因为我知道dataframe api本身管理执行并选择最多优化方式。

0 个答案:

没有答案