在将以下代码提交到集群时
具有以下配置:
我们正在使用两个节点的集群 node1:24 gb内存,16核,1 Tb node2:24 Gb ram,4核,1Tb
数据大小为2.4 Gb
spark-submit --master yarn --deploy-mode client --executor-cores 3 --num-executors 2 --executor-memory 14g --class trial_1 ./Item_Based.jar datas_3.txt
上述工作分为5个工作ID
作业ID(0-3)成功显示
在工作ID 4处,对应于以下行的阶段ID 12
val profileRatings = profilePairs.aggregateByKey(initialSet)( addop, merge )
在这一行上,火花作业散乱很多,没有进展
def main(args: Array[String]) {
Logger.getLogger("org").setLevel(Level.ERROR)
val conf = new SparkConf()
conf.setAppName("New_Profile_2")
val sc = new SparkContext(conf)
val data = sc.textFile(args(0))
val ratings = data.map(x => x.split(" ")).map(x => (x(0).toInt , (x(1).toInt, 1 )))
ratings.take(10).foreach(println)
val joinedRatings = ratings.join(ratings)
println("*******************Joined Ratings*****************************")
joinedRatings.take(10).foreach(println)
println("**********************Unique Ids*************************")
val uniqueJoinedRatings = joinedRatings.filter(filterDuplicates)
uniqueJoinedRatings.take(10).foreach(println)
println("******************mapped by Profiles***********")
val profilePairs = uniqueJoinedRatings.map(makepairs) //.partitionBy(new HashPartitioner(38))
profilePairs.take(10).foreach(println)
println("***********printing group by keys***************")
val initialSet = (0, 0)
val addop = (x: (Int,Int) , y:(Int,Int)) => (x._1+y._1,x._2+y._2)
val merge = (p1 :(Int, Int), p2:(Int , Int)) => (p1._1+p2._1 , p1._2+p2._2 )
val profileRatings = profilePairs.aggregateByKey(initialSet)( addop, merge )
profileRatings.take(10).foreach(println)
}