我注意到当我增加群集中的从属数量时,性能会下降。我使用的机器有2x10芯片和18Go内存。 第一个有5个从属,第二个有7个。我在AWS上有8个内存30Go内存,另一方面有4个奴隶和8个奴隶。
我根据核心数量相应地增加了并行度,但没有改变。
此外,我不太了解spark.driver.cores属性,我应该用最大数量的核心来设置它吗?
Conf 5 slaves
.set("spark.driver.cores","10")
.set("spark.driver.memory","15g")
//.set("spark.akka.threads","16")
.set("spark.executor.memory","3g")
.set("spark.executor.cores","5")
.set("spark.executor.instances","20")
.set("spark.default.parallelism","100")
Conf 7 slaves
.set("spark.driver.cores","10")
.set("spark.driver.memory","15g")
.set("spark.executor.memory","3g")
.set("spark.executor.cores","5")
.set("spark.executor.instances","28")
.set("spark.default.parallelism","140")
Code
import spire.implicits._
import org.apache.spark.mllib.rdd.MLPairRDDFunctions._
import org.apache.spark.mllib.linalg._
import org.apache.spark.mllib.feature.StandardScaler
import org.apache.spark.rdd.RDD._
case class Data_Object(private val id:String, private var vector:Vector) extends Serializable {
def get_id = this.id
def get_vector = this.vector
}
def bary(tab1:Array[(Vector,Double)],k:Int) : Vector = {
var tab2 = tab1.map(_._1.toArray)
var bary1 = tab2.reduce(_+_)
bary1 = bary1.map(x=>x/k)
val bary2 = Vectors.dense(bary1)
bary2
}
val data = sc.textFile("/ds10.csv")
var parsedData = data.map(x => x.split(',')).map(y => new Data_Object(y(0),Vectors.dense(y.tail.map(_.toDouble))))
var k = 60
var numPart2 = 10
var rdd_temp = parsedData
parsedData.cache
//parsedData.count = 10000
for( ind <- 1 to maxIterForYstar ) {
var rdd_distance = rdd_temp.cartesian(parsedData).flatMap{ case (x,y) =>
Some((x.get_id,(y.get_vector,-Vectors.sqdist(x.get_vector,y.get_vector))))
}//.repartition(1000)
var rdd_knn_bykey = rdd_distance.topByKey(k)(Ordering[(Double)].on(x=>x._2)).coalesce(numPart2)
var rdd_knn_bary = rdd_knn_bykey.map(x=>(x._1,bary(x._2,k)))
rdd_temp = rdd_knn_bary.map(x=> new Data_Object(x._1,x._2))
}