这是我在这里发布的第一个问题。
我想更改此代码。它使用JavaArrayList,我想使用Apache DataFrames或DataSet。我们希望获得记录之间的距离。现在,它当时正在处理一个记录,但每个记录需要6秒钟,我想减少那个时间。
首先,将数据集[row]转换为列表[row]
var resultNNRows = df_testc.collectAsList()
var cData = new ClaimOutputBean()
var claimData = new ArrayList[ClaimOutputBean]()
var neighborResult = ""
然后,为列表的每个记录循环并调用getNeighbor方法以获得前5个记录的距离。对于每条记录,需要6秒钟,该过程将处理1百万条记录。 我想改变它,看看是否有一个Spark函数,我可以用它来并行处理多个记录。
for ( i <- 0 until resultNNRows.size()){
//take for each record 6 seconds and we want to reduce the time
//Use a spark function to do the process with multiple records and in parallel.
var resultList = getNeighbor(df_trainc, resultNNRows.get(i), k, mlFeatures).collectAsList()
neighborResult = ""
for ( j <- 0 until resultList.size()){
if (neighborResult.equals(""))
neighborResult = resultList.get(j).getAs("Result").toString()
else
neighborResult = neighborResult + " ~ " + resultList.get(j).getAs("Result").toString()
}
println(i + " - neighborResult:::"+neighborResult)
cData = new ClaimOutputBean()
cData.setNeighborResults(neighborResult)
cData.setUniqueId(resultNNRows.get(i).getAs("UniqueId"))
claimData.add(cData)
}
这是完整的代码。
object NearestNeighborPredict {
def predictNN (train : DataFrame, test : DataFrame, k : Int, mlFeatures : MLFeatures) : DataFrame = {
val spark = mlFeatures.spark
var startTime = new Date().getTime();
var dataResult = new NearestNeighborTransform().transformData(train, test)
var df_train = dataResult(0)
var df_test = dataResult(1)
//println("df_train : "+ df_train.count())
//println("df_test : "+ df_test.count())
var endTime = new Date().getTime();
println("NN Predict Got Transformed Train Data : "+ (endTime - startTime) / 1000 + " seconds")
var df_trainc = df_train.select(
"Charges_SS", "DaysOrUnits_SS", "Date_SS",
"Level_PayorId", "Level_TherapyType", "Level_NDCNumber", "Level_ProcedureCode",
"Level_PatientId", "Level_ServiceBranchId", "Level_AuthNbr",
"rejectOutcome", "label", "payorId", "Result", "therapyType", "patientId", "NDCNumber", "procedureCode", "UniqueId"
).persist(StorageLevel.MEMORY_AND_DISK_SER)
df_trainc.createOrReplaceTempView("df_trainc1")
var df_testc = df_test.select(
"Charges_SS", "DaysOrUnits_SS", "Date_SS",
"Level_PayorId", "Level_TherapyType", "Level_NDCNumber", "Level_ProcedureCode",
"Level_PatientId", "Level_ServiceBranchId", "Level_AuthNbr",
"rejectOutcome", "label", "payorId", "Result", "therapyType", "patientId", "NDCNumber", "procedureCode", "UniqueId"
).persist(StorageLevel.MEMORY_AND_DISK_SER)
var resultNNRows = df_testc.collectAsList()
var cData = new ClaimOutputBean()
var claimData = new ArrayList[ClaimOutputBean]()
var neighborResult = ""
for ( i <- 0 until resultNNRows.size()){
//take for each record 6 seconds and we want to reduce the time
//Use a spark function to do the process with multiple records and in parallel.
var resultList = getNeighbor(df_trainc, resultNNRows.get(i), k, mlFeatures).collectAsList()
neighborResult = ""
for ( j <- 0 until resultList.size()){
if (neighborResult.equals(""))
neighborResult = resultList.get(j).getAs("Result").toString()
else
neighborResult = neighborResult + " ~ " + resultList.get(j).getAs("Result").toString()
}
println(i + " - neighborResult:::"+neighborResult)
cData = new ClaimOutputBean()
cData.setNeighborResults(neighborResult)
cData.setUniqueId(resultNNRows.get(i).getAs("UniqueId"))
claimData.add(cData)
}
var resultsNN = mlFeatures.convertClaimOutputBeanToDataFrame(claimData)
//resultsNN.show(false)
df_trainc.unpersist()
df_testc.unpersist()
resultsNN
}
def getNeighbor (train : DataFrame, key : Row, num : Int, mlFeatures : MLFeatures) : DataFrame = {
val spark = mlFeatures.spark
var trainFiltered = spark.sql("SELECT * FROM df_trainc1")
.filter(
col("payorId").equalTo(key.getAs("payorId").toString().toInt).&&
(col("therapyType").equalTo(key.getAs("therapyType").toString())).&&
(col("NDCNumber").equalTo(key.getAs("NDCNumber").toString())).&&
(col("procedureCode").equalTo(key.getAs("procedureCode").toString()))
)
if(trainFiltered.count() == 0){
println("cant find at payor, therapy, NDCNumber and ProcCode Level")
trainFiltered = spark.sql("SELECT * FROM df_trainc1")
.filter(
col("payorId").equalTo(key.getAs("payorId").toString().toInt).&&
(col("therapyType").equalTo(key.getAs("therapyType").toString())).&&
(col("NDCNumber").equalTo(key.getAs("NDCNumber").toString()))
)
}
if(trainFiltered.count() == 0){
println("cant find at payor, therapy and NDCNumber Level")
trainFiltered = spark.sql("SELECT * FROM df_trainc1")
.filter(
col("payorId").equalTo(key.getAs("payorId").toString().toInt).&&
(col("therapyType").equalTo(key.getAs("therapyType").toString()))
)
}
import spark.implicits._
var train1 = trainFiltered.map(r => (r.getAs[Double]("Charges_SS") - key.getAs("Charges_SS").toString().toDouble, //charges
r.getAs[Double]("DaysOrUnits_SS") - key.getAs("DaysOrUnits_SS").toString().toDouble, //daysOrUnits
r.getAs[Double]("Date_SS") - key.getAs("Date_SS").toString().toDouble, //date
r.getAs[org.apache.spark.ml.linalg.SparseVector]("Level_PayorId"), //level_payorId
r.getAs[org.apache.spark.ml.linalg.SparseVector]("Level_TherapyType"), //level_therapyType
r.getAs[org.apache.spark.ml.linalg.SparseVector]("Level_NDCNumber"), //level_NDCNumber
r.getAs[org.apache.spark.ml.linalg.SparseVector]("Level_ProcedureCode"), //level_procCode
r.getAs[org.apache.spark.ml.linalg.SparseVector]("Level_PatientId"), //level_patientId
r.getAs[org.apache.spark.ml.linalg.SparseVector]("Level_ServiceBranchId"), //level_serviceBranchId
r.getAs[org.apache.spark.ml.linalg.SparseVector]("Level_AuthNbr"), //level_AuthNbr
r.getAs[String]("rejectOutcome"), //rejectOutome
r.getAs[Double]("label"), //label
r.getAs[String]("Result"), //Result
r.getAs[Long]("UniqueId") //UniqueId
)
)
var train2 = train1.map(r => (r._1*r._1 + r._2*r._2 + r._3*r._3,
r._4, //level_payorId
r._5, //level_therapyType
r._6, //level_ndcNumber
r._7, //level_procCode
r._8, //level_patientId
r._9, //level_serviceBranchId
r._10, //level_authNbr
r._11, //rejectOutcome
r._12, //label
r._13, //Result
r._14 //UniqueId
)
)
var train3 = train2.map(r => (scala.math.sqrt(r._1),
dotpro(r._2, key.getAs[org.apache.spark.ml.linalg.SparseVector]("Level_PayorId")),
dotpro(r._3, key.getAs[org.apache.spark.ml.linalg.SparseVector]("Level_TherapyType")),
dotpro(r._4, key.getAs[org.apache.spark.ml.linalg.SparseVector]("Level_NDCNumber")),
dotpro(r._5, key.getAs[org.apache.spark.ml.linalg.SparseVector]("Level_ProcedureCode")),
dotpro(r._6, key.getAs[org.apache.spark.ml.linalg.SparseVector]("Level_PatientId")),
dotpro(r._7, key.getAs[org.apache.spark.ml.linalg.SparseVector]("Level_ServiceBranchId")),
dotpro(r._8, key.getAs[org.apache.spark.ml.linalg.SparseVector]("Level_AuthNbr")),
r._9, //rejectOutcome
r._10, //label
r._11, //Result
r._12 //UniqueId
)
)
var train4 = train3.map(r => (r._1/scala.math.sqrt(3),
//(r._2+r._3+r._4+r._5+r._6+r._7+r._8)/7,
(r._2+r._3+r._4+r._5)/4,
r._9, //rejectOutcome
r._10, //label
r._11, //Result
r._12 //UniqueId
)
)
var train5 = train4.map(r => (r._1*0.15+(1-r._2)*0.85, //Distance
r._2, //similarity
r._3, //rejectOutcome
r._4, //label
r._5, //Result
r._6 //UniqueId
)
)
var train6 = train5.toDF("Distance", "Sim", "rejectOutcome", "label", "Result", "UniqueId")
train6.createOrReplaceTempView("train6")
var train7 = spark.sql("SELECT * FROM train6 ORDER BY Distance ASC")
train7 = train7.limit(num)
train7
}
def dotpro (vec1 : org.apache.spark.ml.linalg.SparseVector, vec2 : org.apache.spark.ml.linalg.SparseVector) : Double = {
var arr1 = vec1.toArray
var arr2 = vec2.toArray
var d = 0.0
for (d1 <- 0 to arr1.length-1){
d = d+(arr1(d1)*arr2(d1))
}
d
}
def main(args: Array[String]) {
}
}