将Java ArrayList更改为DataSet Spark的DataFrame

时间:2018-01-12 23:34:19

标签: scala apache-spark dataframe

这是我在这里发布的第一个问题。

我想更改此代码。它使用JavaArrayList,我想使用Apache DataFrames或DataSet。我们希望获得记录之间的距离。现在,它当时正在处理一个记录,但每个记录需要6秒钟,我想减少那个时间。

首先,将数据集[row]转换为列表[row]

          var resultNNRows = df_testc.collectAsList()

          var cData = new ClaimOutputBean()
          var claimData = new ArrayList[ClaimOutputBean]()
          var neighborResult = ""

然后,为列表的每个记录循环并调用getNeighbor方法以获得前5个记录的距离。对于每条记录,需要6秒钟,该过程将处理1百万条记录。 我想改变它,看看是否有一个Spark函数,我可以用它来并行处理多个记录。

          for ( i <- 0 until resultNNRows.size()){
            //take for each record 6 seconds and we want to reduce the time
            //Use a spark function to do the process with multiple records and in parallel.
              var resultList = getNeighbor(df_trainc, resultNNRows.get(i), k, mlFeatures).collectAsList()
              neighborResult = ""
              for ( j <- 0 until resultList.size()){
                  if (neighborResult.equals(""))
                    neighborResult = resultList.get(j).getAs("Result").toString()
                  else  
                    neighborResult = neighborResult + " ~ " + resultList.get(j).getAs("Result").toString()
              }
              println(i + " - neighborResult:::"+neighborResult)

              cData = new ClaimOutputBean()
              cData.setNeighborResults(neighborResult)
              cData.setUniqueId(resultNNRows.get(i).getAs("UniqueId"))

              claimData.add(cData)
          }

这是完整的代码。

object NearestNeighborPredict {

    def predictNN (train : DataFrame, test : DataFrame, k : Int,     mlFeatures : MLFeatures) : DataFrame = {

          val spark = mlFeatures.spark

          var startTime  = new Date().getTime();

          var dataResult = new NearestNeighborTransform().transformData(train, test)
          var df_train   = dataResult(0)
          var df_test    = dataResult(1)

          //println("df_train : "+ df_train.count())
          //println("df_test : "+ df_test.count())

          var endTime    = new Date().getTime();
          println("NN Predict Got Transformed Train Data : "+ (endTime - startTime) / 1000 + " seconds")

          var df_trainc = df_train.select(
          "Charges_SS", "DaysOrUnits_SS", "Date_SS",   
          "Level_PayorId", "Level_TherapyType", "Level_NDCNumber", "Level_ProcedureCode", 
          "Level_PatientId", "Level_ServiceBranchId", "Level_AuthNbr", 
          "rejectOutcome", "label", "payorId", "Result", "therapyType", "patientId", "NDCNumber", "procedureCode", "UniqueId"
          ).persist(StorageLevel.MEMORY_AND_DISK_SER)

          df_trainc.createOrReplaceTempView("df_trainc1")

          var df_testc = df_test.select(
          "Charges_SS", "DaysOrUnits_SS", "Date_SS",   
          "Level_PayorId", "Level_TherapyType", "Level_NDCNumber", "Level_ProcedureCode", 
          "Level_PatientId", "Level_ServiceBranchId", "Level_AuthNbr", 
          "rejectOutcome", "label", "payorId", "Result", "therapyType", "patientId", "NDCNumber", "procedureCode", "UniqueId"
          ).persist(StorageLevel.MEMORY_AND_DISK_SER)

          var resultNNRows = df_testc.collectAsList()

          var cData = new ClaimOutputBean()
          var claimData = new ArrayList[ClaimOutputBean]()
          var neighborResult = ""

          for ( i <- 0 until resultNNRows.size()){
            //take for each record 6 seconds and we want to reduce the time
            //Use a spark function to do the process with multiple records and in parallel.
              var resultList = getNeighbor(df_trainc, resultNNRows.get(i), k, mlFeatures).collectAsList()
              neighborResult = ""
              for ( j <- 0 until resultList.size()){
                  if (neighborResult.equals(""))
                    neighborResult = resultList.get(j).getAs("Result").toString()
                  else  
                    neighborResult = neighborResult + " ~ " + resultList.get(j).getAs("Result").toString()
              }
              println(i + " - neighborResult:::"+neighborResult)

              cData = new ClaimOutputBean()
              cData.setNeighborResults(neighborResult)
              cData.setUniqueId(resultNNRows.get(i).getAs("UniqueId"))

              claimData.add(cData)
          }

          var resultsNN = mlFeatures.convertClaimOutputBeanToDataFrame(claimData)
          //resultsNN.show(false)

          df_trainc.unpersist()
          df_testc.unpersist()

          resultsNN
   }


   def getNeighbor (train : DataFrame, key : Row, num : Int, mlFeatures : MLFeatures) : DataFrame = {

      val spark = mlFeatures.spark

      var trainFiltered = spark.sql("SELECT * FROM df_trainc1")
                               .filter(
                                        col("payorId").equalTo(key.getAs("payorId").toString().toInt).&& 
                                       (col("therapyType").equalTo(key.getAs("therapyType").toString())).&&
                                       (col("NDCNumber").equalTo(key.getAs("NDCNumber").toString())).&&
                                       (col("procedureCode").equalTo(key.getAs("procedureCode").toString()))
                                      )
      if(trainFiltered.count() == 0){      
          println("cant find at payor, therapy, NDCNumber and ProcCode Level")
          trainFiltered = spark.sql("SELECT * FROM df_trainc1")
                               .filter(
                                        col("payorId").equalTo(key.getAs("payorId").toString().toInt).&& 
                                       (col("therapyType").equalTo(key.getAs("therapyType").toString())).&&
                                       (col("NDCNumber").equalTo(key.getAs("NDCNumber").toString()))
                                      )  
      }

      if(trainFiltered.count() == 0){
          println("cant find at payor, therapy and NDCNumber Level")
          trainFiltered = spark.sql("SELECT * FROM df_trainc1")
                               .filter(
                                        col("payorId").equalTo(key.getAs("payorId").toString().toInt).&&
                                        (col("therapyType").equalTo(key.getAs("therapyType").toString()))
                                      )  
      }          

      import spark.implicits._

      var train1 = trainFiltered.map(r => (r.getAs[Double]("Charges_SS") - key.getAs("Charges_SS").toString().toDouble,             //charges
                                   r.getAs[Double]("DaysOrUnits_SS") - key.getAs("DaysOrUnits_SS").toString().toDouble,             //daysOrUnits
                                   r.getAs[Double]("Date_SS") - key.getAs("Date_SS").toString().toDouble,                           //date 
                                   r.getAs[org.apache.spark.ml.linalg.SparseVector]("Level_PayorId"),                               //level_payorId
                                   r.getAs[org.apache.spark.ml.linalg.SparseVector]("Level_TherapyType"),                           //level_therapyType 
                                   r.getAs[org.apache.spark.ml.linalg.SparseVector]("Level_NDCNumber"),                             //level_NDCNumber
                                   r.getAs[org.apache.spark.ml.linalg.SparseVector]("Level_ProcedureCode"),                         //level_procCode  
                                   r.getAs[org.apache.spark.ml.linalg.SparseVector]("Level_PatientId"),                             //level_patientId
                                   r.getAs[org.apache.spark.ml.linalg.SparseVector]("Level_ServiceBranchId"),                       //level_serviceBranchId
                                   r.getAs[org.apache.spark.ml.linalg.SparseVector]("Level_AuthNbr"),                               //level_AuthNbr
                                   r.getAs[String]("rejectOutcome"),                                                                //rejectOutome 
                                   r.getAs[Double]("label"),                                                                        //label
                                   r.getAs[String]("Result"),                                                                       //Result
                                   r.getAs[Long]("UniqueId")                                                                        //UniqueId
                                  )                                    
                            )                                

      var train2 = train1.map(r => (r._1*r._1 + r._2*r._2 + r._3*r._3,
                                     r._4,                              //level_payorId
                                     r._5,                              //level_therapyType
                                     r._6,                              //level_ndcNumber
                                     r._7,                              //level_procCode
                                     r._8,                              //level_patientId
                                     r._9,                              //level_serviceBranchId
                                     r._10,                             //level_authNbr
                                     r._11,                             //rejectOutcome 
                                     r._12,                             //label
                                     r._13,                             //Result 
                                     r._14                              //UniqueId
                                    )
                              )

      var train3 = train2.map(r => (scala.math.sqrt(r._1),
                                     dotpro(r._2, key.getAs[org.apache.spark.ml.linalg.SparseVector]("Level_PayorId")),
                                     dotpro(r._3, key.getAs[org.apache.spark.ml.linalg.SparseVector]("Level_TherapyType")),
                                     dotpro(r._4, key.getAs[org.apache.spark.ml.linalg.SparseVector]("Level_NDCNumber")),
                                     dotpro(r._5, key.getAs[org.apache.spark.ml.linalg.SparseVector]("Level_ProcedureCode")),
                                     dotpro(r._6, key.getAs[org.apache.spark.ml.linalg.SparseVector]("Level_PatientId")),
                                     dotpro(r._7, key.getAs[org.apache.spark.ml.linalg.SparseVector]("Level_ServiceBranchId")),
                                     dotpro(r._8, key.getAs[org.apache.spark.ml.linalg.SparseVector]("Level_AuthNbr")),
                                     r._9,                              //rejectOutcome
                                     r._10,                             //label
                                     r._11,                             //Result 
                                     r._12                              //UniqueId
                                    )
                              )

      var train4 = train3.map(r => (r._1/scala.math.sqrt(3),
                                     //(r._2+r._3+r._4+r._5+r._6+r._7+r._8)/7,
                                     (r._2+r._3+r._4+r._5)/4,
                                     r._9,                              //rejectOutcome
                                     r._10,                             //label
                                     r._11,                             //Result
                                     r._12                              //UniqueId
                                    )
                              )

      var train5 = train4.map(r => (r._1*0.15+(1-r._2)*0.85,            //Distance
                                     r._2,                              //similarity 
                                     r._3,                              //rejectOutcome
                                     r._4,                              //label
                                     r._5,                              //Result
                                     r._6                               //UniqueId
                                    )
                              )

      var train6 = train5.toDF("Distance", "Sim", "rejectOutcome", "label", "Result", "UniqueId")
      train6.createOrReplaceTempView("train6")

      var train7 = spark.sql("SELECT * FROM train6 ORDER BY Distance ASC")
      train7 = train7.limit(num)

      train7
   }  


   def dotpro (vec1 : org.apache.spark.ml.linalg.SparseVector, vec2 : org.apache.spark.ml.linalg.SparseVector) : Double = {
        var arr1 = vec1.toArray
        var arr2 = vec2.toArray
        var d = 0.0
        for (d1 <- 0 to arr1.length-1){
          d = d+(arr1(d1)*arr2(d1))
        }
        d
   }  

   def main(args: Array[String]) { 
   }
}

0 个答案:

没有答案