在ParArray中启动时火花作业冻结

时间:2016-02-21 03:42:28

标签: scala apache-spark scala-collections

我想将一组时间序列数据从多个csv文件转换为Labeledpoint并保存到镶木地板文件中。 Csv文件很小,通常< 10MiB

当我使用ParArray启动它时,它每次提交4个作业并冻结。代码在这里

val idx = Another_DataFrame
ListFiles(new File("data/stock data"))
.filter(_.getName.contains(".csv")).zipWithIndex
.par //comment this line and code runs smoothly
.foreach{
  f=>
      val stk = spark_csv(f._1.getPath) //doing good
      ColMerge(stk,idx,RESULT_PATH(f)) //freeze here
    stk.unpersist()
}

和冻结部分:

def ColMerge(ori:DataFrame,index:DataFrame,PATH:String) = {
val df = ori.join(index,ori("date")===index("index_date")).drop("index_date").orderBy("date").cache
val head = df.head
val col = df.columns.filter(e=>e!="code"&&e!="date"&&e!="name")
val toMap = col.filter{
  e=>head.get(head.fieldIndex(e)).isInstanceOf[String]
}.sorted
val toCast = col.diff(toMap).filterNot(_=="data")
val res: Array[((String, String, Array[Double]), Long)] = df.sort("date").map{
  row=>
    val res1= toCast.map{
      col=>
        row.getDouble(row.fieldIndex(col))
    }
    val res2= toMap.flatMap{
      col=>
        val mapping = new Array[Double](GlobalConfig.ColumnMapping(col).size)
        row.getString(row.fieldIndex(col)).split(";").par.foreach{
          word=>
            mapping(GlobalConfig.ColumnMapping(col)(word)) = 1
        }
        mapping
    }

    (
      row.getString(row.fieldIndex("code")),
      row.getString(row.fieldIndex("date")),
      res1++res2++row.getAs[Seq[Double]]("data")
      )
}.zipWithIndex.collect
df.unpersist
val dataset = GlobalConfig.sctx.makeRDD(res.map{
  day=>
    (day._1._1,
      day._1._2,
      try{
        new LabeledPoint(GetHighPrice(res(day._2.toInt+2)._1._3.slice(0,4))/GetLowPrice(res(day._2.toInt)._1._3.slice(0,4))*1.03,Vectors.dense(day._1._3))
      }
      catch {
        case ex:ArrayIndexOutOfBoundsException=>
          new LabeledPoint(-1,Vectors.dense(day._1._3))
      }
      )
}).filter(_._3.label != -1).toDF("code","date","labeledpoint")
dataset.write.mode(SaveMode.Overwrite).parquet(PATH)
}

DataFrame.sort()

中生成zipWithIndex时冻结的确切作业是resColMerge

由于大部分工作都是在collect之后完成的,我真的想用ParArray来加速ColMerge,但是这种奇怪的冻结让我无法这样做。我需要新建一个线程池吗?

0 个答案:

没有答案