我想将一组时间序列数据从多个csv文件转换为Labeledpoint
并保存到镶木地板文件中。 Csv文件很小,通常< 10MiB
当我使用ParArray启动它时,它每次提交4个作业并冻结。代码在这里
val idx = Another_DataFrame
ListFiles(new File("data/stock data"))
.filter(_.getName.contains(".csv")).zipWithIndex
.par //comment this line and code runs smoothly
.foreach{
f=>
val stk = spark_csv(f._1.getPath) //doing good
ColMerge(stk,idx,RESULT_PATH(f)) //freeze here
stk.unpersist()
}
和冻结部分:
def ColMerge(ori:DataFrame,index:DataFrame,PATH:String) = {
val df = ori.join(index,ori("date")===index("index_date")).drop("index_date").orderBy("date").cache
val head = df.head
val col = df.columns.filter(e=>e!="code"&&e!="date"&&e!="name")
val toMap = col.filter{
e=>head.get(head.fieldIndex(e)).isInstanceOf[String]
}.sorted
val toCast = col.diff(toMap).filterNot(_=="data")
val res: Array[((String, String, Array[Double]), Long)] = df.sort("date").map{
row=>
val res1= toCast.map{
col=>
row.getDouble(row.fieldIndex(col))
}
val res2= toMap.flatMap{
col=>
val mapping = new Array[Double](GlobalConfig.ColumnMapping(col).size)
row.getString(row.fieldIndex(col)).split(";").par.foreach{
word=>
mapping(GlobalConfig.ColumnMapping(col)(word)) = 1
}
mapping
}
(
row.getString(row.fieldIndex("code")),
row.getString(row.fieldIndex("date")),
res1++res2++row.getAs[Seq[Double]]("data")
)
}.zipWithIndex.collect
df.unpersist
val dataset = GlobalConfig.sctx.makeRDD(res.map{
day=>
(day._1._1,
day._1._2,
try{
new LabeledPoint(GetHighPrice(res(day._2.toInt+2)._1._3.slice(0,4))/GetLowPrice(res(day._2.toInt)._1._3.slice(0,4))*1.03,Vectors.dense(day._1._3))
}
catch {
case ex:ArrayIndexOutOfBoundsException=>
new LabeledPoint(-1,Vectors.dense(day._1._3))
}
)
}).filter(_._3.label != -1).toDF("code","date","labeledpoint")
dataset.write.mode(SaveMode.Overwrite).parquet(PATH)
}
在DataFrame.sort()
zipWithIndex
时冻结的确切作业是res
或ColMerge
由于大部分工作都是在collect
之后完成的,我真的想用ParArray来加速ColMerge
,但是这种奇怪的冻结让我无法这样做。我需要新建一个线程池吗?